Skip to content

feat: bigframes.bigquery.json_extract #868

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,41 @@ def json_set(
return series


def json_extract(
series: series.Series,
json_path: str,
) -> series.Series:
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
value. This function uses single quotes and brackets to escape invalid JSONPath
characters in JSON keys.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}'])
>>> bbq.json_extract(s, json_path="$.class")
0 "{\\\"students\\\":[{\\\"id\\\":5},{\\\"id\\\":12}]}"
dtype: string

Args:
series (bigframes.series.Series):
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
json_path (str):
The JSON path identifying the data that you want to obtain from the input.

Returns:
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
"""
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))


# Search functions defined from
# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions


def vector_search(
base_table: str,
column_to_search: str,
Expand Down
12 changes: 12 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,11 @@ def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
).to_expr()


@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True)
def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
return json_extract(json_obj=x, json_path=op.json_path)


### Binary Ops
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
Expand Down Expand Up @@ -1549,6 +1554,13 @@ def json_set(
"""Produces a new SQL JSON value with the specified JSON data inserted or replaced."""


@ibis.udf.scalar.builtin(name="json_extract")
def json_extract(
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
) -> ibis_dtypes.JSON:
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""


@ibis.udf.scalar.builtin(name="ML.DISTANCE")
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""
16 changes: 16 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,22 @@ def output_type(self, *input_types):
return dtypes.STRING_DTYPE


## JSON Ops
@dataclasses.dataclass(frozen=True)
class JSONExtract(UnaryOp):
name: typing.ClassVar[str] = "json_extract"
json_path: str

def output_type(self, *input_types):
input_type = input_types[0]
if not dtypes.is_json_like(input_type):
raise TypeError(
"Input type must be an valid JSON object or JSON-formatted string type."
+ f" Received type: {input_type}"
)
return input_type


# Binary Ops
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
Expand Down
27 changes: 27 additions & 0 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,30 @@ def test_json_set_w_invalid_value_type():
def test_json_set_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)])


def test_json_extract_from_json():
s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}])
actual = bbq.json_extract(s, "$.a.b")
# After the introduction of the JSON type, the output should be a JSON-formatted series.
expected = _get_series_from_json(["[1,2]", None, "0"])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_from_string():
s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'])
actual = bbq.json_extract(s, "$.a.b")
expected = _get_series_from_json(["[1,2]", None, "0"])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
check_names=False,
)


def test_json_extract_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_extract(bpd.Series([1, 2]), "$.a")