Skip to content

feat: adds bigframes.bigquery.array_to_string to convert array elements to delimited strings #731

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ def array_length(series: series.Series) -> series.Series:
dtype: Int64

Args:
series (bigframes.series.Series):
A Series with array columns.
series (bigframes.series.Series): A Series with array columns.

Returns:
bigframes.series.Series: A Series of integer values indicating
Expand Down Expand Up @@ -104,7 +103,7 @@ def array_agg(

Args:
obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
A GroupBy object to be applied the function.
A GroupBy object to be applied the function.

Returns:
bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or
Expand All @@ -119,3 +118,33 @@ def array_agg(
raise ValueError(
f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}"
)


def array_to_string(series: series.Series, delimiter: str) -> series.Series:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add null_text as an optional parameter, too?

https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions#array_to_string

I recall it's a bit difficult to test arrays containing nulls since there are many paths that don't support them, so I'm okay omitting for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially experimented with using null_text but determined it wasn't beneficial at this stage. The null_text parameter is designed to handle null values within arrays, whereas BF arrays cannot contain nulls. This stems from BigQuery's behavior of permitting nulls in intermediate query results but not in materialized views or tables.

"""Converts array elements within a Series into delimited strings.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> import numpy as np
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]])
>>> bbq.array_to_string(s, delimiter=", ")
0 H, i, !
1 Hello, World
2
3
4 Hi
dtype: string

Args:
series (bigframes.series.Series): A Series containing arrays.
delimiter (str): The string used to separate array elements.

Returns:
bigframes.series.Series: A Series containing delimited strings.

"""
return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
6 changes: 6 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,6 +875,12 @@ def map_op_impl(x: ibis_types.Value, op: ops.MapOp):
return case.else_(x).end()


# Array Ops
@scalar_op_compiler.register_unary_op(ops.ArrayToStringOp, pass_op=True)
def array_to_string_op_impl(x: ibis_types.Value, op: ops.ArrayToStringOp):
return typing.cast(ibis_types.ArrayValue, x).join(op.delimiter)


### Binary Ops
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
Expand Down
8 changes: 8 additions & 0 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,14 @@ def is_array_like(type: ExpressionType) -> bool:
)


def is_array_string_like(type: ExpressionType) -> bool:
return (
isinstance(type, pd.ArrowDtype)
and isinstance(type.pyarrow_dtype, pa.ListType)
and pa.types.is_string(type.pyarrow_dtype.value_type)
)


def is_struct_like(type: ExpressionType) -> bool:
return isinstance(type, pd.ArrowDtype) and isinstance(
type.pyarrow_dtype, pa.StructType
Expand Down
13 changes: 13 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,19 @@ def output_type(self, *input_types):
return input_types[0]


## Array Ops
@dataclasses.dataclass(frozen=True)
class ArrayToStringOp(UnaryOp):
name: typing.ClassVar[str] = "array_to_string"
delimiter: str

def output_type(self, *input_types):
input_type = input_types[0]
if not dtypes.is_array_string_like(input_type):
raise TypeError("Input type must be an array of string type.")
return dtypes.STRING_DTYPE


# Binary Ops
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
Expand Down
13 changes: 13 additions & 0 deletions tests/system/small/bigquery/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,16 @@ def test_array_agg_matches_after_explode():
result.to_pandas(), # type: ignore
df.to_pandas(),
)


@pytest.mark.parametrize(
("data"),
[
pytest.param([[1, 2], [3, 4], [5]], id="int_array"),
pytest.param(["hello", "world"], id="string"),
],
)
def test_array_to_string_w_type_checks(data):
series = bpd.Series(data)
with pytest.raises(TypeError):
bbq.array_to_string(series, delimiter=", ")