Skip to content

Commit 2fd1b81

Browse files
feat: Add Series.combine (#680)
1 parent 9ca92d0 commit 2fd1b81

File tree

7 files changed

+214
-39
lines changed

7 files changed

+214
-39
lines changed

bigframes/core/compile/scalar_op_compiler.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -1298,22 +1298,36 @@ def coalesce_impl(
12981298
return ibis.coalesce(x, y)
12991299

13001300

1301-
@scalar_op_compiler.register_binary_op(ops.cliplower_op)
1302-
def clip_lower(
1301+
@scalar_op_compiler.register_binary_op(ops.maximum_op)
1302+
def maximum_impl(
13031303
value: ibis_types.Value,
13041304
lower: ibis_types.Value,
13051305
):
1306+
# Note: propagates nulls
13061307
return ibis.case().when(lower.isnull() | (value < lower), lower).else_(value).end()
13071308

13081309

1309-
@scalar_op_compiler.register_binary_op(ops.clipupper_op)
1310-
def clip_upper(
1310+
@scalar_op_compiler.register_binary_op(ops.minimum_op)
1311+
def minimum_impl(
13111312
value: ibis_types.Value,
13121313
upper: ibis_types.Value,
13131314
):
1315+
# Note: propagates nulls
13141316
return ibis.case().when(upper.isnull() | (value > upper), upper).else_(value).end()
13151317

13161318

1319+
@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True)
1320+
def binary_remote_function_op_impl(
1321+
x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp
1322+
):
1323+
if not hasattr(op.func, "bigframes_remote_function"):
1324+
raise TypeError(
1325+
f"only a bigframes remote function is supported as a callable. {constants.FEEDBACK_LINK}"
1326+
)
1327+
x_transformed = op.func(x, y)
1328+
return x_transformed
1329+
1330+
13171331
# Ternary Operations
13181332
@scalar_op_compiler.register_ternary_op(ops.where_op)
13191333
def where_op(

bigframes/operations/__init__.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -545,8 +545,8 @@ def output_type(self, *input_types):
545545

546546
# Binary Ops
547547
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
548-
cliplower_op = create_binary_op(name="clip_lower", type_signature=op_typing.COERCE)
549-
clipupper_op = create_binary_op(name="clip_upper", type_signature=op_typing.COERCE)
548+
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
549+
minimum_op = create_binary_op(name="minimum", type_signature=op_typing.COERCE)
550550
coalesce_op = create_binary_op(name="coalesce", type_signature=op_typing.COERCE)
551551

552552

@@ -587,6 +587,16 @@ def output_type(self, *input_types):
587587
raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}")
588588

589589

590+
@dataclasses.dataclass(frozen=True)
591+
class BinaryRemoteFunctionOp(BinaryOp):
592+
name: typing.ClassVar[str] = "binary_remote_function"
593+
func: typing.Callable
594+
595+
def output_type(self, *input_types):
596+
# This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
597+
return self.func.output_dtype
598+
599+
590600
add_op = AddOp()
591601
sub_op = SubOp()
592602
mul_op = create_binary_op(name="mul", type_signature=op_typing.BINARY_NUMERIC)
@@ -713,4 +723,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
713723
np.divide: div_op,
714724
np.power: pow_op,
715725
np.arctan2: arctan2_op,
726+
np.maximum: maximum_op,
727+
np.minimum: minimum_op,
716728
}

bigframes/series.py

+34-2
Original file line numberDiff line numberDiff line change
@@ -1031,9 +1031,9 @@ def clip(self, lower, upper):
10311031
if lower is None and upper is None:
10321032
return self
10331033
if lower is None:
1034-
return self._apply_binary_op(upper, ops.clipupper_op, alignment="left")
1034+
return self._apply_binary_op(upper, ops.minimum_op, alignment="left")
10351035
if upper is None:
1036-
return self._apply_binary_op(lower, ops.cliplower_op, alignment="left")
1036+
return self._apply_binary_op(lower, ops.maximum_op, alignment="left")
10371037
value_id, lower_id, upper_id, block = self._align3(lower, upper)
10381038
block, result_id = block.apply_ternary_op(
10391039
value_id, lower_id, upper_id, ops.clip_op
@@ -1374,6 +1374,38 @@ def apply(
13741374
materialized_series = result_series._cached()
13751375
return materialized_series
13761376

1377+
def combine(
1378+
self,
1379+
other,
1380+
func,
1381+
) -> Series:
1382+
if not callable(func):
1383+
raise ValueError(
1384+
"Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported."
1385+
)
1386+
1387+
if not hasattr(func, "bigframes_remote_function"):
1388+
# Keep this in sync with .apply
1389+
try:
1390+
return func(self, other)
1391+
except Exception as ex:
1392+
# This could happen if any of the operators in func is not
1393+
# supported on a Series. Let's guide the customer to use a
1394+
# remote function instead
1395+
if hasattr(ex, "message"):
1396+
ex.message += f"\n{_remote_function_recommendation_message}"
1397+
raise
1398+
1399+
reprojected_series = Series(self._block._force_reproject())
1400+
result_series = reprojected_series._apply_binary_op(
1401+
other, ops.BinaryRemoteFunctionOp(func=func)
1402+
)
1403+
1404+
# return Series with materialized result so that any error in the remote
1405+
# function is caught early
1406+
materialized_series = result_series._cached()
1407+
return materialized_series
1408+
13771409
def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series:
13781410
return Series(self._get_block().add_prefix(prefix))
13791411

tests/system/large/test_remote_function.py

+35
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,41 @@ def stringify(x):
221221
)
222222

223223

224+
# @pytest.mark.flaky(retries=2, delay=120)
225+
def test_remote_function_binop(session, scalars_dfs, dataset_id, bq_cf_connection):
226+
try:
227+
228+
def func(x, y):
229+
return x * abs(y % 4)
230+
231+
remote_func = session.remote_function(
232+
[str, int],
233+
str,
234+
dataset_id,
235+
bq_cf_connection,
236+
reuse=False,
237+
)(func)
238+
239+
scalars_df, scalars_pandas_df = scalars_dfs
240+
241+
scalars_df = scalars_df.dropna()
242+
scalars_pandas_df = scalars_pandas_df.dropna()
243+
bf_result = (
244+
scalars_df["string_col"]
245+
.combine(scalars_df["int64_col"], remote_func)
246+
.to_pandas()
247+
)
248+
pd_result = scalars_pandas_df["string_col"].combine(
249+
scalars_pandas_df["int64_col"], func
250+
)
251+
pandas.testing.assert_series_equal(bf_result, pd_result)
252+
finally:
253+
# clean up the gcp assets created for the remote function
254+
cleanup_remote_function_assets(
255+
session.bqclient, session.cloudfunctionsclient, remote_func
256+
)
257+
258+
224259
@pytest.mark.flaky(retries=2, delay=120)
225260
def test_remote_function_decorator_with_bigframes_series(
226261
session, scalars_dfs, dataset_id, bq_cf_connection

tests/system/small/test_numpy.py

+22-31
Original file line numberDiff line numberDiff line change
@@ -73,27 +73,6 @@ def test_df_ufuncs(scalars_dfs, opname):
7373
pd.testing.assert_frame_equal(bf_result, pd_result)
7474

7575

76-
@pytest.mark.parametrize(
77-
("opname",),
78-
[
79-
("add",),
80-
("subtract",),
81-
("multiply",),
82-
("divide",),
83-
("power",),
84-
("arctan2",),
85-
],
86-
)
87-
def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname):
88-
bf_result = getattr(np, opname)(
89-
floats_product_bf.float64_col_x, floats_product_bf.float64_col_y
90-
).to_pandas()
91-
pd_result = getattr(np, opname)(
92-
floats_product_pd.float64_col_x, floats_product_pd.float64_col_y
93-
)
94-
pd.testing.assert_series_equal(bf_result, pd_result)
95-
96-
9776
@pytest.mark.parametrize(
9877
("opname",),
9978
[
@@ -106,30 +85,42 @@ def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname):
10685
)
10786
def test_df_binary_ufuncs(scalars_dfs, opname):
10887
scalars_df, scalars_pandas_df = scalars_dfs
88+
op = getattr(np, opname)
10989

110-
bf_result = getattr(np, opname)(
111-
scalars_df[["float64_col", "int64_col"]], 5.1
112-
).to_pandas()
113-
pd_result = getattr(np, opname)(
114-
scalars_pandas_df[["float64_col", "int64_col"]], 5.1
115-
)
90+
bf_result = op(scalars_df[["float64_col", "int64_col"]], 5.1).to_pandas()
91+
pd_result = op(scalars_pandas_df[["float64_col", "int64_col"]], 5.1)
11692

11793
pd.testing.assert_frame_equal(bf_result, pd_result)
11894

11995

96+
# Operations tested here don't work on full dataframe in numpy+pandas
97+
# Maybe because of nullable dtypes?
12098
@pytest.mark.parametrize(
12199
("x", "y"),
122100
[
123101
("int64_col", "int64_col"),
124102
("float64_col", "int64_col"),
125103
],
126104
)
127-
def test_series_atan2(scalars_dfs, x, y):
128-
# Test atan2 separately as pandas errors when passing entire df as input, so pass only series
105+
@pytest.mark.parametrize(
106+
("opname",),
107+
[
108+
("add",),
109+
("subtract",),
110+
("multiply",),
111+
("divide",),
112+
("arctan2",),
113+
("minimum",),
114+
("maximum",),
115+
],
116+
)
117+
def test_series_binary_ufuncs(scalars_dfs, x, y, opname):
129118
scalars_df, scalars_pandas_df = scalars_dfs
130119

131-
bf_result = np.arctan2(scalars_df[x], scalars_df[y]).to_pandas()
132-
pd_result = np.arctan2(scalars_pandas_df[x], scalars_pandas_df[y])
120+
op = getattr(np, opname)
121+
122+
bf_result = op(scalars_df[x], scalars_df[y]).to_pandas()
123+
pd_result = op(scalars_pandas_df[x], scalars_pandas_df[y])
133124

134125
pd.testing.assert_series_equal(bf_result, pd_result)
135126

tests/system/small/test_series.py

+35
Original file line numberDiff line numberDiff line change
@@ -3509,6 +3509,41 @@ def test_apply_numpy_ufunc(scalars_dfs, ufunc):
35093509
assert_series_equal(bf_result, pd_result)
35103510

35113511

3512+
@pytest.mark.parametrize(
3513+
("ufunc",),
3514+
[
3515+
pytest.param(numpy.add),
3516+
pytest.param(numpy.divide),
3517+
],
3518+
ids=[
3519+
"add",
3520+
"divide",
3521+
],
3522+
)
3523+
def test_combine_series_ufunc(scalars_dfs, ufunc):
3524+
scalars_df, scalars_pandas_df = scalars_dfs
3525+
3526+
bf_col = scalars_df["int64_col"].dropna()
3527+
bf_result = bf_col.combine(bf_col, ufunc).to_pandas()
3528+
3529+
pd_col = scalars_pandas_df["int64_col"].dropna()
3530+
pd_result = pd_col.combine(pd_col, ufunc)
3531+
3532+
assert_series_equal(bf_result, pd_result, check_dtype=False)
3533+
3534+
3535+
def test_combine_scalar_ufunc(scalars_dfs):
3536+
scalars_df, scalars_pandas_df = scalars_dfs
3537+
3538+
bf_col = scalars_df["int64_col"].dropna()
3539+
bf_result = bf_col.combine(2.5, numpy.add).to_pandas()
3540+
3541+
pd_col = scalars_pandas_df["int64_col"].dropna()
3542+
pd_result = pd_col.combine(2.5, numpy.add)
3543+
3544+
assert_series_equal(bf_result, pd_result, check_dtype=False)
3545+
3546+
35123547
def test_apply_simple_udf(scalars_dfs):
35133548
scalars_df, scalars_pandas_df = scalars_dfs
35143549

third_party/bigframes_vendored/pandas/core/series.py

+56
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,62 @@ def apply(
12791279
"""
12801280
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
12811281

1282+
def combine(
1283+
self,
1284+
other: Series | Hashable,
1285+
func,
1286+
) -> Series:
1287+
"""
1288+
Combine the Series with a Series or scalar according to `func`.
1289+
1290+
Combine the Series and `other` using `func` to perform elementwise
1291+
selection for combined Series.
1292+
`fill_value` is assumed when value is missing at some index
1293+
from one of the two objects being combined.
1294+
1295+
**Examples:**
1296+
1297+
>>> import bigframes.pandas as bpd
1298+
>>> import numpy as np
1299+
>>> bpd.options.display.progress_bar = None
1300+
1301+
Consider 2 Datasets ``s1`` and ``s2`` containing
1302+
highest clocked speeds of different birds.
1303+
1304+
>>> s1 = bpd.Series({'falcon': 330.0, 'eagle': 160.0})
1305+
>>> s1
1306+
falcon 330.0
1307+
eagle 160.0
1308+
dtype: Float64
1309+
>>> s2 = bpd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
1310+
>>> s2
1311+
falcon 345.0
1312+
eagle 200.0
1313+
duck 30.0
1314+
dtype: Float64
1315+
1316+
Now, to combine the two datasets and view the highest speeds
1317+
of the birds across the two datasets
1318+
1319+
>>> s1.combine(s2, np.maximum)
1320+
falcon 345.0
1321+
eagle 200.0
1322+
duck <NA>
1323+
dtype: Float64
1324+
1325+
Args:
1326+
other (Series or scalar):
1327+
The value(s) to be combined with the `Series`.
1328+
func (function):
1329+
BigFrames DataFrames ``remote_function`` to apply.
1330+
Takes two scalars as inputs and returns an element.
1331+
Also accepts some numpy binary functions.
1332+
1333+
Returns:
1334+
Series: The result of combining the Series with the other object.
1335+
"""
1336+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
1337+
12821338
def groupby(
12831339
self,
12841340
by=None,

0 commit comments

Comments
 (0)