Skip to content

Commit 802183d

Browse files
authored
feat: support BQ managed functions through read_gbq_function (#1476)
* feat: support read_gbq_function for managed function * quick fix * add back attr in wrapper * fix tests * add attr for remote function * quick fix * fix rebase * fix rebase
1 parent 28040c3 commit 802183d

File tree

8 files changed

+216
-77
lines changed

8 files changed

+216
-77
lines changed

bigframes/dataframe.py

+22-26
Original file line numberDiff line numberDiff line change
@@ -4108,32 +4108,26 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
41084108
)
41094109

41104110
def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
4111-
# In Bigframes remote function, DataFrame '.apply' method is specifically
4111+
# In Bigframes BigQuery function, DataFrame '.apply' method is specifically
41124112
# designed to work with row-wise or column-wise operations, where the input
41134113
# to the applied function should be a Series, not a scalar.
41144114

41154115
if utils.get_axis_number(axis) == 1:
41164116
msg = bfe.format_message("axis=1 scenario is in preview.")
41174117
warnings.warn(msg, category=bfe.PreviewWarning)
41184118

4119-
# TODO(jialuo): Deprecate the "bigframes_remote_function" attribute.
4120-
# We have some tests using pre-defined remote_function that were
4121-
# defined based on "bigframes_remote_function" instead of
4122-
# "bigframes_bigquery_function". So we need to fix those pre-defined
4123-
# remote functions before deprecating the "bigframes_remote_function"
4124-
# attribute. Check if the function is a remote function.
4125-
if not hasattr(func, "bigframes_remote_function") and not hasattr(
4126-
func, "bigframes_bigquery_function"
4127-
):
4128-
raise ValueError("For axis=1 a bigframes function must be used.")
4119+
if not hasattr(func, "bigframes_bigquery_function"):
4120+
raise ValueError(
4121+
"For axis=1 a BigFrames BigQuery function must be used."
4122+
)
41294123

41304124
is_row_processor = getattr(func, "is_row_processor")
41314125
if is_row_processor:
41324126
# Early check whether the dataframe dtypes are currently supported
4133-
# in the remote function
4127+
# in the bigquery function
41344128
# NOTE: Keep in sync with the value converters used in the gcf code
41354129
# generated in function_template.py
4136-
remote_function_supported_dtypes = (
4130+
bigquery_function_supported_dtypes = (
41374131
bigframes.dtypes.INT_DTYPE,
41384132
bigframes.dtypes.FLOAT_DTYPE,
41394133
bigframes.dtypes.BOOL_DTYPE,
@@ -4142,18 +4136,18 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
41424136
)
41434137
supported_dtypes_types = tuple(
41444138
type(dtype)
4145-
for dtype in remote_function_supported_dtypes
4139+
for dtype in bigquery_function_supported_dtypes
41464140
if not isinstance(dtype, pandas.ArrowDtype)
41474141
)
41484142
# Check ArrowDtype separately since multiple BigQuery types map to
41494143
# ArrowDtype, including BYTES and TIMESTAMP.
41504144
supported_arrow_types = tuple(
41514145
dtype.pyarrow_dtype
4152-
for dtype in remote_function_supported_dtypes
4146+
for dtype in bigquery_function_supported_dtypes
41534147
if isinstance(dtype, pandas.ArrowDtype)
41544148
)
41554149
supported_dtypes_hints = tuple(
4156-
str(dtype) for dtype in remote_function_supported_dtypes
4150+
str(dtype) for dtype in bigquery_function_supported_dtypes
41574151
)
41584152

41594153
for dtype in self.dtypes:
@@ -4186,10 +4180,11 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
41864180
)
41874181
else:
41884182
# This is a special case where we are providing not-pandas-like
4189-
# extension. If the remote function can take one or more params
4190-
# then we assume that here the user intention is to use the
4191-
# column values of the dataframe as arguments to the function.
4192-
# For this to work the following condition must be true:
4183+
# extension. If the bigquery function can take one or more
4184+
# params then we assume that here the user intention is to use
4185+
# the column values of the dataframe as arguments to the
4186+
# function. For this to work the following condition must be
4187+
# true:
41934188
# 1. The number or input params in the function must be same
41944189
# as the number of columns in the dataframe
41954190
# 2. The dtypes of the columns in the dataframe must be
@@ -4231,15 +4226,16 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs):
42314226

42324227
return result_series
42334228

4234-
# At this point column-wise or element-wise remote function operation will
4229+
# At this point column-wise or element-wise bigquery function operation will
42354230
# be performed (not supported).
4236-
if hasattr(func, "bigframes_remote_function"):
4231+
if hasattr(func, "bigframes_bigquery_function"):
42374232
raise formatter.create_exception_with_feedback_link(
42384233
NotImplementedError,
4239-
"BigFrames DataFrame '.apply()' does not support remote function "
4240-
"for column-wise (i.e. with axis=0) operations, please use a "
4241-
"regular python function instead. For element-wise operations of "
4242-
"the remote function, please use '.map()'.",
4234+
"BigFrames DataFrame '.apply()' does not support BigFrames "
4235+
"BigQuery function for column-wise (i.e. with axis=0) "
4236+
"operations, please use a regular python function instead. For "
4237+
"element-wise operations of the BigFrames BigQuery function, "
4238+
"please use '.map()'.",
42434239
)
42444240

42454241
# Per-column apply

bigframes/functions/_function_session.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -741,7 +741,7 @@ def wrapper(func):
741741
# with that name and would directly manage their lifecycle.
742742
if created_new and (not name):
743743
self._update_temp_artifacts(
744-
func.bigframes_remote_function, func.bigframes_cloud_function
744+
func.bigframes_bigquery_function, func.bigframes_cloud_function
745745
)
746746
return func
747747

bigframes/functions/function.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,11 @@ def func(*bigframes_args, **bigframes_kwargs):
219219
database=routine_ref.dataset_id,
220220
signature=(ibis_signature.input_types, ibis_signature.output_type),
221221
) # type: ignore
222-
func.bigframes_remote_function = str(routine_ref) # type: ignore
222+
func.bigframes_bigquery_function = str(routine_ref) # type: ignore
223+
224+
# We will keep the "bigframes_remote_function" attr for remote function.
225+
if hasattr(routine, "remote_function_options") and routine.remote_function_options:
226+
func.bigframes_remote_function = func.bigframes_bigquery_function # type: ignore
223227

224228
# set input bigframes data types
225229
has_unknown_dtypes = False

bigframes/series.py

+20-25
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,9 @@
6868
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
6969

7070

71-
_remote_function_recommendation_message = (
71+
_bigquery_function_recommendation_message = (
7272
"Your functions could not be applied directly to the Series."
73-
" Try converting it to a remote function."
73+
" Try converting it to a BigFrames BigQuery function."
7474
)
7575

7676
_list = list # Type alias to escape Series.list property
@@ -1530,38 +1530,33 @@ def apply(
15301530

15311531
if not callable(func):
15321532
raise ValueError(
1533-
"Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported."
1533+
"Only a ufunc (a function that applies to the entire Series) or"
1534+
" a BigFrames BigQuery function that only works on single values"
1535+
" are supported."
15341536
)
15351537

1536-
# TODO(jialuo): Deprecate the "bigframes_remote_function" attribute.
1537-
# We have some tests using pre-defined remote_function that were defined
1538-
# based on "bigframes_remote_function" instead of
1539-
# "bigframes_bigquery_function". So we need to fix those pre-defined
1540-
# remote functions before deprecating the "bigframes_remote_function"
1541-
# attribute.
1542-
if not hasattr(func, "bigframes_remote_function") and not hasattr(
1543-
func, "bigframes_bigquery_function"
1544-
):
1538+
if not hasattr(func, "bigframes_bigquery_function"):
15451539
# It is neither a remote function nor a managed function.
15461540
# Then it must be a vectorized function that applies to the Series
15471541
# as a whole.
15481542
if by_row:
15491543
raise ValueError(
1550-
"A vectorized non-remote function can be provided only with by_row=False."
1551-
" For element-wise operation it must be a remote function."
1544+
"A vectorized non-BigFrames BigQuery function can be "
1545+
"provided only with by_row=False. For element-wise operation "
1546+
"it must be a BigFrames BigQuery function."
15521547
)
15531548

15541549
try:
15551550
return func(self)
15561551
except Exception as ex:
15571552
# This could happen if any of the operators in func is not
15581553
# supported on a Series. Let's guide the customer to use a
1559-
# remote function instead
1554+
# bigquery function instead
15601555
if hasattr(ex, "message"):
1561-
ex.message += f"\n{_remote_function_recommendation_message}"
1556+
ex.message += f"\n{_bigquery_function_recommendation_message}"
15621557
raise
15631558

1564-
# We are working with remote function at this point
1559+
# We are working with bigquery function at this point
15651560
result_series = self._apply_unary_op(
15661561
ops.RemoteFunctionOp(func=func, apply_on_null=True)
15671562
)
@@ -1590,21 +1585,21 @@ def combine(
15901585
) -> Series:
15911586
if not callable(func):
15921587
raise ValueError(
1593-
"Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported."
1588+
"Only a ufunc (a function that applies to the entire Series) or"
1589+
" a BigFrames BigQuery function that only works on single values"
1590+
" are supported."
15941591
)
15951592

1596-
if not hasattr(func, "bigframes_remote_function") and not hasattr(
1597-
func, "bigframes_bigquery_function"
1598-
):
1593+
if not hasattr(func, "bigframes_bigquery_function"):
15991594
# Keep this in sync with .apply
16001595
try:
16011596
return func(self, other)
16021597
except Exception as ex:
16031598
# This could happen if any of the operators in func is not
16041599
# supported on a Series. Let's guide the customer to use a
1605-
# remote function instead
1600+
# bigquery function instead
16061601
if hasattr(ex, "message"):
1607-
ex.message += f"\n{_remote_function_recommendation_message}"
1602+
ex.message += f"\n{_bigquery_function_recommendation_message}"
16081603
raise
16091604

16101605
result_series = self._apply_binary_op(
@@ -1749,10 +1744,10 @@ def duplicated(self, keep: str = "first") -> Series:
17491744

17501745
def mask(self, cond, other=None) -> Series:
17511746
if callable(cond):
1752-
if hasattr(cond, "bigframes_remote_function"):
1747+
if hasattr(cond, "bigframes_bigquery_function"):
17531748
cond = self.apply(cond)
17541749
else:
1755-
# For non-remote function assume that it is applicable on Series
1750+
# For non-BigQuery function assume that it is applicable on Series
17561751
cond = self.apply(cond, by_row=False)
17571752

17581753
if not isinstance(cond, Series):

tests/system/large/functions/test_managed_function.py

+45
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,18 @@ def func(x, y):
147147
.to_pandas()
148148
)
149149
pandas.testing.assert_series_equal(bf_result, pd_result)
150+
151+
# Make sure the read_gbq_function path works for this function.
152+
managed_func_ref = session.read_gbq_function(
153+
managed_func.bigframes_bigquery_function
154+
)
155+
bf_result_gbq = (
156+
scalars_df["string_col"]
157+
.combine(scalars_df["int64_col"], managed_func_ref)
158+
.to_pandas()
159+
)
160+
pandas.testing.assert_series_equal(bf_result_gbq, pd_result)
161+
150162
finally:
151163
# clean up the gcp assets created for the managed function.
152164
cleanup_function_assets(
@@ -181,6 +193,23 @@ def featurize(x: int) -> list[array_dtype]: # type: ignore
181193
# Ignore any dtype disparity.
182194
pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
183195

196+
# Make sure the read_gbq_function path works for this function.
197+
featurize_ref = session.read_gbq_function(featurize.bigframes_bigquery_function)
198+
199+
assert hasattr(featurize_ref, "bigframes_bigquery_function")
200+
assert not hasattr(featurize_ref, "bigframes_remote_function")
201+
assert (
202+
featurize_ref.bigframes_bigquery_function
203+
== featurize.bigframes_bigquery_function
204+
)
205+
206+
# Test on the function from read_gbq_function.
207+
got = featurize_ref(10)
208+
assert got == [array_dtype(i) for i in [10, 11, 12]]
209+
210+
bf_result_gbq = bf_int64_col.apply(featurize_ref).to_pandas()
211+
pandas.testing.assert_series_equal(bf_result_gbq, pd_result, check_dtype=False)
212+
184213
finally:
185214
# Clean up the gcp assets created for the managed function.
186215
cleanup_function_assets(
@@ -295,6 +324,22 @@ def foo(x, y, z):
295324
expected_result, bf_result, check_dtype=False, check_index_type=False
296325
)
297326

327+
# Make sure the read_gbq_function path works for this function.
328+
foo_ref = session.read_gbq_function(foo.bigframes_bigquery_function)
329+
330+
assert hasattr(foo_ref, "bigframes_bigquery_function")
331+
assert not hasattr(foo_ref, "bigframes_remote_function")
332+
assert foo_ref.bigframes_bigquery_function == foo.bigframes_bigquery_function
333+
334+
# Test on the function from read_gbq_function.
335+
got = foo_ref(10, 38, "hello")
336+
assert got == ["10", "38.0", "hello"]
337+
338+
bf_result_gbq = bf_df.apply(foo_ref, axis=1).to_pandas()
339+
pandas.testing.assert_series_equal(
340+
bf_result_gbq, expected_result, check_dtype=False, check_index_type=False
341+
)
342+
298343
finally:
299344
# Clean up the gcp assets created for the managed function.
300345
cleanup_function_assets(foo, session.bqclient, session.cloudfunctionsclient)

0 commit comments

Comments
 (0)