Skip to content

Commit 240a1ac

Browse files
authored
feat: expose max_batching_rows in remote_function (#622)
* feat: expose `max_batching_rows` in `remote_function` * fix option formation, add tests * fix type annotation * assert max_batching_rows after routing creation * add forgotten assert
1 parent 9d205ae commit 240a1ac

File tree

4 files changed

+89
-6
lines changed

4 files changed

+89
-6
lines changed

bigframes/functions/remote_function.py

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,13 @@ def __init__(
145145
self._cloud_function_docker_repository = cloud_function_docker_repository
146146

147147
def create_bq_remote_function(
148-
self, input_args, input_types, output_type, endpoint, bq_function_name
148+
self,
149+
input_args,
150+
input_types,
151+
output_type,
152+
endpoint,
153+
bq_function_name,
154+
max_batching_rows,
149155
):
150156
"""Create a BigQuery remote function given the artifacts of a user defined
151157
function and the http endpoint of a corresponding cloud function."""
@@ -169,14 +175,25 @@ def create_bq_remote_function(
169175
bq_function_args.append(
170176
f"{name} {third_party_ibis_bqtypes.BigQueryType.from_ibis(input_types[idx])}"
171177
)
178+
179+
remote_function_options = {
180+
"endpoint": endpoint,
181+
"max_batching_rows": max_batching_rows,
182+
}
183+
184+
remote_function_options_str = ", ".join(
185+
[
186+
f'{key}="{val}"' if isinstance(val, str) else f"{key}={val}"
187+
for key, val in remote_function_options.items()
188+
if val is not None
189+
]
190+
)
191+
172192
create_function_ddl = f"""
173193
CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)})
174194
RETURNS {bq_function_return_type}
175195
REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}`
176-
OPTIONS (
177-
endpoint = "{endpoint}",
178-
max_batching_rows = 1000
179-
)"""
196+
OPTIONS ({remote_function_options_str})"""
180197

181198
logger.info(f"Creating BQ remote function: {create_function_ddl}")
182199

@@ -438,6 +455,7 @@ def provision_bq_remote_function(
438455
reuse,
439456
name,
440457
package_requirements,
458+
max_batching_rows,
441459
):
442460
"""Provision a BigQuery remote function."""
443461
# If reuse of any existing function with the same name (indicated by the
@@ -485,7 +503,12 @@ def provision_bq_remote_function(
485503
"Exactly one type should be provided for every input arg."
486504
)
487505
self.create_bq_remote_function(
488-
input_args, input_types, output_type, cf_endpoint, remote_function_name
506+
input_args,
507+
input_types,
508+
output_type,
509+
cf_endpoint,
510+
remote_function_name,
511+
max_batching_rows,
489512
)
490513
else:
491514
logger.info(f"Remote function {remote_function_name} already exists.")
@@ -607,6 +630,7 @@ def remote_function(
607630
cloud_function_service_account: Optional[str] = None,
608631
cloud_function_kms_key_name: Optional[str] = None,
609632
cloud_function_docker_repository: Optional[str] = None,
633+
max_batching_rows: Optional[int] = 1000,
610634
):
611635
"""Decorator to turn a user defined function into a BigQuery remote function.
612636
@@ -723,6 +747,15 @@ def remote_function(
723747
projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME.
724748
For more details see
725749
https://cloud.google.com/functions/docs/securing/cmek#before_you_begin.
750+
max_batching_rows (int, Optional):
751+
The maximum number of rows to be batched for processing in the
752+
BQ remote function. Default value is 1000. A lower number can be
753+
passed to avoid timeouts in case the user code is too complex to
754+
process large number of rows fast enough. A higher number can be
755+
used to increase throughput in case the user code is fast enough.
756+
`None` can be passed to let BQ remote functions service apply
757+
default batching. See for more details
758+
https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request.
726759
"""
727760
import bigframes.pandas as bpd
728761

@@ -846,6 +879,7 @@ def wrapper(f):
846879
reuse,
847880
name,
848881
packages,
882+
max_batching_rows,
849883
)
850884

851885
# TODO: Move ibis logic to compiler step

bigframes/pandas/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,7 @@ def remote_function(
643643
cloud_function_service_account: Optional[str] = None,
644644
cloud_function_kms_key_name: Optional[str] = None,
645645
cloud_function_docker_repository: Optional[str] = None,
646+
max_batching_rows: Optional[int] = 1000,
646647
):
647648
return global_session.with_default_session(
648649
bigframes.session.Session.remote_function,
@@ -656,6 +657,7 @@ def remote_function(
656657
cloud_function_service_account=cloud_function_service_account,
657658
cloud_function_kms_key_name=cloud_function_kms_key_name,
658659
cloud_function_docker_repository=cloud_function_docker_repository,
660+
max_batching_rows=max_batching_rows,
659661
)
660662

661663

bigframes/session/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1541,6 +1541,7 @@ def remote_function(
15411541
cloud_function_service_account: Optional[str] = None,
15421542
cloud_function_kms_key_name: Optional[str] = None,
15431543
cloud_function_docker_repository: Optional[str] = None,
1544+
max_batching_rows: Optional[int] = 1000,
15441545
):
15451546
"""Decorator to turn a user defined function into a BigQuery remote function. Check out
15461547
the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.
@@ -1635,6 +1636,15 @@ def remote_function(
16351636
projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME.
16361637
For more details see
16371638
https://cloud.google.com/functions/docs/securing/cmek#before_you_begin.
1639+
max_batching_rows (int, Optional):
1640+
The maximum number of rows to be batched for processing in the
1641+
BQ remote function. Default value is 1000. A lower number can be
1642+
passed to avoid timeouts in case the user code is too complex to
1643+
process large number of rows fast enough. A higher number can be
1644+
used to increase throughput in case the user code is fast enough.
1645+
`None` can be passed to let BQ remote functions service apply
1646+
default batching. See for more details
1647+
https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request.
16381648
Returns:
16391649
callable: A remote function object pointing to the cloud assets created
16401650
in the background to support the remote execution. The cloud assets can be
@@ -1656,6 +1666,7 @@ def remote_function(
16561666
cloud_function_service_account=cloud_function_service_account,
16571667
cloud_function_kms_key_name=cloud_function_kms_key_name,
16581668
cloud_function_docker_repository=cloud_function_docker_repository,
1669+
max_batching_rows=max_batching_rows,
16591670
)
16601671

16611672
def read_gbq_function(

tests/system/large/test_remote_function.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1300,3 +1300,39 @@ def square_num(x):
13001300
cleanup_remote_function_assets(
13011301
session.bqclient, session.cloudfunctionsclient, square_num
13021302
)
1303+
1304+
1305+
@pytest.mark.parametrize(
1306+
("max_batching_rows"),
1307+
[
1308+
10_000,
1309+
None,
1310+
],
1311+
)
1312+
@pytest.mark.flaky(retries=2, delay=120)
1313+
def test_remote_function_max_batching_rows(session, scalars_dfs, max_batching_rows):
1314+
try:
1315+
1316+
def square(x):
1317+
return x * x
1318+
1319+
square_remote = session.remote_function(
1320+
[int], int, reuse=False, max_batching_rows=max_batching_rows
1321+
)(square)
1322+
1323+
bq_routine = session.bqclient.get_routine(
1324+
square_remote.bigframes_remote_function
1325+
)
1326+
assert bq_routine.remote_function_options.max_batching_rows == max_batching_rows
1327+
1328+
scalars_df, scalars_pandas_df = scalars_dfs
1329+
1330+
bf_result = scalars_df["int64_too"].apply(square_remote).to_pandas()
1331+
pd_result = scalars_pandas_df["int64_too"].apply(square)
1332+
1333+
pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
1334+
finally:
1335+
# clean up the gcp assets created for the remote function
1336+
cleanup_remote_function_assets(
1337+
session.bqclient, session.cloudfunctionsclient, square_remote
1338+
)

0 commit comments

Comments
 (0)