Skip to content

Commit 1832778

Browse files
authored
feat: ensure "bigframes-api" label is always set on jobs, even if the API is unknown (#722)
* feat: ensure `"bigframes-api"` label is always set on jobs, even if the API is unknown * remove some dead code. plumb through api_name * avoid . in label value * add tests
1 parent 354abc1 commit 1832778

File tree

12 files changed

+131
-99
lines changed

12 files changed

+131
-99
lines changed

.kokoro/continuous/e2e.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Only run this nox session.
44
env_vars: {
55
key: "NOX_SESSION"
6-
value: "unit_prerelease system_prerelease system_noextras e2e notebook"
6+
value: "e2e doctest notebook unit_prerelease system_prerelease system_noextras"
77
}
88

99
env_vars: {

.kokoro/presubmit/e2e.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Only run this nox session.
44
env_vars: {
55
key: "NOX_SESSION"
6-
value: "unit_prerelease system_prerelease system_noextras e2e notebook"
6+
value: "e2e doctest notebook unit_prerelease system_prerelease system_noextras"
77
}
88

99
env_vars: {

bigframes/core/log_adapter.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,12 @@ def add_api_method(api_method_name):
9999
_api_methods = _api_methods[:MAX_LABELS_COUNT]
100100

101101

102-
def get_and_reset_api_methods():
102+
def get_and_reset_api_methods(dry_run: bool = False):
103103
global _lock
104104
with _lock:
105105
previous_api_methods = list(_api_methods)
106-
_api_methods.clear()
106+
107+
# dry_run might not make a job resource, so only reset the log on real queries.
108+
if not dry_run:
109+
_api_methods.clear()
107110
return previous_api_methods

bigframes/dataframe.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -2912,7 +2912,9 @@ def to_csv(
29122912
field_delimiter=sep,
29132913
header=header,
29142914
)
2915-
_, query_job = self._block.expr.session._start_query(export_data_statement)
2915+
_, query_job = self._block.expr.session._start_query(
2916+
export_data_statement, api_name="dataframe-to_csv"
2917+
)
29162918
self._set_internal_query_job(query_job)
29172919

29182920
def to_json(
@@ -2954,7 +2956,9 @@ def to_json(
29542956
format="JSON",
29552957
export_options={},
29562958
)
2957-
_, query_job = self._block.expr.session._start_query(export_data_statement)
2959+
_, query_job = self._block.expr.session._start_query(
2960+
export_data_statement, api_name="dataframe-to_json"
2961+
)
29582962
self._set_internal_query_job(query_job)
29592963

29602964
def to_gbq(
@@ -3086,7 +3090,9 @@ def to_parquet(
30863090
format="PARQUET",
30873091
export_options=export_options,
30883092
)
3089-
_, query_job = self._block.expr.session._start_query(export_data_statement)
3093+
_, query_job = self._block.expr.session._start_query(
3094+
export_data_statement, api_name="dataframe-to_parquet"
3095+
)
30903096
self._set_internal_query_job(query_job)
30913097

30923098
def to_dict(

bigframes/functions/remote_function.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import sys
2525
import tempfile
2626
import textwrap
27-
from typing import List, NamedTuple, Optional, Sequence, TYPE_CHECKING, Union
27+
from typing import cast, List, NamedTuple, Optional, Sequence, TYPE_CHECKING, Union
2828
import warnings
2929

3030
import ibis
@@ -133,6 +133,8 @@ def __init__(
133133
cloud_function_service_account,
134134
cloud_function_kms_key_name,
135135
cloud_function_docker_repository,
136+
*,
137+
session: Session,
136138
):
137139
self._gcp_project_id = gcp_project_id
138140
self._cloud_function_region = cloud_function_region
@@ -145,6 +147,7 @@ def __init__(
145147
self._cloud_function_service_account = cloud_function_service_account
146148
self._cloud_function_kms_key_name = cloud_function_kms_key_name
147149
self._cloud_function_docker_repository = cloud_function_docker_repository
150+
self._session = session
148151

149152
def create_bq_remote_function(
150153
self,
@@ -216,10 +219,8 @@ def create_bq_remote_function(
216219
# This requires bigquery.datasets.create IAM permission
217220
self._bq_client.create_dataset(dataset, exists_ok=True)
218221

219-
# TODO: Use session._start_query() so we get progress bar
220-
query_job = self._bq_client.query(create_function_ddl) # Make an API request.
221-
query_job.result() # Wait for the job to complete.
222-
222+
# TODO(swast): plumb through the original, user-facing api_name.
223+
_, query_job = self._session._start_query(create_function_ddl)
223224
logger.info(f"Created remote function {query_job.ddl_target_routine}")
224225

225226
def get_cloud_function_fully_qualified_parent(self):
@@ -910,6 +911,7 @@ def remote_function(
910911
is_row_processor = False
911912

912913
import bigframes.series
914+
import bigframes.session
913915

914916
if input_types == bigframes.series.Series:
915917
warnings.warn(
@@ -928,7 +930,7 @@ def remote_function(
928930
# Some defaults may be used from the session if not provided otherwise
929931
import bigframes.pandas as bpd
930932

931-
session = session or bpd.get_global_session()
933+
session = cast(bigframes.session.Session, session or bpd.get_global_session())
932934

933935
# A BigQuery client is required to perform BQ operations
934936
if not bigquery_client:
@@ -1040,6 +1042,7 @@ def wrapper(f):
10401042
cloud_function_service_account,
10411043
cloud_function_kms_key_name,
10421044
cloud_function_docker_repository,
1045+
session=session, # type: ignore
10431046
)
10441047

10451048
rf_name, cf_name = remote_function_client.provision_bq_remote_function(

bigframes/pandas/__init__.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,9 @@ def _set_default_session_location_if_possible(query):
399399
bqclient = clients_provider.bqclient
400400

401401
if bigframes.session._io.bigquery.is_query(query):
402+
# Intentionally run outside of the session so that we can detect the
403+
# location before creating the session. Since it's a dry_run, labels
404+
# aren't necessary.
402405
job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True))
403406
options.bigquery.location = job.location
404407
else:
@@ -773,7 +776,10 @@ def clean_up_by_session_id(
773776
dataset = session._anonymous_dataset
774777
else:
775778
dataset = bigframes.session._io.bigquery.create_bq_dataset_reference(
776-
client, location=location, project=project
779+
client,
780+
location=location,
781+
project=project,
782+
api_name="clean_up_by_session_id",
777783
)
778784

779785
bigframes.session._io.bigquery.delete_tables_matching_session_id(

bigframes/session/__init__.py

+26-14
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,9 @@ def __init__(
235235

236236
self._anonymous_dataset = (
237237
bigframes.session._io.bigquery.create_bq_dataset_reference(
238-
self.bqclient, location=self._location
238+
self.bqclient,
239+
location=self._location,
240+
api_name="session-__init__",
239241
)
240242
)
241243

@@ -420,9 +422,11 @@ def _query_to_destination(
420422
# bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement.
421423
dry_run_config = bigquery.QueryJobConfig()
422424
dry_run_config.dry_run = True
423-
_, dry_run_job = self._start_query(query, job_config=dry_run_config)
425+
_, dry_run_job = self._start_query(
426+
query, job_config=dry_run_config, api_name=api_name
427+
)
424428
if dry_run_job.statement_type != "SELECT":
425-
_, query_job = self._start_query(query)
429+
_, query_job = self._start_query(query, api_name=api_name)
426430
return query_job.destination, query_job
427431

428432
# Create a table to workaround BigQuery 10 GB query results limit. See:
@@ -451,23 +455,25 @@ def _query_to_destination(
451455
bigquery.QueryJobConfig,
452456
bigquery.QueryJobConfig.from_api_repr(configuration),
453457
)
454-
job_config.labels["bigframes-api"] = api_name
455458
job_config.destination = temp_table
456459

457460
try:
458461
# Write to temp table to workaround BigQuery 10 GB query results
459462
# limit. See: internal issue 303057336.
460463
job_config.labels["error_caught"] = "true"
461464
_, query_job = self._start_query(
462-
query, job_config=job_config, timeout=timeout
465+
query,
466+
job_config=job_config,
467+
timeout=timeout,
468+
api_name=api_name,
463469
)
464470
return query_job.destination, query_job
465471
except google.api_core.exceptions.BadRequest:
466472
# Some SELECT statements still aren't compatible with cluster
467473
# tables as the destination. For example, if the query has a
468474
# top-level ORDER BY, this conflicts with our ability to cluster
469475
# the table by the index column(s).
470-
_, query_job = self._start_query(query, timeout=timeout)
476+
_, query_job = self._start_query(query, timeout=timeout, api_name=api_name)
471477
return query_job.destination, query_job
472478

473479
def read_gbq_query(
@@ -811,7 +817,7 @@ def _read_gbq_table(
811817
dry_run_config = bigquery.QueryJobConfig()
812818
dry_run_config.dry_run = True
813819
try:
814-
self._start_query(sql, job_config=dry_run_config)
820+
self._start_query(sql, job_config=dry_run_config, api_name=api_name)
815821
except google.api_core.exceptions.NotFound:
816822
# note that a notfound caused by a simple typo will be
817823
# caught above when the metadata is fetched, not here
@@ -1777,12 +1783,6 @@ def _prepare_query_job_config(
17771783
bigframes.options.compute.maximum_bytes_billed
17781784
)
17791785

1780-
current_labels = job_config.labels if job_config.labels else {}
1781-
for key, value in bigframes.options.compute.extra_query_labels.items():
1782-
if key not in current_labels:
1783-
current_labels[key] = value
1784-
job_config.labels = current_labels
1785-
17861786
if self._bq_kms_key_name:
17871787
job_config.destination_encryption_configuration = (
17881788
bigquery.EncryptionConfiguration(kms_key_name=self._bq_kms_key_name)
@@ -1818,13 +1818,19 @@ def _start_query(
18181818
job_config: Optional[bigquery.job.QueryJobConfig] = None,
18191819
max_results: Optional[int] = None,
18201820
timeout: Optional[float] = None,
1821+
api_name: Optional[str] = None,
18211822
) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
18221823
"""
18231824
Starts BigQuery query job and waits for results.
18241825
"""
18251826
job_config = self._prepare_query_job_config(job_config)
18261827
return bigframes.session._io.bigquery.start_query_with_client(
1827-
self.bqclient, sql, job_config, max_results, timeout
1828+
self.bqclient,
1829+
sql,
1830+
job_config,
1831+
max_results,
1832+
timeout,
1833+
api_name=api_name,
18281834
)
18291835

18301836
def _start_query_ml_ddl(
@@ -1970,6 +1976,9 @@ def _execute(
19701976
job_config = bigquery.QueryJobConfig(dry_run=dry_run)
19711977
else:
19721978
job_config.dry_run = dry_run
1979+
1980+
# TODO(swast): plumb through the api_name of the user-facing api that
1981+
# caused this query.
19731982
return self._start_query(
19741983
sql=sql,
19751984
job_config=job_config,
@@ -1982,6 +1991,9 @@ def _peek(
19821991
if not tree_properties.peekable(self._with_cached_executions(array_value.node)):
19831992
warnings.warn("Peeking this value cannot be done efficiently.")
19841993
sql = self._compile_unordered(array_value).peek_sql(n_rows)
1994+
1995+
# TODO(swast): plumb through the api_name of the user-facing api that
1996+
# caused this query.
19851997
return self._start_query(
19861998
sql=sql,
19871999
)

bigframes/session/_io/bigquery/__init__.py

+38-9
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq
2929
import google.api_core.exceptions
3030
import google.cloud.bigquery as bigquery
31+
import google.cloud.bigquery.table
3132

3233
import bigframes
3334
from bigframes.core import log_adapter
@@ -40,19 +41,34 @@
4041
# will be limited to this many tables
4142

4243
LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"
44+
CHECK_DRIVE_PERMISSIONS = "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions."
4345

4446

4547
def create_job_configs_labels(
4648
job_configs_labels: Optional[Dict[str, str]],
4749
api_methods: typing.List[str],
50+
api_name: Optional[str] = None,
4851
) -> Dict[str, str]:
4952
if job_configs_labels is None:
5053
job_configs_labels = {}
5154

52-
if api_methods:
55+
# If the user has labels they wish to set, make sure we set those first so
56+
# they are preserved.
57+
for key, value in bigframes.options.compute.extra_query_labels.items():
58+
job_configs_labels[key] = value
59+
60+
if api_name is not None:
61+
job_configs_labels["bigframes-api"] = api_name
62+
63+
if api_methods and "bigframes-api" not in job_configs_labels:
5364
job_configs_labels["bigframes-api"] = api_methods[0]
5465
del api_methods[0]
5566

67+
# Make sure we always populate bigframes-api with _something_, even if we
68+
# have a code path which doesn't populate the list of api_methods. See
69+
# internal issue 336521938.
70+
job_configs_labels.setdefault("bigframes-api", "unknown")
71+
5672
labels = list(
5773
itertools.chain(
5874
job_configs_labels.keys(),
@@ -193,27 +209,33 @@ def format_option(key: str, value: Union[bool, str]) -> str:
193209
return f"{key}={repr(value)}"
194210

195211

212+
def add_labels(job_config, api_name: Optional[str] = None):
213+
api_methods = log_adapter.get_and_reset_api_methods(dry_run=job_config.dry_run)
214+
job_config.labels = create_job_configs_labels(
215+
job_configs_labels=job_config.labels,
216+
api_methods=api_methods,
217+
api_name=api_name,
218+
)
219+
220+
196221
def start_query_with_client(
197222
bq_client: bigquery.Client,
198223
sql: str,
199224
job_config: bigquery.job.QueryJobConfig,
200225
max_results: Optional[int] = None,
201226
timeout: Optional[float] = None,
227+
api_name: Optional[str] = None,
202228
) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
203229
"""
204230
Starts query job and waits for results.
205231
"""
206-
if not job_config.dry_run:
207-
api_methods = log_adapter.get_and_reset_api_methods()
208-
job_config.labels = create_job_configs_labels(
209-
job_configs_labels=job_config.labels, api_methods=api_methods
210-
)
232+
add_labels(job_config, api_name=api_name)
211233

212234
try:
213235
query_job = bq_client.query(sql, job_config=job_config, timeout=timeout)
214236
except google.api_core.exceptions.Forbidden as ex:
215237
if "Drive credentials" in ex.message:
216-
ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions."
238+
ex.message += CHECK_DRIVE_PERMISSIONS
217239
raise
218240

219241
opts = bigframes.options.display
@@ -286,7 +308,10 @@ def delete_tables_matching_session_id(
286308

287309

288310
def create_bq_dataset_reference(
289-
bq_client: bigquery.Client, location=None, project=None
311+
bq_client: bigquery.Client,
312+
location=None,
313+
project=None,
314+
api_name: str = "unknown",
290315
) -> bigquery.DatasetReference:
291316
"""Create and identify dataset(s) for temporary BQ resources.
292317
@@ -307,7 +332,11 @@ def create_bq_dataset_reference(
307332
Returns:
308333
bigquery.DatasetReference: The constructed reference to the anonymous dataset.
309334
"""
310-
query_job = bq_client.query("SELECT 1", location=location, project=project)
335+
job_config = google.cloud.bigquery.QueryJobConfig()
336+
add_labels(job_config, api_name=api_name)
337+
query_job = bq_client.query(
338+
"SELECT 1", location=location, project=project, job_config=job_config
339+
)
311340
query_job.result() # blocks until finished
312341

313342
# The anonymous dataset is used by BigQuery to write query results and

0 commit comments

Comments
 (0)