Skip to content

Commit b2917bb

Browse files
authored
feat: (Preview) Support automatic load of timedelta from BQ tables. (#1429)
* feat: (Preview) Support automatical load of timedelta columns from BigQuery tables * pass in timedelta columns' labels instead of schema traversal * fix duplicate col name issue * check timedelta type locally * fix lint * use dyptes.convert_to_schema_field for tagging timedeltas * fix lint * avoid changing the description of existing tables * restrain from unnecessary schema update
1 parent 673540b commit b2917bb

File tree

4 files changed

+101
-4
lines changed

4 files changed

+101
-4
lines changed

bigframes/dataframe.py

+1
Original file line numberDiff line numberDiff line change
@@ -3733,6 +3733,7 @@ def to_gbq(
37333733
default_project=default_project,
37343734
)
37353735
)
3736+
37363737
query_job = self._session._executor.export_gbq(
37373738
export_array.rename_columns(id_overrides),
37383739
destination=destination,

bigframes/dtypes.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,12 @@ def convert_schema_field(
678678
pa_struct = pa.struct(fields)
679679
pa_type = pa.list_(pa_struct) if is_repeated else pa_struct
680680
return field.name, pd.ArrowDtype(pa_type)
681+
elif (
682+
field.field_type == "INTEGER"
683+
and field.description is not None
684+
and field.description.endswith(TIMEDELTA_DESCRIPTION_TAG)
685+
):
686+
return field.name, TIMEDELTA_DTYPE
681687
elif field.field_type in _TK_TO_BIGFRAMES:
682688
if is_repeated:
683689
pa_type = pa.list_(
@@ -719,7 +725,9 @@ def convert_to_schema_field(
719725
)
720726
if bigframes_dtype.pyarrow_dtype == pa.duration("us"):
721727
# Timedeltas are represented as integers in microseconds.
722-
return google.cloud.bigquery.SchemaField(name, "INTEGER")
728+
return google.cloud.bigquery.SchemaField(
729+
name, "INTEGER", description=TIMEDELTA_DESCRIPTION_TAG
730+
)
723731
raise TypeError(
724732
f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
725733
)
@@ -876,3 +884,6 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype:
876884
"STRING",
877885
"ARRAY",
878886
}
887+
888+
889+
TIMEDELTA_DESCRIPTION_TAG = "#microseconds"

bigframes/session/executor.py

+33-3
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
import weakref
3434

3535
import google.api_core.exceptions
36-
import google.cloud.bigquery as bigquery
36+
from google.cloud import bigquery
3737
import google.cloud.bigquery.job as bq_job
3838
import google.cloud.bigquery.table as bq_table
3939
import google.cloud.bigquery_storage_v1
@@ -47,6 +47,7 @@
4747
import bigframes.core.ordering as order
4848
import bigframes.core.schema
4949
import bigframes.core.tree_properties as tree_properties
50+
import bigframes.dtypes
5051
import bigframes.features
5152
import bigframes.session._io.bigquery as bq_io
5253
import bigframes.session.metrics
@@ -320,6 +321,19 @@ def export_gbq(
320321
sql=sql,
321322
job_config=job_config,
322323
)
324+
325+
has_timedelta_col = any(
326+
t == bigframes.dtypes.TIMEDELTA_DTYPE for t in array_value.schema.dtypes
327+
)
328+
329+
if if_exists != "append" and has_timedelta_col:
330+
# Only update schema if this is not modifying an existing table, and the
331+
# new table contains timedelta columns.
332+
assert query_job.destination is not None
333+
table = self.bqclient.get_table(query_job.destination)
334+
table.schema = array_value.schema.to_bigquery()
335+
self.bqclient.update_table(table, ["schema"])
336+
323337
return query_job
324338

325339
def export_gcs(
@@ -649,12 +663,28 @@ def _validate_result_schema(
649663
raise ValueError(
650664
f"This error should only occur while testing. BigFrames internal schema: {internal_schema.to_bigquery()} does not match actual schema: {actual_schema}"
651665
)
652-
if ibis_schema.to_bigquery() != actual_schema:
666+
sanitized_schema = _sanitize_for_ibis(actual_schema)
667+
if ibis_schema.to_bigquery() != sanitized_schema:
653668
raise ValueError(
654-
f"This error should only occur while testing. Ibis schema: {ibis_schema.to_bigquery()} does not match actual schema: {actual_schema}"
669+
f"This error should only occur while testing. Ibis schema: {ibis_schema.to_bigquery()} does not match sanitized schema: {sanitized_schema}"
655670
)
656671

657672

673+
def _sanitize_for_ibis(
674+
schema: Tuple[bigquery.SchemaField, ...]
675+
) -> Tuple[bigquery.SchemaField, ...]:
676+
# Schema inferred from Ibis does not contain description field. We only need to compare the names, types and modes.
677+
return tuple(
678+
bigquery.SchemaField(
679+
f.name,
680+
f.field_type,
681+
f.mode, # type:ignore
682+
fields=_sanitize_for_ibis(f.fields),
683+
)
684+
for f in schema
685+
)
686+
687+
658688
def generate_head_plan(node: nodes.BigFrameNode, n: int):
659689
return nodes.SliceNode(node, start=None, stop=n)
660690

tests/system/small/test_dataframe_io.py

+55
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@
3232

3333
import typing
3434

35+
from google.cloud import bigquery
36+
3537
import bigframes
38+
from bigframes import dtypes
3639
import bigframes.dataframe
3740
import bigframes.features
3841
import bigframes.pandas as bpd
@@ -697,6 +700,58 @@ def test_to_gbq_w_json(bigquery_client):
697700
assert table.schema[1].field_type == "JSON"
698701

699702

703+
def test_to_gbq_with_timedelta(bigquery_client, dataset_id):
704+
destination_table = f"{dataset_id}.test_to_gbq_with_timedelta"
705+
s1 = bpd.Series([1, 2, 3, 4])
706+
s2 = bpd.to_timedelta(bpd.Series([1, 2, 3, 4]), unit="s")
707+
df = bpd.DataFrame({"id": s1, "timedelta_col": s2})
708+
709+
df.to_gbq(destination_table)
710+
table = bigquery_client.get_table(destination_table)
711+
712+
assert table.schema[1].name == "timedelta_col"
713+
assert table.schema[1].field_type == "INTEGER"
714+
assert dtypes.TIMEDELTA_DESCRIPTION_TAG in table.schema[1].description
715+
716+
717+
def test_gbq_round_trip_with_timedelta(session, dataset_id):
718+
destination_table = f"{dataset_id}.test_gbq_roundtrip_with_timedelta"
719+
df = pd.DataFrame(
720+
{
721+
"col_1": [1],
722+
"col_2": [pd.Timedelta(1, "s")],
723+
"col_3": [1.1],
724+
}
725+
)
726+
bpd.DataFrame(df).to_gbq(destination_table)
727+
728+
result = session.read_gbq(destination_table)
729+
730+
assert result["col_1"].dtype == dtypes.INT_DTYPE
731+
assert result["col_2"].dtype == dtypes.TIMEDELTA_DTYPE
732+
assert result["col_3"].dtype == dtypes.FLOAT_DTYPE
733+
734+
735+
def test_to_gbq_timedelta_tag_ignored_when_appending(bigquery_client, dataset_id):
736+
# First, create a table
737+
destination_table = f"{dataset_id}.test_to_gbq_timedelta_tag_ignored_when_appending"
738+
schema = [bigquery.SchemaField("my_col", "INTEGER")]
739+
bigquery_client.create_table(bigquery.Table(destination_table, schema))
740+
741+
# Then, append to that table with timedelta values
742+
df = pd.DataFrame(
743+
{
744+
"my_col": [pd.Timedelta(1, "s")],
745+
}
746+
)
747+
bpd.DataFrame(df).to_gbq(destination_table, if_exists="append")
748+
749+
table = bigquery_client.get_table(destination_table)
750+
assert table.schema[0].name == "my_col"
751+
assert table.schema[0].field_type == "INTEGER"
752+
assert table.schema[0].description is None
753+
754+
700755
@pytest.mark.parametrize(
701756
("index"),
702757
[True, False],

0 commit comments

Comments
 (0)