Skip to content

Commit e6c0cd1

Browse files
authored
perf: if primary keys are defined, read_gbq avoids copying table data (#112)
We make the same uniqueness assumption as the query engine and use these columns as the total ordering. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue b/305260214 🦕
1 parent 752a1d6 commit e6c0cd1

File tree

5 files changed

+153
-17
lines changed

5 files changed

+153
-17
lines changed

bigframes/core/indexers.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import pandas as pd
2222

2323
import bigframes.constants as constants
24+
import bigframes.core.blocks
2425
import bigframes.core.guid as guid
2526
import bigframes.core.indexes as indexes
2627
import bigframes.core.scalar
@@ -214,7 +215,7 @@ def __getitem__(self, key: tuple) -> bigframes.core.scalar.Scalar:
214215
raise ValueError(error_message)
215216
if len(key) != 2:
216217
raise TypeError(error_message)
217-
block = self._dataframe._block
218+
block: bigframes.core.blocks.Block = self._dataframe._block
218219
column_block = block.select_columns([block.value_columns[key[1]]])
219220
column = bigframes.series.Series(column_block)
220221
return column.iloc[key[0]]

bigframes/session.py

+95-15
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,8 @@ def read_gbq_query(
498498
499499
See also: :meth:`Session.read_gbq`.
500500
"""
501+
# NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
502+
# these docstrings are inline.
501503
return self._read_gbq_query(
502504
query=query,
503505
index_col=index_col,
@@ -515,8 +517,6 @@ def _read_gbq_query(
515517
max_results: Optional[int] = None,
516518
api_name: str,
517519
) -> dataframe.DataFrame:
518-
# NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
519-
# these docstrings are inline.
520520
if isinstance(index_col, str):
521521
index_cols = [index_col]
522522
else:
@@ -561,6 +561,8 @@ def read_gbq_table(
561561
562562
See also: :meth:`Session.read_gbq`.
563563
"""
564+
# NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
565+
# these docstrings are inline.
564566
return self._read_gbq_table(
565567
query=query,
566568
index_col=index_col,
@@ -569,6 +571,62 @@ def read_gbq_table(
569571
api_name="read_gbq_table",
570572
)
571573

574+
def _read_gbq_table_to_ibis_with_total_ordering(
575+
self,
576+
table_ref: bigquery.table.TableReference,
577+
*,
578+
api_name: str,
579+
) -> Tuple[ibis_types.Table, Optional[Sequence[str]]]:
580+
"""Create a read-only Ibis table expression representing a table.
581+
582+
If we can get a total ordering from the table, such as via primary key
583+
column(s), then return those too so that ordering generation can be
584+
avoided.
585+
"""
586+
if table_ref.dataset_id.upper() == "_SESSION":
587+
# _SESSION tables aren't supported by the tables.get REST API.
588+
return (
589+
self.ibis_client.sql(
590+
f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`"
591+
),
592+
None,
593+
)
594+
595+
table_expression = self.ibis_client.table(
596+
table_ref.table_id,
597+
database=f"{table_ref.project}.{table_ref.dataset_id}",
598+
)
599+
600+
# If there are primary keys defined, the query engine assumes these
601+
# columns are unique, even if the constraint is not enforced. We make
602+
# the same assumption and use these columns as the total ordering keys.
603+
table = self.bqclient.get_table(table_ref)
604+
605+
# TODO(b/305264153): Use public properties to fetch primary keys once
606+
# added to google-cloud-bigquery.
607+
primary_keys = (
608+
table._properties.get("tableConstraints", {})
609+
.get("primaryKey", {})
610+
.get("columns")
611+
)
612+
613+
if not primary_keys:
614+
return table_expression, None
615+
else:
616+
# Read from a snapshot since we won't have to copy the table data to create a total ordering.
617+
job_config = bigquery.QueryJobConfig()
618+
job_config.labels["bigframes-api"] = api_name
619+
current_timestamp = list(
620+
self.bqclient.query(
621+
"SELECT CURRENT_TIMESTAMP() AS `current_timestamp`",
622+
job_config=job_config,
623+
).result()
624+
)[0][0]
625+
table_expression = self.ibis_client.sql(
626+
bigframes_io.create_snapshot_sql(table_ref, current_timestamp)
627+
)
628+
return table_expression, primary_keys
629+
572630
def _read_gbq_table(
573631
self,
574632
query: str,
@@ -581,24 +639,19 @@ def _read_gbq_table(
581639
if max_results and max_results <= 0:
582640
raise ValueError("`max_results` should be a positive number.")
583641

584-
# NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
585-
# these docstrings are inline.
586642
# TODO(swast): Can we re-use the temp table from other reads in the
587643
# session, if the original table wasn't modified?
588644
table_ref = bigquery.table.TableReference.from_string(
589645
query, default_project=self.bqclient.project
590646
)
591647

592-
if table_ref.dataset_id.upper() == "_SESSION":
593-
# _SESSION tables aren't supported by the tables.get REST API.
594-
table_expression = self.ibis_client.sql(
595-
f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`"
596-
)
597-
else:
598-
table_expression = self.ibis_client.table(
599-
table_ref.table_id,
600-
database=f"{table_ref.project}.{table_ref.dataset_id}",
601-
)
648+
(
649+
table_expression,
650+
total_ordering_cols,
651+
) = self._read_gbq_table_to_ibis_with_total_ordering(
652+
table_ref,
653+
api_name=api_name,
654+
)
602655

603656
for key in col_order:
604657
if key not in table_expression.columns:
@@ -624,7 +677,34 @@ def _read_gbq_table(
624677
ordering = None
625678
is_total_ordering = False
626679

627-
if len(index_cols) != 0:
680+
if total_ordering_cols is not None:
681+
# Note: currently, this a table has a total ordering only when the
682+
# primary key(s) are set on a table. The query engine assumes such
683+
# columns are unique, even if not enforced.
684+
is_total_ordering = True
685+
ordering = core.ExpressionOrdering(
686+
ordering_value_columns=[
687+
core.OrderingColumnReference(column_id)
688+
for column_id in total_ordering_cols
689+
],
690+
total_ordering_columns=frozenset(total_ordering_cols),
691+
)
692+
693+
if len(index_cols) != 0:
694+
index_labels = typing.cast(List[Optional[str]], index_cols)
695+
else:
696+
# Use the total_ordering_cols to project offsets to use as the default index.
697+
table_expression = table_expression.order_by(index_cols)
698+
default_index_id = guid.generate_guid("bigframes_index_")
699+
default_index_col = (
700+
ibis.row_number().cast(ibis_dtypes.int64).name(default_index_id)
701+
)
702+
table_expression = table_expression.mutate(
703+
**{default_index_id: default_index_col}
704+
)
705+
index_cols = [default_index_id]
706+
index_labels = [None]
707+
elif len(index_cols) != 0:
628708
index_labels = typing.cast(List[Optional[str]], index_cols)
629709
distinct_table = table_expression.select(*index_cols).distinct()
630710
is_unique_sql = f"""WITH full_table AS (

noxfile.py

-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@
8989
"system",
9090
"doctest",
9191
"cover",
92-
"release_dry_run",
9392
]
9493

9594
# Error if a python version is missing

tests/system/conftest.py

+31
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import logging
1818
import math
1919
import pathlib
20+
import textwrap
2021
import typing
2122
from typing import Dict, Optional
2223

@@ -795,6 +796,36 @@ def penguins_randomforest_classifier_model_name(
795796
return model_name
796797

797798

799+
@pytest.fixture(scope="session")
800+
def usa_names_grouped_table(
801+
session: bigframes.Session, dataset_id_permanent
802+
) -> bigquery.Table:
803+
"""Provides a table with primary key(s) set."""
804+
table_id = f"{dataset_id_permanent}.usa_names_grouped"
805+
try:
806+
return session.bqclient.get_table(table_id)
807+
except google.cloud.exceptions.NotFound:
808+
query = textwrap.dedent(
809+
f"""
810+
CREATE TABLE `{dataset_id_permanent}.usa_names_grouped`
811+
(
812+
total_people INT64,
813+
name STRING,
814+
gender STRING,
815+
year INT64,
816+
PRIMARY KEY(name, gender, year) NOT ENFORCED
817+
)
818+
AS
819+
SELECT SUM(`number`) AS total_people, name, gender, year
820+
FROM `bigquery-public-data.usa_names.usa_1910_2013`
821+
GROUP BY name, gender, year
822+
"""
823+
)
824+
job = session.bqclient.query(query)
825+
job.result()
826+
return session.bqclient.get_table(table_id)
827+
828+
798829
@pytest.fixture()
799830
def deferred_repr():
800831
bigframes.options.display.repr_mode = "deferred"

tests/system/small/test_session.py

+25
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from typing import List
2121

2222
import google.api_core.exceptions
23+
import google.cloud.bigquery as bigquery
2324
import numpy as np
2425
import pandas as pd
2526
import pytest
@@ -231,6 +232,30 @@ def test_read_gbq_w_anonymous_query_results_table(session: bigframes.Session):
231232
pd.testing.assert_frame_equal(result, expected, check_dtype=False)
232233

233234

235+
def test_read_gbq_w_primary_keys_table(
236+
session: bigframes.Session, usa_names_grouped_table: bigquery.Table
237+
):
238+
table = usa_names_grouped_table
239+
# TODO(b/305264153): Use public properties to fetch primary keys once
240+
# added to google-cloud-bigquery.
241+
primary_keys = (
242+
table._properties.get("tableConstraints", {})
243+
.get("primaryKey", {})
244+
.get("columns")
245+
)
246+
assert len(primary_keys) != 0
247+
248+
df = session.read_gbq(f"{table.project}.{table.dataset_id}.{table.table_id}")
249+
result = df.head(100).to_pandas()
250+
251+
# Verify that the DataFrame is already sorted by primary keys.
252+
sorted_result = result.sort_values(primary_keys)
253+
pd.testing.assert_frame_equal(result, sorted_result)
254+
255+
# Verify that we're working from a snapshot rather than a copy of the table.
256+
assert "FOR SYSTEM_TIME AS OF TIMESTAMP" in df.sql
257+
258+
234259
@pytest.mark.parametrize(
235260
("query_or_table", "max_results"),
236261
[

0 commit comments

Comments
 (0)