Skip to content

fix: Allow to_pandas to download more than 10GB #637

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,9 +513,14 @@ def _materialize_local(
) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
"""Run query and download results as a pandas DataFrame. Return the total number of results as well."""
# TODO(swast): Allow for dry run and timeout.
results_iterator, query_job = self.session._execute(
self.expr, sorted=materialize_options.ordered
_, query_job = self.session._query_to_destination(
self.session._to_sql(self.expr, sorted=True),
list(self.index_columns),
api_name="cached",
do_clustering=False,
)
results_iterator = query_job.result()

table_size = (
self.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES
)
Expand Down
16 changes: 10 additions & 6 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,8 @@ def _query_to_destination(
index_cols: List[str],
api_name: str,
configuration: dict = {"query": {"useQueryCache": True}},
) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]:
do_clustering=True,
) -> Tuple[Optional[bigquery.TableReference], bigquery.QueryJob]:
# If a dry_run indicates this is not a query type job, then don't
# bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement.
dry_run_config = bigquery.QueryJobConfig()
Expand All @@ -442,11 +443,14 @@ def _query_to_destination(
# internal issue 303057336.
# Since we have a `statement_type == 'SELECT'`, schema should be populated.
schema = typing.cast(Iterable[bigquery.SchemaField], dry_run_job.schema)
cluster_cols = [
item.name
for item in schema
if (item.name in index_cols) and _can_cluster_bq(item)
][:_MAX_CLUSTER_COLUMNS]
if do_clustering:
cluster_cols = [
item.name
for item in schema
if (item.name in index_cols) and _can_cluster_bq(item)
][:_MAX_CLUSTER_COLUMNS]
else:
cluster_cols = []
temp_table = self._create_empty_temp_table(schema, cluster_cols)

timeout_ms = configuration.get("jobTimeoutMs") or configuration["query"].get(
Expand Down
11 changes: 11 additions & 0 deletions tests/system/load/test_large_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,14 @@ def test_to_pandas_batches_large_table():
del df

assert row_count == expected_row_count


def test_to_pandas_large_table():
df = bpd.read_gbq("load_testing.scalars_10gb")
# df will be downloaded locally
expected_row_count, expected_column_count = df.shape

df = df.to_pandas()
row_count, column_count = df.shape
assert column_count == expected_column_count
assert row_count == expected_row_count