Skip to content

Commit 1e3feda

Browse files
feat: add DataFrame.to_arrow to create Arrow Table from DataFrame (#807)
* feat: add `DataFrame.to_arrow` to create Arrow Table from DataFrame * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * type annotation for sample * align index names in to_arrow * better assertions --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 3da559e commit 1e3feda

File tree

8 files changed

+500
-0
lines changed

8 files changed

+500
-0
lines changed

bigframes/core/blocks.py

+30
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,36 @@ def _validate_result_schema(self, result_df: pd.DataFrame):
467467
f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}"
468468
)
469469

470+
def to_arrow(
471+
self,
472+
*,
473+
ordered: bool = True,
474+
) -> Tuple[pa.Table, bigquery.QueryJob]:
475+
"""Run query and download results as a pyarrow Table."""
476+
# pa.Table.from_pandas puts index columns last, so update the expression to match.
477+
expr = self.expr.select_columns(
478+
list(self.value_columns) + list(self.index_columns)
479+
)
480+
481+
_, query_job = self.session._query_to_destination(
482+
self.session._to_sql(expr, ordered=ordered),
483+
list(self.index_columns),
484+
api_name="cached",
485+
do_clustering=False,
486+
)
487+
results_iterator = query_job.result()
488+
pa_table = results_iterator.to_arrow()
489+
490+
pa_index_labels = []
491+
for index_level, index_label in enumerate(self._index_labels):
492+
if isinstance(index_label, str):
493+
pa_index_labels.append(index_label)
494+
else:
495+
pa_index_labels.append(f"__index_level_{index_level}__")
496+
497+
pa_table = pa_table.rename_columns(list(self.column_labels) + pa_index_labels)
498+
return pa_table, query_job
499+
470500
def to_pandas(
471501
self,
472502
max_download_size: Optional[int] = None,

bigframes/dataframe.py

+29
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import numpy
4545
import pandas
4646
import pandas.io.formats.format
47+
import pyarrow
4748
import tabulate
4849

4950
import bigframes
@@ -1183,6 +1184,34 @@ def cov(self, *, numeric_only: bool = False) -> DataFrame:
11831184

11841185
return DataFrame(frame._block.calculate_pairwise_metric(agg_ops.CovOp()))
11851186

1187+
def to_arrow(
1188+
self,
1189+
*,
1190+
ordered: Optional[bool] = None,
1191+
) -> pyarrow.Table:
1192+
"""Write DataFrame to an Arrow table / record batch.
1193+
1194+
Args:
1195+
ordered (bool, default None):
1196+
Determines whether the resulting Arrow table will be deterministically ordered.
1197+
In some cases, unordered may result in a faster-executing query. If set to a value
1198+
other than None, will override Session default.
1199+
1200+
Returns:
1201+
pyarrow.Table: A pyarrow Table with all rows and columns of this DataFrame.
1202+
"""
1203+
warnings.warn(
1204+
"to_arrow is in preview. Types and unnamed / duplicate name columns may change in future.",
1205+
category=bigframes.exceptions.PreviewWarning,
1206+
)
1207+
1208+
self._optimize_query_complexity()
1209+
pa_table, query_job = self._block.to_arrow(
1210+
ordered=ordered if ordered is not None else self._session._strictly_ordered,
1211+
)
1212+
self._set_internal_query_job(query_job)
1213+
return pa_table
1214+
11861215
def to_pandas(
11871216
self,
11881217
max_download_size: Optional[int] = None,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def test_create_polars_df() -> None:
17+
# [START bigquery_dataframes_to_polars]
18+
import polars
19+
20+
import bigframes.enums
21+
import bigframes.pandas as bpd
22+
23+
bf_df = bpd.read_gbq_table(
24+
"bigquery-public-data.usa_names.usa_1910_current",
25+
# Setting index_col to either a unique column or NULL will give the
26+
# best performance.
27+
index_col=bigframes.enums.DefaultIndexKind.NULL,
28+
)
29+
# TODO(developer): Do some analysis using BigQuery DataFrames.
30+
# ...
31+
32+
# Run the query and download the results as an Arrow table to convert into
33+
# a Polars DataFrame. Use ordered=False if your polars analysis is OK with
34+
# non-deterministic ordering.
35+
arrow_table = bf_df.to_arrow(ordered=False)
36+
polars_df = polars.from_arrow(arrow_table)
37+
# [END bigquery_dataframes_to_polars]
38+
39+
assert polars_df.shape == bf_df.shape
40+
assert polars_df["number"].sum() == bf_df["number"].sum()

0 commit comments

Comments
 (0)