Skip to content

feat: use ArrowDtype for STRUCT columns in to_pandas #85

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,9 @@ def _to_dataframe(
cls, result, schema: typing.Mapping[str, bigframes.dtypes.Dtype]
) -> pd.DataFrame:
"""Convert BigQuery data to pandas DataFrame with specific dtypes."""
dtypes = bigframes.dtypes.to_pandas_dtypes_overrides(result.schema)
df = result.to_dataframe(
dtypes=dtypes,
bool_dtype=pd.BooleanDtype(),
int_dtype=pd.Int64Dtype(),
float_dtype=pd.Float64Dtype(),
Expand Down
17 changes: 17 additions & 0 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from typing import Any, Dict, Iterable, Literal, Tuple, Union

import geopandas as gpd # type: ignore
import google.cloud.bigquery as bigquery
import google.cloud.bigquery._pandas_helpers
import ibis
import ibis.expr.datatypes as ibis_dtypes
import ibis.expr.types as ibis_types
Expand Down Expand Up @@ -401,3 +403,18 @@ def cast_ibis_value(
raise TypeError(
f"Unsupported cast {value.type()} to {to_type}. {constants.FEEDBACK_LINK}"
)


def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict:
"""For each STRUCT field, make sure we specify the full type to use."""
# TODO(swast): Also override ARRAY fields.
dtypes = {}
for field in schema:
if field.field_type == "RECORD" and field.mode != "REPEATED":
# TODO(swast): We're using a private API here. Would likely be
# better if we called `to_arrow()` and converted to a pandas
# DataFrame ourselves from that.
dtypes[field.name] = pd.ArrowDtype(
google.cloud.bigquery._pandas_helpers.bq_to_arrow_data_type(field)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per discussion with @Linchin I've copied this private method into our third_party folder.

)
return dtypes
23 changes: 20 additions & 3 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import google.api_core.exceptions
import pandas as pd
import pyarrow as pa
import pytest

from tests.system.utils import (
Expand Down Expand Up @@ -44,7 +45,7 @@ def test_to_pandas_w_correct_dtypes(scalars_df_default_index):


def test_to_pandas_array_struct_correct_result(session):
"""In future, we should support arrays and structs with arrow types.
"""In future, we should support arrays with arrow types.
For now we fall back to the current connector behavior of converting
to Python objects"""
df = session.read_gbq(
Expand All @@ -59,11 +60,27 @@ def test_to_pandas_array_struct_correct_result(session):
expected = pd.DataFrame(
{
"array_column": [[1, 3, 2]],
"struct_column": [{"string_field": "a", "float_field": 1.2}],
"struct_column": pd.Series(
[{"string_field": "a", "float_field": 1.2}],
dtype=pd.ArrowDtype(
pa.struct(
[
("string_field", pa.string()),
("float_field", pa.float64()),
]
)
),
),
}
)
expected.index = expected.index.astype("Int64")
pd.testing.assert_frame_equal(result, expected)
pd.testing.assert_series_equal(result.dtypes, expected.dtypes)
pd.testing.assert_series_equal(result["array_column"], expected["array_column"])
# assert_series_equal not implemented for struct columns yet. Compare
# values as Python objects, instead.
pd.testing.assert_series_equal(
result["struct_column"].astype("O"), expected["struct_column"].astype("O")
)


@pytest.mark.parametrize(
Expand Down