Skip to content

Commit c37afe5

Browse files
authored
Use job_config.schema for data type conversion if specified in load_table_from_dataframe. (#8105)
* Use `job_config.schema` for data type conversion if specified in `load_table_from_dataframe`. Use the BigQuery schema to inform encoding of file used in load job. This fixes an issue where a dataframe with ambiguous types (such as an `object` column containing all `None` values) could not be appended to an existing table, since the schemas wouldn't match in most cases. * Add system test for loading dataframe with non-nulls and explicit schema.
1 parent 4763bcf commit c37afe5

File tree

5 files changed

+824
-2
lines changed

5 files changed

+824
-2
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# Copyright 2019 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Shared helper functions for connecting BigQuery and pandas."""
16+
17+
try:
18+
import pyarrow
19+
import pyarrow.parquet
20+
except ImportError: # pragma: NO COVER
21+
pyarrow = None
22+
23+
from google.cloud.bigquery import schema
24+
25+
26+
STRUCT_TYPES = ("RECORD", "STRUCT")
27+
28+
29+
def pyarrow_datetime():
30+
return pyarrow.timestamp("us", tz=None)
31+
32+
33+
def pyarrow_numeric():
34+
return pyarrow.decimal128(38, 9)
35+
36+
37+
def pyarrow_time():
38+
return pyarrow.time64("us")
39+
40+
41+
def pyarrow_timestamp():
42+
return pyarrow.timestamp("us", tz="UTC")
43+
44+
45+
if pyarrow:
46+
BQ_TO_ARROW_SCALARS = {
47+
"BOOL": pyarrow.bool_,
48+
"BOOLEAN": pyarrow.bool_,
49+
"BYTES": pyarrow.binary,
50+
"DATE": pyarrow.date32,
51+
"DATETIME": pyarrow_datetime,
52+
"FLOAT": pyarrow.float64,
53+
"FLOAT64": pyarrow.float64,
54+
"GEOGRAPHY": pyarrow.string,
55+
"INT64": pyarrow.int64,
56+
"INTEGER": pyarrow.int64,
57+
"NUMERIC": pyarrow_numeric,
58+
"STRING": pyarrow.string,
59+
"TIME": pyarrow_time,
60+
"TIMESTAMP": pyarrow_timestamp,
61+
}
62+
else: # pragma: NO COVER
63+
BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER
64+
65+
66+
def bq_to_arrow_struct_data_type(field):
67+
arrow_fields = []
68+
for subfield in field.fields:
69+
arrow_subfield = bq_to_arrow_field(subfield)
70+
if arrow_subfield:
71+
arrow_fields.append(arrow_subfield)
72+
else:
73+
# Could not determine a subfield type. Fallback to type
74+
# inference.
75+
return None
76+
return pyarrow.struct(arrow_fields)
77+
78+
79+
def bq_to_arrow_data_type(field):
80+
"""Return the Arrow data type, corresponding to a given BigQuery column.
81+
82+
Returns None if default Arrow type inspection should be used.
83+
"""
84+
if field.mode is not None and field.mode.upper() == "REPEATED":
85+
inner_type = bq_to_arrow_data_type(
86+
schema.SchemaField(field.name, field.field_type)
87+
)
88+
if inner_type:
89+
return pyarrow.list_(inner_type)
90+
return None
91+
92+
if field.field_type.upper() in STRUCT_TYPES:
93+
return bq_to_arrow_struct_data_type(field)
94+
95+
data_type_constructor = BQ_TO_ARROW_SCALARS.get(field.field_type.upper())
96+
if data_type_constructor is None:
97+
return None
98+
return data_type_constructor()
99+
100+
101+
def bq_to_arrow_field(bq_field):
102+
"""Return the Arrow field, corresponding to a given BigQuery column.
103+
104+
Returns None if the Arrow type cannot be determined.
105+
"""
106+
arrow_type = bq_to_arrow_data_type(bq_field)
107+
if arrow_type:
108+
is_nullable = bq_field.mode.upper() == "NULLABLE"
109+
return pyarrow.field(bq_field.name, arrow_type, nullable=is_nullable)
110+
return None
111+
112+
113+
def bq_to_arrow_array(series, bq_field):
114+
arrow_type = bq_to_arrow_data_type(bq_field)
115+
if bq_field.mode.upper() == "REPEATED":
116+
return pyarrow.ListArray.from_pandas(series, type=arrow_type)
117+
if bq_field.field_type.upper() in STRUCT_TYPES:
118+
return pyarrow.StructArray.from_pandas(series, type=arrow_type)
119+
return pyarrow.array(series, type=arrow_type)
120+
121+
122+
def to_parquet(dataframe, bq_schema, filepath):
123+
"""Write dataframe as a Parquet file, according to the desired BQ schema.
124+
125+
This function requires the :mod:`pyarrow` package. Arrow is used as an
126+
intermediate format.
127+
128+
Args:
129+
dataframe (pandas.DataFrame):
130+
DataFrame to convert to convert to Parquet file.
131+
bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]):
132+
Desired BigQuery schema. Number of columns must match number of
133+
columns in the DataFrame.
134+
filepath (str):
135+
Path to write Parquet file to.
136+
"""
137+
if pyarrow is None:
138+
raise ValueError("pyarrow is required for BigQuery schema conversion.")
139+
140+
if len(bq_schema) != len(dataframe.columns):
141+
raise ValueError(
142+
"Number of columns in schema must match number of columns in dataframe."
143+
)
144+
145+
arrow_arrays = []
146+
arrow_names = []
147+
for bq_field in bq_schema:
148+
arrow_names.append(bq_field.name)
149+
arrow_arrays.append(bq_to_arrow_array(dataframe[bq_field.name], bq_field))
150+
151+
arrow_table = pyarrow.Table.from_arrays(arrow_arrays, names=arrow_names)
152+
pyarrow.parquet.write_table(arrow_table, filepath)

bigquery/google/cloud/bigquery/client.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
from google.cloud.bigquery._helpers import _record_field_to_json
4545
from google.cloud.bigquery._helpers import _str_or_none
4646
from google.cloud.bigquery._http import Connection
47+
from google.cloud.bigquery import _pandas_helpers
4748
from google.cloud.bigquery.dataset import Dataset
4849
from google.cloud.bigquery.dataset import DatasetListItem
4950
from google.cloud.bigquery.dataset import DatasetReference
@@ -1271,9 +1272,16 @@ def load_table_from_dataframe(
12711272
project (str, optional):
12721273
Project ID of the project of where to run the job. Defaults
12731274
to the client's project.
1274-
job_config (google.cloud.bigquery.job.LoadJobConfig, optional):
1275+
job_config (~google.cloud.bigquery.job.LoadJobConfig, optional):
12751276
Extra configuration options for the job.
12761277
1278+
To override the default pandas data type conversions, supply
1279+
a value for
1280+
:attr:`~google.cloud.bigquery.job.LoadJobConfig.schema` with
1281+
column names matching those of the dataframe. The BigQuery
1282+
schema is used to determine the correct data type conversion.
1283+
Indexes are not loaded. Requires the :mod:`pyarrow` library.
1284+
12771285
Returns:
12781286
google.cloud.bigquery.job.LoadJob: A new load job.
12791287
@@ -1296,7 +1304,10 @@ def load_table_from_dataframe(
12961304
os.close(tmpfd)
12971305

12981306
try:
1299-
dataframe.to_parquet(tmppath)
1307+
if job_config.schema:
1308+
_pandas_helpers.to_parquet(dataframe, job_config.schema, tmppath)
1309+
else:
1310+
dataframe.to_parquet(tmppath)
13001311

13011312
with open(tmppath, "rb") as parquet_file:
13021313
return self.load_table_from_file(

bigquery/tests/system.py

+158
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
import six
2929
import pytest
30+
import pytz
3031

3132
try:
3233
from google.cloud import bigquery_storage_v1beta1
@@ -36,6 +37,10 @@
3637
import pandas
3738
except ImportError: # pragma: NO COVER
3839
pandas = None
40+
try:
41+
import pyarrow
42+
except ImportError: # pragma: NO COVER
43+
pyarrow = None
3944
try:
4045
import IPython
4146
from IPython.utils import io
@@ -622,6 +627,159 @@ def test_load_table_from_local_avro_file_then_dump_table(self):
622627
sorted(row_tuples, key=by_wavelength), sorted(ROWS, key=by_wavelength)
623628
)
624629

630+
@unittest.skipIf(pandas is None, "Requires `pandas`")
631+
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
632+
def test_load_table_from_dataframe_w_nulls(self):
633+
"""Test that a DataFrame with null columns can be uploaded if a
634+
BigQuery schema is specified.
635+
636+
See: https://github.com/googleapis/google-cloud-python/issues/7370
637+
"""
638+
# Schema with all scalar types.
639+
scalars_schema = (
640+
bigquery.SchemaField("bool_col", "BOOLEAN"),
641+
bigquery.SchemaField("bytes_col", "BYTES"),
642+
bigquery.SchemaField("date_col", "DATE"),
643+
bigquery.SchemaField("dt_col", "DATETIME"),
644+
bigquery.SchemaField("float_col", "FLOAT"),
645+
bigquery.SchemaField("geo_col", "GEOGRAPHY"),
646+
bigquery.SchemaField("int_col", "INTEGER"),
647+
bigquery.SchemaField("num_col", "NUMERIC"),
648+
bigquery.SchemaField("str_col", "STRING"),
649+
bigquery.SchemaField("time_col", "TIME"),
650+
bigquery.SchemaField("ts_col", "TIMESTAMP"),
651+
)
652+
table_schema = scalars_schema + (
653+
# TODO: Array columns can't be read due to NULLABLE versus REPEATED
654+
# mode mismatch. See:
655+
# https://issuetracker.google.com/133415569#comment3
656+
# bigquery.SchemaField("array_col", "INTEGER", mode="REPEATED"),
657+
# TODO: Support writing StructArrays to Parquet. See:
658+
# https://jira.apache.org/jira/browse/ARROW-2587
659+
# bigquery.SchemaField("struct_col", "RECORD", fields=scalars_schema),
660+
)
661+
num_rows = 100
662+
nulls = [None] * num_rows
663+
dataframe = pandas.DataFrame(
664+
{
665+
"bool_col": nulls,
666+
"bytes_col": nulls,
667+
"date_col": nulls,
668+
"dt_col": nulls,
669+
"float_col": nulls,
670+
"geo_col": nulls,
671+
"int_col": nulls,
672+
"num_col": nulls,
673+
"str_col": nulls,
674+
"time_col": nulls,
675+
"ts_col": nulls,
676+
}
677+
)
678+
679+
dataset_id = _make_dataset_id("bq_load_test")
680+
self.temp_dataset(dataset_id)
681+
table_id = "{}.{}.load_table_from_dataframe_w_nulls".format(
682+
Config.CLIENT.project, dataset_id
683+
)
684+
685+
# Create the table before loading so that schema mismatch errors are
686+
# identified.
687+
table = retry_403(Config.CLIENT.create_table)(
688+
Table(table_id, schema=table_schema)
689+
)
690+
self.to_delete.insert(0, table)
691+
692+
job_config = bigquery.LoadJobConfig(schema=table_schema)
693+
load_job = Config.CLIENT.load_table_from_dataframe(
694+
dataframe, table_id, job_config=job_config
695+
)
696+
load_job.result()
697+
698+
table = Config.CLIENT.get_table(table)
699+
self.assertEqual(tuple(table.schema), table_schema)
700+
self.assertEqual(table.num_rows, num_rows)
701+
702+
@unittest.skipIf(pandas is None, "Requires `pandas`")
703+
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
704+
def test_load_table_from_dataframe_w_explicit_schema(self):
705+
# Schema with all scalar types.
706+
scalars_schema = (
707+
bigquery.SchemaField("bool_col", "BOOLEAN"),
708+
bigquery.SchemaField("bytes_col", "BYTES"),
709+
bigquery.SchemaField("date_col", "DATE"),
710+
bigquery.SchemaField("dt_col", "DATETIME"),
711+
bigquery.SchemaField("float_col", "FLOAT"),
712+
bigquery.SchemaField("geo_col", "GEOGRAPHY"),
713+
bigquery.SchemaField("int_col", "INTEGER"),
714+
bigquery.SchemaField("num_col", "NUMERIC"),
715+
bigquery.SchemaField("str_col", "STRING"),
716+
bigquery.SchemaField("time_col", "TIME"),
717+
bigquery.SchemaField("ts_col", "TIMESTAMP"),
718+
)
719+
table_schema = scalars_schema + (
720+
# TODO: Array columns can't be read due to NULLABLE versus REPEATED
721+
# mode mismatch. See:
722+
# https://issuetracker.google.com/133415569#comment3
723+
# bigquery.SchemaField("array_col", "INTEGER", mode="REPEATED"),
724+
# TODO: Support writing StructArrays to Parquet. See:
725+
# https://jira.apache.org/jira/browse/ARROW-2587
726+
# bigquery.SchemaField("struct_col", "RECORD", fields=scalars_schema),
727+
)
728+
dataframe = pandas.DataFrame(
729+
{
730+
"bool_col": [True, None, False],
731+
"bytes_col": [b"abc", None, b"def"],
732+
"date_col": [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)],
733+
"dt_col": [
734+
datetime.datetime(1, 1, 1, 0, 0, 0),
735+
None,
736+
datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
737+
],
738+
"float_col": [float("-inf"), float("nan"), float("inf")],
739+
"geo_col": [
740+
"POINT(30 10)",
741+
None,
742+
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
743+
],
744+
"int_col": [-9223372036854775808, None, 9223372036854775807],
745+
"num_col": [
746+
decimal.Decimal("-99999999999999999999999999999.999999999"),
747+
None,
748+
decimal.Decimal("99999999999999999999999999999.999999999"),
749+
],
750+
"str_col": ["abc", None, "def"],
751+
"time_col": [
752+
datetime.time(0, 0, 0),
753+
None,
754+
datetime.time(23, 59, 59, 999999),
755+
],
756+
"ts_col": [
757+
datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
758+
None,
759+
datetime.datetime(
760+
9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc
761+
),
762+
],
763+
},
764+
dtype="object",
765+
)
766+
767+
dataset_id = _make_dataset_id("bq_load_test")
768+
self.temp_dataset(dataset_id)
769+
table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema".format(
770+
Config.CLIENT.project, dataset_id
771+
)
772+
773+
job_config = bigquery.LoadJobConfig(schema=table_schema)
774+
load_job = Config.CLIENT.load_table_from_dataframe(
775+
dataframe, table_id, job_config=job_config
776+
)
777+
load_job.result()
778+
779+
table = Config.CLIENT.get_table(table_id)
780+
self.assertEqual(tuple(table.schema), table_schema)
781+
self.assertEqual(table.num_rows, 3)
782+
625783
def test_load_avro_from_uri_then_dump_table(self):
626784
from google.cloud.bigquery.job import CreateDisposition
627785
from google.cloud.bigquery.job import SourceFormat

0 commit comments

Comments
 (0)