Skip to content

Commit ee48df8

Browse files
committed
fix(bigquery): interpret datetime columns from pandas dataframe as nanoseconds
Also: * Enable TIMESTAMP and DATETIME unit tests for `_pandas_helpers`. * Add more data types to load dataframe sample.
1 parent 154c8ec commit ee48df8

File tree

3 files changed

+178
-36
lines changed

3 files changed

+178
-36
lines changed

bigquery/samples/load_table_dataframe.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@
1616
def load_table_dataframe(client, table_id):
1717

1818
# [START bigquery_load_table_dataframe]
19-
from google.cloud import bigquery
19+
import datetime
2020

21+
from google.cloud import bigquery
2122
import pandas
23+
import pytz
2224

2325
# TODO(developer): Construct a BigQuery client object.
2426
# client = bigquery.Client()
@@ -27,16 +29,54 @@ def load_table_dataframe(client, table_id):
2729
# table_id = "your-project.your_dataset.your_table_name"
2830

2931
records = [
30-
{"title": u"The Meaning of Life", "release_year": 1983},
31-
{"title": u"Monty Python and the Holy Grail", "release_year": 1975},
32-
{"title": u"Life of Brian", "release_year": 1979},
33-
{"title": u"And Now for Something Completely Different", "release_year": 1971},
32+
{
33+
"title": u"The Meaning of Life",
34+
"release_year": 1983,
35+
"length_minutes": 112.5,
36+
"release_date": datetime.datetime(
37+
1983, 5, 9, 13, 0, 0, tzinfo=pytz.timezone("Europe/Paris")
38+
),
39+
"dvd_release": datetime.datetime(2002, 1, 22, 7, 0, 0),
40+
},
41+
{
42+
"title": u"Monty Python and the Holy Grail",
43+
"release_year": 1975,
44+
"length_minutes": 91.5,
45+
"release_date": datetime.datetime(
46+
1975, 4, 9, 23, 59, 2, tzinfo=pytz.timezone("Europe/London")
47+
),
48+
"dvd_release": datetime.datetime(2002, 7, 16, 9, 0, 0),
49+
},
50+
{
51+
"title": u"Life of Brian",
52+
"release_year": 1979,
53+
"length_minutes": 94.25,
54+
"release_date": datetime.datetime(
55+
1979, 8, 17, 23, 59, 5, tzinfo=pytz.timezone("America/New_York")
56+
),
57+
"dvd_release": datetime.datetime(2008, 1, 14, 8, 0, 0),
58+
},
59+
{
60+
"title": u"And Now for Something Completely Different",
61+
"release_year": 1971,
62+
"length_minutes": 88.0,
63+
"release_date": datetime.datetime(
64+
1971, 9, 28, 23, 59, 7, tzinfo=pytz.timezone("Europe/London")
65+
),
66+
"dvd_release": datetime.datetime(2003, 10, 22, 10, 0, 0),
67+
},
3468
]
3569
dataframe = pandas.DataFrame(
3670
records,
3771
# In the loaded table, the column order reflects the order of the
3872
# columns in the DataFrame.
39-
columns=["title", "release_year"],
73+
columns=[
74+
"title",
75+
"release_year",
76+
"length_minutes",
77+
"release_date",
78+
"dvd_release",
79+
],
4080
# Optionally, set a named index, which can also be written to the
4181
# BigQuery table.
4282
index=pandas.Index(

bigquery/samples/tests/test_load_table_dataframe.py

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import datetime
16+
1517
import pytest
18+
import pytz
1619

1720
from .. import load_table_dataframe
1821

@@ -25,7 +28,80 @@ def test_load_table_dataframe(capsys, client, random_table_id):
2528

2629
table = load_table_dataframe.load_table_dataframe(client, random_table_id)
2730
out, _ = capsys.readouterr()
28-
assert "Loaded 4 rows and 3 columns" in out
31+
expected_column_names = [
32+
"wikidata_id",
33+
"title",
34+
"release_year",
35+
"length_minutes",
36+
"release_date",
37+
"dvd_release",
38+
]
39+
assert "Loaded 4 rows and {} columns".format(len(expected_column_names)) in out
2940

3041
column_names = [field.name for field in table.schema]
31-
assert column_names == ["wikidata_id", "title", "release_year"]
42+
assert column_names == expected_column_names
43+
column_types = [field.field_type for field in table.schema]
44+
assert column_types == [
45+
"STRING",
46+
"STRING",
47+
"INTEGER",
48+
"FLOAT",
49+
"TIMESTAMP",
50+
"DATETIME",
51+
]
52+
53+
df = client.list_rows(table).to_dataframe()
54+
df.sort_values("release_year", inplace=True)
55+
expected_df = pandas.DataFrame(
56+
[
57+
{
58+
"title": u"And Now for Something Completely Different",
59+
"release_year": 1971,
60+
"length_minutes": 88.0,
61+
"release_date": datetime.datetime(
62+
1971, 9, 28, 23, 59, 7, tzinfo=pytz.timezone("Europe/London")
63+
),
64+
"dvd_release": datetime.datetime(2003, 10, 22, 10, 0, 0),
65+
"wikidata_id": u"Q16403",
66+
},
67+
{
68+
"title": u"Monty Python and the Holy Grail",
69+
"release_year": 1975,
70+
"length_minutes": 91.5,
71+
"release_date": datetime.datetime(
72+
1975, 4, 9, 23, 59, 2, tzinfo=pytz.timezone("Europe/London")
73+
),
74+
"dvd_release": datetime.datetime(2002, 7, 16, 9, 0, 0),
75+
"wikidata_id": u"Q25043",
76+
},
77+
{
78+
"title": u"Life of Brian",
79+
"release_year": 1979,
80+
"length_minutes": 94.25,
81+
"release_date": datetime.datetime(
82+
1979, 8, 17, 23, 59, 5, tzinfo=pytz.timezone("America/New_York")
83+
),
84+
"dvd_release": datetime.datetime(2008, 1, 14, 8, 0, 0),
85+
"wikidata_id": u"Q24953",
86+
},
87+
{
88+
"title": u"The Meaning of Life",
89+
"release_year": 1983,
90+
"length_minutes": 112.5,
91+
"release_date": datetime.datetime(
92+
1983, 5, 9, 13, 0, 0, tzinfo=pytz.timezone("Europe/Paris")
93+
),
94+
"dvd_release": datetime.datetime(2002, 1, 22, 7, 0, 0),
95+
"wikidata_id": u"Q24980",
96+
},
97+
],
98+
columns=[
99+
"title",
100+
"release_year",
101+
"length_minutes",
102+
"release_date",
103+
"dvd_release",
104+
"wikidata_id",
105+
]
106+
)
107+
assert df.equals(expected_df)

bigquery/tests/unit/test__pandas_helpers.py

Lines changed: 54 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -386,20 +386,15 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test):
386386
),
387387
("BOOLEAN", [True, None, False, None]),
388388
("BOOL", [False, None, True, None]),
389-
# TODO: Once https://issues.apache.org/jira/browse/ARROW-5450 is
390-
# resolved, test with TIMESTAMP column. Conversion from pyarrow
391-
# TimestampArray to list of Python objects fails with OverflowError:
392-
# Python int too large to convert to C long.
393-
#
394-
# (
395-
# "TIMESTAMP",
396-
# [
397-
# datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
398-
# None,
399-
# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc),
400-
# datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
401-
# ],
402-
# ),
389+
(
390+
"TIMESTAMP",
391+
[
392+
datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
393+
None,
394+
datetime.datetime(9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc),
395+
datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc),
396+
],
397+
),
403398
(
404399
"DATE",
405400
[
@@ -418,20 +413,15 @@ def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test):
418413
datetime.time(12, 0, 0),
419414
],
420415
),
421-
# TODO: Once https://issues.apache.org/jira/browse/ARROW-5450 is
422-
# resolved, test with DATETIME column. Conversion from pyarrow
423-
# TimestampArray to list of Python objects fails with OverflowError:
424-
# Python int too large to convert to C long.
425-
#
426-
# (
427-
# "DATETIME",
428-
# [
429-
# datetime.datetime(1, 1, 1, 0, 0, 0),
430-
# None,
431-
# datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
432-
# datetime.datetime(1970, 1, 1, 0, 0, 0),
433-
# ],
434-
# ),
416+
(
417+
"DATETIME",
418+
[
419+
datetime.datetime(1, 1, 1, 0, 0, 0),
420+
None,
421+
datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
422+
datetime.datetime(1970, 1, 1, 0, 0, 0),
423+
],
424+
),
435425
(
436426
"GEOGRAPHY",
437427
[
@@ -453,6 +443,42 @@ def test_bq_to_arrow_array_w_nullable_scalars(module_under_test, bq_type, rows):
453443
assert rows == roundtrip
454444

455445

446+
@pytest.mark.parametrize(
447+
"bq_type,rows",
448+
[
449+
(
450+
"TIMESTAMP",
451+
[
452+
"1971-09-28T23:59:07+00:00",
453+
"1975-04-09T23:59:02+00:00",
454+
"1979-08-17T23:59:05+00:00",
455+
"NaT",
456+
"1983-05-09T13:00:00+00:00",
457+
],
458+
),
459+
(
460+
"DATETIME",
461+
[
462+
"1971-09-28T23:59:07",
463+
"1975-04-09T23:59:02",
464+
"1979-08-17T23:59:05",
465+
"NaT",
466+
"1983-05-09T13:00:00",
467+
],
468+
),
469+
],
470+
)
471+
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
472+
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
473+
def test_bq_to_arrow_array_w_pandas_timestamp(module_under_test, bq_type, rows):
474+
rows = [pandas.Timestamp(row) for row in rows]
475+
series = pandas.Series(rows)
476+
bq_field = schema.SchemaField("field_name", bq_type)
477+
arrow_array = module_under_test.bq_to_arrow_array(series, bq_field)
478+
roundtrip = arrow_array.to_pandas()
479+
assert series.equals(roundtrip)
480+
481+
456482
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
457483
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
458484
def test_bq_to_arrow_array_w_arrays(module_under_test):

0 commit comments

Comments
 (0)