|
27 | 27 |
|
28 | 28 | import six
|
29 | 29 | import pytest
|
| 30 | +import pytz |
30 | 31 |
|
31 | 32 | try:
|
32 | 33 | from google.cloud import bigquery_storage_v1beta1
|
|
36 | 37 | import pandas
|
37 | 38 | except ImportError: # pragma: NO COVER
|
38 | 39 | pandas = None
|
| 40 | +try: |
| 41 | + import pyarrow |
| 42 | +except ImportError: # pragma: NO COVER |
| 43 | + pyarrow = None |
39 | 44 | try:
|
40 | 45 | import IPython
|
41 | 46 | from IPython.utils import io
|
@@ -622,6 +627,159 @@ def test_load_table_from_local_avro_file_then_dump_table(self):
|
622 | 627 | sorted(row_tuples, key=by_wavelength), sorted(ROWS, key=by_wavelength)
|
623 | 628 | )
|
624 | 629 |
|
| 630 | + @unittest.skipIf(pandas is None, "Requires `pandas`") |
| 631 | + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") |
| 632 | + def test_load_table_from_dataframe_w_nulls(self): |
| 633 | + """Test that a DataFrame with null columns can be uploaded if a |
| 634 | + BigQuery schema is specified. |
| 635 | +
|
| 636 | + See: https://github.com/googleapis/google-cloud-python/issues/7370 |
| 637 | + """ |
| 638 | + # Schema with all scalar types. |
| 639 | + scalars_schema = ( |
| 640 | + bigquery.SchemaField("bool_col", "BOOLEAN"), |
| 641 | + bigquery.SchemaField("bytes_col", "BYTES"), |
| 642 | + bigquery.SchemaField("date_col", "DATE"), |
| 643 | + bigquery.SchemaField("dt_col", "DATETIME"), |
| 644 | + bigquery.SchemaField("float_col", "FLOAT"), |
| 645 | + bigquery.SchemaField("geo_col", "GEOGRAPHY"), |
| 646 | + bigquery.SchemaField("int_col", "INTEGER"), |
| 647 | + bigquery.SchemaField("num_col", "NUMERIC"), |
| 648 | + bigquery.SchemaField("str_col", "STRING"), |
| 649 | + bigquery.SchemaField("time_col", "TIME"), |
| 650 | + bigquery.SchemaField("ts_col", "TIMESTAMP"), |
| 651 | + ) |
| 652 | + table_schema = scalars_schema + ( |
| 653 | + # TODO: Array columns can't be read due to NULLABLE versus REPEATED |
| 654 | + # mode mismatch. See: |
| 655 | + # https://issuetracker.google.com/133415569#comment3 |
| 656 | + # bigquery.SchemaField("array_col", "INTEGER", mode="REPEATED"), |
| 657 | + # TODO: Support writing StructArrays to Parquet. See: |
| 658 | + # https://jira.apache.org/jira/browse/ARROW-2587 |
| 659 | + # bigquery.SchemaField("struct_col", "RECORD", fields=scalars_schema), |
| 660 | + ) |
| 661 | + num_rows = 100 |
| 662 | + nulls = [None] * num_rows |
| 663 | + dataframe = pandas.DataFrame( |
| 664 | + { |
| 665 | + "bool_col": nulls, |
| 666 | + "bytes_col": nulls, |
| 667 | + "date_col": nulls, |
| 668 | + "dt_col": nulls, |
| 669 | + "float_col": nulls, |
| 670 | + "geo_col": nulls, |
| 671 | + "int_col": nulls, |
| 672 | + "num_col": nulls, |
| 673 | + "str_col": nulls, |
| 674 | + "time_col": nulls, |
| 675 | + "ts_col": nulls, |
| 676 | + } |
| 677 | + ) |
| 678 | + |
| 679 | + dataset_id = _make_dataset_id("bq_load_test") |
| 680 | + self.temp_dataset(dataset_id) |
| 681 | + table_id = "{}.{}.load_table_from_dataframe_w_nulls".format( |
| 682 | + Config.CLIENT.project, dataset_id |
| 683 | + ) |
| 684 | + |
| 685 | + # Create the table before loading so that schema mismatch errors are |
| 686 | + # identified. |
| 687 | + table = retry_403(Config.CLIENT.create_table)( |
| 688 | + Table(table_id, schema=table_schema) |
| 689 | + ) |
| 690 | + self.to_delete.insert(0, table) |
| 691 | + |
| 692 | + job_config = bigquery.LoadJobConfig(schema=table_schema) |
| 693 | + load_job = Config.CLIENT.load_table_from_dataframe( |
| 694 | + dataframe, table_id, job_config=job_config |
| 695 | + ) |
| 696 | + load_job.result() |
| 697 | + |
| 698 | + table = Config.CLIENT.get_table(table) |
| 699 | + self.assertEqual(tuple(table.schema), table_schema) |
| 700 | + self.assertEqual(table.num_rows, num_rows) |
| 701 | + |
| 702 | + @unittest.skipIf(pandas is None, "Requires `pandas`") |
| 703 | + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") |
| 704 | + def test_load_table_from_dataframe_w_explicit_schema(self): |
| 705 | + # Schema with all scalar types. |
| 706 | + scalars_schema = ( |
| 707 | + bigquery.SchemaField("bool_col", "BOOLEAN"), |
| 708 | + bigquery.SchemaField("bytes_col", "BYTES"), |
| 709 | + bigquery.SchemaField("date_col", "DATE"), |
| 710 | + bigquery.SchemaField("dt_col", "DATETIME"), |
| 711 | + bigquery.SchemaField("float_col", "FLOAT"), |
| 712 | + bigquery.SchemaField("geo_col", "GEOGRAPHY"), |
| 713 | + bigquery.SchemaField("int_col", "INTEGER"), |
| 714 | + bigquery.SchemaField("num_col", "NUMERIC"), |
| 715 | + bigquery.SchemaField("str_col", "STRING"), |
| 716 | + bigquery.SchemaField("time_col", "TIME"), |
| 717 | + bigquery.SchemaField("ts_col", "TIMESTAMP"), |
| 718 | + ) |
| 719 | + table_schema = scalars_schema + ( |
| 720 | + # TODO: Array columns can't be read due to NULLABLE versus REPEATED |
| 721 | + # mode mismatch. See: |
| 722 | + # https://issuetracker.google.com/133415569#comment3 |
| 723 | + # bigquery.SchemaField("array_col", "INTEGER", mode="REPEATED"), |
| 724 | + # TODO: Support writing StructArrays to Parquet. See: |
| 725 | + # https://jira.apache.org/jira/browse/ARROW-2587 |
| 726 | + # bigquery.SchemaField("struct_col", "RECORD", fields=scalars_schema), |
| 727 | + ) |
| 728 | + dataframe = pandas.DataFrame( |
| 729 | + { |
| 730 | + "bool_col": [True, None, False], |
| 731 | + "bytes_col": [b"abc", None, b"def"], |
| 732 | + "date_col": [datetime.date(1, 1, 1), None, datetime.date(9999, 12, 31)], |
| 733 | + "dt_col": [ |
| 734 | + datetime.datetime(1, 1, 1, 0, 0, 0), |
| 735 | + None, |
| 736 | + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), |
| 737 | + ], |
| 738 | + "float_col": [float("-inf"), float("nan"), float("inf")], |
| 739 | + "geo_col": [ |
| 740 | + "POINT(30 10)", |
| 741 | + None, |
| 742 | + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", |
| 743 | + ], |
| 744 | + "int_col": [-9223372036854775808, None, 9223372036854775807], |
| 745 | + "num_col": [ |
| 746 | + decimal.Decimal("-99999999999999999999999999999.999999999"), |
| 747 | + None, |
| 748 | + decimal.Decimal("99999999999999999999999999999.999999999"), |
| 749 | + ], |
| 750 | + "str_col": ["abc", None, "def"], |
| 751 | + "time_col": [ |
| 752 | + datetime.time(0, 0, 0), |
| 753 | + None, |
| 754 | + datetime.time(23, 59, 59, 999999), |
| 755 | + ], |
| 756 | + "ts_col": [ |
| 757 | + datetime.datetime(1, 1, 1, 0, 0, 0, tzinfo=pytz.utc), |
| 758 | + None, |
| 759 | + datetime.datetime( |
| 760 | + 9999, 12, 31, 23, 59, 59, 999999, tzinfo=pytz.utc |
| 761 | + ), |
| 762 | + ], |
| 763 | + }, |
| 764 | + dtype="object", |
| 765 | + ) |
| 766 | + |
| 767 | + dataset_id = _make_dataset_id("bq_load_test") |
| 768 | + self.temp_dataset(dataset_id) |
| 769 | + table_id = "{}.{}.load_table_from_dataframe_w_explicit_schema".format( |
| 770 | + Config.CLIENT.project, dataset_id |
| 771 | + ) |
| 772 | + |
| 773 | + job_config = bigquery.LoadJobConfig(schema=table_schema) |
| 774 | + load_job = Config.CLIENT.load_table_from_dataframe( |
| 775 | + dataframe, table_id, job_config=job_config |
| 776 | + ) |
| 777 | + load_job.result() |
| 778 | + |
| 779 | + table = Config.CLIENT.get_table(table_id) |
| 780 | + self.assertEqual(tuple(table.schema), table_schema) |
| 781 | + self.assertEqual(table.num_rows, 3) |
| 782 | + |
625 | 783 | def test_load_avro_from_uri_then_dump_table(self):
|
626 | 784 | from google.cloud.bigquery.job import CreateDisposition
|
627 | 785 | from google.cloud.bigquery.job import SourceFormat
|
|
0 commit comments