Skip to content

Commit 1246da8

Browse files
authored
feat: make it easier to disable best-effort deduplication with streaming inserts (#734)
* feat: make it easier to disable row insert IDs * Also accept any iterables for row_ids
1 parent 38b3ef9 commit 1246da8

File tree

4 files changed

+195
-14
lines changed

4 files changed

+195
-14
lines changed

google/cloud/bigquery/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from google.cloud.bigquery.dataset import Dataset
3838
from google.cloud.bigquery.dataset import DatasetReference
3939
from google.cloud.bigquery import enums
40+
from google.cloud.bigquery.enums import AutoRowIDs
4041
from google.cloud.bigquery.enums import KeyResultStatementKind
4142
from google.cloud.bigquery.enums import SqlTypeNames
4243
from google.cloud.bigquery.enums import StandardSqlDataTypes
@@ -144,6 +145,7 @@
144145
"DEFAULT_RETRY",
145146
# Enum Constants
146147
"enums",
148+
"AutoRowIDs",
147149
"Compression",
148150
"CreateDisposition",
149151
"DestinationFormat",

google/cloud/bigquery/client.py

+41-6
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
from google.cloud.bigquery.dataset import Dataset
6969
from google.cloud.bigquery.dataset import DatasetListItem
7070
from google.cloud.bigquery.dataset import DatasetReference
71+
from google.cloud.bigquery.enums import AutoRowIDs
7172
from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
7273
from google.cloud.bigquery.opentelemetry_tracing import create_span
7374
from google.cloud.bigquery import job
@@ -3349,7 +3350,7 @@ def insert_rows_json(
33493350
self,
33503351
table: Union[Table, TableReference, str],
33513352
json_rows: Sequence[Dict],
3352-
row_ids: Sequence[str] = None,
3353+
row_ids: Union[Iterable[str], AutoRowIDs, None] = AutoRowIDs.GENERATE_UUID,
33533354
skip_invalid_rows: bool = None,
33543355
ignore_unknown_values: bool = None,
33553356
template_suffix: str = None,
@@ -3371,11 +3372,20 @@ def insert_rows_json(
33713372
json_rows (Sequence[Dict]):
33723373
Row data to be inserted. Keys must match the table schema fields
33733374
and values must be JSON-compatible representations.
3374-
row_ids (Optional[Sequence[Optional[str]]]):
3375+
row_ids (Union[Iterable[str], AutoRowIDs, None]):
33753376
Unique IDs, one per row being inserted. An ID can also be
33763377
``None``, indicating that an explicit insert ID should **not**
33773378
be used for that row. If the argument is omitted altogether,
33783379
unique IDs are created automatically.
3380+
3381+
.. versionchanged:: 2.21.0
3382+
Can also be an iterable, not just a sequence, or an
3383+
:class:`AutoRowIDs` enum member.
3384+
3385+
.. deprecated:: 2.21.0
3386+
Passing ``None`` to explicitly request autogenerating insert IDs is
3387+
deprecated, use :attr:`AutoRowIDs.GENERATE_UUID` instead.
3388+
33793389
skip_invalid_rows (Optional[bool]):
33803390
Insert all valid rows of a request, even if invalid rows exist.
33813391
The default value is ``False``, which causes the entire request
@@ -3415,12 +3425,37 @@ def insert_rows_json(
34153425
rows_info = []
34163426
data = {"rows": rows_info}
34173427

3418-
for index, row in enumerate(json_rows):
3428+
if row_ids is None:
3429+
warnings.warn(
3430+
"Passing None for row_ids is deprecated. To explicitly request "
3431+
"autogenerated insert IDs, use AutoRowIDs.GENERATE_UUID instead",
3432+
category=DeprecationWarning,
3433+
)
3434+
row_ids = AutoRowIDs.GENERATE_UUID
3435+
3436+
if not isinstance(row_ids, AutoRowIDs):
3437+
try:
3438+
row_ids_iter = iter(row_ids)
3439+
except TypeError:
3440+
msg = "row_ids is neither an iterable nor an AutoRowIDs enum member"
3441+
raise TypeError(msg)
3442+
3443+
for i, row in enumerate(json_rows):
34193444
info = {"json": row}
3420-
if row_ids is not None:
3421-
info["insertId"] = row_ids[index]
3422-
else:
3445+
3446+
if row_ids is AutoRowIDs.GENERATE_UUID:
34233447
info["insertId"] = str(uuid.uuid4())
3448+
elif row_ids is AutoRowIDs.DISABLED:
3449+
info["insertId"] = None
3450+
else:
3451+
try:
3452+
insert_id = next(row_ids_iter)
3453+
except StopIteration:
3454+
msg = f"row_ids did not generate enough IDs, error at index {i}"
3455+
raise ValueError(msg)
3456+
else:
3457+
info["insertId"] = insert_id
3458+
34243459
rows_info.append(info)
34253460

34263461
if skip_invalid_rows is not None:

google/cloud/bigquery/enums.py

+7
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@
2121
from google.cloud.bigquery.query import ScalarQueryParameterType
2222

2323

24+
class AutoRowIDs(enum.Enum):
25+
"""How to handle automatic insert IDs when inserting rows as a stream."""
26+
27+
DISABLED = enum.auto()
28+
GENERATE_UUID = enum.auto()
29+
30+
2431
class Compression(object):
2532
"""The compression type to use for exported files. The default value is
2633
:attr:`NONE`.

tests/unit/test_client.py

+145-8
Original file line numberDiff line numberDiff line change
@@ -5434,7 +5434,7 @@ def test_insert_rows_from_dataframe_w_explicit_none_insert_ids(self):
54345434
method="POST", path=API_PATH, data=EXPECTED_SENT_DATA, timeout=None
54355435
)
54365436

5437-
def test_insert_rows_json(self):
5437+
def test_insert_rows_json_default_behavior(self):
54385438
from google.cloud.bigquery.dataset import DatasetReference
54395439
from google.cloud.bigquery.schema import SchemaField
54405440
from google.cloud.bigquery.table import Table
@@ -5481,29 +5481,127 @@ def test_insert_rows_json(self):
54815481
method="POST", path="/%s" % PATH, data=SENT, timeout=7.5,
54825482
)
54835483

5484-
def test_insert_rows_json_with_string_id(self):
5485-
rows = [{"col1": "val1"}]
5484+
def test_insert_rows_json_w_explicitly_requested_autogenerated_insert_ids(self):
5485+
from google.cloud.bigquery import AutoRowIDs
5486+
5487+
rows = [{"col1": "val1"}, {"col2": "val2"}]
54865488
creds = _make_credentials()
54875489
http = object()
54885490
client = self._make_one(
54895491
project="default-project", credentials=creds, _http=http
54905492
)
54915493
conn = client._connection = make_connection({})
54925494

5493-
with mock.patch("uuid.uuid4", side_effect=map(str, range(len(rows)))):
5494-
errors = client.insert_rows_json("proj.dset.tbl", rows)
5495+
uuid_patcher = mock.patch("uuid.uuid4", side_effect=map(str, range(len(rows))))
5496+
with uuid_patcher:
5497+
errors = client.insert_rows_json(
5498+
"proj.dset.tbl", rows, row_ids=AutoRowIDs.GENERATE_UUID
5499+
)
54955500

54965501
self.assertEqual(len(errors), 0)
5497-
expected = {
5498-
"rows": [{"json": row, "insertId": str(i)} for i, row in enumerate(rows)]
5502+
5503+
# Check row data sent to the backend.
5504+
expected_row_data = {
5505+
"rows": [
5506+
{"json": {"col1": "val1"}, "insertId": "0"},
5507+
{"json": {"col2": "val2"}, "insertId": "1"},
5508+
]
54995509
}
55005510
conn.api_request.assert_called_once_with(
55015511
method="POST",
55025512
path="/projects/proj/datasets/dset/tables/tbl/insertAll",
5503-
data=expected,
5513+
data=expected_row_data,
5514+
timeout=None,
5515+
)
5516+
5517+
def test_insert_rows_json_w_explicitly_disabled_insert_ids(self):
5518+
from google.cloud.bigquery import AutoRowIDs
5519+
5520+
rows = [{"col1": "val1"}, {"col2": "val2"}]
5521+
creds = _make_credentials()
5522+
http = object()
5523+
client = self._make_one(
5524+
project="default-project", credentials=creds, _http=http
5525+
)
5526+
conn = client._connection = make_connection({})
5527+
5528+
errors = client.insert_rows_json(
5529+
"proj.dset.tbl", rows, row_ids=AutoRowIDs.DISABLED,
5530+
)
5531+
5532+
self.assertEqual(len(errors), 0)
5533+
5534+
expected_row_data = {
5535+
"rows": [
5536+
{"json": {"col1": "val1"}, "insertId": None},
5537+
{"json": {"col2": "val2"}, "insertId": None},
5538+
]
5539+
}
5540+
conn.api_request.assert_called_once_with(
5541+
method="POST",
5542+
path="/projects/proj/datasets/dset/tables/tbl/insertAll",
5543+
data=expected_row_data,
5544+
timeout=None,
5545+
)
5546+
5547+
def test_insert_rows_json_with_iterator_row_ids(self):
5548+
rows = [{"col1": "val1"}, {"col2": "val2"}, {"col3": "val3"}]
5549+
creds = _make_credentials()
5550+
http = object()
5551+
client = self._make_one(
5552+
project="default-project", credentials=creds, _http=http
5553+
)
5554+
conn = client._connection = make_connection({})
5555+
5556+
row_ids_iter = map(str, itertools.count(42))
5557+
errors = client.insert_rows_json("proj.dset.tbl", rows, row_ids=row_ids_iter)
5558+
5559+
self.assertEqual(len(errors), 0)
5560+
expected_row_data = {
5561+
"rows": [
5562+
{"json": {"col1": "val1"}, "insertId": "42"},
5563+
{"json": {"col2": "val2"}, "insertId": "43"},
5564+
{"json": {"col3": "val3"}, "insertId": "44"},
5565+
]
5566+
}
5567+
conn.api_request.assert_called_once_with(
5568+
method="POST",
5569+
path="/projects/proj/datasets/dset/tables/tbl/insertAll",
5570+
data=expected_row_data,
55045571
timeout=None,
55055572
)
55065573

5574+
def test_insert_rows_json_with_non_iterable_row_ids(self):
5575+
rows = [{"col1": "val1"}]
5576+
creds = _make_credentials()
5577+
http = object()
5578+
client = self._make_one(
5579+
project="default-project", credentials=creds, _http=http
5580+
)
5581+
client._connection = make_connection({})
5582+
5583+
with self.assertRaises(TypeError) as exc:
5584+
client.insert_rows_json("proj.dset.tbl", rows, row_ids=object())
5585+
5586+
err_msg = str(exc.exception)
5587+
self.assertIn("row_ids", err_msg)
5588+
self.assertIn("iterable", err_msg)
5589+
5590+
def test_insert_rows_json_with_too_few_row_ids(self):
5591+
rows = [{"col1": "val1"}, {"col2": "val2"}, {"col3": "val3"}]
5592+
creds = _make_credentials()
5593+
http = object()
5594+
client = self._make_one(
5595+
project="default-project", credentials=creds, _http=http
5596+
)
5597+
client._connection = make_connection({})
5598+
5599+
insert_ids = ["10", "20"]
5600+
5601+
error_msg_pattern = "row_ids did not generate enough IDs.*index 2"
5602+
with self.assertRaisesRegex(ValueError, error_msg_pattern):
5603+
client.insert_rows_json("proj.dset.tbl", rows, row_ids=insert_ids)
5604+
55075605
def test_insert_rows_json_w_explicit_none_insert_ids(self):
55085606
rows = [{"col1": "val1"}, {"col2": "val2"}]
55095607
creds = _make_credentials()
@@ -5526,6 +5624,45 @@ def test_insert_rows_json_w_explicit_none_insert_ids(self):
55265624
timeout=None,
55275625
)
55285626

5627+
def test_insert_rows_json_w_none_insert_ids_sequence(self):
5628+
rows = [{"col1": "val1"}, {"col2": "val2"}]
5629+
creds = _make_credentials()
5630+
http = object()
5631+
client = self._make_one(
5632+
project="default-project", credentials=creds, _http=http
5633+
)
5634+
conn = client._connection = make_connection({})
5635+
5636+
uuid_patcher = mock.patch("uuid.uuid4", side_effect=map(str, range(len(rows))))
5637+
with warnings.catch_warnings(record=True) as warned, uuid_patcher:
5638+
errors = client.insert_rows_json("proj.dset.tbl", rows, row_ids=None)
5639+
5640+
self.assertEqual(len(errors), 0)
5641+
5642+
# Passing row_ids=None should have resulted in a deprecation warning.
5643+
matches = [
5644+
warning
5645+
for warning in warned
5646+
if issubclass(warning.category, DeprecationWarning)
5647+
and "row_ids" in str(warning)
5648+
and "AutoRowIDs.GENERATE_UUID" in str(warning)
5649+
]
5650+
assert matches, "The expected deprecation warning was not raised."
5651+
5652+
# Check row data sent to the backend.
5653+
expected_row_data = {
5654+
"rows": [
5655+
{"json": {"col1": "val1"}, "insertId": "0"},
5656+
{"json": {"col2": "val2"}, "insertId": "1"},
5657+
]
5658+
}
5659+
conn.api_request.assert_called_once_with(
5660+
method="POST",
5661+
path="/projects/proj/datasets/dset/tables/tbl/insertAll",
5662+
data=expected_row_data,
5663+
timeout=None,
5664+
)
5665+
55295666
def test_insert_rows_w_wrong_arg(self):
55305667
from google.cloud.bigquery.dataset import DatasetReference
55315668
from google.cloud.bigquery.schema import SchemaField

0 commit comments

Comments
 (0)