Skip to content

Commit c707c30

Browse files
fix: first pass on making retry configuration more consistent (#695)
* fix: first pass on making retry configuration more consistent Currently ReadRows uses the Retry.deadline configuration inconsistently: - its used as the attempt timeout for the first retry attempt - its used as a limit for retry scheduling for reading a single row Conceptually there are 3 timeouts that are relevant to ReadRows: - attempt timeout: how long a single RPC is allowed to run, this should map directly to a gRPC deadline - overall timeout: Limit how long we should wait across all of the retry attempts, possibly truncating the last attempt timeout. - read timeout: How long we are willing to wait for the next row in a stream Ideally Retry.deadline would represent an operation deadline (since thats the primary concern of the end user). However there is no backwards compatible way to do this. Changing the behavior would cause existing application to start enforcing a very short deadline. This PR tries to improve the situation in a backwards compatible way: - keep Retry.deadline as a read timeout - introduce a new parameter for overall timeout This results in less than ideal api, but avoids breaking existing applications. * fix old test * add attempt timeout * lint * add some tests * lint * refactor confusing logic * apply fixes from review * address feedback
1 parent 5c72780 commit c707c30

File tree

4 files changed

+185
-29
lines changed

4 files changed

+185
-29
lines changed

google/cloud/bigtable/row_data.py

+58-9
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717

1818
import copy
19+
import time
20+
1921
import six
2022

2123
import grpc
@@ -342,6 +344,10 @@ def _retry_read_rows_exception(exc):
342344
initial=1.0,
343345
maximum=15.0,
344346
multiplier=2.0,
347+
# NOTE: this is a soft read timeout: this limits for how long we are willing
348+
# to schedule retry attempts to read the next row. This does not set the
349+
# RPC timeout. Please use the separate overal_timeout parameter of read_rows
350+
# to limit the operation duration
345351
deadline=60.0, # 60 seconds
346352
)
347353
"""The default retry strategy to be used on retry-able errors.
@@ -389,7 +395,14 @@ class PartialRowsData(object):
389395
STATE_CELL_IN_PROGRESS: CELL_IN_PROGRESS,
390396
}
391397

392-
def __init__(self, read_method, request, retry=DEFAULT_RETRY_READ_ROWS):
398+
def __init__(
399+
self,
400+
read_method,
401+
request,
402+
retry=DEFAULT_RETRY_READ_ROWS,
403+
attempt_timeout=None,
404+
overall_timeout=None,
405+
):
393406
# Counter for rows returned to the user
394407
self._counter = 0
395408
# In-progress row, unset until first response, after commit/reset
@@ -406,14 +419,14 @@ def __init__(self, read_method, request, retry=DEFAULT_RETRY_READ_ROWS):
406419
self.read_method = read_method
407420
self.request = request
408421
self.retry = retry
422+
self._attempt_timeout = attempt_timeout
423+
# absolute timestamp when all retry attempts should end
424+
if overall_timeout:
425+
self._overall_deadline = time.time() + overall_timeout
426+
else:
427+
self._overall_deadline = None
409428

410-
# The `timeout` parameter must be somewhat greater than the value
411-
# contained in `self.retry`, in order to avoid race-like condition and
412-
# allow registering the first deadline error before invoking the retry.
413-
# Otherwise there is a risk of entering an infinite loop that resets
414-
# the timeout counter just before it being triggered. The increment
415-
# by 1 second here is customary but should not be much less than that.
416-
self.response_iterator = read_method(request, timeout=self.retry._deadline + 1)
429+
self.response_iterator = self._create_read_stream(request)
417430

418431
self.rows = {}
419432
self._state = self.STATE_NEW_ROW
@@ -451,6 +464,28 @@ class as a generator instead.
451464
for row in self:
452465
self.rows[row.row_key] = row
453466

467+
@property
468+
def remaining_overall_timeout(self):
469+
"""Returns the remaining deadline allotted for the entire stream.
470+
Returns a float of seconds"""
471+
if not self._overall_deadline:
472+
return None
473+
474+
return self._overall_deadline - time.time()
475+
476+
def _create_read_stream(self, req):
477+
"""Starts a new RPC bounded by the overall deadline and attempt timeout.
478+
479+
:type req: class:`data_messages_v2_pb2.ReadRowsRequest`
480+
"""
481+
effective_timeout = self.remaining_overall_timeout
482+
if effective_timeout is None:
483+
effective_timeout = self._attempt_timeout
484+
elif self._attempt_timeout is not None:
485+
effective_timeout = min(effective_timeout, self._attempt_timeout)
486+
487+
return self.read_method(req, timeout=effective_timeout)
488+
454489
def _create_retry_request(self):
455490
"""Helper for :meth:`__iter__`."""
456491
req_manager = _ReadRowsRequestManager(
@@ -465,7 +500,7 @@ def _on_error(self, exc):
465500
if self.last_scanned_row_key:
466501
retry_request = self._create_retry_request()
467502

468-
self.response_iterator = self.read_method(retry_request)
503+
self.response_iterator = self._create_read_stream(retry_request)
469504

470505
def _read_next(self):
471506
"""Helper for :meth:`__iter__`."""
@@ -476,6 +511,20 @@ def _read_next(self):
476511

477512
def _read_next_response(self):
478513
"""Helper for :meth:`__iter__`."""
514+
# Calculate the maximum amount of time that retries should be scheduled.
515+
# This will not actually set any deadlines, it will only limit the
516+
# duration of time that we are willing to schedule retries for.
517+
remaining_overall_timeout = self.remaining_overall_timeout
518+
519+
if remaining_overall_timeout is not None:
520+
# we want make sure that the retry logic doesnt retry after the
521+
# operation deadline is past
522+
if (
523+
self.retry.deadline is None
524+
or self.retry.deadline > remaining_overall_timeout
525+
):
526+
self.retry = self.retry.with_deadline(remaining_overall_timeout)
527+
479528
return self.retry(self._read_next, on_error=self._on_error)()
480529

481530
def __iter__(self):

google/cloud/bigtable/table.py

+40-4
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ def get_cluster_states(self):
481481
for cluster_id, value_pb in table_pb.cluster_states.items()
482482
}
483483

484-
def read_row(self, row_key, filter_=None):
484+
def read_row(self, row_key, filter_=None, overall_timeout=60):
485485
"""Read a single row from this table.
486486
487487
For example:
@@ -506,7 +506,11 @@ def read_row(self, row_key, filter_=None):
506506
"""
507507
row_set = RowSet()
508508
row_set.add_row_key(row_key)
509-
result_iter = iter(self.read_rows(filter_=filter_, row_set=row_set))
509+
result_iter = iter(
510+
self.read_rows(
511+
filter_=filter_, row_set=row_set, overall_timeout=overall_timeout
512+
)
513+
)
510514
row = next(result_iter, None)
511515
if next(result_iter, None) is not None:
512516
raise ValueError("More than one row was returned.")
@@ -521,6 +525,8 @@ def read_rows(
521525
end_inclusive=False,
522526
row_set=None,
523527
retry=DEFAULT_RETRY_READ_ROWS,
528+
attempt_timeout=None,
529+
overall_timeout=None,
524530
):
525531
"""Read rows from this table.
526532
@@ -565,7 +571,22 @@ def read_rows(
565571
default value :attr:`DEFAULT_RETRY_READ_ROWS` can be used and
566572
modified with the :meth:`~google.api_core.retry.Retry.with_delay`
567573
method or the :meth:`~google.api_core.retry.Retry.with_deadline`
568-
method.
574+
method. This retry object is used to try to fetch the next row:
575+
this means that the deadline specified by this object is reset
576+
after every row read. Furthermore, this deadline is loosely enforced:
577+
it will only prevent additional attempts from be scheduled after the
578+
deadline, it will not limit how long a single attempt to read the
579+
next row will run. Prefer to use overall_timeout below.
580+
581+
582+
:type attempt_timeout: float
583+
:param attempt_timeout: (Optional) the attempt timeout to execute a
584+
single RPC. If this attempt fails and there is overall_timeout
585+
left, another attempt will be sent.
586+
587+
:type overall_timeout: float
588+
:param overall_timeout: (Optional) the overall operation deadline to
589+
to completely read the entire ReadRows stream.
569590
570591
:rtype: :class:`.PartialRowsData`
571592
:returns: A :class:`.PartialRowsData` a generator for consuming
@@ -582,7 +603,13 @@ def read_rows(
582603
row_set=row_set,
583604
)
584605
data_client = self._instance._client.table_data_client
585-
return PartialRowsData(data_client.transport.read_rows, request_pb, retry)
606+
return PartialRowsData(
607+
data_client.transport.read_rows,
608+
request_pb,
609+
retry,
610+
attempt_timeout=attempt_timeout,
611+
overall_timeout=overall_timeout,
612+
)
586613

587614
def yield_rows(self, **kwargs):
588615
"""Read rows from this table.
@@ -615,6 +642,15 @@ def yield_rows(self, **kwargs):
615642
:param row_set: (Optional) The row set containing multiple row keys and
616643
row_ranges.
617644
645+
:type attempt_timeout: float
646+
:param attempt_timeout: (Optional) the attempt timeout to execute a
647+
single RPC. If this attempt fails and there is overall_timeout
648+
left, another attempt will be sent.
649+
650+
:type overall_timeout: float
651+
:param overall_timeout: (Optional) the overall operation deadline to
652+
to completely read the entire ReadRows stream.
653+
618654
:rtype: :class:`.PartialRowData`
619655
:returns: A :class:`.PartialRowData` for each row returned
620656
"""

tests/unit/test_row_data.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -370,20 +370,26 @@ def test_constructor(self):
370370
self.assertEqual(partial_rows_data.rows, {})
371371
self.assertEqual(partial_rows_data.retry, DEFAULT_RETRY_READ_ROWS)
372372

373-
def test_constructor_with_retry(self):
374-
from google.cloud.bigtable.row_data import DEFAULT_RETRY_READ_ROWS
375-
373+
def test_constructor_with_overall_timeout(self):
376374
client = _Client()
377375
client._data_stub = mock.MagicMock()
378376
request = object()
379-
retry = DEFAULT_RETRY_READ_ROWS
380-
partial_rows_data = self._make_one(client._data_stub.ReadRows, request, retry)
381-
partial_rows_data.read_method.assert_called_once_with(
382-
request, timeout=DEFAULT_RETRY_READ_ROWS.deadline + 1
377+
partial_rows_data = self._make_one(
378+
client._data_stub.ReadRows, request, overall_timeout=11
383379
)
380+
partial_rows_data.read_method.assert_called_once_with(request, timeout=mock.ANY)
381+
382+
# the deadline being passed to the first RPC should be slightly less
383+
# than 11. But to avoid flakiness on slow test runners, its padded down
384+
# by 3 secs
385+
self.assertLess(8, partial_rows_data.read_method.call_args.kwargs["timeout"])
386+
384387
self.assertIs(partial_rows_data.request, request)
385388
self.assertEqual(partial_rows_data.rows, {})
386-
self.assertEqual(partial_rows_data.retry, retry)
389+
# The remaining deadline should be
390+
# But to avoid flakiness on slow test runners, its padded down by 3 secs
391+
self.assertLess(8, partial_rows_data.remaining_overall_timeout)
392+
self.assertLessEqual(partial_rows_data.remaining_overall_timeout, 11)
387393

388394
def test___eq__(self):
389395
client = _Client()

tests/unit/test_table.py

+73-8
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
15-
14+
import time
1615
import unittest
1716

1817
import mock
@@ -798,17 +797,12 @@ def mock_create_row_request(table_name, **kwargs):
798797

799798
def test_read_retry_rows(self):
800799
from google.cloud.bigtable_v2.gapic import bigtable_client
801-
from google.cloud.bigtable_admin_v2.gapic import bigtable_table_admin_client
802800
from google.api_core import retry
803801

804802
data_api = bigtable_client.BigtableClient(mock.Mock())
805-
table_api = bigtable_table_admin_client.BigtableTableAdminClient(mock.Mock())
806803
credentials = _make_credentials()
807-
client = self._make_client(
808-
project="project-id", credentials=credentials, admin=True
809-
)
804+
client = self._make_client(project="project-id", credentials=credentials)
810805
client._table_data_client = data_api
811-
client._table_admin_client = table_api
812806
instance = client.instance(instance_id=self.INSTANCE_ID)
813807
table = self._make_one(self.TABLE_ID, instance)
814808

@@ -857,6 +851,77 @@ def test_read_retry_rows(self):
857851
result = rows[1]
858852
self.assertEqual(result.row_key, self.ROW_KEY_2)
859853

854+
def test_read_retry_rows_timeouts(self):
855+
from google.cloud.bigtable_v2.gapic import bigtable_client
856+
857+
data_api = bigtable_client.BigtableClient(mock.Mock())
858+
credentials = _make_credentials()
859+
client = self._make_client(project="project-id", credentials=credentials)
860+
client._table_data_client = data_api
861+
instance = client.instance(instance_id=self.INSTANCE_ID)
862+
table = self._make_one(self.TABLE_ID, instance)
863+
864+
# Patch the stub used by the API method.
865+
client._table_data_client.transport.read_rows = mock.Mock(
866+
side_effect=[_MockReadRowsIterator()]
867+
)
868+
869+
# By default there is no timeout
870+
list(table.read_rows())
871+
self.assertIsNone(
872+
client._table_data_client.transport.read_rows.call_args.kwargs["timeout"]
873+
)
874+
875+
# attempt timeout should be passed thru
876+
client._table_data_client.transport.read_rows = mock.Mock(
877+
side_effect=[_MockReadRowsIterator()]
878+
)
879+
list(table.read_rows(attempt_timeout=1.0))
880+
self.assertEquals(
881+
1.0,
882+
client._table_data_client.transport.read_rows.call_args.kwargs["timeout"],
883+
)
884+
885+
# overall timeout should be passed thru
886+
client._table_data_client.transport.read_rows = mock.Mock(
887+
side_effect=[_MockReadRowsIterator()]
888+
)
889+
list(table.read_rows(overall_timeout=10.0))
890+
# The RPC timeout should be slightly less than 10.0 but to avoid test
891+
# flakiness its padded by a couple of secs.
892+
self.assertLess(
893+
8.0,
894+
client._table_data_client.transport.read_rows.call_args.kwargs["timeout"],
895+
)
896+
897+
# attempt timeout limits overall timeout
898+
client._table_data_client.transport.read_rows = mock.Mock(
899+
side_effect=[_MockReadRowsIterator()]
900+
)
901+
list(table.read_rows(attempt_timeout=5.0, overall_timeout=10.0))
902+
self.assertLessEqual(
903+
5.0,
904+
client._table_data_client.transport.read_rows.call_args.kwargs["timeout"],
905+
)
906+
907+
# attempt timeout is truncated by overall timeout
908+
class DelayedFailureIterator(object):
909+
def next(self):
910+
time.sleep(0.75)
911+
raise DeadlineExceeded("delayed error")
912+
913+
__next__ = next
914+
915+
client._table_data_client.transport.read_rows = mock.Mock(
916+
side_effect=[DelayedFailureIterator(), _MockReadRowsIterator()]
917+
)
918+
list(table.read_rows(attempt_timeout=1.0, overall_timeout=1.0))
919+
920+
self.assertGreater(
921+
1.0,
922+
client._table_data_client.transport.read_rows.call_args.kwargs["timeout"],
923+
)
924+
860925
def test_yield_retry_rows(self):
861926
from google.cloud.bigtable_v2.gapic import bigtable_client
862927
from google.cloud.bigtable_admin_v2.gapic import bigtable_table_admin_client

0 commit comments

Comments
 (0)