Skip to content

Commit 1b56cd0

Browse files
authored
feat(spanner): retry spanner transactions and mutations when RST_STREAM error (#6699)
* feat(spanner): retry spanner transactions and mutations when RST_STREAM internal errors is returned from backend. * added test for non-retryable internal error
1 parent f5443e8 commit 1b56cd0

File tree

3 files changed

+66
-35
lines changed

3 files changed

+66
-35
lines changed

spanner/client_test.go

+18
Original file line numberDiff line numberDiff line change
@@ -1579,6 +1579,24 @@ func TestClient_ApplyAtLeastOnceInvalidArgument(t *testing.T) {
15791579
}
15801580
}
15811581

1582+
func TestClient_ApplyAtLeastOnce_NonRetryableInternalErrors(t *testing.T) {
1583+
t.Parallel()
1584+
server, client, teardown := setupMockedTestServer(t)
1585+
defer teardown()
1586+
ms := []*Mutation{
1587+
Insert("Accounts", []string{"AccountId", "Nickname", "Balance"}, []interface{}{int64(1), "Foo", int64(50)}),
1588+
Insert("Accounts", []string{"AccountId", "Nickname", "Balance"}, []interface{}{int64(2), "Bar", int64(1)}),
1589+
}
1590+
server.TestSpanner.PutExecutionTime(MethodCommitTransaction,
1591+
SimulatedExecutionTime{
1592+
Errors: []error{status.Errorf(codes.Internal, "grpc: error while marshaling: string field contains invalid UTF-8")},
1593+
})
1594+
_, err := client.Apply(context.Background(), ms, ApplyAtLeastOnce())
1595+
if status.Code(err) != codes.Internal {
1596+
t.Fatalf("Error mismatch:\ngot: %v\nwant: %v", err, codes.Internal)
1597+
}
1598+
}
1599+
15821600
func TestClient_Apply_ApplyOptions(t *testing.T) {
15831601
t.Parallel()
15841602

spanner/retry.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,13 @@ func (r *spannerRetryer) Retry(err error) (time.Duration, bool) {
8080
}
8181

8282
// runWithRetryOnAbortedOrSessionNotFound executes the given function and
83-
// retries it if it returns an Aborted or Session not found error. The retry
84-
// is delayed if the error was Aborted. The delay between retries is the delay
83+
// retries it if it returns an Aborted, Session not found error or certain Internal errors. The retry
84+
// is delayed if the error was Aborted or Internal error. The delay between retries is the delay
8585
// returned by Cloud Spanner, or if none is returned, the calculated delay with
8686
// a minimum of 10ms and maximum of 32s. There is no delay before the retry if
8787
// the error was Session not found.
8888
func runWithRetryOnAbortedOrSessionNotFound(ctx context.Context, f func(context.Context) error) error {
89-
retryer := onCodes(DefaultRetryBackoff, codes.Aborted)
89+
retryer := onCodes(DefaultRetryBackoff, codes.Aborted, codes.Internal)
9090
funcWithRetry := func(ctx context.Context) error {
9191
for {
9292
err := f(ctx)

spanner/transaction.go

+45-32
Original file line numberDiff line numberDiff line change
@@ -1387,51 +1387,64 @@ func (t *writeOnlyTransaction) applyAtLeastOnce(ctx context.Context, ms ...*Muta
13871387
ts time.Time
13881388
sh *sessionHandle
13891389
)
1390+
defer func() {
1391+
if sh != nil {
1392+
sh.recycle()
1393+
}
1394+
}()
13901395
mPb, err := mutationsProto(ms)
13911396
if err != nil {
13921397
// Malformed mutation found, just return the error.
13931398
return ts, err
13941399
}
13951400

1396-
// Retry-loop for aborted transactions.
1397-
// TODO: Replace with generic retryer.
1398-
for {
1399-
if sh == nil || sh.getID() == "" || sh.getClient() == nil {
1400-
// No usable session for doing the commit, take one from pool.
1401-
sh, err = t.sp.take(ctx)
1402-
if err != nil {
1403-
// sessionPool.Take already retries for session
1404-
// creations/retrivals.
1405-
return ts, err
1401+
// Make a retryer for Aborted and certain Internal errors.
1402+
retryer := onCodes(DefaultRetryBackoff, codes.Aborted, codes.Internal)
1403+
// Apply the mutation and retry if the commit is aborted.
1404+
applyMutationWithRetry := func(ctx context.Context) error {
1405+
for {
1406+
if sh == nil || sh.getID() == "" || sh.getClient() == nil {
1407+
// No usable session for doing the commit, take one from pool.
1408+
sh, err = t.sp.take(ctx)
1409+
if err != nil {
1410+
// sessionPool.Take already retries for session
1411+
// creations/retrivals.
1412+
return ToSpannerError(err)
1413+
}
14061414
}
1407-
defer sh.recycle()
1408-
}
1409-
res, err := sh.getClient().Commit(contextWithOutgoingMetadata(ctx, sh.getMetadata()), &sppb.CommitRequest{
1410-
Session: sh.getID(),
1411-
Transaction: &sppb.CommitRequest_SingleUseTransaction{
1412-
SingleUseTransaction: &sppb.TransactionOptions{
1413-
Mode: &sppb.TransactionOptions_ReadWrite_{
1414-
ReadWrite: &sppb.TransactionOptions_ReadWrite{},
1415+
res, err := sh.getClient().Commit(contextWithOutgoingMetadata(ctx, sh.getMetadata()), &sppb.CommitRequest{
1416+
Session: sh.getID(),
1417+
Transaction: &sppb.CommitRequest_SingleUseTransaction{
1418+
SingleUseTransaction: &sppb.TransactionOptions{
1419+
Mode: &sppb.TransactionOptions_ReadWrite_{
1420+
ReadWrite: &sppb.TransactionOptions_ReadWrite{},
1421+
},
14151422
},
14161423
},
1417-
},
1418-
Mutations: mPb,
1419-
RequestOptions: createRequestOptions(t.commitPriority, "", t.transactionTag),
1420-
})
1421-
if err != nil && !isAbortedErr(err) {
1422-
if isSessionNotFoundError(err) {
1423-
// Discard the bad session.
1424-
sh.destroy()
1424+
Mutations: mPb,
1425+
RequestOptions: createRequestOptions(t.commitPriority, "", t.transactionTag),
1426+
})
1427+
if err != nil && !isAbortedErr(err) {
1428+
if isSessionNotFoundError(err) {
1429+
// Discard the bad session.
1430+
sh.destroy()
1431+
}
1432+
return toSpannerErrorWithCommitInfo(err, true)
1433+
} else if err == nil {
1434+
if tstamp := res.GetCommitTimestamp(); tstamp != nil {
1435+
ts = time.Unix(tstamp.Seconds, int64(tstamp.Nanos))
1436+
}
14251437
}
1426-
return ts, toSpannerErrorWithCommitInfo(err, true)
1427-
} else if err == nil {
1428-
if tstamp := res.GetCommitTimestamp(); tstamp != nil {
1429-
ts = time.Unix(tstamp.Seconds, int64(tstamp.Nanos))
1438+
delay, shouldRetry := retryer.Retry(err)
1439+
if !shouldRetry {
1440+
return err
1441+
}
1442+
if err := gax.Sleep(ctx, delay); err != nil {
1443+
return err
14301444
}
1431-
break
14321445
}
14331446
}
1434-
return ts, ToSpannerError(err)
1447+
return ts, applyMutationWithRetry(ctx)
14351448
}
14361449

14371450
// isAbortedErr returns true if the error indicates that an gRPC call is

0 commit comments

Comments
 (0)