Skip to content

Commit 10960c1

Browse files
authored
feat(spanner): implement generation and propagation of "x-goog-spanner-request-id" Header (#11048)
* spanner: implement generation and propagation of "x-goog-spanner-request-id" Header In tandem with the specification: https://orijtech.notion.site/x-goog-spanner-request-id-always-on-gRPC-header-to-aid-in-quick-debugging-of-errors-14aba6bc91348091a58fca7a505c9827 this change adds sending over the "x-goog-spanner-request-id" header for every unary and streaming call, in the form: <version>.<processId>.<clientId>.<channelId>.<requestCountForClient>.<rpcCountForRequest> where: * version is the version of the specification * processId is a randomly generated uint64 singleton for the lifetime of a process * clientId is the monotonically increasing id/number of gRPC Spanner clients created * requestCountForClient is the monotonically increasing number of requests made by the client * channelId currently at 1 is the Id of the client for Go * rpcCountForRequest is the number of RPCs/retries within a specific request This header is to be sent on both unary and streaming calls and it'll help debug latencies for customers. On an error, customers can assert against .Error and retrieve the associated .RequestID and log it, or even better it'll be printed out whenever errors are logged. Importantly making randIdForProcess to be a uint6 which is 64bits and not a UUID4 which is 128bits which surely massively reduces the possibility of collisions to ensure that high QPS applications can function and accept bursts of traffic without worry, as the prior design used uint32 aka 32 bits for which just 50,000 new processes being created could get the probability of collisions to 25%, with this new change a company would have to create 82 million QPS every second for 1,000 years for a 1% collision with 2.6e18 for which the collision would be 1%. Using 64-bits still provides really good protection whereby for a 1% chance of collision, we would need 810 million objects, so we have good protection. However, Google Cloud Spanner's backend has to store every one of the always on headers for a desired retention period hence 64-bits is a great balance between collision protection vs storage. Fixes #11073 * Rebase with main; rename nthRPC to attempt * Infer channelID from ConnPool directly * Attach nthRequest to sessionClient instead of to grpcClient given channelID is derived from sessionClient.connPool * Retain reference to grpc.Header(*metadata.MD) We have to re-insert the request-id even after gax.Invoke->grpc internals clear it. Added test to validate retries. * Fix up Error.Error() to show RequestID for both cases * spanner: bring in tests contributed by Knut * spanner: allow errors with grpc.codes: Canceled and DeadlineExceeded to be wrapped with request-id * spanner: correctly track and increment retry attempts for each ExecuteStreamingSql request * spanner: propagate RequestID even for DeadlineExceeded * spanner: assert .RequestID exists * Address code reivew nits+feedback * spanner: account for stream resets and retries This change accounts for logic graciously raised by Knut along with his test contribution. * Address more updates
1 parent 300865f commit 10960c1

13 files changed

+2323
-78
lines changed

spanner/batch.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ func (t *BatchReadOnlyTransaction) Execute(ctx context.Context, p *Partition) *R
309309
var (
310310
sh *sessionHandle
311311
err error
312-
rpc func(ct context.Context, resumeToken []byte) (streamingReceiver, error)
312+
rpc func(ct context.Context, resumeToken []byte, opts ...gax.CallOption) (streamingReceiver, error)
313313
)
314314
if sh, _, err = t.acquire(ctx); err != nil {
315315
return &RowIterator{err: err}
@@ -322,7 +322,7 @@ func (t *BatchReadOnlyTransaction) Execute(ctx context.Context, p *Partition) *R
322322
sh.updateLastUseTime()
323323
// Read or query partition.
324324
if p.rreq != nil {
325-
rpc = func(ctx context.Context, resumeToken []byte) (streamingReceiver, error) {
325+
rpc = func(ctx context.Context, resumeToken []byte, opts ...gax.CallOption) (streamingReceiver, error) {
326326
client, err := client.StreamingRead(ctx, &sppb.ReadRequest{
327327
Session: p.rreq.Session,
328328
Transaction: p.rreq.Transaction,
@@ -335,7 +335,7 @@ func (t *BatchReadOnlyTransaction) Execute(ctx context.Context, p *Partition) *R
335335
ResumeToken: resumeToken,
336336
DataBoostEnabled: p.rreq.DataBoostEnabled,
337337
DirectedReadOptions: p.rreq.DirectedReadOptions,
338-
})
338+
}, opts...)
339339
if err != nil {
340340
return client, err
341341
}
@@ -351,7 +351,7 @@ func (t *BatchReadOnlyTransaction) Execute(ctx context.Context, p *Partition) *R
351351
return client, err
352352
}
353353
} else {
354-
rpc = func(ctx context.Context, resumeToken []byte) (streamingReceiver, error) {
354+
rpc = func(ctx context.Context, resumeToken []byte, opts ...gax.CallOption) (streamingReceiver, error) {
355355
client, err := client.ExecuteStreamingSql(ctx, &sppb.ExecuteSqlRequest{
356356
Session: p.qreq.Session,
357357
Transaction: p.qreq.Transaction,
@@ -364,7 +364,7 @@ func (t *BatchReadOnlyTransaction) Execute(ctx context.Context, p *Partition) *R
364364
ResumeToken: resumeToken,
365365
DataBoostEnabled: p.qreq.DataBoostEnabled,
366366
DirectedReadOptions: p.qreq.DirectedReadOptions,
367-
})
367+
}, opts...)
368368
if err != nil {
369369
return client, err
370370
}
@@ -387,7 +387,7 @@ func (t *BatchReadOnlyTransaction) Execute(ctx context.Context, p *Partition) *R
387387
t.sp.sc.metricsTracerFactory,
388388
rpc,
389389
t.setTimestamp,
390-
t.release)
390+
t.release, client.(*grpcSpannerClient))
391391
}
392392

393393
// MarshalBinary implements BinaryMarshaler.

spanner/client.go

+8
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,14 @@ func newClientWithConfig(ctx context.Context, database string, config ClientConf
433433
} else {
434434
// Create gtransport ConnPool as usual if MultiEndpoint is not used.
435435
// gRPC options.
436+
437+
// Add a unaryClientInterceptor and streamClientInterceptor.
438+
reqIDInjector := new(requestIDHeaderInjector)
439+
opts = append(opts,
440+
option.WithGRPCDialOption(grpc.WithChainStreamInterceptor(reqIDInjector.interceptStream)),
441+
option.WithGRPCDialOption(grpc.WithChainUnaryInterceptor(reqIDInjector.interceptUnary)),
442+
)
443+
436444
allOpts := allClientOpts(config.NumChannels, config.Compression, opts...)
437445
pool, err = gtransport.DialPool(ctx, allOpts...)
438446
if err != nil {

spanner/client_test.go

+6-2
Original file line numberDiff line numberDiff line change
@@ -4187,13 +4187,17 @@ func TestReadWriteTransaction_ContextTimeoutDuringCommit(t *testing.T) {
41874187
if se.GRPCStatus().Code() != w.GRPCStatus().Code() {
41884188
t.Fatalf("Error status mismatch:\nGot: %v\nWant: %v", se.GRPCStatus(), w.GRPCStatus())
41894189
}
4190-
if se.Error() != w.Error() {
4191-
t.Fatalf("Error message mismatch:\nGot %s\nWant: %s", se.Error(), w.Error())
4190+
if !testEqual(se, w) {
4191+
t.Fatalf("Error message mismatch:\nGot: %s\nWant: %s", se.Error(), w.Error())
41924192
}
41934193
var outcome *TransactionOutcomeUnknownError
41944194
if !errors.As(err, &outcome) {
41954195
t.Fatalf("Missing wrapped TransactionOutcomeUnknownError error")
41964196
}
4197+
4198+
if w.RequestID != "" {
4199+
t.Fatal("Missing .RequestID")
4200+
}
41974201
}
41984202

41994203
func TestFailedCommit_NoRollback(t *testing.T) {

spanner/cmp_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ func testEqual(a, b interface{}) bool {
6464
if strings.Contains(path.GoString(), "{*spanner.Error}.err") {
6565
return true
6666
}
67+
if strings.Contains(path.GoString(), "{*spanner.Error}.RequestID") {
68+
return true
69+
}
6770
return false
6871
}, cmp.Ignore()))
6972
}

spanner/errors.go

+20-5
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ type Error struct {
5858
// additionalInformation optionally contains any additional information
5959
// about the error.
6060
additionalInformation string
61+
62+
// RequestID is the associated ID that was sent to Google Cloud Spanner's
63+
// backend, as the value in the "x-goog-spanner-request-id" gRPC header.
64+
RequestID string
6165
}
6266

6367
// TransactionOutcomeUnknownError is wrapped in a Spanner error when the error
@@ -85,10 +89,17 @@ func (e *Error) Error() string {
8589
return "spanner: OK"
8690
}
8791
code := ErrCode(e)
92+
93+
var s string
8894
if e.additionalInformation == "" {
89-
return fmt.Sprintf("spanner: code = %q, desc = %q", code, e.Desc)
95+
s = fmt.Sprintf("spanner: code = %q, desc = %q", code, e.Desc)
96+
} else {
97+
s = fmt.Sprintf("spanner: code = %q, desc = %q, additional information = %s", code, e.Desc, e.additionalInformation)
9098
}
91-
return fmt.Sprintf("spanner: code = %q, desc = %q, additional information = %s", code, e.Desc, e.additionalInformation)
99+
if e.RequestID != "" {
100+
s = fmt.Sprintf("%s, requestID = %q", s, e.RequestID)
101+
}
102+
return s
92103
}
93104

94105
// Unwrap returns the wrapped error (if any).
@@ -123,6 +134,10 @@ func (e *Error) decorate(info string) {
123134
// APIError error having given error code as its status.
124135
func spannerErrorf(code codes.Code, format string, args ...interface{}) error {
125136
msg := fmt.Sprintf(format, args...)
137+
return spannerError(code, msg)
138+
}
139+
140+
func spannerError(code codes.Code, msg string) error {
126141
wrapped, _ := apierror.FromError(status.Error(code, msg))
127142
return &Error{
128143
Code: code,
@@ -172,9 +187,9 @@ func toSpannerErrorWithCommitInfo(err error, errorDuringCommit bool) error {
172187
desc = fmt.Sprintf("%s, %s", desc, transactionOutcomeUnknownMsg)
173188
wrapped = &TransactionOutcomeUnknownError{err: wrapped}
174189
}
175-
return &Error{status.FromContextError(err).Code(), toAPIError(wrapped), desc, ""}
190+
return &Error{status.FromContextError(err).Code(), toAPIError(wrapped), desc, "", ""}
176191
case status.Code(err) == codes.Unknown:
177-
return &Error{codes.Unknown, toAPIError(err), err.Error(), ""}
192+
return &Error{codes.Unknown, toAPIError(err), err.Error(), "", ""}
178193
default:
179194
statusErr := status.Convert(err)
180195
code, desc := statusErr.Code(), statusErr.Message()
@@ -183,7 +198,7 @@ func toSpannerErrorWithCommitInfo(err error, errorDuringCommit bool) error {
183198
desc = fmt.Sprintf("%s, %s", desc, transactionOutcomeUnknownMsg)
184199
wrapped = &TransactionOutcomeUnknownError{err: wrapped}
185200
}
186-
return &Error{code, toAPIError(wrapped), desc, ""}
201+
return &Error{code, toAPIError(wrapped), desc, "", ""}
187202
}
188203
}
189204

spanner/grpc_client.go

+32-15
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package spanner
1919
import (
2020
"context"
2121
"strings"
22+
"sync/atomic"
2223

2324
vkit "cloud.google.com/go/spanner/apiv1"
2425
"cloud.google.com/go/spanner/apiv1/spannerpb"
@@ -67,6 +68,15 @@ type spannerClient interface {
6768
type grpcSpannerClient struct {
6869
raw *vkit.Client
6970
metricsTracerFactory *builtinMetricsTracerFactory
71+
72+
// These fields are used to uniquely track x-goog-spanner-request-id where:
73+
// raw(*vkit.Client) is the channel, and channelID is derived from the ordinal
74+
// count of unique *vkit.Client as retrieved from the session pool.
75+
channelID uint64
76+
// id is derived from the SpannerClient.
77+
id int
78+
// nthRequest is incremented for each new request (but not for retries of requests).
79+
nthRequest *atomic.Uint32
7080
}
7181

7282
var (
@@ -76,13 +86,16 @@ var (
7686

7787
// newGRPCSpannerClient initializes a new spannerClient that uses the gRPC
7888
// Spanner API.
79-
func newGRPCSpannerClient(ctx context.Context, sc *sessionClient, opts ...option.ClientOption) (spannerClient, error) {
89+
func newGRPCSpannerClient(ctx context.Context, sc *sessionClient, channelID uint64, opts ...option.ClientOption) (spannerClient, error) {
8090
raw, err := vkit.NewClient(ctx, opts...)
8191
if err != nil {
8292
return nil, err
8393
}
8494

8595
g := &grpcSpannerClient{raw: raw, metricsTracerFactory: sc.metricsTracerFactory}
96+
clientID := sc.nthClient
97+
g.prepareRequestIDTrackers(clientID, channelID, sc.nthRequest)
98+
8699
clientInfo := []string{"gccl", internal.Version}
87100
if sc.userAgent != "" {
88101
agentWithVersion := strings.SplitN(sc.userAgent, "/", 2)
@@ -118,7 +131,7 @@ func (g *grpcSpannerClient) CreateSession(ctx context.Context, req *spannerpb.Cr
118131
mt := g.newBuiltinMetricsTracer(ctx)
119132
defer recordOperationCompletion(mt)
120133
ctx = context.WithValue(ctx, metricsTracerKey, mt)
121-
resp, err := g.raw.CreateSession(ctx, req, opts...)
134+
resp, err := g.raw.CreateSession(ctx, req, g.optsWithNextRequestID(opts)...)
122135
statusCode, _ := status.FromError(err)
123136
mt.currOp.setStatus(statusCode.Code().String())
124137
return resp, err
@@ -128,7 +141,7 @@ func (g *grpcSpannerClient) BatchCreateSessions(ctx context.Context, req *spanne
128141
mt := g.newBuiltinMetricsTracer(ctx)
129142
defer recordOperationCompletion(mt)
130143
ctx = context.WithValue(ctx, metricsTracerKey, mt)
131-
resp, err := g.raw.BatchCreateSessions(ctx, req, opts...)
144+
resp, err := g.raw.BatchCreateSessions(ctx, req, g.optsWithNextRequestID(opts)...)
132145
statusCode, _ := status.FromError(err)
133146
mt.currOp.setStatus(statusCode.Code().String())
134147
return resp, err
@@ -138,21 +151,21 @@ func (g *grpcSpannerClient) GetSession(ctx context.Context, req *spannerpb.GetSe
138151
mt := g.newBuiltinMetricsTracer(ctx)
139152
defer recordOperationCompletion(mt)
140153
ctx = context.WithValue(ctx, metricsTracerKey, mt)
141-
resp, err := g.raw.GetSession(ctx, req, opts...)
154+
resp, err := g.raw.GetSession(ctx, req, g.optsWithNextRequestID(opts)...)
142155
statusCode, _ := status.FromError(err)
143156
mt.currOp.setStatus(statusCode.Code().String())
144157
return resp, err
145158
}
146159

147160
func (g *grpcSpannerClient) ListSessions(ctx context.Context, req *spannerpb.ListSessionsRequest, opts ...gax.CallOption) *vkit.SessionIterator {
148-
return g.raw.ListSessions(ctx, req, opts...)
161+
return g.raw.ListSessions(ctx, req, g.optsWithNextRequestID(opts)...)
149162
}
150163

151164
func (g *grpcSpannerClient) DeleteSession(ctx context.Context, req *spannerpb.DeleteSessionRequest, opts ...gax.CallOption) error {
152165
mt := g.newBuiltinMetricsTracer(ctx)
153166
defer recordOperationCompletion(mt)
154167
ctx = context.WithValue(ctx, metricsTracerKey, mt)
155-
err := g.raw.DeleteSession(ctx, req, opts...)
168+
err := g.raw.DeleteSession(ctx, req, g.optsWithNextRequestID(opts)...)
156169
statusCode, _ := status.FromError(err)
157170
mt.currOp.setStatus(statusCode.Code().String())
158171
return err
@@ -162,21 +175,23 @@ func (g *grpcSpannerClient) ExecuteSql(ctx context.Context, req *spannerpb.Execu
162175
mt := g.newBuiltinMetricsTracer(ctx)
163176
defer recordOperationCompletion(mt)
164177
ctx = context.WithValue(ctx, metricsTracerKey, mt)
165-
resp, err := g.raw.ExecuteSql(ctx, req, opts...)
178+
resp, err := g.raw.ExecuteSql(ctx, req, g.optsWithNextRequestID(opts)...)
166179
statusCode, _ := status.FromError(err)
167180
mt.currOp.setStatus(statusCode.Code().String())
168181
return resp, err
169182
}
170183

171184
func (g *grpcSpannerClient) ExecuteStreamingSql(ctx context.Context, req *spannerpb.ExecuteSqlRequest, opts ...gax.CallOption) (spannerpb.Spanner_ExecuteStreamingSqlClient, error) {
185+
// Note: This method does not add g.optsWithNextRequestID to inject x-goog-spanner-request-id
186+
// as it is already manually added when creating Stream iterators for ExecuteStreamingSql.
172187
return g.raw.ExecuteStreamingSql(peer.NewContext(ctx, &peer.Peer{}), req, opts...)
173188
}
174189

175190
func (g *grpcSpannerClient) ExecuteBatchDml(ctx context.Context, req *spannerpb.ExecuteBatchDmlRequest, opts ...gax.CallOption) (*spannerpb.ExecuteBatchDmlResponse, error) {
176191
mt := g.newBuiltinMetricsTracer(ctx)
177192
defer recordOperationCompletion(mt)
178193
ctx = context.WithValue(ctx, metricsTracerKey, mt)
179-
resp, err := g.raw.ExecuteBatchDml(ctx, req, opts...)
194+
resp, err := g.raw.ExecuteBatchDml(ctx, req, g.optsWithNextRequestID(opts)...)
180195
statusCode, _ := status.FromError(err)
181196
mt.currOp.setStatus(statusCode.Code().String())
182197
return resp, err
@@ -186,21 +201,23 @@ func (g *grpcSpannerClient) Read(ctx context.Context, req *spannerpb.ReadRequest
186201
mt := g.newBuiltinMetricsTracer(ctx)
187202
defer recordOperationCompletion(mt)
188203
ctx = context.WithValue(ctx, metricsTracerKey, mt)
189-
resp, err := g.raw.Read(ctx, req, opts...)
204+
resp, err := g.raw.Read(ctx, req, g.optsWithNextRequestID(opts)...)
190205
statusCode, _ := status.FromError(err)
191206
mt.currOp.setStatus(statusCode.Code().String())
192207
return resp, err
193208
}
194209

195210
func (g *grpcSpannerClient) StreamingRead(ctx context.Context, req *spannerpb.ReadRequest, opts ...gax.CallOption) (spannerpb.Spanner_StreamingReadClient, error) {
211+
// Note: This method does not add g.optsWithNextRequestID, as it is already
212+
// manually added when creating Stream iterators for StreamingRead.
196213
return g.raw.StreamingRead(peer.NewContext(ctx, &peer.Peer{}), req, opts...)
197214
}
198215

199216
func (g *grpcSpannerClient) BeginTransaction(ctx context.Context, req *spannerpb.BeginTransactionRequest, opts ...gax.CallOption) (*spannerpb.Transaction, error) {
200217
mt := g.newBuiltinMetricsTracer(ctx)
201218
defer recordOperationCompletion(mt)
202219
ctx = context.WithValue(ctx, metricsTracerKey, mt)
203-
resp, err := g.raw.BeginTransaction(ctx, req, opts...)
220+
resp, err := g.raw.BeginTransaction(ctx, req, g.optsWithNextRequestID(opts)...)
204221
statusCode, _ := status.FromError(err)
205222
mt.currOp.setStatus(statusCode.Code().String())
206223
return resp, err
@@ -210,7 +227,7 @@ func (g *grpcSpannerClient) Commit(ctx context.Context, req *spannerpb.CommitReq
210227
mt := g.newBuiltinMetricsTracer(ctx)
211228
defer recordOperationCompletion(mt)
212229
ctx = context.WithValue(ctx, metricsTracerKey, mt)
213-
resp, err := g.raw.Commit(ctx, req, opts...)
230+
resp, err := g.raw.Commit(ctx, req, g.optsWithNextRequestID(opts)...)
214231
statusCode, _ := status.FromError(err)
215232
mt.currOp.setStatus(statusCode.Code().String())
216233
return resp, err
@@ -220,7 +237,7 @@ func (g *grpcSpannerClient) Rollback(ctx context.Context, req *spannerpb.Rollbac
220237
mt := g.newBuiltinMetricsTracer(ctx)
221238
defer recordOperationCompletion(mt)
222239
ctx = context.WithValue(ctx, metricsTracerKey, mt)
223-
err := g.raw.Rollback(ctx, req, opts...)
240+
err := g.raw.Rollback(ctx, req, g.optsWithNextRequestID(opts)...)
224241
statusCode, _ := status.FromError(err)
225242
mt.currOp.setStatus(statusCode.Code().String())
226243
return err
@@ -230,7 +247,7 @@ func (g *grpcSpannerClient) PartitionQuery(ctx context.Context, req *spannerpb.P
230247
mt := g.newBuiltinMetricsTracer(ctx)
231248
defer recordOperationCompletion(mt)
232249
ctx = context.WithValue(ctx, metricsTracerKey, mt)
233-
resp, err := g.raw.PartitionQuery(ctx, req, opts...)
250+
resp, err := g.raw.PartitionQuery(ctx, req, g.optsWithNextRequestID(opts)...)
234251
statusCode, _ := status.FromError(err)
235252
mt.currOp.setStatus(statusCode.Code().String())
236253
return resp, err
@@ -240,12 +257,12 @@ func (g *grpcSpannerClient) PartitionRead(ctx context.Context, req *spannerpb.Pa
240257
mt := g.newBuiltinMetricsTracer(ctx)
241258
defer recordOperationCompletion(mt)
242259
ctx = context.WithValue(ctx, metricsTracerKey, mt)
243-
resp, err := g.raw.PartitionRead(ctx, req, opts...)
260+
resp, err := g.raw.PartitionRead(ctx, req, g.optsWithNextRequestID(opts)...)
244261
statusCode, _ := status.FromError(err)
245262
mt.currOp.setStatus(statusCode.Code().String())
246263
return resp, err
247264
}
248265

249266
func (g *grpcSpannerClient) BatchWrite(ctx context.Context, req *spannerpb.BatchWriteRequest, opts ...gax.CallOption) (spannerpb.Spanner_BatchWriteClient, error) {
250-
return g.raw.BatchWrite(peer.NewContext(ctx, &peer.Peer{}), req, opts...)
267+
return g.raw.BatchWrite(peer.NewContext(ctx, &peer.Peer{}), req, g.optsWithNextRequestID(opts)...)
251268
}

spanner/internal/testutil/inmem_spanner_server.go

+4
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ const (
9090
MethodExecuteBatchDml string = "EXECUTE_BATCH_DML"
9191
MethodStreamingRead string = "EXECUTE_STREAMING_READ"
9292
MethodBatchWrite string = "BATCH_WRITE"
93+
MethodPartitionQuery string = "PARTITION_QUERY"
9394
)
9495

9596
// StatementResult represents a mocked result on the test server. The result is
@@ -1107,6 +1108,9 @@ func (s *inMemSpannerServer) Rollback(ctx context.Context, req *spannerpb.Rollba
11071108
}
11081109

11091110
func (s *inMemSpannerServer) PartitionQuery(ctx context.Context, req *spannerpb.PartitionQueryRequest) (*spannerpb.PartitionResponse, error) {
1111+
if err := s.simulateExecutionTime(MethodPartitionQuery, req); err != nil {
1112+
return nil, err
1113+
}
11101114
s.mu.Lock()
11111115
if s.stopped {
11121116
s.mu.Unlock()

0 commit comments

Comments
 (0)