Skip to content

Commit 26c04f4

Browse files
authored
feat(bigquery): use storage api for query jobs (#6822)
Initial work on using the Storage API for fetching results of a query. This is more efficient because it can download data in parallel by splitting the read session and using Arrow as a more efficient format. The API surface for users stay the same, with them being able to transform query results into user defined structs. Under the hood the library will take care of converting data represented in Arrow to the user defined struct. One thing to note is that this introduces the first external dependency on the Apache Arrow Go library. Initially we are gonna use it as an experimental feature and explicit ask users to create a `bqStorage.BigQueryReadClient`. Proposed by issue https://togithub.com/googleapis/google-cloud-go/issues/3880 and work on the Python library https://medium.com/google-cloud/announcing-google-cloud-bigquery-version-1-17-0-1fc428512171
1 parent fbe1bd4 commit 26c04f4

18 files changed

+1695
-19
lines changed

bigquery/arrow.go

+221
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
// Copyright 2023 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package bigquery
16+
17+
import (
18+
"bytes"
19+
"encoding/base64"
20+
"errors"
21+
"fmt"
22+
"math/big"
23+
24+
"cloud.google.com/go/civil"
25+
"github.com/apache/arrow/go/v10/arrow"
26+
"github.com/apache/arrow/go/v10/arrow/array"
27+
"github.com/apache/arrow/go/v10/arrow/ipc"
28+
)
29+
30+
type arrowDecoder struct {
31+
tableSchema Schema
32+
rawArrowSchema []byte
33+
arrowSchema *arrow.Schema
34+
}
35+
36+
func newArrowDecoderFromSession(session *readSession, schema Schema) (*arrowDecoder, error) {
37+
bqSession := session.bqSession
38+
if bqSession == nil {
39+
return nil, errors.New("read session not initialized")
40+
}
41+
arrowSerializedSchema := bqSession.GetArrowSchema().GetSerializedSchema()
42+
buf := bytes.NewBuffer(arrowSerializedSchema)
43+
r, err := ipc.NewReader(buf)
44+
if err != nil {
45+
return nil, err
46+
}
47+
defer r.Release()
48+
p := &arrowDecoder{
49+
tableSchema: schema,
50+
rawArrowSchema: arrowSerializedSchema,
51+
arrowSchema: r.Schema(),
52+
}
53+
return p, nil
54+
}
55+
56+
func (ap *arrowDecoder) createIPCReaderForBatch(serializedArrowRecordBatch []byte) (*ipc.Reader, error) {
57+
buf := bytes.NewBuffer(ap.rawArrowSchema)
58+
buf.Write(serializedArrowRecordBatch)
59+
return ipc.NewReader(buf, ipc.WithSchema(ap.arrowSchema))
60+
}
61+
62+
// decodeArrowRecords decodes BQ ArrowRecordBatch into rows of []Value.
63+
func (ap *arrowDecoder) decodeArrowRecords(serializedArrowRecordBatch []byte) ([][]Value, error) {
64+
r, err := ap.createIPCReaderForBatch(serializedArrowRecordBatch)
65+
if err != nil {
66+
return nil, err
67+
}
68+
defer r.Release()
69+
rs := make([][]Value, 0)
70+
for r.Next() {
71+
rec := r.Record()
72+
values, err := ap.convertArrowRecordValue(rec)
73+
if err != nil {
74+
return nil, err
75+
}
76+
rs = append(rs, values...)
77+
}
78+
return rs, nil
79+
}
80+
81+
// decodeRetainedArrowRecords decodes BQ ArrowRecordBatch into a list of retained arrow.Record.
82+
func (ap *arrowDecoder) decodeRetainedArrowRecords(serializedArrowRecordBatch []byte) ([]arrow.Record, error) {
83+
r, err := ap.createIPCReaderForBatch(serializedArrowRecordBatch)
84+
if err != nil {
85+
return nil, err
86+
}
87+
defer r.Release()
88+
records := []arrow.Record{}
89+
for r.Next() {
90+
rec := r.Record()
91+
rec.Retain()
92+
records = append(records, rec)
93+
}
94+
return records, nil
95+
}
96+
97+
// convertArrowRows converts an arrow.Record into a series of Value slices.
98+
func (ap *arrowDecoder) convertArrowRecordValue(record arrow.Record) ([][]Value, error) {
99+
rs := make([][]Value, record.NumRows())
100+
for i := range rs {
101+
rs[i] = make([]Value, record.NumCols())
102+
}
103+
for j, col := range record.Columns() {
104+
fs := ap.tableSchema[j]
105+
ft := ap.arrowSchema.Field(j).Type
106+
for i := 0; i < col.Len(); i++ {
107+
v, err := convertArrowValue(col, i, ft, fs)
108+
if err != nil {
109+
return nil, fmt.Errorf("found arrow type %s, but could not convert value: %v", ap.arrowSchema.Field(j).Type, err)
110+
}
111+
rs[i][j] = v
112+
}
113+
}
114+
return rs, nil
115+
}
116+
117+
// convertArrow gets row value in the given column and converts to a Value.
118+
// Arrow is a colunar storage, so we navigate first by column and get the row value.
119+
// More details on conversions can be seen here: https://cloud.google.com/bigquery/docs/reference/storage#arrow_schema_details
120+
func convertArrowValue(col arrow.Array, i int, ft arrow.DataType, fs *FieldSchema) (Value, error) {
121+
if !col.IsValid(i) {
122+
return nil, nil
123+
}
124+
switch ft.(type) {
125+
case *arrow.BooleanType:
126+
v := col.(*array.Boolean).Value(i)
127+
return convertBasicType(fmt.Sprintf("%v", v), fs.Type)
128+
case *arrow.Int8Type:
129+
v := col.(*array.Int8).Value(i)
130+
return convertBasicType(fmt.Sprintf("%v", v), fs.Type)
131+
case *arrow.Int16Type:
132+
v := col.(*array.Int16).Value(i)
133+
return convertBasicType(fmt.Sprintf("%v", v), fs.Type)
134+
case *arrow.Int32Type:
135+
v := col.(*array.Int32).Value(i)
136+
return convertBasicType(fmt.Sprintf("%v", v), fs.Type)
137+
case *arrow.Int64Type:
138+
v := col.(*array.Int64).Value(i)
139+
return convertBasicType(fmt.Sprintf("%v", v), fs.Type)
140+
case *arrow.Float16Type:
141+
v := col.(*array.Float16).Value(i)
142+
return convertBasicType(fmt.Sprintf("%v", v.Float32()), fs.Type)
143+
case *arrow.Float32Type:
144+
v := col.(*array.Float32).Value(i)
145+
return convertBasicType(fmt.Sprintf("%v", v), fs.Type)
146+
case *arrow.Float64Type:
147+
v := col.(*array.Float64).Value(i)
148+
return convertBasicType(fmt.Sprintf("%v", v), fs.Type)
149+
case *arrow.BinaryType:
150+
v := col.(*array.Binary).Value(i)
151+
encoded := base64.StdEncoding.EncodeToString(v)
152+
return convertBasicType(encoded, fs.Type)
153+
case *arrow.StringType:
154+
v := col.(*array.String).Value(i)
155+
return convertBasicType(v, fs.Type)
156+
case *arrow.Date32Type:
157+
v := col.(*array.Date32).Value(i)
158+
return convertBasicType(v.FormattedString(), fs.Type)
159+
case *arrow.Date64Type:
160+
v := col.(*array.Date64).Value(i)
161+
return convertBasicType(v.FormattedString(), fs.Type)
162+
case *arrow.TimestampType:
163+
v := col.(*array.Timestamp).Value(i)
164+
dft := ft.(*arrow.TimestampType)
165+
t := v.ToTime(dft.Unit)
166+
if dft.TimeZone == "" { // Datetime
167+
return Value(civil.DateTimeOf(t)), nil
168+
}
169+
return Value(t.UTC()), nil // Timestamp
170+
case *arrow.Time32Type:
171+
v := col.(*array.Time32).Value(i)
172+
return convertBasicType(v.FormattedString(arrow.Microsecond), fs.Type)
173+
case *arrow.Time64Type:
174+
v := col.(*array.Time64).Value(i)
175+
return convertBasicType(v.FormattedString(arrow.Microsecond), fs.Type)
176+
case *arrow.Decimal128Type:
177+
dft := ft.(*arrow.Decimal128Type)
178+
v := col.(*array.Decimal128).Value(i)
179+
rat := big.NewRat(1, 1)
180+
rat.Num().SetBytes(v.BigInt().Bytes())
181+
d := rat.Denom()
182+
d.Exp(big.NewInt(10), big.NewInt(int64(dft.Scale)), nil)
183+
return Value(rat), nil
184+
case *arrow.Decimal256Type:
185+
dft := ft.(*arrow.Decimal256Type)
186+
v := col.(*array.Decimal256).Value(i)
187+
rat := big.NewRat(1, 1)
188+
rat.Num().SetBytes(v.BigInt().Bytes())
189+
d := rat.Denom()
190+
d.Exp(big.NewInt(10), big.NewInt(int64(dft.Scale)), nil)
191+
return Value(rat), nil
192+
case *arrow.ListType:
193+
arr := col.(*array.List)
194+
dft := ft.(*arrow.ListType)
195+
values := []Value{}
196+
start, end := arr.ValueOffsets(i)
197+
slice := array.NewSlice(arr.ListValues(), start, end)
198+
for j := 0; j < slice.Len(); j++ {
199+
v, err := convertArrowValue(slice, j, dft.Elem(), fs)
200+
if err != nil {
201+
return nil, err
202+
}
203+
values = append(values, v)
204+
}
205+
return values, nil
206+
case *arrow.StructType:
207+
arr := col.(*array.Struct)
208+
nestedValues := []Value{}
209+
fields := ft.(*arrow.StructType).Fields()
210+
for fIndex, f := range fields {
211+
v, err := convertArrowValue(arr.Field(fIndex), i, f.Type, fs.Schema[fIndex])
212+
if err != nil {
213+
return nil, err
214+
}
215+
nestedValues = append(nestedValues, v)
216+
}
217+
return nestedValues, nil
218+
default:
219+
return nil, fmt.Errorf("unknown arrow type: %v", ft)
220+
}
221+
}

bigquery/bigquery.go

+22
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ type Client struct {
5757

5858
projectID string
5959
bqs *bq.Service
60+
rc *readClient
6061
}
6162

6263
// DetectProjectID is a sentinel value that instructs NewClient to detect the
@@ -97,6 +98,21 @@ func NewClient(ctx context.Context, projectID string, opts ...option.ClientOptio
9798
return c, nil
9899
}
99100

101+
// EnableStorageReadClient sets up Storage API connection to be used when fetching
102+
// large datasets from tables, jobs or queries.
103+
// Calling this method twice will return an error.
104+
func (c *Client) EnableStorageReadClient(ctx context.Context, opts ...option.ClientOption) error {
105+
if c.rc != nil {
106+
return fmt.Errorf("failed: storage read client already set up")
107+
}
108+
rc, err := newReadClient(ctx, c.projectID, opts...)
109+
if err != nil {
110+
return err
111+
}
112+
c.rc = rc
113+
return nil
114+
}
115+
100116
// Project returns the project ID or number for this instance of the client, which may have
101117
// either been explicitly specified or autodetected.
102118
func (c *Client) Project() string {
@@ -107,6 +123,12 @@ func (c *Client) Project() string {
107123
// Close should be called when the client is no longer needed.
108124
// It need not be called at program exit.
109125
func (c *Client) Close() error {
126+
if c.rc != nil {
127+
err := c.rc.close()
128+
if err != nil {
129+
return err
130+
}
131+
}
110132
return nil
111133
}
112134

bigquery/examples_test.go

+23
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,29 @@ func ExampleQuery_Read() {
162162
_ = it // TODO: iterate using Next or iterator.Pager.
163163
}
164164

165+
func ExampleQuery_Read_accelerated() {
166+
ctx := context.Background()
167+
client, err := bigquery.NewClient(ctx, "project-id")
168+
if err != nil {
169+
// TODO: Handle error.
170+
}
171+
172+
// Enable Storage API usage for fetching data
173+
err = client.EnableStorageReadClient(ctx)
174+
if err != nil {
175+
// TODO: Handle error.
176+
}
177+
178+
sql := fmt.Sprintf(`SELECT name, number, state FROM %s WHERE state = "CA"`, `bigquery-public-data.usa_names.usa_1910_current`)
179+
q := client.Query(sql)
180+
it, err := q.Read(ctx)
181+
if err != nil {
182+
// TODO: Handle error.
183+
}
184+
185+
_ = it // TODO: iterate using Next or iterator.Pager.
186+
}
187+
165188
func ExampleRowIterator_Next() {
166189
ctx := context.Background()
167190
client, err := bigquery.NewClient(ctx, "project-id")

bigquery/go.mod

+16-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ require (
77
cloud.google.com/go/datacatalog v1.8.1
88
cloud.google.com/go/iam v0.8.0
99
cloud.google.com/go/storage v1.28.1
10+
github.com/apache/arrow/go/v10 v10.0.1
1011
github.com/google/go-cmp v0.5.9
1112
github.com/google/uuid v1.3.0
1213
github.com/googleapis/gax-go/v2 v2.7.0
@@ -22,13 +23,27 @@ require (
2223
require (
2324
cloud.google.com/go/compute v1.14.0 // indirect
2425
cloud.google.com/go/compute/metadata v0.2.3 // indirect
26+
github.com/andybalholm/brotli v1.0.4 // indirect
27+
github.com/apache/thrift v0.16.0 // indirect
28+
github.com/goccy/go-json v0.9.11 // indirect
2529
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
2630
github.com/golang/protobuf v1.5.2 // indirect
31+
github.com/golang/snappy v0.0.4 // indirect
32+
github.com/google/flatbuffers v2.0.8+incompatible // indirect
2733
github.com/google/martian/v3 v3.2.1 // indirect
2834
github.com/googleapis/enterprise-certificate-proxy v0.2.1 // indirect
35+
github.com/klauspost/asmfmt v1.3.2 // indirect
36+
github.com/klauspost/compress v1.15.9 // indirect
37+
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
38+
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect
39+
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect
40+
github.com/pierrec/lz4/v4 v4.1.15 // indirect
41+
github.com/zeebo/xxh3 v1.0.2 // indirect
42+
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 // indirect
2943
golang.org/x/net v0.0.0-20221014081412-f15817d10f9b // indirect
3044
golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783 // indirect
31-
golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10 // indirect
45+
golang.org/x/sys v0.0.0-20220829200755-d48e67d00261 // indirect
3246
golang.org/x/text v0.5.0 // indirect
47+
golang.org/x/tools v0.1.12 // indirect
3348
google.golang.org/appengine v1.6.7 // indirect
3449
)

0 commit comments

Comments
 (0)