Skip to content

Commit a01b271

Browse files
authored
feat: add 'columns' as an alias for 'col_order' (#298)
1 parent a61c5fe commit a01b271

File tree

5 files changed

+71
-38
lines changed

5 files changed

+71
-38
lines changed

bigframes/pandas/__init__.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -486,20 +486,22 @@ def read_gbq(
486486
query_or_table: str,
487487
*,
488488
index_col: Iterable[str] | str = (),
489-
col_order: Iterable[str] = (),
489+
columns: Iterable[str] = (),
490490
max_results: Optional[int] = None,
491491
filters: vendored_pandas_gbq.FiltersType = (),
492492
use_cache: bool = True,
493+
col_order: Iterable[str] = (),
493494
) -> bigframes.dataframe.DataFrame:
494495
_set_default_session_location_if_possible(query_or_table)
495496
return global_session.with_default_session(
496497
bigframes.session.Session.read_gbq,
497498
query_or_table,
498499
index_col=index_col,
499-
col_order=col_order,
500+
columns=columns,
500501
max_results=max_results,
501502
filters=filters,
502503
use_cache=use_cache,
504+
col_order=col_order,
503505
)
504506

505507

@@ -520,18 +522,20 @@ def read_gbq_query(
520522
query: str,
521523
*,
522524
index_col: Iterable[str] | str = (),
523-
col_order: Iterable[str] = (),
525+
columns: Iterable[str] = (),
524526
max_results: Optional[int] = None,
525527
use_cache: bool = True,
528+
col_order: Iterable[str] = (),
526529
) -> bigframes.dataframe.DataFrame:
527530
_set_default_session_location_if_possible(query)
528531
return global_session.with_default_session(
529532
bigframes.session.Session.read_gbq_query,
530533
query,
531534
index_col=index_col,
532-
col_order=col_order,
535+
columns=columns,
533536
max_results=max_results,
534537
use_cache=use_cache,
538+
col_order=col_order,
535539
)
536540

537541

@@ -542,18 +546,20 @@ def read_gbq_table(
542546
query: str,
543547
*,
544548
index_col: Iterable[str] | str = (),
545-
col_order: Iterable[str] = (),
549+
columns: Iterable[str] = (),
546550
max_results: Optional[int] = None,
547551
use_cache: bool = True,
552+
col_order: Iterable[str] = (),
548553
) -> bigframes.dataframe.DataFrame:
549554
_set_default_session_location_if_possible(query)
550555
return global_session.with_default_session(
551556
bigframes.session.Session.read_gbq_table,
552557
query,
553558
index_col=index_col,
554-
col_order=col_order,
559+
columns=columns,
555560
max_results=max_results,
556561
use_cache=use_cache,
562+
col_order=col_order,
557563
)
558564

559565

bigframes/session/__init__.py

+45-21
Original file line numberDiff line numberDiff line change
@@ -232,20 +232,28 @@ def read_gbq(
232232
query_or_table: str,
233233
*,
234234
index_col: Iterable[str] | str = (),
235-
col_order: Iterable[str] = (),
235+
columns: Iterable[str] = (),
236236
max_results: Optional[int] = None,
237237
filters: third_party_pandas_gbq.FiltersType = (),
238238
use_cache: bool = True,
239+
col_order: Iterable[str] = (),
239240
# Add a verify index argument that fails if the index is not unique.
240241
) -> dataframe.DataFrame:
241242
# TODO(b/281571214): Generate prompt to show the progress of read_gbq.
242-
query_or_table = self._filters_to_query(query_or_table, col_order, filters)
243+
if columns and col_order:
244+
raise ValueError(
245+
"Must specify either columns (preferred) or col_order, not both"
246+
)
247+
elif col_order:
248+
columns = col_order
249+
250+
query_or_table = self._filters_to_query(query_or_table, columns, filters)
243251

244252
if _is_query(query_or_table):
245253
return self._read_gbq_query(
246254
query_or_table,
247255
index_col=index_col,
248-
col_order=col_order,
256+
columns=columns,
249257
max_results=max_results,
250258
api_name="read_gbq",
251259
use_cache=use_cache,
@@ -257,7 +265,7 @@ def read_gbq(
257265
return self._read_gbq_table(
258266
query_or_table,
259267
index_col=index_col,
260-
col_order=col_order,
268+
columns=columns,
261269
max_results=max_results,
262270
api_name="read_gbq",
263271
use_cache=use_cache,
@@ -388,9 +396,10 @@ def read_gbq_query(
388396
query: str,
389397
*,
390398
index_col: Iterable[str] | str = (),
391-
col_order: Iterable[str] = (),
399+
columns: Iterable[str] = (),
392400
max_results: Optional[int] = None,
393401
use_cache: bool = True,
402+
col_order: Iterable[str] = (),
394403
) -> dataframe.DataFrame:
395404
"""Turn a SQL query into a DataFrame.
396405
@@ -442,10 +451,17 @@ def read_gbq_query(
442451
"""
443452
# NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
444453
# these docstrings are inline.
454+
if columns and col_order:
455+
raise ValueError(
456+
"Must specify either columns (preferred) or col_order, not both"
457+
)
458+
elif col_order:
459+
columns = col_order
460+
445461
return self._read_gbq_query(
446462
query=query,
447463
index_col=index_col,
448-
col_order=col_order,
464+
columns=columns,
449465
max_results=max_results,
450466
api_name="read_gbq_query",
451467
use_cache=use_cache,
@@ -456,7 +472,7 @@ def _read_gbq_query(
456472
query: str,
457473
*,
458474
index_col: Iterable[str] | str = (),
459-
col_order: Iterable[str] = (),
475+
columns: Iterable[str] = (),
460476
max_results: Optional[int] = None,
461477
api_name: str = "read_gbq_query",
462478
use_cache: bool = True,
@@ -492,7 +508,7 @@ def _read_gbq_query(
492508
return self.read_gbq_table(
493509
f"{destination.project}.{destination.dataset_id}.{destination.table_id}",
494510
index_col=index_cols,
495-
col_order=col_order,
511+
columns=columns,
496512
max_results=max_results,
497513
use_cache=use_cache,
498514
)
@@ -502,9 +518,10 @@ def read_gbq_table(
502518
query: str,
503519
*,
504520
index_col: Iterable[str] | str = (),
505-
col_order: Iterable[str] = (),
521+
columns: Iterable[str] = (),
506522
max_results: Optional[int] = None,
507523
use_cache: bool = True,
524+
col_order: Iterable[str] = (),
508525
) -> dataframe.DataFrame:
509526
"""Turn a BigQuery table into a DataFrame.
510527
@@ -521,10 +538,17 @@ def read_gbq_table(
521538
"""
522539
# NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
523540
# these docstrings are inline.
541+
if columns and col_order:
542+
raise ValueError(
543+
"Must specify either columns (preferred) or col_order, not both"
544+
)
545+
elif col_order:
546+
columns = col_order
547+
524548
return self._read_gbq_table(
525549
query=query,
526550
index_col=index_col,
527-
col_order=col_order,
551+
columns=columns,
528552
max_results=max_results,
529553
api_name="read_gbq_table",
530554
use_cache=use_cache,
@@ -583,7 +607,7 @@ def _read_gbq_table(
583607
query: str,
584608
*,
585609
index_col: Iterable[str] | str = (),
586-
col_order: Iterable[str] = (),
610+
columns: Iterable[str] = (),
587611
max_results: Optional[int] = None,
588612
api_name: str,
589613
use_cache: bool = True,
@@ -602,10 +626,10 @@ def _read_gbq_table(
602626
table_ref, api_name=api_name, use_cache=use_cache
603627
)
604628

605-
for key in col_order:
629+
for key in columns:
606630
if key not in table_expression.columns:
607631
raise ValueError(
608-
f"Column '{key}' of `col_order` not found in this table."
632+
f"Column '{key}' of `columns` not found in this table."
609633
)
610634

611635
if isinstance(index_col, str):
@@ -619,8 +643,8 @@ def _read_gbq_table(
619643
f"Column `{key}` of `index_col` not found in this table."
620644
)
621645

622-
if col_order:
623-
table_expression = table_expression.select([*index_cols, *col_order])
646+
if columns:
647+
table_expression = table_expression.select([*index_cols, *columns])
624648

625649
# If the index is unique and sortable, then we don't need to generate
626650
# an ordering column.
@@ -719,7 +743,7 @@ def _read_bigquery_load_job(
719743
*,
720744
job_config: bigquery.LoadJobConfig,
721745
index_col: Iterable[str] | str = (),
722-
col_order: Iterable[str] = (),
746+
columns: Iterable[str] = (),
723747
) -> dataframe.DataFrame:
724748
if isinstance(index_col, str):
725749
index_cols = [index_col]
@@ -760,7 +784,7 @@ def _read_bigquery_load_job(
760784
return self.read_gbq_table(
761785
table_id,
762786
index_col=index_col,
763-
col_order=col_order,
787+
columns=columns,
764788
)
765789

766790
def read_gbq_model(self, model_name: str):
@@ -959,13 +983,13 @@ def read_csv(
959983
if index_col is None:
960984
index_col = ()
961985

962-
# usecols should only be an iterable of strings (column names) for use as col_order in read_gbq.
963-
col_order: Tuple[Any, ...] = tuple()
986+
# usecols should only be an iterable of strings (column names) for use as columns in read_gbq.
987+
columns: Tuple[Any, ...] = tuple()
964988
if usecols is not None:
965989
if isinstance(usecols, Iterable) and all(
966990
isinstance(col, str) for col in usecols
967991
):
968-
col_order = tuple(col for col in usecols)
992+
columns = tuple(col for col in usecols)
969993
else:
970994
raise NotImplementedError(
971995
"BigQuery engine only supports an iterable of strings for `usecols`. "
@@ -1000,7 +1024,7 @@ def read_csv(
10001024
table,
10011025
job_config=job_config,
10021026
index_col=index_col,
1003-
col_order=col_order,
1027+
columns=columns,
10041028
)
10051029
else:
10061030
if any(arg in kwargs for arg in ("chunksize", "iterator")):

notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@
613613
"source": [
614614
"# Query 3 columns of interest from drug label dataset\n",
615615
"df = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n",
616-
" col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n",
616+
" columns=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n",
617617
"\n",
618618
"# Exclude any rows with missing data\n",
619619
"df = df.dropna()\n",
@@ -825,7 +825,7 @@
825825
"source": [
826826
"# Query 3 columns of interest from drug label dataset\n",
827827
"df_missing = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n",
828-
" col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n",
828+
" columns=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n",
829829
"\n",
830830
"# Exclude any rows with missing data\n",
831831
"df_missing = df_missing.dropna()\n",

tests/system/small/test_session.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_read_gbq_tokyo(
5252

5353

5454
@pytest.mark.parametrize(
55-
("query_or_table", "col_order"),
55+
("query_or_table", "columns"),
5656
[
5757
pytest.param(
5858
"{scalars_table_id}", ["bool_col", "int64_col"], id="two_cols_in_table"
@@ -79,16 +79,16 @@ def test_read_gbq_tokyo(
7979
),
8080
],
8181
)
82-
def test_read_gbq_w_col_order(
82+
def test_read_gbq_w_columns(
8383
session: bigframes.Session,
8484
scalars_table_id: str,
8585
query_or_table: str,
86-
col_order: List[str],
86+
columns: List[str],
8787
):
8888
df = session.read_gbq(
89-
query_or_table.format(scalars_table_id=scalars_table_id), col_order=col_order
89+
query_or_table.format(scalars_table_id=scalars_table_id), columns=columns
9090
)
91-
assert df.columns.tolist() == col_order
91+
assert df.columns.tolist() == columns
9292

9393

9494
@pytest.mark.parametrize(

third_party/bigframes_vendored/pandas/io/gbq.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@ def read_gbq(
1717
query_or_table: str,
1818
*,
1919
index_col: Iterable[str] | str = (),
20-
col_order: Iterable[str] = (),
20+
columns: Iterable[str] = (),
2121
max_results: Optional[int] = None,
2222
filters: FiltersType = (),
2323
use_cache: bool = True,
24+
col_order: Iterable[str] = (),
2425
):
2526
"""Loads a DataFrame from BigQuery.
2627
@@ -77,11 +78,11 @@ def read_gbq(
7778
7879
Reading data with `columns` and `filters` parameters:
7980
80-
>>> col_order = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed']
81+
>>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed']
8182
>>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant'])]
8283
>>> df = bpd.read_gbq(
8384
... "bigquery-public-data.baseball.games_wide",
84-
... col_order=col_order,
85+
... columns=columns,
8586
... filters=filters,
8687
... )
8788
>>> df.head(1)
@@ -97,7 +98,7 @@ def read_gbq(
9798
`project.dataset.tablename` or `dataset.tablename`.
9899
index_col (Iterable[str] or str):
99100
Name of result column(s) to use for index in results DataFrame.
100-
col_order (Iterable[str]):
101+
columns (Iterable[str]):
101102
List of BigQuery column names in the desired order for results
102103
DataFrame.
103104
max_results (Optional[int], default None):
@@ -113,6 +114,8 @@ def read_gbq(
113114
is to be conducted.
114115
use_cache (bool, default True):
115116
Whether to cache the query inputs. Default to True.
117+
col_order (Iterable[str]):
118+
Alias for columns, retained for backwards compatibility.
116119
117120
Returns:
118121
bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table.

0 commit comments

Comments
 (0)