Skip to content

Commit 962d7e5

Browse files
Merge branch 'main' into neo_bqml
2 parents 4ddb46c + d0ab9cc commit 962d7e5

File tree

7 files changed

+206
-50
lines changed

7 files changed

+206
-50
lines changed

bigframes/core/utils.py

+4
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
UNNAMED_INDEX_ID = "bigframes_unnamed_index"
2424

2525

26+
def is_gcs_path(value) -> typing_extensions.TypeGuard[str]:
27+
return isinstance(value, str) and value.startswith("gs://")
28+
29+
2630
def get_axis_number(axis: typing.Union[str, int]) -> typing.Literal[0, 1]:
2731
if axis in {0, "index", "rows"}:
2832
return 0

bigframes/dataframe.py

+32-18
Original file line numberDiff line numberDiff line change
@@ -2952,15 +2952,21 @@ def from_records(
29522952
)
29532953

29542954
def to_csv(
2955-
self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True
2956-
) -> None:
2955+
self,
2956+
path_or_buf=None,
2957+
sep=",",
2958+
*,
2959+
header: bool = True,
2960+
index: bool = True,
2961+
) -> Optional[str]:
29572962
# TODO(swast): Can we support partition columns argument?
29582963
# TODO(chelsealin): Support local file paths.
29592964
# TODO(swast): Some warning that wildcard is recommended for large
29602965
# query results? See:
29612966
# https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size
2962-
if not path_or_buf.startswith("gs://"):
2963-
raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS)
2967+
if not utils.is_gcs_path(path_or_buf):
2968+
pd_df = self.to_pandas()
2969+
return pd_df.to_csv(path_or_buf, sep=sep, header=header, index=index)
29642970
if "*" not in path_or_buf:
29652971
raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD)
29662972

@@ -2977,22 +2983,28 @@ def to_csv(
29772983
export_data_statement, api_name="dataframe-to_csv"
29782984
)
29792985
self._set_internal_query_job(query_job)
2986+
return None
29802987

29812988
def to_json(
29822989
self,
2983-
path_or_buf: str,
2984-
orient: Literal[
2985-
"split", "records", "index", "columns", "values", "table"
2986-
] = "columns",
2990+
path_or_buf=None,
2991+
orient: Optional[
2992+
Literal["split", "records", "index", "columns", "values", "table"]
2993+
] = None,
29872994
*,
29882995
lines: bool = False,
29892996
index: bool = True,
2990-
) -> None:
2997+
) -> Optional[str]:
29912998
# TODO(swast): Can we support partition columns argument?
2992-
# TODO(chelsealin): Support local file paths.
2993-
if not path_or_buf.startswith("gs://"):
2994-
raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS)
2995-
2999+
if not utils.is_gcs_path(path_or_buf):
3000+
pd_df = self.to_pandas()
3001+
return pd_df.to_json(
3002+
path_or_buf,
3003+
orient=orient,
3004+
lines=lines,
3005+
index=index,
3006+
default_handler=str,
3007+
)
29963008
if "*" not in path_or_buf:
29973009
raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD)
29983010

@@ -3021,6 +3033,7 @@ def to_json(
30213033
export_data_statement, api_name="dataframe-to_json"
30223034
)
30233035
self._set_internal_query_job(query_job)
3036+
return None
30243037

30253038
def to_gbq(
30263039
self,
@@ -3119,19 +3132,19 @@ def __array__(self, dtype=None) -> numpy.ndarray:
31193132

31203133
def to_parquet(
31213134
self,
3122-
path: str,
3135+
path=None,
31233136
*,
31243137
compression: Optional[Literal["snappy", "gzip"]] = "snappy",
31253138
index: bool = True,
3126-
) -> None:
3139+
) -> Optional[bytes]:
31273140
# TODO(swast): Can we support partition columns argument?
31283141
# TODO(chelsealin): Support local file paths.
31293142
# TODO(swast): Some warning that wildcard is recommended for large
31303143
# query results? See:
31313144
# https://cloud.google.com/bigquery/docs/exporting-data#limit_the_exported_file_size
3132-
if not path.startswith("gs://"):
3133-
raise NotImplementedError(ERROR_IO_ONLY_GS_PATHS)
3134-
3145+
if not utils.is_gcs_path(path):
3146+
pd_df = self.to_pandas()
3147+
return pd_df.to_parquet(path, compression=compression, index=index)
31353148
if "*" not in path:
31363149
raise NotImplementedError(ERROR_IO_REQUIRES_WILDCARD)
31373150

@@ -3155,6 +3168,7 @@ def to_parquet(
31553168
export_data_statement, api_name="dataframe-to_parquet"
31563169
)
31573170
self._set_internal_query_job(query_job)
3171+
return None
31583172

31593173
def to_dict(
31603174
self,

bigframes/series.py

+30-11
Original file line numberDiff line numberDiff line change
@@ -1652,9 +1652,22 @@ def to_frame(self, name: blocks.Label = None) -> bigframes.dataframe.DataFrame:
16521652
return bigframes.dataframe.DataFrame(block)
16531653

16541654
def to_csv(
1655-
self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True
1656-
) -> None:
1657-
return self.to_frame().to_csv(path_or_buf, sep=sep, header=header, index=index)
1655+
self,
1656+
path_or_buf=None,
1657+
sep=",",
1658+
*,
1659+
header: bool = True,
1660+
index: bool = True,
1661+
) -> Optional[str]:
1662+
if utils.is_gcs_path(path_or_buf):
1663+
return self.to_frame().to_csv(
1664+
path_or_buf, sep=sep, header=header, index=index
1665+
)
1666+
else:
1667+
pd_series = self.to_pandas()
1668+
return pd_series.to_csv(
1669+
path_or_buf=path_or_buf, sep=sep, header=header, index=index
1670+
)
16581671

16591672
def to_dict(self, into: type[dict] = dict) -> typing.Mapping:
16601673
return typing.cast(dict, self.to_pandas().to_dict(into)) # type: ignore
@@ -1664,17 +1677,23 @@ def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None:
16641677

16651678
def to_json(
16661679
self,
1667-
path_or_buf: str,
1668-
orient: typing.Literal[
1669-
"split", "records", "index", "columns", "values", "table"
1670-
] = "columns",
1680+
path_or_buf=None,
1681+
orient: Optional[
1682+
typing.Literal["split", "records", "index", "columns", "values", "table"]
1683+
] = None,
16711684
*,
16721685
lines: bool = False,
16731686
index: bool = True,
1674-
) -> None:
1675-
return self.to_frame().to_json(
1676-
path_or_buf=path_or_buf, orient=orient, lines=lines, index=index
1677-
)
1687+
) -> Optional[str]:
1688+
if utils.is_gcs_path(path_or_buf):
1689+
return self.to_frame().to_json(
1690+
path_or_buf=path_or_buf, orient=orient, lines=lines, index=index
1691+
)
1692+
else:
1693+
pd_series = self.to_pandas()
1694+
return pd_series.to_json(
1695+
path_or_buf=path_or_buf, orient=orient, lines=lines, index=index # type: ignore
1696+
)
16781697

16791698
def to_latex(
16801699
self, buf=None, columns=None, header=True, index=True, **kwargs

tests/system/small/test_dataframe.py

+67-1
Original file line numberDiff line numberDiff line change
@@ -4125,6 +4125,72 @@ def test_df_to_latex(scalars_df_index, scalars_pandas_df_index):
41254125
assert bf_result == pd_result
41264126

41274127

4128+
def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index):
4129+
bf_result = scalars_df_index.to_json()
4130+
# default_handler for arrow types that have no default conversion
4131+
pd_result = scalars_pandas_df_index.to_json(default_handler=str)
4132+
4133+
assert bf_result == pd_result
4134+
4135+
4136+
@skip_legacy_pandas
4137+
def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index):
4138+
with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file:
4139+
scalars_df_index.to_json(bf_result_file, orient="table")
4140+
# default_handler for arrow types that have no default conversion
4141+
scalars_pandas_df_index.to_json(
4142+
pd_result_file, orient="table", default_handler=str
4143+
)
4144+
4145+
bf_result = bf_result_file.read()
4146+
pd_result = pd_result_file.read()
4147+
4148+
assert bf_result == pd_result
4149+
4150+
4151+
def test_df_to_csv_local_str(scalars_df_index, scalars_pandas_df_index):
4152+
bf_result = scalars_df_index.to_csv()
4153+
# default_handler for arrow types that have no default conversion
4154+
pd_result = scalars_pandas_df_index.to_csv()
4155+
4156+
assert bf_result == pd_result
4157+
4158+
4159+
def test_df_to_csv_local_file(scalars_df_index, scalars_pandas_df_index):
4160+
with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file:
4161+
scalars_df_index.to_csv(bf_result_file)
4162+
scalars_pandas_df_index.to_csv(pd_result_file)
4163+
4164+
bf_result = bf_result_file.read()
4165+
pd_result = pd_result_file.read()
4166+
4167+
assert bf_result == pd_result
4168+
4169+
4170+
def test_df_to_parquet_local_bytes(scalars_df_index, scalars_pandas_df_index):
4171+
# GEOGRAPHY not supported in parquet export.
4172+
unsupported = ["geography_col"]
4173+
4174+
bf_result = scalars_df_index.drop(columns=unsupported).to_parquet()
4175+
# default_handler for arrow types that have no default conversion
4176+
pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_parquet()
4177+
4178+
assert bf_result == pd_result
4179+
4180+
4181+
def test_df_to_parquet_local_file(scalars_df_index, scalars_pandas_df_index):
4182+
# GEOGRAPHY not supported in parquet export.
4183+
unsupported = ["geography_col"]
4184+
with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file:
4185+
scalars_df_index.drop(columns=unsupported).to_parquet(bf_result_file)
4186+
scalars_pandas_df_index.drop(columns=unsupported).to_parquet(pd_result_file)
4187+
4188+
bf_result = bf_result_file.read()
4189+
pd_result = pd_result_file.read()
4190+
4191+
assert bf_result == pd_result
4192+
4193+
41284194
def test_df_to_records(scalars_df_index, scalars_pandas_df_index):
41294195
unsupported = ["numeric_col"]
41304196
bf_result = scalars_df_index.drop(columns=unsupported).to_records()
@@ -4166,7 +4232,7 @@ def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index):
41664232
scalars_df_index.to_pickle(bf_result_file)
41674233
scalars_pandas_df_index.to_pickle(pd_result_file)
41684234
bf_result = bf_result_file.read()
4169-
pd_result = bf_result_file.read()
4235+
pd_result = pd_result_file.read()
41704236

41714237
assert bf_result == pd_result
41724238

tests/system/small/test_series.py

+38
Original file line numberDiff line numberDiff line change
@@ -2753,6 +2753,44 @@ def test_to_latex(scalars_df_index, scalars_pandas_df_index):
27532753
assert bf_result == pd_result
27542754

27552755

2756+
def test_series_to_json_local_str(scalars_df_index, scalars_pandas_df_index):
2757+
bf_result = scalars_df_index.int64_col.to_json()
2758+
pd_result = scalars_pandas_df_index.int64_col.to_json()
2759+
2760+
assert bf_result == pd_result
2761+
2762+
2763+
@skip_legacy_pandas
2764+
def test_series_to_json_local_file(scalars_df_index, scalars_pandas_df_index):
2765+
with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file:
2766+
scalars_df_index.int64_col.to_json(bf_result_file)
2767+
scalars_pandas_df_index.int64_col.to_json(pd_result_file)
2768+
2769+
bf_result = bf_result_file.read()
2770+
pd_result = pd_result_file.read()
2771+
2772+
assert bf_result == pd_result
2773+
2774+
2775+
def test_series_to_csv_local_str(scalars_df_index, scalars_pandas_df_index):
2776+
bf_result = scalars_df_index.int64_col.to_csv()
2777+
# default_handler for arrow types that have no default conversion
2778+
pd_result = scalars_pandas_df_index.int64_col.to_csv()
2779+
2780+
assert bf_result == pd_result
2781+
2782+
2783+
def test_series_to_csv_local_file(scalars_df_index, scalars_pandas_df_index):
2784+
with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file:
2785+
scalars_df_index.int64_col.to_csv(bf_result_file)
2786+
scalars_pandas_df_index.int64_col.to_csv(pd_result_file)
2787+
2788+
bf_result = bf_result_file.read()
2789+
pd_result = pd_result_file.read()
2790+
2791+
assert bf_result == pd_result
2792+
2793+
27562794
def test_to_dict(scalars_df_index, scalars_pandas_df_index):
27572795
bf_result = scalars_df_index["int64_too"].to_dict()
27582796

third_party/bigframes_vendored/pandas/core/frame.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -476,11 +476,11 @@ def to_gbq(
476476

477477
def to_parquet(
478478
self,
479-
path: str,
479+
path: Optional[str],
480480
*,
481481
compression: Optional[Literal["snappy", "gzip"]] = "snappy",
482482
index: bool = True,
483-
) -> None:
483+
) -> Optional[bytes]:
484484
"""Write a DataFrame to the binary Parquet format.
485485
486486
This function writes the dataframe as a `parquet file
@@ -496,9 +496,13 @@ def to_parquet(
496496
>>> df.to_parquet(path=gcs_bucket)
497497
498498
Args:
499-
path (str):
499+
path (str, path object, file-like object, or None, default None):
500+
String, path object (implementing ``os.PathLike[str]``), or file-like
501+
object implementing a binary ``write()`` function. If None, the result is
502+
returned as bytes. If a string or path, it will be used as Root Directory
503+
path when writing a partitioned dataset.
500504
Destination URI(s) of Cloud Storage files(s) to store the extracted dataframe
501-
in format of ``gs://<bucket_name>/<object_name_or_glob>``.
505+
should be formatted ``gs://<bucket_name>/<object_name_or_glob>``.
502506
If the data size is more than 1GB, you must use a wildcard to export
503507
the data into multiple files and the size of the files varies.
504508
@@ -511,7 +515,7 @@ def to_parquet(
511515
If ``False``, they will not be written to the file.
512516
513517
Returns:
514-
None.
518+
bytes if no path argument is provided else None
515519
"""
516520
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
517521

0 commit comments

Comments
 (0)