From ade505cb9fac96985e62f0c546f0df9b65f301c9 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 26 Oct 2023 05:12:13 +0000 Subject: [PATCH 01/22] Revert "ci: Disable presubmit LLM tests temporarily (#144)" (#148) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 1641aff37d601b47e0bc4f25ff148be4f718bd1a, which was merged due to automerge label while still being discussed. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- tests/system/small/ml/test_llm.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index a801c36c83..b7257dde1b 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -26,9 +26,6 @@ def test_create_text_generator_model(palm2_text_generator_model): assert palm2_text_generator_model._bqml_model is not None -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_session(bq_connection, llm_text_pandas_df): import bigframes.pandas as bpd @@ -51,9 +48,6 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_connection(llm_text_pandas_df): from bigframes import _config @@ -80,9 +74,6 @@ def test_create_text_generator_model_default_connection(llm_text_pandas_df): # Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df @@ -94,9 +85,6 @@ def test_text_generator_predict_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df @@ -108,9 +96,6 @@ def test_text_generator_predict_series_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df @@ -123,9 +108,6 @@ def test_text_generator_predict_arbitrary_col_label_success( assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df @@ -157,9 +139,6 @@ def test_create_text_embedding_generator_model_defaults(bq_connection): assert model._bqml_model is not None -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df @@ -173,9 +152,6 @@ def test_embedding_generator_predict_success( assert value.size == 768 -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df From bfd49a54e52e2d502345aa8f9b54457c902abf5c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 11:48:14 -0500 Subject: [PATCH 02/22] refactor: make `to_pandas()` call `to_arrow()` and use local dtypes in DataFrame construction (#132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Towards internal issue 280662868 πŸ¦• --- bigframes/core/blocks.py | 41 +--- bigframes/core/indexes/index.py | 3 +- bigframes/dtypes.py | 6 + bigframes/session/__init__.py | 10 +- bigframes/session/_io/pandas.py | 77 +++++++ tests/system/small/test_dataframe.py | 10 - tests/system/small/test_series.py | 48 ++++- tests/unit/session/test_io_pandas.py | 296 +++++++++++++++++++++++++++ tests/unit/test_dtypes.py | 57 +++--- 9 files changed, 457 insertions(+), 91 deletions(-) create mode 100644 bigframes/session/_io/pandas.py create mode 100644 tests/unit/session/test_io_pandas.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 046d2b3a44..eab4645477 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -28,11 +28,8 @@ from typing import Iterable, List, Optional, Sequence, Tuple import warnings -import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery -import numpy import pandas as pd -import pyarrow as pa # type: ignore import bigframes.constants as constants import bigframes.core as core @@ -46,6 +43,7 @@ import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.session._io.pandas import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common # Type constraint for wherever column labels are used @@ -372,34 +370,11 @@ def reorder_levels(self, ids: typing.Sequence[str]): level_names = [self.col_id_to_index_name[index_id] for index_id in ids] return Block(self.expr, ids, self.column_labels, level_names) - @classmethod - def _to_dataframe( - cls, result, schema: typing.Mapping[str, bigframes.dtypes.Dtype] - ) -> pd.DataFrame: + def _to_dataframe(self, result) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" - dtypes = bigframes.dtypes.to_pandas_dtypes_overrides(result.schema) - df = result.to_dataframe( - dtypes=dtypes, - bool_dtype=pd.BooleanDtype(), - int_dtype=pd.Int64Dtype(), - float_dtype=pd.Float64Dtype(), - string_dtype=pd.StringDtype(storage="pyarrow"), - date_dtype=pd.ArrowDtype(pa.date32()), - datetime_dtype=pd.ArrowDtype(pa.timestamp("us")), - time_dtype=pd.ArrowDtype(pa.time64("us")), - timestamp_dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC")), - ) - - # Convert Geography column from StringDType to GeometryDtype. - for column_name, dtype in schema.items(): - if dtype == gpd.array.GeometryDtype(): - df[column_name] = gpd.GeoSeries.from_wkt( - # https://github.com/geopandas/geopandas/issues/1879 - df[column_name].replace({numpy.nan: None}), - # BigQuery geography type is based on the WGS84 reference ellipsoid. - crs="EPSG:4326", - ) - return df + dtypes = dict(zip(self.index_columns, self.index_dtypes)) + dtypes.update(zip(self.value_columns, self.dtypes)) + return self._expr._session._rows_to_dataframe(result, dtypes) def to_pandas( self, @@ -480,8 +455,7 @@ def _compute_and_count( if sampling_method == _HEAD: total_rows = int(results_iterator.total_rows * fraction) results_iterator.max_results = total_rows - schema = dict(zip(self.value_columns, self.dtypes)) - df = self._to_dataframe(results_iterator, schema) + df = self._to_dataframe(results_iterator) if self.index_columns: df.set_index(list(self.index_columns), inplace=True) @@ -510,8 +484,7 @@ def _compute_and_count( ) else: total_rows = results_iterator.total_rows - schema = dict(zip(self.value_columns, self.dtypes)) - df = self._to_dataframe(results_iterator, schema) + df = self._to_dataframe(results_iterator) if self.index_columns: df.set_index(list(self.index_columns), inplace=True) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 677bb8529c..b9ffdff21e 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -399,9 +399,10 @@ def to_pandas(self) -> pandas.Index: """Executes deferred operations and downloads the results.""" # Project down to only the index column. So the query can be cached to visualize other data. index_columns = list(self._block.index_columns) + dtypes = dict(zip(index_columns, self.dtypes)) expr = self._expr.select_columns(index_columns) results, _ = expr.start_query() - df = expr._session._rows_to_dataframe(results) + df = expr._session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index index.names = list(self._block._index_labels) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index da221a95ac..079f0cc27a 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -169,6 +169,10 @@ def ibis_dtype_to_bigframes_dtype( if isinstance(ibis_dtype, ibis_dtypes.Struct): return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) + # BigQuery only supports integers of size 64 bits. + if isinstance(ibis_dtype, ibis_dtypes.Integer): + return pd.Int64Dtype() + if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] elif isinstance(ibis_dtype, ibis_dtypes.Null): @@ -372,6 +376,8 @@ def cast_ibis_value( ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64), ibis_dtypes.date: (), + ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,), + ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,), ibis_dtypes.time: (), ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,), diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index db9c5a353c..af1f70d54d 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1515,14 +1515,10 @@ def _get_table_size(self, destination_table): return table.num_bytes def _rows_to_dataframe( - self, row_iterator: bigquery.table.RowIterator + self, row_iterator: bigquery.table.RowIterator, dtypes: Dict ) -> pandas.DataFrame: - return row_iterator.to_dataframe( - bool_dtype=pandas.BooleanDtype(), - int_dtype=pandas.Int64Dtype(), - float_dtype=pandas.Float64Dtype(), - string_dtype=pandas.StringDtype(storage="pyarrow"), - ) + arrow_table = row_iterator.to_arrow() + return bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) def _start_generic_job(self, job: formatting_helpers.GenericJob): if bigframes.options.display.progress_bar is not None: diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py new file mode 100644 index 0000000000..163127b546 --- /dev/null +++ b/bigframes/session/_io/pandas.py @@ -0,0 +1,77 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Union + +import geopandas # type: ignore +import pandas +import pandas.arrays +import pyarrow # type: ignore +import pyarrow.compute # type: ignore + +import bigframes.constants + + +def arrow_to_pandas( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict +): + if len(dtypes) != arrow_table.num_columns: + raise ValueError( + f"Number of types {len(dtypes)} doesn't match number of columns " + f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}" + ) + + serieses = {} + for field, column in zip(arrow_table.schema, arrow_table): + dtype = dtypes[field.name] + + if dtype == geopandas.array.GeometryDtype(): + series = geopandas.GeoSeries.from_wkt( + column, + # BigQuery geography type is based on the WGS84 reference ellipsoid. + crs="EPSG:4326", + ) + elif dtype == pandas.Float64Dtype(): + # Preserve NA/NaN distinction. Note: This is currently needed, even if we use + # nullable Float64Dtype in the types_mapper. See: + # https://github.com/pandas-dev/pandas/issues/55668 + # Regarding type: ignore, this class has been public at this + # location since pandas 1.2.0. See: + # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html + pd_array = pandas.arrays.FloatingArray( # type: ignore + column.to_numpy(), + pyarrow.compute.is_null(column).to_numpy(), + ) + series = pandas.Series(pd_array, dtype=dtype) + elif dtype == pandas.Int64Dtype(): + # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly + # casts to float64 in an intermediate step. + pd_array = pandas.arrays.IntegerArray( + pyarrow.compute.fill_null(column, 0).to_numpy(), + pyarrow.compute.is_null(column).to_numpy(), + ) + series = pandas.Series(pd_array, dtype=dtype) + elif isinstance(dtype, pandas.ArrowDtype): + # Avoid conversion logic if we are backing the pandas Series by the + # arrow array. + series = pandas.Series( + pandas.arrays.ArrowExtensionArray(column), # type: ignore + dtype=dtype, + ) + else: + series = column.to_pandas(types_mapper=lambda _: dtype) + + serieses[field.name] = series + + return pandas.DataFrame(serieses) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 19e50eb06d..84e8def83b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2046,16 +2046,6 @@ def test__dir__with_rename(scalars_dfs): def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index.iloc[start:stop:step] - - # Pandas may assign non-object dtype to empty series and series index - # dtypes of empty columns are a known area of divergence from pandas - for column in pd_result.columns: - if ( - pd_result[column].empty and column != "geography_col" - ): # for empty geography_col, bigframes assigns non-object dtype - pd_result[column] = pd_result[column].astype("object") - pd_result.index = pd_result.index.astype("object") - pd.testing.assert_frame_equal( bf_result, pd_result, diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index bd9edbb1ca..c9510290b6 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -575,7 +575,15 @@ def test_series_int_int_operators_series(scalars_dfs, operator): ) def test_mods(scalars_dfs, col_x, col_y, method): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]).to_pandas() + x_bf = scalars_df[col_x] + y_bf = scalars_df[col_y] + bf_series = getattr(x_bf, method)(y_bf) + # BigQuery's mod functions return [BIG]NUMERIC values unless both arguments are integers. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#mod + if x_bf.dtype == pd.Int64Dtype() and y_bf.dtype == pd.Int64Dtype(): + bf_result = bf_series.to_pandas() + else: + bf_result = bf_series.astype("Float64").to_pandas() pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) pd.testing.assert_series_equal(pd_result, bf_result) @@ -620,8 +628,20 @@ def test_divmods_series(scalars_dfs, col_x, col_y, method): pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)( scalars_pandas_df[col_y] ) - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) @pytest.mark.parametrize( @@ -649,8 +669,20 @@ def test_divmods_scalars(scalars_dfs, col_x, other, method): scalars_df, scalars_pandas_df = scalars_dfs bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(other) pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other) - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) @pytest.mark.parametrize( @@ -1941,12 +1973,6 @@ def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step] - - # Pandas may assign non-object dtype to empty series and series index - if pd_result.empty: - pd_result = pd_result.astype("object") - pd_result.index = pd_result.index.astype("object") - pd.testing.assert_series_equal( bf_result, pd_result, diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py new file mode 100644 index 0000000000..8b95977ec3 --- /dev/null +++ b/tests/unit/session/test_io_pandas.py @@ -0,0 +1,296 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +from typing import Dict, Union + +import geopandas # type: ignore +import numpy +import pandas +import pandas.arrays +import pandas.testing +import pyarrow # type: ignore +import pytest + +import bigframes.session._io.pandas + + +@pytest.mark.parametrize( + ("arrow_table", "dtypes", "expected"), + ( + pytest.param( + pyarrow.Table.from_pydict({}), + {}, + pandas.DataFrame(), + id="empty-df", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": pyarrow.array([None, None, None], type=pyarrow.bool_()), + "float": pyarrow.array([None, None, None], type=pyarrow.float64()), + "int": pyarrow.array([None, None, None], type=pyarrow.int64()), + "string": pyarrow.array([None, None, None], type=pyarrow.string()), + "time": pyarrow.array( + [None, None, None], type=pyarrow.time64("us") + ), + } + ), + { + "bool": "boolean", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + "time": pandas.ArrowDtype(pyarrow.time64("us")), + }, + pandas.DataFrame( + { + "bool": pandas.Series([None, None, None], dtype="boolean"), + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [float("nan"), float("nan"), float("nan")], + dtype="float64", + ), + numpy.array([True, True, True], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [None, None, None], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + [None, None, None], dtype="string[pyarrow]" + ), + "time": pandas.Series( + [ + None, + None, + None, + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + id="nulls-df", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "date": pyarrow.array( + [ + datetime.date(2023, 8, 29), + None, + datetime.date(2024, 4, 9), + datetime.date(1, 1, 1), + ], + type=pyarrow.date32(), + ), + "datetime": pyarrow.array( + [ + datetime.datetime(2023, 8, 29), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + ], + type=pyarrow.timestamp("us"), + ), + "string": ["123", None, "abc", "xyz"], + "time": pyarrow.array( + [ + datetime.time(0, 0, 0, 1), + datetime.time(12, 0, 0), + None, + datetime.time(23, 59, 59, 999999), + ], + type=pyarrow.time64("us"), + ), + "timestamp": pyarrow.array( + [ + datetime.datetime(2023, 8, 29), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + ], + type=pyarrow.timestamp("us", datetime.timezone.utc), + ), + } + ), + { + "date": pandas.ArrowDtype(pyarrow.date32()), + "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")), + "string": "string[pyarrow]", + "time": pandas.ArrowDtype(pyarrow.time64("us")), + "timestamp": pandas.ArrowDtype( + pyarrow.timestamp("us", datetime.timezone.utc) + ), + }, + pandas.DataFrame( + { + "date": pandas.Series( + [ + datetime.date(2023, 8, 29), + None, + datetime.date(2024, 4, 9), + datetime.date(1, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + "datetime": pandas.Series( + [ + datetime.datetime(2023, 8, 29), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + "time": pandas.Series( + [ + datetime.time(0, 0, 0, 1), + datetime.time(12, 0, 0), + None, + datetime.time(23, 59, 59, 999999), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + "timestamp": pandas.Series( + [ + datetime.datetime(2023, 8, 29), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + ], + dtype=pandas.ArrowDtype( + pyarrow.timestamp("us", datetime.timezone.utc) + ), + ), + } + ), + id="arrow-dtypes", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": [True, None, True, False], + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pyarrow.array( + [1.0, None, float("nan"), -1.0], + type=pyarrow.float64(), + ), + "int": pyarrow.array( + [1, None, -1, 2**63 - 1], + type=pyarrow.int64(), + ), + "string": ["123", None, "abc", "xyz"], + } + ), + { + "bool": "boolean", + "bytes": "object", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + }, + pandas.DataFrame( + { + "bool": pandas.Series([True, None, True, False], dtype="boolean"), + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [1.0, float("nan"), float("nan"), -1.0], dtype="float64" + ), + numpy.array([False, True, False, False], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [1, None, -1, 2**63 - 1], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + } + ), + id="scalar-dtypes", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "geocol": [ + "POINT(32 210)", + None, + "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)", + ] + } + ), + {"geocol": geopandas.array.GeometryDtype()}, + pandas.DataFrame( + { + "geocol": geopandas.GeoSeries.from_wkt( + ["POINT(32 210)", None, "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)"], + crs="EPSG:4326", + ), + } + ), + id="geography-dtype", + ), + ), +) +def test_arrow_to_pandas( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], + dtypes: Dict, + expected: pandas.DataFrame, +): + actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + pandas.testing.assert_series_equal(actual.dtypes, expected.dtypes) + + # assert_frame_equal is converting to numpy internally, which causes some + # loss of precision with the extreme values in this test. + for column in actual.columns: + assert tuple( + (index, value) if (value is pandas.NA or value == value) else (index, "nan") + for index, value in actual[column].items() + ) == tuple( + (index, value) if (value is pandas.NA or value == value) else (index, "nan") + for index, value in expected[column].items() + ) + + +@pytest.mark.parametrize( + ("arrow_table", "dtypes"), + ( + pytest.param( + pyarrow.Table.from_pydict({"col1": [1], "col2": [2]}), + {"col1": "Int64"}, + id="too-few-dtypes", + ), + pytest.param( + pyarrow.RecordBatch.from_pydict({"col1": [1]}), + {"col1": "Int64", "col2": "string[pyarrow]"}, + id="too-many-dtypes", + ), + ), +) +def test_arrow_to_pandas_wrong_size_dtypes( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict +): + with pytest.raises(ValueError, match=f"Number of types {len(dtypes)}"): + bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 3baff2e1f5..6ceaaf911b 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -29,41 +29,42 @@ # TODO(bmil): Add ARRAY, INTERVAL, STRUCT to cover all the standard # BigQuery data types as they appear in Ibis: # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types - (ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), np.dtype("O")), - (ibis_dtypes.boolean, pd.BooleanDtype()), - (ibis_dtypes.binary, np.dtype("O")), - (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), - (ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us"))), - (ibis_dtypes.float64, pd.Float64Dtype()), - ( + pytest.param( + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), + np.dtype("O"), + id="bignumeric", + ), + pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"), + pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"), + pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"), + pytest.param( + ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime" + ), + pytest.param(ibis_dtypes.float64, pd.Float64Dtype(), id="float"), + pytest.param( ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), gpd.array.GeometryDtype(), + id="geography", ), - (ibis_dtypes.int64, pd.Int64Dtype()), - (ibis_dtypes.json, np.dtype("O")), - (ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), np.dtype("O")), - (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")), - (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))), - ( + pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"), + pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"), + # TODO(tswast): custom dtype (or at least string dtype) for JSON objects + pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"), + pytest.param( + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), + np.dtype("O"), + id="numeric", + ), + pytest.param( + ibis_dtypes.string, pd.StringDtype(storage="pyarrow"), id="string" + ), + pytest.param(ibis_dtypes.time, pd.ArrowDtype(pa.time64("us")), id="time"), + pytest.param( ibis_dtypes.Timestamp(timezone="UTC"), pd.ArrowDtype(pa.timestamp("us", tz="UTC")), # type: ignore + id="timestamp", ), ], - ids=[ - "bignumeric", - "bool", - "bytes", - "date", - "datetime", - "float", - "geography", - "int64", - "json", - "numeric", - "string", - "time", - "timestamp", - ], ) def test_ibis_dtype_converts(ibis_dtype, bigframes_dtype): """Test all the Ibis data types needed to read BigQuery tables""" From d423e102453c070af64aa37741fb9ff6fb9a6d25 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 26 Oct 2023 17:44:14 +0000 Subject: [PATCH 03/22] test: Log slowest tests durations (#146) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- noxfile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 1864da9fe7..d0bbda80fd 100644 --- a/noxfile.py +++ b/noxfile.py @@ -305,8 +305,10 @@ def run_system( "py.test", "--quiet", "-n=20", - # Any individual test taking longer than 10 mins will be terminated. + # Any individual test taking longer than 15 mins will be terminated. "--timeout=900", + # Log 20 slowest tests + "--durations=20", f"--junitxml={prefix_name}_{session.python}_sponge_log.xml", ] if print_duration: From 45c617fee7becc42f1c129246ffdc32f3a963f12 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 26 Oct 2023 11:36:14 -0700 Subject: [PATCH 04/22] docs: link to ML.EVALUATE BQML page for score() methods (#137) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/ml/ensemble.py | 12 ++++++++++++ bigframes/ml/forecasting.py | 6 ++++++ third_party/bigframes_vendored/sklearn/base.py | 14 +++++++++++++- .../bigframes_vendored/sklearn/cluster/_kmeans.py | 9 +++++++-- .../sklearn/decomposition/_pca.py | 8 +++++++- 5 files changed, 45 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 113ad872b5..19ca8608ff 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -507,6 +507,12 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. @@ -676,6 +682,12 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 8a6de1dd81..8e309d5e73 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -112,6 +112,12 @@ def score( ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#time_series_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame only contains 1 column as diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 42868ce51f..768328e552 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -85,6 +85,12 @@ def score(self, X, y): which is a harsh metric since you require for each sample that each label set be correctly predicted. + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models + for the outputs relevant to this model type. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples, n_features). Test samples. @@ -105,7 +111,13 @@ class RegressorMixin: _estimator_type = "regressor" def score(self, X, y): - """Return the evaluation metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index ece62dc147..5369d3662d 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -12,7 +12,6 @@ # License: BSD 3 clause from abc import ABC -from typing import List, Optional from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -83,7 +82,13 @@ def score( X, y=None, ): - """Metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#k-means_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 97fee5a501..011ecc06dd 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -55,7 +55,13 @@ def fit(self, X, y=None): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X=None, y=None): - """Return the metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#pca_models + for the outputs relevant to this model type. Args: X (default None): From c639a3657465e2b68a3b93c363bd3ae1e969d2cc Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 26 Oct 2023 12:30:15 -0700 Subject: [PATCH 05/22] feat: populate ibis version in user agent (#140) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/session/clients.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 544f74265f..e33413002f 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -28,12 +28,13 @@ import google.cloud.bigquery_storage_v1 import google.cloud.functions_v2 import google.cloud.resourcemanager_v3 +import ibis import pydata_google_auth import bigframes.version _ENV_DEFAULT_PROJECT = "GOOGLE_CLOUD_PROJECT" -_APPLICATION_NAME = f"bigframes/{bigframes.version.__version__}" +_APPLICATION_NAME = f"bigframes/{bigframes.version.__version__} ibis/{ibis.__version__}" _SCOPES = ["https://www.googleapis.com/auth/cloud-platform"] # BigQuery is a REST API, which requires the protocol as part of the URL. From 2ddbf743efc2fd8ffb61ae8d3333fc4b98ce4b55 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 15:12:14 -0500 Subject: [PATCH 06/22] fix: don't override the global logging config (#138) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/clients.py | 3 --- bigframes/remote_function.py | 5 ----- 2 files changed, 8 deletions(-) diff --git a/bigframes/clients.py b/bigframes/clients.py index 4ba9d93d69..de2421e499 100644 --- a/bigframes/clients.py +++ b/bigframes/clients.py @@ -24,9 +24,6 @@ from google.cloud import bigquery_connection_v1, resourcemanager_v3 from google.iam.v1 import iam_policy_pb2, policy_pb2 -logging.basicConfig( - level=logging.INFO, format="[%(levelname)s][%(asctime)s][%(name)s] %(message)s" -) logger = logging.getLogger(__name__) diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index c82ba84056..a39cd033f6 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -53,11 +53,6 @@ from bigframes import clients import bigframes.constants as constants -# TODO(shobs): Change the min log level to INFO after the development stabilizes -# before June 2023 -logging.basicConfig( - level=logging.INFO, format="[%(levelname)s][%(asctime)s][%(name)s] %(message)s" -) logger = logging.getLogger(__name__) # Protocol version 4 is available in python version 3.4 and above From 27c57255c7fe11e1ef9b9826d988d80fc17442a6 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Thu, 26 Oct 2023 14:01:04 -0700 Subject: [PATCH 07/22] fix: use indexee's session for loc listlike cases (#152) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/core/indexers.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 4f5a9471b9..d18a0a38ef 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -310,7 +310,9 @@ def _loc_getitem_series_or_dataframe( index_name = temporary_index_names[i] values = [entry[i] for entry in key] index_cols_dict[index_name] = values - keys_df = bigframes.dataframe.DataFrame(index_cols_dict) + keys_df = bigframes.dataframe.DataFrame( + index_cols_dict, session=series_or_dataframe._get_block().expr._session + ) keys_df = keys_df.set_index(temporary_index_names, drop=True) keys_df = keys_df.rename_axis(original_index_names) else: @@ -320,7 +322,10 @@ def _loc_getitem_series_or_dataframe( index_name_is_none = index_name is None if index_name_is_none: index_name = "unnamed_col" - keys_df = bigframes.dataframe.DataFrame({index_name: key}) + keys_df = bigframes.dataframe.DataFrame( + {index_name: key}, + session=series_or_dataframe._get_block().expr._session, + ) keys_df = keys_df.set_index(index_name, drop=True) if index_name_is_none: keys_df.index.name = None From 8e4451841ba09099b0ed5433f9102511741dfbed Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 26 Oct 2023 14:40:13 -0700 Subject: [PATCH 08/22] feat: add pandas.qcut (#104) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/core/reshape/__init__.py | 33 ++++++++++++ bigframes/operations/aggregations.py | 51 +++++++++++++++++++ bigframes/pandas/__init__.py | 13 +++++ tests/system/small/test_pandas.py | 25 +++++++++ .../pandas/core/reshape/tile.py | 30 +++++++++++ 5 files changed, 152 insertions(+) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 339ce7466a..dc61c3baad 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -20,6 +20,7 @@ import bigframes.core as core import bigframes.core.utils as utils import bigframes.dataframe +import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series @@ -118,3 +119,35 @@ def cut( f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" ) return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec()) + + +def qcut( + x: bigframes.series.Series, + q: typing.Union[int, typing.Sequence[float]], + *, + labels: Optional[bool] = None, + duplicates: typing.Literal["drop", "error"] = "error", +) -> bigframes.series.Series: + if isinstance(q, int) and q <= 0: + raise ValueError("`q` should be a positive integer.") + + if labels is not False: + raise NotImplementedError( + f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + ) + if duplicates != "drop": + raise NotImplementedError( + f"Only duplicates='drop' is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + ) + block = x._block + label = block.col_id_to_label[x._value_column] + block, nullity_id = block.apply_unary_op(x._value_column, ops.notnull_op) + block, result = block.apply_window_op( + x._value_column, + agg_ops.QcutOp(q), + window_spec=core.WindowSpec(grouping_keys=(nullity_id,)), + ) + block, result = block.apply_binary_op( + result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label + ) + return bigframes.series.Series(block.select_column(result)) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 23271e8220..465d188724 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -254,6 +254,53 @@ def handles_ties(self): return True +class QcutOp(WindowOp): + def __init__(self, quantiles: typing.Union[int, typing.Sequence[float]]): + self.name = f"qcut-{quantiles}" + self._quantiles = quantiles + + @numeric_op + def _as_ibis( + self, column: ibis_types.Column, window=None + ) -> ibis_types.IntegerValue: + if isinstance(self._quantiles, int): + quantiles_ibis = dtypes.literal_to_ibis_scalar(self._quantiles) + percent_ranks = typing.cast( + ibis_types.FloatingColumn, + _apply_window_if_present(column.percent_rank(), window), + ) + float_bucket = typing.cast( + ibis_types.FloatingColumn, (percent_ranks * quantiles_ibis) + ) + return float_bucket.ceil().clip(lower=_ibis_num(1)) - _ibis_num(1) + else: + percent_ranks = typing.cast( + ibis_types.FloatingColumn, + _apply_window_if_present(column.percent_rank(), window), + ) + out = ibis.case() + first_ibis_quantile = dtypes.literal_to_ibis_scalar(self._quantiles[0]) + out = out.when(percent_ranks < first_ibis_quantile, None) + for bucket_n in range(len(self._quantiles) - 1): + ibis_quantile = dtypes.literal_to_ibis_scalar( + self._quantiles[bucket_n + 1] + ) + out = out.when( + percent_ranks <= ibis_quantile, + dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=Int64Dtype()), + ) + out = out.else_(None) + return out.end() + + @property + def skips_nulls(self): + return False + + @property + def handles_ties(self): + return True + + class NuniqueOp(AggregateOp): name = "nunique" @@ -491,3 +538,7 @@ def lookup_agg_func(key: str) -> AggregateOp: return _AGGREGATIONS_LOOKUP[key] else: raise ValueError(f"Unrecognize aggregate function: {key}") + + +def _ibis_num(number: float): + return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 5c1928e6f0..8d9726312f 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -134,6 +134,19 @@ def cut( cut.__doc__ = vendored_pandas_tile.cut.__doc__ +def qcut( + x: bigframes.series.Series, + q: int, + *, + labels: Optional[bool] = None, + duplicates: typing.Literal["drop", "error"] = "error", +) -> bigframes.series.Series: + return bigframes.core.reshape.qcut(x, q, labels=labels, duplicates=duplicates) + + +qcut.__doc__ = vendored_pandas_tile.qcut.__doc__ + + def merge( left: DataFrame, right: DataFrame, diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a429c6551d..f8fa78587f 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -223,3 +223,28 @@ def test_cut(scalars_dfs): bf_result = bf_result.to_pandas() pd_result = pd_result.astype("Int64") pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("q",), + [ + (1,), + (2,), + (7,), + (32,), + ([0, 0.1, 0.3, 0.4, 0.9, 1.0],), + ([0.5, 0.9],), + ], +) +def test_qcut(scalars_dfs, q): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.qcut( + scalars_pandas_df["float64_col"], q, labels=False, duplicates="drop" + ) + bf_result = bpd.qcut(scalars_df["float64_col"], q, labels=False, duplicates="drop") + + bf_result = bf_result.to_pandas() + pd_result = pd_result.astype("Int64") + + pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 4f5f2efef0..24ea655a5f 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -65,3 +65,33 @@ def cut( False : returns an ndarray of integers. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +def qcut(x, q, *, labels=None, duplicates="error"): + """ + Quantile-based discretization function. + + Discretize variable into equal-sized buckets based on rank or based + on sample quantiles. For example 1000 values for 10 quantiles would + produce a Categorical object indicating quantile membership for each data point. + + Args: + x (Series): + The input Series to be binned. Must be 1-dimensional. + q (int or list-like of float): + Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately + array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. + labels (None): + Used as labels for the resulting bins. Must be of the same length as + the resulting bins. If False, return only integer indicators of the + bins. If True, raises an error. + duplicates ({default 'raise', 'drop'}, optional): + If bin edges are not unique, raise ValueError or drop non-uniques. + + Returns: + Series: Categorical or Series of integers if labels is False + The return type (Categorical or Series) depends on the input: a Series + of type category if input is a Series else Categorical. Bins are + represented as categories when categorical data is returned. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 5edcd19e6200db9b9ebe3d4945816b3ebf1f7bcd Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 26 Oct 2023 15:20:15 -0700 Subject: [PATCH 09/22] feat: add unstack to series, add level param (#115) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/core/blocks.py | 26 ++++++++++-- bigframes/dataframe.py | 32 +++++---------- bigframes/series.py | 40 +++++++++++-------- tests/system/conftest.py | 8 +++- tests/system/small/test_dataframe.py | 10 ++++- tests/system/small/test_multiindex.py | 31 ++++++++++++-- .../bigframes_vendored/pandas/core/series.py | 13 ++++++ 7 files changed, 112 insertions(+), 48 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index eab4645477..e8a3968b3d 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -67,6 +67,10 @@ _MONOTONIC_DECREASING = "monotonic_decreasing" +LevelType = typing.Union[str, int] +LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] + + class BlockHolder(typing.Protocol): """Interface for mutable objects with state represented by a block value object.""" @@ -1423,9 +1427,7 @@ def _get_unique_values( raise ValueError(f"Too many unique values: {pd_values}") if len(columns) > 1: - return pd.MultiIndex.from_frame( - pd_values.sort_values(by=list(pd_values.columns), na_position="first") - ) + return pd.MultiIndex.from_frame(pd_values) else: return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first")) @@ -1611,6 +1613,24 @@ def cached(self) -> Block: index_labels=self.index_labels, ) + def resolve_index_level(self, level: LevelsType) -> typing.Sequence[str]: + if utils.is_list_like(level): + levels = list(level) + else: + levels = [level] + resolved_level_ids = [] + for level_ref in levels: + if isinstance(level_ref, int): + resolved_level_ids.append(self.index_columns[level_ref]) + elif isinstance(level_ref, typing.Hashable): + matching_ids = self.index_name_to_col_id.get(level_ref, []) + if len(matching_ids) != 1: + raise ValueError("level name cannot be found or is ambiguous") + resolved_level_ids.append(matching_ids[0]) + else: + raise ValueError(f"Unexpected level: {level_ref}") + return resolved_level_ids + def _is_monotonic( self, column_ids: typing.Union[str, Sequence[str]], increasing: bool ) -> bool: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5c0d9b78e1..869075a970 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1038,22 +1038,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): raise ValueError("Columns must be a multiindex to reorder levels.") def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: - if utils.is_list_like(level): - levels = list(level) - else: - levels = [level] - resolved_level_ids = [] - for level_ref in levels: - if isinstance(level_ref, int): - resolved_level_ids.append(self._block.index_columns[level_ref]) - elif isinstance(level_ref, typing.Hashable): - matching_ids = self._block.index_name_to_col_id.get(level_ref, []) - if len(matching_ids) != 1: - raise ValueError("level name cannot be found or is ambiguous") - resolved_level_ids.append(matching_ids[0]) - else: - raise ValueError(f"Unexpected level: {level_ref}") - return resolved_level_ids + return self._block.resolve_index_level(level) def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame: block = self._block.rename(columns=columns) @@ -1802,20 +1787,25 @@ def _stack_multi(self, level: LevelsType = -1): block = block.stack(levels=len(level)) return DataFrame(block) - def unstack(self): + def unstack(self, level: LevelsType = -1): + if isinstance(level, int) or isinstance(level, str): + level = [level] + block = self._block # Special case, unstack with mono-index transpose into a series if self.index.nlevels == 1: block = block.stack(how="right", levels=self.columns.nlevels) return bigframes.series.Series(block) - # Pivot by last level of index - index_ids = block.index_columns + # Pivot by index levels + unstack_ids = self._resolve_levels(level) block = block.reset_index(drop=False) - block = block.set_index(index_ids[:-1]) + block = block.set_index( + [col for col in self._block.index_columns if col not in unstack_ids] + ) pivot_block = block.pivot( - columns=[index_ids[-1]], + columns=unstack_ids, values=self._block.value_columns, values_in_index=True, ) diff --git a/bigframes/series.py b/bigframes/series.py index 49df8ab61e..c191452783 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -352,22 +352,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0): return Series(self._block.reorder_levels(resolved_level_ids)) def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]: - if _is_list_like(level): - levels = list(level) - else: - levels = [level] - resolved_level_ids = [] - for level_ref in levels: - if isinstance(level_ref, int): - resolved_level_ids.append(self._block.index_columns[level_ref]) - elif isinstance(level_ref, typing.Hashable): - matching_ids = self._block.index_name_to_col_id.get(level_ref, []) - if len(matching_ids) != 1: - raise ValueError("level name cannot be found or is ambiguous") - resolved_level_ids.append(matching_ids[0]) - else: - raise ValueError(f"Unexpected level: {level_ref}") - return resolved_level_ids + return self._block.resolve_index_level(level) def between(self, left, right, inclusive="both"): if inclusive not in ["both", "neither", "left", "right"]: @@ -918,6 +903,29 @@ def argmin(self) -> int: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) + def unstack(self, level: LevelsType = -1): + if isinstance(level, int) or isinstance(level, str): + level = [level] + + block = self._block + + if self.index.nlevels == 1: + raise ValueError("Series must have multi-index to unstack") + + # Pivot by index levels + unstack_ids = self._resolve_levels(level) + block = block.reset_index(drop=False) + block = block.set_index( + [col for col in self._block.index_columns if col not in unstack_ids] + ) + + pivot_block = block.pivot( + columns=unstack_ids, + values=self._block.value_columns, + values_in_index=False, + ) + return bigframes.dataframe.DataFrame(pivot_block) + def idxmax(self) -> blocks.Label: block = self._block.order_by( [ diff --git a/tests/system/conftest.py b/tests/system/conftest.py index cb664302a8..8885b03d34 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -400,7 +400,11 @@ def hockey_df( hockey_table_id: str, session: bigframes.Session ) -> bigframes.dataframe.DataFrame: """DataFrame pointing at test data.""" - return session.read_gbq(hockey_table_id) + return ( + session.read_gbq(hockey_table_id) + .set_index(["player_name", "season"]) + .sort_index() + ) @pytest.fixture(scope="session") @@ -419,7 +423,7 @@ def hockey_pandas_df() -> pd.DataFrame: "season": pd.Int64Dtype(), }, ) - df.index = df.index.astype("Int64") + df = df.set_index(["player_name", "season"]).sort_index() return df diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 84e8def83b..a746a1867c 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1949,8 +1949,14 @@ def test_df_pivot(scalars_dfs, values, index, columns): ], ) def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): - bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas() - pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns) + bf_result = ( + hockey_df.reset_index() + .pivot(values=values, index=index, columns=columns) + .to_pandas() + ) + pd_result = hockey_pandas_df.reset_index().pivot( + values=values, index=index, columns=columns + ) # Pandas produces NaN, where bq dataframes produces pd.NA pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index b5c78de69c..a87dacae04 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -909,13 +909,36 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_multi_index_unstack(hockey_df, hockey_pandas_df): +@pytest.mark.parametrize( + ("level",), + [(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)], +) +def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level): bf_result = ( - hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas() + hockey_df.set_index(["team_name", "position"], append=True) + .unstack(level=level) + .to_pandas() ) pd_result = hockey_pandas_df.set_index( - ["team_name", "season", "position"] - ).unstack() + ["team_name", "position"], append=True + ).unstack(level=level) + + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("level",), + [(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)], +) +def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level): + bf_result = ( + hockey_df.set_index(["team_name", "position"], append=True)["number"] + .unstack(level=level) + .to_pandas() + ) + pd_result = hockey_pandas_df.set_index(["team_name", "position"], append=True)[ + "number" + ].unstack(level=level) pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index bd1f9a9a18..f0e13e16f5 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1654,6 +1654,19 @@ def clip(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def unstack(self, level): + """ + Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. + + Args: + level (int, str, or list of these, default last level): + Level(s) to unstack, can pass level name. + + Returns: + DataFrame: Unstacked Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def argmax(self): """ Return int position of the smallest value in the Series. From 3afd4a35f4c38dad86dab17ff62444cd418cab88 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 18:02:14 -0500 Subject: [PATCH 10/22] feat: add `DataFrame.to_pandas_batches()` to download large `DataFrame` objects (#136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Builds on https://togithub.com/googleapis/python-bigquery-dataframes/pull/132 Towards internal issue 280662868 πŸ¦• --- bigframes/core/blocks.py | 29 +++++++++++-- bigframes/dataframe.py | 4 ++ bigframes/session/_io/pandas.py | 20 +++++++-- tests/system/small/test_dataframe_io.py | 8 ++++ tests/unit/session/test_io_pandas.py | 56 +++++++++++++++++++++++++ 5 files changed, 109 insertions(+), 8 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index e8a3968b3d..9db193a04e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -416,6 +416,30 @@ def to_pandas( ) return df, query_job + def to_pandas_batches(self): + """Download results one message at a time.""" + dtypes = dict(zip(self.index_columns, self.index_dtypes)) + dtypes.update(zip(self.value_columns, self.dtypes)) + results_iterator, _ = self._expr.start_query() + for arrow_table in results_iterator.to_arrow_iterable( + bqstorage_client=self._expr._session.bqstoragereadclient + ): + df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + self._copy_index_to_pandas(df) + yield df + + def _copy_index_to_pandas(self, df: pd.DataFrame): + """Set the index on pandas DataFrame to match this block. + + Warning: This method modifies ``df`` inplace. + """ + if self.index_columns: + df.set_index(list(self.index_columns), inplace=True) + # Pandas names is annotated as list[str] rather than the more + # general Sequence[Label] that BigQuery DataFrames has. + # See: https://github.com/pandas-dev/pandas-stubs/issues/804 + df.index.names = self.index.names # type: ignore + def _compute_and_count( self, value_keys: Optional[Iterable[str]] = None, @@ -489,10 +513,7 @@ def _compute_and_count( else: total_rows = results_iterator.total_rows df = self._to_dataframe(results_iterator) - - if self.index_columns: - df.set_index(list(self.index_columns), inplace=True) - df.index.names = self.index.names # type: ignore + self._copy_index_to_pandas(df) return df, total_rows, query_job diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 869075a970..3fd8319876 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -893,6 +893,10 @@ def to_pandas( self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) + def to_pandas_batches(self) -> Iterable[pandas.DataFrame]: + """Stream DataFrame results to an iterable of pandas DataFrame""" + return self._block.to_pandas_batches() + def _compute_dry_run(self) -> bigquery.QueryJob: return self._block._compute_dry_run() diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 163127b546..1af00a2d01 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -46,20 +46,32 @@ def arrow_to_pandas( # Preserve NA/NaN distinction. Note: This is currently needed, even if we use # nullable Float64Dtype in the types_mapper. See: # https://github.com/pandas-dev/pandas/issues/55668 + mask = pyarrow.compute.is_null(column) + nonnull = pyarrow.compute.fill_null(column, float("nan")) # Regarding type: ignore, this class has been public at this # location since pandas 1.2.0. See: # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html pd_array = pandas.arrays.FloatingArray( # type: ignore - column.to_numpy(), - pyarrow.compute.is_null(column).to_numpy(), + nonnull.to_numpy() + if isinstance(nonnull, pyarrow.ChunkedArray) + else nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy() + if isinstance(mask, pyarrow.ChunkedArray) + else mask.to_numpy(zero_copy_only=False), ) series = pandas.Series(pd_array, dtype=dtype) elif dtype == pandas.Int64Dtype(): # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly # casts to float64 in an intermediate step. + mask = pyarrow.compute.is_null(column) + nonnull = pyarrow.compute.fill_null(column, 0) pd_array = pandas.arrays.IntegerArray( - pyarrow.compute.fill_null(column, 0).to_numpy(), - pyarrow.compute.is_null(column).to_numpy(), + nonnull.to_numpy() + if isinstance(nonnull, pyarrow.ChunkedArray) + else nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy() + if isinstance(mask, pyarrow.ChunkedArray) + else mask.to_numpy(zero_copy_only=False), ) series = pandas.Series(pd_array, dtype=dtype) elif isinstance(dtype, pandas.ArrowDtype): diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index d60083a837..8f5d706f62 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -83,6 +83,14 @@ def test_to_pandas_array_struct_correct_result(session): ) +def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): + """Verify to_pandas_batches() APIs returns the expected dtypes.""" + expected = scalars_df_default_index.dtypes + for df in scalars_df_default_index.to_pandas_batches(): + actual = df.dtypes + pd.testing.assert_series_equal(actual, expected) + + @pytest.mark.parametrize( ("index"), [True, False], diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py index 8b95977ec3..0f6f5dae03 100644 --- a/tests/unit/session/test_io_pandas.py +++ b/tests/unit/session/test_io_pandas.py @@ -231,6 +231,62 @@ ), id="scalar-dtypes", ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": pyarrow.chunked_array( + [[True, None], [True, False]], + type=pyarrow.bool_(), + ), + "bytes": pyarrow.chunked_array( + [[b"123", None], [b"abc", b"xyz"]], + type=pyarrow.binary(), + ), + "float": pyarrow.chunked_array( + [[1.0, None], [float("nan"), -1.0]], + type=pyarrow.float64(), + ), + "int": pyarrow.chunked_array( + [[1, None], [-1, 2**63 - 1]], + type=pyarrow.int64(), + ), + "string": pyarrow.chunked_array( + [["123", None], ["abc", "xyz"]], + type=pyarrow.string(), + ), + } + ), + { + "bool": "boolean", + "bytes": "object", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + }, + pandas.DataFrame( + { + "bool": pandas.Series([True, None, True, False], dtype="boolean"), + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [1.0, float("nan"), float("nan"), -1.0], dtype="float64" + ), + numpy.array([False, True, False, False], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [1, None, -1, 2**63 - 1], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + } + ), + id="scalar-dtypes-chunked_array", + ), pytest.param( pyarrow.Table.from_pydict( { From 39df43e243ac0374d1a1eb2a75779324825afbe9 Mon Sep 17 00:00:00 2001 From: Bradford Orr <15842009+orrbradford@users.noreply.github.com> Date: Thu, 26 Oct 2023 16:46:14 -0700 Subject: [PATCH 11/22] =?UTF-8?q?fix:=20resolve=20plotly=20rendering=20iss?= =?UTF-8?q?ue=20by=20using=20ipython=20html=20for=20job=20pro=E2=80=A6=20(?= =?UTF-8?q?#134)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …gress messages Fixes bug that was preventing plotly rendering to show after the progress bar. Original ipywidgets implementation isn't necessary for basic opening of urls Screen recording: https://togithub.com/googleapis/python-bigquery-dataframes/assets/15842009/5225ce05-117a-4808-9ff0-cb2c3aaf3a40 Internal bug: b/297062404 --- bigframes/formatting_helpers.py | 24 ++++--- tests/system/small/test_progress_bar.py | 83 +++++++++++-------------- 2 files changed, 53 insertions(+), 54 deletions(-) diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 6851bdd2bd..752aeb7a10 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -16,6 +16,7 @@ # TODO(orrbradford): cleanup up typings and documenttion in this file import datetime +import random from typing import Any, Optional, Union import google.api_core.exceptions as api_core_exceptions @@ -57,9 +58,9 @@ def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): Pywidget html table. """ if query_job is None: - return widgets.HTML("No job information available") + return display.HTML("No job information available") if query_job.dry_run: - return widgets.HTML( + return display.HTML( f"Computation deferred. Computation will process {get_formatted_bytes(query_job.total_bytes_processed)}" ) table_html = "" @@ -125,16 +126,20 @@ def wait_for_query_job( Returns: A row iterator over the query results. """ - loading_bar = widgets.HTML(get_query_job_loading_html(query_job)) if progress_bar == "auto": progress_bar = "notebook" if in_ipython() else "terminal" try: if progress_bar == "notebook": - display.display(loading_bar) + display_id = str(random.random()) + loading_bar = display.HTML(get_query_job_loading_html(query_job)) + display.display(loading_bar, display_id=display_id) query_result = query_job.result(max_results=max_results) query_job.reload() - loading_bar.value = get_query_job_loading_html(query_job) + display.update_display( + display.HTML(get_query_job_loading_html(query_job)), + display_id=display_id, + ) elif progress_bar == "terminal": initial_loading_bar = get_query_job_loading_string(query_job) print(initial_loading_bar) @@ -171,16 +176,19 @@ def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): progress_bar (str, Optional): Which progress bar to show. """ - loading_bar = widgets.HTML(get_base_job_loading_html(job)) if progress_bar == "auto": progress_bar = "notebook" if in_ipython() else "terminal" try: if progress_bar == "notebook": - display.display(loading_bar) + display_id = str(random.random()) + loading_bar = display.HTML(get_base_job_loading_html(job)) + display.display(loading_bar, display_id=display_id) job.result() job.reload() - loading_bar.value = get_base_job_loading_html(job) + display.update_display( + display.HTML(get_base_job_loading_html(job)), display_id=display_id + ) elif progress_bar == "terminal": inital_loading_bar = get_base_job_loading_string(job) print(inital_loading_bar) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 00380c2639..f7fc4eaa8f 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re import tempfile import pandas as pd @@ -19,94 +20,84 @@ import bigframes as bf import bigframes.formatting_helpers as formatting_helpers +job_load_message_regex = r"\w+ job [\w-]+ is \w+\." + def test_progress_bar_dataframe( penguins_df_default_index: bf.dataframe.DataFrame, capsys ): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" + capsys.readouterr() # clear output penguins_df_default_index.to_pandas() - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 + + assert_loading_msg_exist(capsys.readouterr().out) assert penguins_df_default_index.query_job is not None - for line in lines: - assert html_check in line and open_job_check in line def test_progress_bar_series(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" series = penguins_df_default_index["body_mass_g"].head(10) + capsys.readouterr() # clear output series.to_pandas() - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 + + assert_loading_msg_exist(capsys.readouterr().out) assert series.query_job is not None - for line in lines: - assert html_check in line and open_job_check in line def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, capsys): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" + capsys.readouterr() # clear output penguins_df_default_index["body_mass_g"].head(10).mean() - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 - for line in lines: - assert html_check in line and open_job_check in line + + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_read_gbq(session: bf.Session, penguins_table_id: str, capsys): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" + capsys.readouterr() # clear output session.read_gbq(penguins_table_id) - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 - for line in lines: - assert html_check in line and open_job_check in line + + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_extract_jobs( penguins_df_default_index: bf.dataframe.DataFrame, gcs_folder, capsys ): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" path = gcs_folder + "test_read_csv_progress_bar*.csv" + capsys.readouterr() # clear output penguins_df_default_index.to_csv(path) - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") - lines = [line for line in lines if len(line) > 0] - assert len(lines) > 0 - for line in lines: - assert html_check in line and open_job_check in line + + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_load_jobs( session: bf.Session, penguins_pandas_df_default_index: pd.DataFrame, capsys ): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" with tempfile.TemporaryDirectory() as dir: path = dir + "/test_read_csv_progress_bar*.csv" penguins_pandas_df_default_index.to_csv(path, index=False) + capsys.readouterr() # clear output session.read_csv(path) - html_check = "HTML(value=" - open_job_check = "Open Job" - lines = capsys.readouterr().out.split("\n") + + assert_loading_msg_exist(capsys.readouterr().out) + + +def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): + numLoadingMsg = 0 + lines = capystOut.split("\n") lines = [line for line in lines if len(line) > 0] + assert len(lines) > 0 for line in lines: - assert html_check in line and open_job_check in line + if re.match(pattern, line) is not None: + numLoadingMsg += 1 + assert numLoadingMsg > 0 def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): - bf.options.display.progress_bar = "notebook" + bf.options.display.progress_bar = "terminal" penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( False ) From eceeb221f553644411b954ae2db0f0ae5a505687 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 26 Oct 2023 17:28:13 -0700 Subject: [PATCH 12/22] refactor: ArrayValue is now a tree that defers conversion to ibis (#110) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/core/__init__.py | 1221 +++-------------- bigframes/core/block_transforms.py | 25 +- bigframes/core/blocks.py | 34 +- bigframes/core/compile/__init__.py | 21 + bigframes/core/compile/compiled.py | 1121 +++++++++++++++ bigframes/core/compile/compiler.py | 185 +++ .../core/{joins => compile}/row_identity.py | 14 +- .../core/{joins => compile}/single_column.py | 35 +- bigframes/core/groupby/__init__.py | 20 +- bigframes/core/indexers.py | 6 +- bigframes/core/indexes/index.py | 15 +- bigframes/core/joins/__init__.py | 9 +- bigframes/core/nodes.py | 245 ++++ bigframes/core/ordering.py | 4 +- bigframes/core/window_spec.py | 35 + bigframes/dataframe.py | 18 +- bigframes/ml/metrics.py | 2 +- bigframes/operations/base.py | 4 +- bigframes/series.py | 34 +- bigframes/session/__init__.py | 44 +- tests/system/small/test_progress_bar.py | 4 +- tests/system/small/test_series.py | 4 +- tests/system/small/test_session.py | 9 +- tests/unit/core/test_blocks.py | 5 +- tests/unit/resources.py | 17 +- tests/unit/test_core.py | 37 +- 26 files changed, 1996 insertions(+), 1172 deletions(-) create mode 100644 bigframes/core/compile/__init__.py create mode 100644 bigframes/core/compile/compiled.py create mode 100644 bigframes/core/compile/compiler.py rename bigframes/core/{joins => compile}/row_identity.py (94%) rename bigframes/core/{joins => compile}/single_column.py (87%) create mode 100644 bigframes/core/nodes.py create mode 100644 bigframes/core/window_spec.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 6c78a07f3b..4653f0ab6a 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -14,29 +14,21 @@ from __future__ import annotations from dataclasses import dataclass -import functools -import math -import textwrap +import io import typing -from typing import Collection, Iterable, Literal, Optional, Sequence, Tuple +from typing import Iterable, Literal, Optional, Sequence, Tuple from google.cloud import bigquery import ibis -import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas -import bigframes.constants as constants +import bigframes.core.compile as compiled import bigframes.core.guid -from bigframes.core.ordering import ( - encode_order_string, - ExpressionOrdering, - IntegerEncoding, - OrderingColumnReference, - reencode_order_string, - StringEncoding, -) -import bigframes.core.utils as utils +import bigframes.core.nodes as nodes +from bigframes.core.ordering import OrderingColumnReference +import bigframes.core.ordering as orderings +from bigframes.core.window_spec import WindowSpec import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -49,470 +41,190 @@ @dataclass(frozen=True) -class WindowSpec: +class ArrayValue: """ - Specifies a window over which aggregate and analytic function may be applied. - grouping_keys: set of column ids to group on - preceding: Number of preceding rows in the window - following: Number of preceding rows in the window - ordering: List of columns ids and ordering direction to override base ordering + ArrayValue is an immutable type representing a 2D array with per-column types. """ - grouping_keys: typing.Sequence[str] = tuple() - ordering: typing.Sequence[OrderingColumnReference] = tuple() - preceding: typing.Optional[int] = None - following: typing.Optional[int] = None - min_periods: int = 0 - - -# TODO(swast): We might want to move this to it's own sub-module. -class ArrayValue: - """Immutable BigQuery DataFrames expression tree. - - Note: Usage of this class is considered to be private and subject to change - at any time. + node: nodes.BigFrameNode - This class is a wrapper around Ibis expressions. Its purpose is to defer - Ibis projection operations to keep generated SQL small and correct when - mixing and matching columns from different versions of a DataFrame. - - Args: - session: - A BigQuery DataFrames session to allow more flexibility in running - queries. - table: An Ibis table expression. - columns: Ibis value expressions that can be projected as columns. - hidden_ordering_columns: Ibis value expressions to store ordering. - ordering: An ordering property of the data frame. - predicates: A list of filters on the data frame. - """ - - def __init__( - self, + @classmethod + def from_ibis( + cls, session: Session, table: ibis_types.Table, columns: Sequence[ibis_types.Value], - hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, - ordering: ExpressionOrdering = ExpressionOrdering(), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + hidden_ordering_columns: Sequence[ibis_types.Value], + ordering: orderings.ExpressionOrdering, ): - self._session = session - self._table = table - self._predicates = tuple(predicates) if predicates is not None else () - # TODO: Validate ordering - if not ordering.total_ordering_columns: - raise ValueError("Must have total ordering defined by one or more columns") - self._ordering = ordering - # Allow creating a DataFrame directly from an Ibis table expression. - # TODO(swast): Validate that each column references the same table (or - # no table for literal values). - self._columns = tuple(columns) - - # Meta columns store ordering, or other data that doesn't correspond to dataframe columns - self._hidden_ordering_columns = ( - tuple(hidden_ordering_columns) - if hidden_ordering_columns is not None - else () - ) - - # To allow for more efficient lookup by column name, create a - # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} - self._hidden_ordering_column_names = { - column.get_name(): column for column in self._hidden_ordering_columns - } - ### Validation - value_col_ids = self._column_names.keys() - hidden_col_ids = self._hidden_ordering_column_names.keys() - - all_columns = value_col_ids | hidden_col_ids - ordering_valid = all( - col.column_id in all_columns for col in ordering.all_ordering_columns + node = nodes.ReadGbqNode( + table=table, + table_session=session, + columns=tuple(columns), + hidden_ordering_columns=tuple(hidden_ordering_columns), + ordering=ordering, ) - if value_col_ids & hidden_col_ids: - raise ValueError( - f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" - ) - if not ordering_valid: - raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") + return cls(node) @classmethod - def mem_expr_from_pandas( - cls, - pd_df: pandas.DataFrame, - session: Optional[Session], - ) -> ArrayValue: - """ - Builds an in-memory only (SQL only) expr from a pandas dataframe. + def from_pandas(cls, pd_df: pandas.DataFrame): + iobytes = io.BytesIO() + # Discard row labels and use simple string ids for columns + column_ids = tuple(str(label) for label in pd_df.columns) + pd_df.reset_index(drop=True).set_axis(column_ids, axis=1).to_feather(iobytes) + node = nodes.ReadLocalNode(iobytes.getvalue(), column_ids=column_ids) + return cls(node) - Caution: If session is None, only a subset of expr functionality will - be available (null Session is usually not supported). - """ - # We can't include any hidden columns in the ArrayValue constructor, so - # grab the column names before we add the hidden ordering column. - column_names = [str(column) for column in pd_df.columns] - # Make sure column names are all strings. - pd_df = pd_df.set_axis(column_names, axis="columns") - pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) - - # ibis memtable cannot handle NA, must convert to None - pd_df = pd_df.astype("object") # type: ignore - pd_df = pd_df.where(pandas.notnull(pd_df), None) + @property + def column_ids(self) -> typing.Sequence[str]: + return self.compile().column_ids - # NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases. - keys_memtable = ibis.memtable(pd_df) - schema = keys_memtable.schema() - new_schema = [] - for column_index, column in enumerate(schema): - if column == ORDER_ID_COLUMN: - new_type: ibis_dtypes.DataType = ibis_dtypes.int64 - else: - column_type = schema[column] - # The autodetected type might not be one we can support, such - # as NULL type for empty rows, so convert to a type we do - # support. - new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) - ) - # TODO(swast): Ibis memtable doesn't use backticks in struct - # field names, so spaces and other characters aren't allowed in - # the memtable context. Blocked by - # https://github.com/ibis-project/ibis/issues/7187 - column = f"col_{column_index}" - new_schema.append((column, new_type)) + @property + def session(self) -> Session: + required_session = self.node.session + from bigframes import get_global_session - # must set non-null column labels. these are not the user-facing labels - pd_df = pd_df.set_axis( - [column for column, _ in new_schema], - axis="columns", - ) - keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema)) + return self.node.session[0] if required_session else get_global_session() - return cls( - session, # type: ignore # Session cannot normally be none, see "caution" above - keys_memtable, - columns=[ - keys_memtable[f"col_{column_index}"].name(column) - for column_index, column in enumerate(column_names) - ], - ordering=ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - ), - hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), - ) - - @property - def columns(self) -> typing.Tuple[ibis_types.Value, ...]: - return self._columns + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + return self.compile().get_column_type(key) - @property - def column_ids(self) -> typing.Sequence[str]: - return tuple(self._column_names.keys()) + def compile(self) -> compiled.CompiledArrayValue: + return compiled.compile_node(self.node) - @property - def _hidden_column_ids(self) -> typing.Sequence[str]: - return tuple(self._hidden_ordering_column_names.keys()) + def shape(self) -> typing.Tuple[int, int]: + """Returns dimensions as (length, width) tuple.""" + width = len(self.compile().columns) + count_expr = self.compile()._to_ibis_expr("unordered").count() - @property - def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: - """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" - return ( - _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) - if self._predicates - else None + # Support in-memory engines for hermetic unit tests. + if not self.node.session: + try: + length = ibis.pandas.connect({}).execute(count_expr) + return (length, width) + except Exception: + # Not all cases can be handled by pandas engine + pass + + sql = self.session.ibis_client.compile(count_expr) + row_iterator, _ = self.session._start_query( + sql=sql, + max_results=1, ) + length = next(row_iterator)[0] + return (length, width) - @property - def _ibis_order(self) -> Sequence[ibis_types.Value]: - """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" - return _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, - self._ordering.all_ordering_columns, + def to_sql( + self, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + return self.compile().to_sql( + offset_column=offset_column, + col_id_overrides=col_id_overrides, + sorted=sorted, ) - def builder(self) -> ArrayValueBuilder: - """Creates a mutable builder for expressions.""" - # Since ArrayValue is intended to be immutable (immutability offers - # potential opportunities for caching, though we might need to introduce - # more node types for that to be useful), we create a builder class. - return ArrayValueBuilder( - self._session, - self._table, - columns=self._columns, - hidden_ordering_columns=self._hidden_ordering_columns, - ordering=self._ordering, - predicates=self._predicates, + def start_query( + self, + job_config: Optional[bigquery.job.QueryJobConfig] = None, + max_results: Optional[int] = None, + *, + sorted: bool = True, + ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """Execute a query and return metadata about the results.""" + # TODO(swast): Cache the job ID so we can look it up again if they ask + # for the results? We'd need a way to invalidate the cache if DataFrame + # becomes mutable, though. Or move this method to the immutable + # expression class. + # TODO(swast): We might want to move this method to Session and/or + # provide our own minimal metadata class. Tight coupling to the + # BigQuery client library isn't ideal, especially if we want to support + # a LocalSession for unit testing. + # TODO(swast): Add a timeout here? If the query is taking a long time, + # maybe we just print the job metadata that we have so far? + sql = self.to_sql(sorted=sorted) # type:ignore + return self.session._start_query( + sql=sql, + job_config=job_config, + max_results=max_results, ) - def drop_columns(self, columns: Iterable[str]) -> ArrayValue: - # Must generate offsets if we are dropping a column that ordering depends on - expr = self - for ordering_column in set(columns).intersection( - [col.column_id for col in self._ordering.ordering_value_columns] - ): - expr = self._hide_column(ordering_column) - - expr_builder = expr.builder() - remain_cols = [ - column for column in expr.columns if column.get_name() not in columns - ] - expr_builder.columns = remain_cols - return expr_builder.build() - - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - ibis_type = typing.cast( - bigframes.dtypes.IbisDtype, self._get_any_column(key).type() + def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: + """Write the ArrayValue to a session table and create a new block object that references it.""" + compiled = self.compile() + ibis_expr = compiled._to_ibis_expr("unordered", expose_hidden_cols=True) + destination = self.session._ibis_to_session_table( + ibis_expr, cluster_cols=cluster_cols, api_name="cache" ) - return typing.cast( - bigframes.dtypes.Dtype, - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + table_expression = self.session.ibis_client.table( + f"{destination.project}.{destination.dataset_id}.{destination.table_id}" + ) + new_columns = [table_expression[column] for column in compiled.column_ids] + new_hidden_columns = [ + table_expression[column] + for column in compiled._hidden_ordering_column_names + ] + return ArrayValue.from_ibis( + self.session, + table_expression, + columns=new_columns, + hidden_ordering_columns=new_hidden_columns, + ordering=compiled._ordering, ) - def _get_ibis_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column.""" - if key not in self.column_ids: - raise ValueError( - "Column name {} not in set of values: {}".format(key, self.column_ids) - ) - return typing.cast(ibis_types.Value, self._column_names[key]) - - def _get_any_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column. Will also get hidden columns.""" - all_columns = {**self._column_names, **self._hidden_ordering_column_names} - if key not in all_columns.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, all_columns.keys() - ) - ) - return typing.cast(ibis_types.Value, all_columns[key]) + # Operations - def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: - """Gets the Ibis expression for a given hidden column.""" - if key not in self._hidden_ordering_column_names.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, self._hidden_ordering_column_names.keys() - ) - ) - return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + def drop_columns(self, columns: Iterable[str]) -> ArrayValue: + return ArrayValue( + nodes.DropColumnsNode(child=self.node, columns=tuple(columns)) + ) def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - condition = typing.cast( - ibis_types.BooleanValue, self._get_ibis_column(predicate_id) - ) - if keep_null: - condition = typing.cast( - ibis_types.BooleanValue, - condition.fillna( - typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) - ), + return ArrayValue( + nodes.FilterNode( + child=self.node, predicate_id=predicate_id, keep_null=keep_null ) - return self._filter(condition) - - def _filter(self, predicate_value: ibis_types.BooleanValue) -> ArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - expr = self.builder() - expr.ordering = expr.ordering.with_non_sequential() - expr.predicates = [*self._predicates, predicate_value] - return expr.build() + ) def order_by( self, by: Sequence[OrderingColumnReference], stable: bool = False ) -> ArrayValue: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) - return expr_builder.build() - - def reversed(self) -> ArrayValue: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_reverse() - return expr_builder.build() - - def _uniform_sampling(self, fraction: float) -> ArrayValue: - """Sampling the table on given fraction. - - .. warning:: - The row numbers of result is non-deterministic, avoid to use. - """ - table = self._to_ibis_expr( - "unordered", expose_hidden_cols=True, fraction=fraction - ) - columns = [table[column_name] for column_name in self._column_names] - hidden_ordering_columns = [ - table[column_name] for column_name in self._hidden_ordering_column_names - ] return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, + nodes.OrderByNode(child=self.node, by=tuple(by), stable=stable) ) - @property - def _offsets(self) -> ibis_types.IntegerColumn: - if not self._ordering.is_sequential: - raise ValueError( - "Expression does not have offsets. Generate them first using project_offsets." - ) - if not self._ordering.total_order_col: - raise ValueError( - "Ordering is invalid. Marked as sequential but no total order columns." - ) - column = self._get_any_column(self._ordering.total_order_col.column_id) - return typing.cast(ibis_types.IntegerColumn, column) - - def _project_offsets(self) -> ArrayValue: - """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" - if self._ordering.is_sequential: - return self - # TODO(tbergeron): Enforce total ordering - table = self._to_ibis_expr( - ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN - ) - columns = [table[column_name] for column_name in self._column_names] - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(True, is_sequential=True), - ) - return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=[table[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - def _hide_column(self, column_id) -> ArrayValue: - """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" - expr_builder = self.builder() - # Need to rename column as caller might be creating a new row with the same name but different values. - # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. - new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") - expr_builder.hidden_ordering_columns = [ - *self._hidden_ordering_columns, - self._get_ibis_column(column_id).name(new_name), - ] - expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) - return expr_builder.build() + def reversed(self) -> ArrayValue: + return ArrayValue(nodes.ReversedNode(child=self.node)) def promote_offsets(self, col_id: str) -> ArrayValue: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. """ - # Special case: offsets already exist - ordering = self._ordering - - if (not ordering.is_sequential) or (not ordering.total_order_col): - return self._project_offsets().promote_offsets(col_id) - expr_builder = self.builder() - expr_builder.columns = [ - self._get_any_column(ordering.total_order_col.column_id).name(col_id), - *self.columns, - ] - return expr_builder.build() + return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: - return self._projection( - [self._get_ibis_column(col_id) for col_id in column_ids] + return ArrayValue( + nodes.SelectNode(child=self.node, column_ids=tuple(column_ids)) ) - def _projection(self, columns: Iterable[ibis_types.Value]) -> ArrayValue: - """Creates a new expression based on this expression with new columns.""" - # TODO(swast): We might want to do validation here that columns derive - # from the same table expression instead of (in addition to?) at - # construction time. - - expr = self - for ordering_column in set(self.column_ids).intersection( - [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] - ): - # Need to hide ordering columns that are being dropped. Alternatively, could project offsets - expr = expr._hide_column(ordering_column) - builder = expr.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - - def shape(self) -> typing.Tuple[int, int]: - """Returns dimensions as (length, width) tuple.""" - width = len(self.columns) - count_expr = self._to_ibis_expr("unordered").count() - sql = self._session.ibis_client.compile(count_expr) - - # Support in-memory engines for hermetic unit tests. - if not isinstance(sql, str): - length = self._session.ibis_client.execute(count_expr) - else: - row_iterator, _ = self._session._start_query( - sql=sql, - max_results=1, - ) - length = next(row_iterator)[0] - return (length, width) - def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: """Append together multiple ArrayValue objects.""" - if len(other) == 0: - return self - tables = [] - prefix_base = 10 - prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) - # Must normalize all ids to the same encoding size - max_encoding_size = max( - self._ordering.string_encoding.length, - *[expression._ordering.string_encoding.length for expression in other], - ) - for i, expr in enumerate([self, *other]): - ordering_prefix = str(i).zfill(prefix_size) - table = expr._to_ibis_expr( - ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN - ) - # Rename the value columns based on horizontal offset before applying union. - table = table.select( - [ - table[col].name(f"column_{i}") - if col != ORDER_ID_COLUMN - else ( - ordering_prefix - + reencode_order_string( - table[ORDER_ID_COLUMN], max_encoding_size - ) - ).name(ORDER_ID_COLUMN) - for i, col in enumerate(table.columns) - ] - ) - tables.append(table) - combined_table = ibis.union(*tables) - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - string_encoding=StringEncoding(True, prefix_size + max_encoding_size), - ) return ArrayValue( - self._session, - combined_table, - columns=[ - combined_table[col] - for col in combined_table.columns - if col != ORDER_ID_COLUMN - ], - hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], - ordering=ordering, + nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) ) def project_unary_op( self, column_name: str, op: ops.UnaryOp, output_name=None ) -> ArrayValue: """Creates a new expression based on this expression with unary operation applied to one column.""" - value = op._as_ibis(self._get_ibis_column(column_name)).name( - output_name or column_name + return ArrayValue( + nodes.ProjectUnaryOpNode( + child=self.node, input_id=column_name, op=op, output_id=output_name + ) ) - return self._set_or_replace_by_id(output_name or column_name, value) def project_binary_op( self, @@ -522,11 +234,15 @@ def project_binary_op( output_column_id: str, ) -> ArrayValue: """Creates a new expression based on this expression with binary operation applied to two columns.""" - value = op( - self._get_ibis_column(left_column_id), - self._get_ibis_column(right_column_id), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) + return ArrayValue( + nodes.ProjectBinaryOpNode( + child=self.node, + left_input_id=left_column_id, + right_input_id=right_column_id, + op=op, + output_id=output_column_id, + ) + ) def project_ternary_op( self, @@ -537,12 +253,16 @@ def project_ternary_op( output_column_id: str, ) -> ArrayValue: """Creates a new expression based on this expression with ternary operation applied to three columns.""" - value = op( - self._get_ibis_column(col_id_1), - self._get_ibis_column(col_id_2), - self._get_ibis_column(col_id_3), - ).name(output_column_id) - return self._set_or_replace_by_id(output_column_id, value) + return ArrayValue( + nodes.ProjectTernaryOpNode( + child=self.node, + input_id1=col_id_1, + input_id2=col_id_2, + input_id3=col_id_3, + op=op, + output_id=output_column_id, + ) + ) def aggregate( self, @@ -557,46 +277,14 @@ def aggregate( by_column_id: column id of the aggregation key, this is preserved through the transform dropna: whether null keys should be dropped """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: agg_op._as_ibis(table[col_in]) - for col_in, agg_op, col_out in aggregations - } - if by_column_ids: - result = table.group_by(by_column_ids).aggregate(**stats) - # Must have deterministic ordering, so order by the unique "by" column - ordering = ExpressionOrdering( - [ - OrderingColumnReference(column_id=column_id) - for column_id in by_column_ids - ], - total_ordering_columns=frozenset(by_column_ids), - ) - columns = tuple(result[key] for key in result.columns) - expr = ArrayValue(self._session, result, columns=columns, ordering=ordering) - if dropna: - for column_id in by_column_ids: - expr = expr._filter( - ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) - ) - # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation - return expr._project_offsets() - else: - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) - return ArrayValue( - self._session, - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, + return ArrayValue( + nodes.AggregateNode( + child=self.node, + aggregations=tuple(aggregations), + by_column_ids=tuple(by_column_ids), + dropna=dropna, ) + ) def corr_aggregate( self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] @@ -607,25 +295,8 @@ def corr_aggregate( Arguments: corr_aggregations: left_column_id, right_column_id, output_column_id tuples """ - table = self._to_ibis_expr("unordered") - stats = { - col_out: table[col_left].corr(table[col_right], how="pop") - for col_left, col_right, col_out in corr_aggregations - } - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. - ordering = ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], - total_ordering_columns=frozenset([ORDER_ID_COLUMN]), - integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), - ) return ArrayValue( - self._session, - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, + nodes.CorrNode(child=self.node, corr_aggregations=tuple(corr_aggregations)) ) def project_window_op( @@ -647,231 +318,17 @@ def project_window_op( never_skip_nulls: will disable null skipping for operators that would otherwise do so skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ - column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) - window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) - - window_op = op._as_ibis(column, window) - - clauses = [] - if op.skips_nulls and not never_skip_nulls: - clauses.append((column.isnull(), ibis.NA)) - if window_spec.min_periods: - if op.skips_nulls: - # Most operations do not count NULL values towards min_periods - observation_count = agg_ops.count_op._as_ibis(column, window) - else: - # Operations like count treat even NULLs as valid observations for the sake of min_periods - # notnull is just used to convert null values to non-null (FALSE) values to be counted - denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) - observation_count = agg_ops.count_op._as_ibis(denulled_value, window) - clauses.append( - ( - observation_count < ibis_types.literal(window_spec.min_periods), - ibis.NA, - ) - ) - if clauses: - case_statement = ibis.case() - for clause in clauses: - case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() - window_op = case_statement - - result = self._set_or_replace_by_id(output_name or column_name, window_op) - # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. - return result._reproject_to_table() if not skip_reproject_unsafe else result - - def to_sql( - self, - offset_column: typing.Optional[str] = None, - col_id_overrides: typing.Mapping[str, str] = {}, - sorted: bool = False, - ) -> str: - offsets_id = offset_column or ORDER_ID_COLUMN - - sql = self._session.ibis_client.compile( - self._to_ibis_expr( - ordering_mode="offset_col" - if (offset_column or sorted) - else "unordered", - order_col_name=offsets_id, - col_id_overrides=col_id_overrides, - ) - ) - if sorted: - sql = textwrap.dedent( - f""" - SELECT * EXCEPT (`{offsets_id}`) - FROM ({sql}) - ORDER BY `{offsets_id}` - """ - ) - return typing.cast(str, sql) - - def _to_ibis_expr( - self, - ordering_mode: Literal["string_encoded", "offset_col", "unordered"], - order_col_name: Optional[str] = ORDER_ID_COLUMN, - expose_hidden_cols: bool = False, - fraction: Optional[float] = None, - col_id_overrides: typing.Mapping[str, str] = {}, - ): - """ - Creates an Ibis table expression representing the DataFrame. - - ArrayValue objects are sorted, so the following options are available - to reflect this in the ibis expression. - - * "offset_col": Zero-based offsets are generated as a column, this will - not sort the rows however. - * "string_encoded": An ordered string column is provided in output table. - * "unordered": No ordering information will be provided in output. Only - value columns are projected. - - For offset or ordered column, order_col_name can be used to assign the - output label for the ordering column. If none is specified, the default - column name will be 'bigframes_ordering_id' - - Args: - ordering_mode: - How to construct the Ibis expression from the ArrayValue. See - above for details. - order_col_name: - If the ordering mode outputs a single ordering or offsets - column, use this as the column name. - expose_hidden_cols: - If True, include the hidden ordering columns in the results. - Only compatible with `order_by` and `unordered` - ``ordering_mode``. - col_id_overrides: - overrides the column ids for the result - Returns: - An ibis expression representing the data help by the ArrayValue object. - """ - assert ordering_mode in ( - "string_encoded", - "offset_col", - "unordered", - ) - if expose_hidden_cols and ordering_mode in ("ordered_col", "offset_col"): - raise ValueError( - f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" + return ArrayValue( + nodes.WindowOpNode( + child=self.node, + column_name=column_name, + op=op, + window_spec=window_spec, + output_name=output_name, + never_skip_nulls=never_skip_nulls, + skip_reproject_unsafe=skip_reproject_unsafe, ) - - columns = list(self._columns) - columns_to_drop: list[ - str - ] = [] # Ordering/Filtering columns that will be dropped at end - - if self._reduced_predicate is not None: - columns.append(self._reduced_predicate) - # Usually drop predicate as it is will be all TRUE after filtering - if not expose_hidden_cols: - columns_to_drop.append(self._reduced_predicate.get_name()) - - order_columns = self._create_order_columns( - ordering_mode, order_col_name, expose_hidden_cols ) - columns.extend(order_columns) - - # Special case for empty tables, since we can't create an empty - # projection. - if not columns: - return ibis.memtable([]) - - # Make sure all dtypes are the "canonical" ones for BigFrames. This is - # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns - ) - base_table = table - if self._reduced_predicate is not None: - table = table.filter(base_table[PREDICATE_COLUMN]) - table = table.drop(*columns_to_drop) - if col_id_overrides: - table = table.relabel(col_id_overrides) - if fraction is not None: - table = table.filter(ibis.random() < ibis.literal(fraction)) - return table - - def _create_order_columns( - self, - ordering_mode: str, - order_col_name: Optional[str], - expose_hidden_cols: bool, - ) -> typing.Sequence[ibis_types.Value]: - # Generate offsets if current ordering id semantics are not sufficiently strict - if ordering_mode == "offset_col": - return (self._create_offset_column().name(order_col_name),) - elif ordering_mode == "string_encoded": - return (self._create_string_ordering_column().name(order_col_name),) - elif expose_hidden_cols: - return self._hidden_ordering_columns - return () - - def _create_offset_column(self) -> ibis_types.IntegerColumn: - if self._ordering.total_order_col and self._ordering.is_sequential: - offsets = self._get_any_column(self._ordering.total_order_col.column_id) - return typing.cast(ibis_types.IntegerColumn, offsets) - else: - window = ibis.window(order_by=self._ibis_order) - if self._predicates: - window = window.group_by(self._reduced_predicate) - offsets = ibis.row_number().over(window) - return typing.cast(ibis_types.IntegerColumn, offsets) - - def _create_string_ordering_column(self) -> ibis_types.StringColumn: - if self._ordering.total_order_col and self._ordering.is_string_encoded: - string_order_ids = self._get_any_column( - self._ordering.total_order_col.column_id - ) - return typing.cast(ibis_types.StringColumn, string_order_ids) - if ( - self._ordering.total_order_col - and self._ordering.integer_encoding.is_encoded - ): - # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers - int_values = self._get_any_column(self._ordering.total_order_col.column_id) - return encode_order_string( - typing.cast(ibis_types.IntegerColumn, int_values), - ) - else: - # Have to build string from scratch - window = ibis.window(order_by=self._ibis_order) - if self._predicates: - window = window.group_by(self._reduced_predicate) - row_nums = typing.cast( - ibis_types.IntegerColumn, ibis.row_number().over(window) - ) - return encode_order_string(row_nums) - - def start_query( - self, - job_config: Optional[bigquery.job.QueryJobConfig] = None, - max_results: Optional[int] = None, - *, - sorted: bool = True, - ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - """Execute a query and return metadata about the results.""" - # TODO(swast): Cache the job ID so we can look it up again if they ask - # for the results? We'd need a way to invalidate the cache if DataFrame - # becomes mutable, though. Or move this method to the immutable - # expression class. - # TODO(swast): We might want to move this method to Session and/or - # provide our own minimal metadata class. Tight coupling to the - # BigQuery client library isn't ideal, especially if we want to support - # a LocalSession for unit testing. - # TODO(swast): Add a timeout here? If the query is taking a long time, - # maybe we just print the job metadata that we have so far? - sql = self.to_sql(sorted=True) # type:ignore - return self._session._start_query( - sql=sql, - job_config=job_config, - max_results=max_results, - ) - - def _get_table_size(self, destination_table): - return self._session._get_table_size(destination_table) def _reproject_to_table(self) -> ArrayValue: """ @@ -881,74 +338,25 @@ def _reproject_to_table(self) -> ArrayValue: some operations such as window operations that cannot be used recursively in projections. """ - table = self._to_ibis_expr( - "unordered", - expose_hidden_cols=True, - ) - columns = [table[column_name] for column_name in self._column_names] - ordering_col_ids = [ - ref.column_id for ref in self._ordering.all_ordering_columns - ] - hidden_ordering_columns = [ - table[column_name] - for column_name in self._hidden_ordering_column_names - if column_name in ordering_col_ids - ] return ArrayValue( - self._session, - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, - ) - - def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): - group_by: typing.List[ibis_types.Value] = ( - [ - typing.cast( - ibis_types.Column, _as_identity(self._get_ibis_column(column)) - ) - for column in window_spec.grouping_keys - ] - if window_spec.grouping_keys - else [] - ) - if self._reduced_predicate is not None: - group_by.append(self._reduced_predicate) - if window_spec.ordering: - order_by = _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, - window_spec.ordering, + nodes.ReprojectOpNode( + child=self.node, ) - if not allow_ties: - # Most operator need an unambiguous ordering, so the table's total ordering is appended - order_by = tuple([*order_by, *self._ibis_order]) - elif (window_spec.following is not None) or (window_spec.preceding is not None): - # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. - order_by = tuple(self._ibis_order) - else: - # Unbound grouping window. Suitable for aggregations but not for analytic function application. - order_by = None - return ibis.window( - preceding=window_spec.preceding, - following=window_spec.following, - order_by=order_by, - group_by=group_by, ) def unpivot( self, row_labels: typing.Sequence[typing.Hashable], unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]] ], *, passthrough_columns: typing.Sequence[str] = (), index_col_ids: typing.Sequence[str] = ["index"], dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] ] = pandas.Float64Dtype(), - how="left", + how: typing.Literal["left", "right"] = "left", ) -> ArrayValue: """ Unpivot ArrayValue columns. @@ -963,133 +371,23 @@ def unpivot( Returns: ArrayValue: The unpivoted ArrayValue """ - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr("unordered", expose_hidden_cols=True) - row_n = len(row_labels) - hidden_col_ids = self._hidden_ordering_column_names.keys() - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) - ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - *hidden_col_ids, - unpivot_offset_id, - ) - - # Extend the original ordering using unpivot_offset_id - old_ordering = self._ordering - if how == "left": - new_ordering = ExpressionOrdering( - ordering_value_columns=[ - *old_ordering.ordering_value_columns, - OrderingColumnReference(unpivot_offset_id), - ], - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - else: # how=="right" - new_ordering = ExpressionOrdering( - ordering_value_columns=[ - OrderingColumnReference(unpivot_offset_id), - *old_ordering.ordering_value_columns, - ], - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - hidden_ordering_columns = [ - unpivot_table[unpivot_offset_id], - *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], - ] return ArrayValue( - session=self._session, - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - hidden_ordering_columns=hidden_ordering_columns, - ordering=new_ordering, + nodes.UnpivotNode( + child=self.node, + row_labels=tuple(row_labels), + unpivot_columns=tuple(unpivot_columns), + passthrough_columns=tuple(passthrough_columns), + index_col_ids=tuple(index_col_ids), + dtype=dtype, + how=how, + ) ) def assign(self, source_id: str, destination_id: str) -> ArrayValue: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) + return ArrayValue( + nodes.AssignNode( + child=self.node, source_id=source_id, destination_id=destination_id + ) ) def assign_constant( @@ -1098,128 +396,41 @@ def assign_constant( value: typing.Any, dtype: typing.Optional[bigframes.dtypes.Dtype], ) -> ArrayValue: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" - ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() - - def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> ArrayValue: - """Safely assign by id while maintaining ordering integrity.""" - # TODO: Split into explicit set and replace methods - ordering_col_ids = [ - col_ref.column_id for col_ref in self._ordering.ordering_value_columns - ] - if id in ordering_col_ids: - return self._hide_column(id)._set_or_replace_by_id(id, new_value) - - builder = self.builder() - if id in self.column_ids: - builder.columns = [ - val if (col_id != id) else new_value.name(id) - for col_id, val in zip(self.column_ids, self._columns) - ] - else: - builder.columns = [*self.columns, new_value.name(id)] - return builder.build() - - def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: - """Write the ArrayValue to a session table and create a new block object that references it.""" - ibis_expr = self._to_ibis_expr("unordered", expose_hidden_cols=True) - destination = self._session._ibis_to_session_table( - ibis_expr, cluster_cols=cluster_cols, api_name="cache" - ) - table_expression = self._session.ibis_client.table( - f"{destination.project}.{destination.dataset_id}.{destination.table_id}" - ) - new_columns = [table_expression[column] for column in self.column_ids] - new_hidden_columns = [ - table_expression[column] for column in self._hidden_ordering_column_names - ] return ArrayValue( - self._session, - table_expression, - columns=new_columns, - hidden_ordering_columns=new_hidden_columns, - ordering=self._ordering, + nodes.AssignConstantNode( + child=self.node, destination_id=destination_id, value=value, dtype=dtype + ) ) - -class ArrayValueBuilder: - """Mutable expression class. - Use ArrayValue.builder() to create from a ArrayValue object. - """ - - def __init__( + def join( self, - session: Session, - table: ibis_types.Table, - ordering: ExpressionOrdering, - columns: Collection[ibis_types.Value] = (), - hidden_ordering_columns: Collection[ibis_types.Value] = (), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + self_column_ids: typing.Sequence[str], + other: ArrayValue, + other_column_ids: typing.Sequence[str], + *, + how: Literal[ + "inner", + "left", + "outer", + "right", + ], + allow_row_identity_join: bool = True, ): - self.session = session - self.table = table - self.columns = list(columns) - self.hidden_ordering_columns = list(hidden_ordering_columns) - self.ordering = ordering - self.predicates = list(predicates) if predicates is not None else None - - def build(self) -> ArrayValue: return ArrayValue( - session=self.session, - table=self.table, - columns=self.columns, - hidden_ordering_columns=self.hidden_ordering_columns, - ordering=self.ordering, - predicates=self.predicates, - ) - - -def _reduce_predicate_list( - predicate_list: typing.Collection[ibis_types.BooleanValue], -) -> ibis_types.BooleanValue: - """Converts a list of predicates BooleanValues into a single BooleanValue.""" - if len(predicate_list) == 0: - raise ValueError("Cannot reduce empty list of predicates") - if len(predicate_list) == 1: - (item,) = predicate_list - return item - return functools.reduce(lambda acc, pred: acc.__and__(pred), predicate_list) - - -def _convert_ordering_to_table_values( - value_lookup: typing.Mapping[str, ibis_types.Value], - ordering_columns: typing.Sequence[OrderingColumnReference], -) -> typing.Sequence[ibis_types.Value]: - column_refs = ordering_columns - ordering_values = [] - for ordering_col in column_refs: - column = typing.cast(ibis_types.Column, value_lookup[ordering_col.column_id]) - ordering_value = ( - ibis.asc(column) - if ordering_col.direction.is_ascending - else ibis.desc(column) + nodes.JoinNode( + left_child=self.node, + right_child=other.node, + left_column_ids=tuple(self_column_ids), + right_column_ids=tuple(other_column_ids), + how=how, + allow_row_identity_join=allow_row_identity_join, + ) ) - # Bigquery SQL considers NULLS to be "smallest" values, but we need to override in these cases. - if (not ordering_col.na_last) and (not ordering_col.direction.is_ascending): - # Force nulls to be first - is_null_val = typing.cast(ibis_types.Column, column.isnull()) - ordering_values.append(ibis.desc(is_null_val)) - elif (ordering_col.na_last) and (ordering_col.direction.is_ascending): - # Force nulls to be last - is_null_val = typing.cast(ibis_types.Column, column.isnull()) - ordering_values.append(ibis.asc(is_null_val)) - ordering_values.append(ordering_value) - return ordering_values + def _uniform_sampling(self, fraction: float) -> ArrayValue: + """Sampling the table on given fraction. -def _as_identity(value: ibis_types.Value): - # Some types need to be converted to string to enable groupby - if value.type().is_float64() or value.type().is_geospatial(): - return value.cast(ibis_dtypes.str) - return value + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index b0f05f4798..3706bf1681 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -21,6 +21,7 @@ import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.ordering as ordering +import bigframes.core.window_spec as windows import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -68,21 +69,21 @@ def indicate_duplicates( if keep == "first": # Count how many copies occur up to current copy of value # Discard this value if there are copies BEFORE - window_spec = core.WindowSpec( + window_spec = windows.WindowSpec( grouping_keys=tuple(columns), following=0, ) elif keep == "last": # Count how many copies occur up to current copy of values # Discard this value if there are copies AFTER - window_spec = core.WindowSpec( + window_spec = windows.WindowSpec( grouping_keys=tuple(columns), preceding=0, ) else: # keep == False # Count how many copies of the value occur in entire series. # Discard this value if there are copies ANYWHERE - window_spec = core.WindowSpec(grouping_keys=tuple(columns)) + window_spec = windows.WindowSpec(grouping_keys=tuple(columns)) block, dummy = block.create_constant(1) block, val_count_col_id = block.apply_window_op( dummy, @@ -131,7 +132,7 @@ def value_counts( ) count_id = agg_ids[0] if normalize: - unbound_window = core.WindowSpec() + unbound_window = windows.WindowSpec() block, total_count_id = block.apply_window_op( count_id, agg_ops.sum_op, unbound_window ) @@ -153,7 +154,7 @@ def value_counts( def pct_change(block: blocks.Block, periods: int = 1) -> blocks.Block: column_labels = block.column_labels - window_spec = core.WindowSpec( + window_spec = windows.WindowSpec( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -195,7 +196,7 @@ def rank( ops.isnull_op, ) nullity_col_ids.append(nullity_col_id) - window = core.WindowSpec( + window = windows.WindowSpec( # BigQuery has syntax to reorder nulls with "NULLS FIRST/LAST", but that is unavailable through ibis presently, so must order on a separate nullity expression first. ordering=( ordering.OrderingColumnReference( @@ -229,7 +230,7 @@ def rank( block, result_id = block.apply_window_op( rownum_col_ids[i], agg_op, - window_spec=core.WindowSpec(grouping_keys=[columns[i]]), + window_spec=windows.WindowSpec(grouping_keys=(columns[i],)), skip_reproject_unsafe=(i < (len(columns) - 1)), ) post_agg_rownum_col_ids.append(result_id) @@ -311,7 +312,7 @@ def nsmallest( block, counter = block.apply_window_op( column_ids[0], agg_ops.rank_op, - window_spec=core.WindowSpec(ordering=order_refs), + window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) block, condition = block.apply_unary_op( counter, ops.partial_right(ops.le_op, n) @@ -343,7 +344,7 @@ def nlargest( block, counter = block.apply_window_op( column_ids[0], agg_ops.rank_op, - window_spec=core.WindowSpec(ordering=order_refs), + window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) block, condition = block.apply_unary_op( counter, ops.partial_right(ops.le_op, n) @@ -440,14 +441,14 @@ def _mean_delta_to_power( grouping_column_ids: typing.Sequence[str], ) -> typing.Tuple[blocks.Block, typing.Sequence[str]]: """Calculate (x-mean(x))^n. Useful for calculating moment statistics such as skew and kurtosis.""" - window = core.WindowSpec(grouping_keys=grouping_column_ids) + window = windows.WindowSpec(grouping_keys=tuple(grouping_column_ids)) block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window) delta_ids = [] cube_op = ops.partial_right(ops.pow_op, n_power) for val_id, mean_val_id in zip(column_ids, mean_ids): block, delta_id = block.apply_binary_op(val_id, mean_val_id, ops.sub_op) block, delta_power_id = block.apply_unary_op(delta_id, cube_op) - block = block.drop_columns(delta_id) + block = block.drop_columns([delta_id]) delta_ids.append(delta_power_id) return block, delta_ids @@ -645,7 +646,7 @@ def _idx_extrema( for idx_col in original_block.index_columns ], ] - window_spec = core.WindowSpec(ordering=order_refs) + window_spec = windows.WindowSpec(ordering=tuple(order_refs)) idx_col = original_block.index_columns[0] block, result_col = block.apply_window_op( idx_col, agg_ops.first_op, window_spec diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 9db193a04e..cc13edeaf9 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -35,7 +35,6 @@ import bigframes.core as core import bigframes.core.guid as guid import bigframes.core.indexes as indexes -import bigframes.core.joins as joins import bigframes.core.joins.name_resolution as join_names import bigframes.core.ordering as ordering import bigframes.core.utils @@ -378,7 +377,7 @@ def _to_dataframe(self, result) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" dtypes = dict(zip(self.index_columns, self.index_dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) - return self._expr._session._rows_to_dataframe(result, dtypes) + return self._expr.session._rows_to_dataframe(result, dtypes) def to_pandas( self, @@ -422,7 +421,7 @@ def to_pandas_batches(self): dtypes.update(zip(self.value_columns, self.dtypes)) results_iterator, _ = self._expr.start_query() for arrow_table in results_iterator.to_arrow_iterable( - bqstorage_client=self._expr._session.bqstoragereadclient + bqstorage_client=self._expr.session.bqstoragereadclient ): df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) self._copy_index_to_pandas(df) @@ -454,7 +453,9 @@ def _compute_and_count( results_iterator, query_job = expr.start_query(max_results=max_results) - table_size = expr._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES + table_size = ( + expr.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES + ) fraction = ( max_download_size / table_size if (max_download_size is not None) and (table_size != 0) @@ -819,7 +820,9 @@ def aggregate_all_and_stack( axis: int | str = 0, value_col_id: str = "values", dropna: bool = True, - dtype=pd.Float64Dtype(), + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] + ] = pd.Float64Dtype(), ) -> Block: axis_n = utils.get_axis_number(axis) if axis_n == 0: @@ -829,7 +832,7 @@ def aggregate_all_and_stack( result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( row_labels=self.column_labels.to_list(), index_col_ids=["index"], - unpivot_columns=[(value_col_id, self.value_columns)], + unpivot_columns=tuple([(value_col_id, tuple(self.value_columns))]), dtype=dtype, ) return Block(result_expr, index_columns=["index"], column_labels=[None]) @@ -841,7 +844,7 @@ def aggregate_all_and_stack( stacked_expr = expr_with_offsets.unpivot( row_labels=self.column_labels.to_list(), index_col_ids=[guid.generate_guid()], - unpivot_columns=[(value_col_id, self.value_columns)], + unpivot_columns=[(value_col_id, tuple(self.value_columns))], passthrough_columns=[*self.index_columns, offset_col], dtype=dtype, ) @@ -1029,13 +1032,13 @@ def summarize( for col_id in column_ids ] columns = [ - (col_id, [f"{col_id}-{stat.name}" for stat in stats]) + (col_id, tuple(f"{col_id}-{stat.name}" for stat in stats)) for col_id in column_ids ] expr = self.expr.aggregate(aggregations).unpivot( labels, - unpivot_columns=columns, - index_col_ids=[label_col_id], + unpivot_columns=tuple(columns), + index_col_ids=tuple([label_col_id]), ) labels = self._get_labels_for_columns(column_ids) return Block(expr, column_labels=labels, index_columns=[label_col_id]) @@ -1342,7 +1345,7 @@ def stack(self, how="left", levels: int = 1): passthrough_columns=self.index_columns, unpivot_columns=unpivot_columns, index_col_ids=added_index_columns, - dtype=dtypes, + dtype=tuple(dtypes), how=how, ) new_index_level_names = self.column_labels.names[-levels:] @@ -1382,7 +1385,7 @@ def _create_stack_column( dtype = self._column_type(input_id) input_columns.append(input_id) # Input column i is the first one that - return input_columns, dtype or pd.Float64Dtype() + return tuple(input_columns), dtype or pd.Float64Dtype() def _column_type(self, col_id: str) -> bigframes.dtypes.Dtype: col_offset = self.value_columns.index(col_id) @@ -1497,8 +1500,7 @@ def merge( sort: bool, suffixes: tuple[str, str] = ("_x", "_y"), ) -> Block: - joined_expr = joins.join_by_column( - self.expr, + joined_expr = self.expr.join( left_join_ids, other.expr, right_join_ids, @@ -1708,7 +1710,7 @@ def _is_monotonic( return result -def block_from_local(data, session=None) -> Block: +def block_from_local(data) -> Block: pd_data = pd.DataFrame(data) columns = pd_data.columns @@ -1730,7 +1732,7 @@ def block_from_local(data, session=None) -> Block: ) index_ids = pd_data.columns[: len(index_labels)] - keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session) + keys_expr = core.ArrayValue.from_pandas(pd_data) return Block( keys_expr, column_labels=columns, diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py new file mode 100644 index 0000000000..c86f4463dc --- /dev/null +++ b/bigframes/core/compile/__init__.py @@ -0,0 +1,21 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.core.compile.compiled import CompiledArrayValue +from bigframes.core.compile.compiler import compile_node + +__all__ = [ + "compile_node", + "CompiledArrayValue", +] diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py new file mode 100644 index 0000000000..1134f1aab0 --- /dev/null +++ b/bigframes/core/compile/compiled.py @@ -0,0 +1,1121 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import functools +import math +import textwrap +import typing +from typing import Collection, Iterable, Literal, Optional, Sequence + +import ibis +import ibis.backends.bigquery as ibis_bigquery +import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.types as ibis_types +import pandas + +import bigframes.constants as constants +import bigframes.core.guid +from bigframes.core.ordering import ( + encode_order_string, + ExpressionOrdering, + IntegerEncoding, + OrderingColumnReference, + reencode_order_string, + StringEncoding, +) +import bigframes.core.utils as utils +from bigframes.core.window_spec import WindowSpec +import bigframes.dtypes +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +ORDER_ID_COLUMN = "bigframes_ordering_id" +PREDICATE_COLUMN = "bigframes_predicate" + + +class CompiledArrayValue: + """Immutable BigQuery DataFrames expression tree. + + Note: Usage of this class is considered to be private and subject to change + at any time. + + This class is a wrapper around Ibis expressions. Its purpose is to defer + Ibis projection operations to keep generated SQL small and correct when + mixing and matching columns from different versions of a DataFrame. + + Args: + table: An Ibis table expression. + columns: Ibis value expressions that can be projected as columns. + hidden_ordering_columns: Ibis value expressions to store ordering. + ordering: An ordering property of the data frame. + predicates: A list of filters on the data frame. + """ + + def __init__( + self, + table: ibis_types.Table, + columns: Sequence[ibis_types.Value], + hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, + ordering: ExpressionOrdering = ExpressionOrdering(), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self._table = table + self._predicates = tuple(predicates) if predicates is not None else () + # TODO: Validate ordering + if not ordering.total_ordering_columns: + raise ValueError("Must have total ordering defined by one or more columns") + self._ordering = ordering + # Allow creating a DataFrame directly from an Ibis table expression. + # TODO(swast): Validate that each column references the same table (or + # no table for literal values). + self._columns = tuple(columns) + + # Meta columns store ordering, or other data that doesn't correspond to dataframe columns + self._hidden_ordering_columns = ( + tuple(hidden_ordering_columns) + if hidden_ordering_columns is not None + else () + ) + + # To allow for more efficient lookup by column name, create a + # dictionary mapping names to column values. + self._column_names = {column.get_name(): column for column in self._columns} + self._hidden_ordering_column_names = { + column.get_name(): column for column in self._hidden_ordering_columns + } + ### Validation + value_col_ids = self._column_names.keys() + hidden_col_ids = self._hidden_ordering_column_names.keys() + + all_columns = value_col_ids | hidden_col_ids + ordering_valid = all( + col.column_id in all_columns for col in ordering.all_ordering_columns + ) + if value_col_ids & hidden_col_ids: + raise ValueError( + f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" + ) + if not ordering_valid: + raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") + + @classmethod + def mem_expr_from_pandas( + cls, + pd_df: pandas.DataFrame, + ) -> CompiledArrayValue: + """ + Builds an in-memory only (SQL only) expr from a pandas dataframe. + """ + # We can't include any hidden columns in the ArrayValue constructor, so + # grab the column names before we add the hidden ordering column. + column_names = [str(column) for column in pd_df.columns] + # Make sure column names are all strings. + pd_df = pd_df.set_axis(column_names, axis="columns") + pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) + + # ibis memtable cannot handle NA, must convert to None + pd_df = pd_df.astype("object") # type: ignore + pd_df = pd_df.where(pandas.notnull(pd_df), None) + + # NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases. + keys_memtable = ibis.memtable(pd_df) + schema = keys_memtable.schema() + new_schema = [] + for column_index, column in enumerate(schema): + if column == ORDER_ID_COLUMN: + new_type: ibis_dtypes.DataType = ibis_dtypes.int64 + else: + column_type = schema[column] + # The autodetected type might not be one we can support, such + # as NULL type for empty rows, so convert to a type we do + # support. + new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) + ) + # TODO(swast): Ibis memtable doesn't use backticks in struct + # field names, so spaces and other characters aren't allowed in + # the memtable context. Blocked by + # https://github.com/ibis-project/ibis/issues/7187 + column = f"col_{column_index}" + new_schema.append((column, new_type)) + + # must set non-null column labels. these are not the user-facing labels + pd_df = pd_df.set_axis( + [column for column, _ in new_schema], + axis="columns", + ) + keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema)) + + return cls( + keys_memtable, + columns=[ + keys_memtable[f"col_{column_index}"].name(column) + for column_index, column in enumerate(column_names) + ], + ordering=ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + ), + hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), + ) + + @property + def columns(self) -> typing.Tuple[ibis_types.Value, ...]: + return self._columns + + @property + def column_ids(self) -> typing.Sequence[str]: + return tuple(self._column_names.keys()) + + @property + def _hidden_column_ids(self) -> typing.Sequence[str]: + return tuple(self._hidden_ordering_column_names.keys()) + + @property + def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: + """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" + return ( + _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) + if self._predicates + else None + ) + + @property + def _ibis_order(self) -> Sequence[ibis_types.Value]: + """Returns a sequence of ibis values which can be directly used to order a table expression. Has direction modifiers applied.""" + return _convert_ordering_to_table_values( + {**self._column_names, **self._hidden_ordering_column_names}, + self._ordering.all_ordering_columns, + ) + + def builder(self) -> ArrayValueBuilder: + """Creates a mutable builder for expressions.""" + # Since ArrayValue is intended to be immutable (immutability offers + # potential opportunities for caching, though we might need to introduce + # more node types for that to be useful), we create a builder class. + return ArrayValueBuilder( + self._table, + columns=self._columns, + hidden_ordering_columns=self._hidden_ordering_columns, + ordering=self._ordering, + predicates=self._predicates, + ) + + def drop_columns(self, columns: Iterable[str]) -> CompiledArrayValue: + # Must generate offsets if we are dropping a column that ordering depends on + expr = self + for ordering_column in set(columns).intersection( + [col.column_id for col in self._ordering.ordering_value_columns] + ): + expr = self._hide_column(ordering_column) + + expr_builder = expr.builder() + remain_cols = [ + column for column in expr.columns if column.get_name() not in columns + ] + expr_builder.columns = remain_cols + return expr_builder.build() + + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + ibis_type = typing.cast( + bigframes.dtypes.IbisDtype, self._get_any_column(key).type() + ) + return typing.cast( + bigframes.dtypes.Dtype, + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type), + ) + + def _get_ibis_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column.""" + if key not in self.column_ids: + raise ValueError( + "Column name {} not in set of values: {}".format(key, self.column_ids) + ) + return typing.cast(ibis_types.Value, self._column_names[key]) + + def _get_any_column(self, key: str) -> ibis_types.Value: + """Gets the Ibis expression for a given column. Will also get hidden columns.""" + all_columns = {**self._column_names, **self._hidden_ordering_column_names} + if key not in all_columns.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, all_columns.keys() + ) + ) + return typing.cast(ibis_types.Value, all_columns[key]) + + def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: + """Gets the Ibis expression for a given hidden column.""" + if key not in self._hidden_ordering_column_names.keys(): + raise ValueError( + "Column name {} not in set of values: {}".format( + key, self._hidden_ordering_column_names.keys() + ) + ) + return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) + + def filter(self, predicate_id: str, keep_null: bool = False) -> CompiledArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + condition = typing.cast( + ibis_types.BooleanValue, self._get_ibis_column(predicate_id) + ) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) + return self._filter(condition) + + def _filter(self, predicate_value: ibis_types.BooleanValue) -> CompiledArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + expr = self.builder() + expr.ordering = expr.ordering.with_non_sequential() + expr.predicates = [*self._predicates, predicate_value] + return expr.build() + + def order_by( + self, by: Sequence[OrderingColumnReference], stable: bool = False + ) -> CompiledArrayValue: + expr_builder = self.builder() + expr_builder.ordering = self._ordering.with_ordering_columns(by, stable=stable) + return expr_builder.build() + + def reversed(self) -> CompiledArrayValue: + expr_builder = self.builder() + expr_builder.ordering = self._ordering.with_reverse() + return expr_builder.build() + + def _uniform_sampling(self, fraction: float) -> CompiledArrayValue: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + table = self._to_ibis_expr( + "unordered", expose_hidden_cols=True, fraction=fraction + ) + columns = [table[column_name] for column_name in self._column_names] + hidden_ordering_columns = [ + table[column_name] for column_name in self._hidden_ordering_column_names + ] + return CompiledArrayValue( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + + @property + def _offsets(self) -> ibis_types.IntegerColumn: + if not self._ordering.is_sequential: + raise ValueError( + "Expression does not have offsets. Generate them first using project_offsets." + ) + if not self._ordering.total_order_col: + raise ValueError( + "Ordering is invalid. Marked as sequential but no total order columns." + ) + column = self._get_any_column(self._ordering.total_order_col.column_id) + return typing.cast(ibis_types.IntegerColumn, column) + + def _project_offsets(self) -> CompiledArrayValue: + """Create a new expression that contains offsets. Should only be executed when offsets are needed for an operations. Has no effect on expression semantics.""" + if self._ordering.is_sequential: + return self + # TODO(tbergeron): Enforce total ordering + table = self._to_ibis_expr( + ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN + ) + columns = [table[column_name] for column_name in self._column_names] + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(True, is_sequential=True), + ) + return CompiledArrayValue( + table, + columns=columns, + hidden_ordering_columns=[table[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def _hide_column(self, column_id) -> CompiledArrayValue: + """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" + expr_builder = self.builder() + # Need to rename column as caller might be creating a new row with the same name but different values. + # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. + new_name = bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") + expr_builder.hidden_ordering_columns = [ + *self._hidden_ordering_columns, + self._get_ibis_column(column_id).name(new_name), + ] + expr_builder.ordering = self._ordering.with_column_remap({column_id: new_name}) + return expr_builder.build() + + def promote_offsets(self, col_id: str) -> CompiledArrayValue: + """ + Convenience function to promote copy of column offsets to a value column. Can be used to reset index. + """ + # Special case: offsets already exist + ordering = self._ordering + + if (not ordering.is_sequential) or (not ordering.total_order_col): + return self._project_offsets().promote_offsets(col_id) + expr_builder = self.builder() + expr_builder.columns = [ + self._get_any_column(ordering.total_order_col.column_id).name(col_id), + *self.columns, + ] + return expr_builder.build() + + def select_columns(self, column_ids: typing.Sequence[str]) -> CompiledArrayValue: + """Creates a new expression based on this expression with new columns.""" + columns = [self._get_ibis_column(col_id) for col_id in column_ids] + expr = self + for ordering_column in set(self.column_ids).intersection( + [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] + ): + # Need to hide ordering columns that are being dropped. Alternatively, could project offsets + expr = expr._hide_column(ordering_column) + builder = expr.builder() + builder.columns = list(columns) + new_expr = builder.build() + return new_expr + + def concat(self, other: typing.Sequence[CompiledArrayValue]) -> CompiledArrayValue: + """Append together multiple ArrayValue objects.""" + if len(other) == 0: + return self + tables = [] + prefix_base = 10 + prefix_size = math.ceil(math.log(len(other) + 1, prefix_base)) + # Must normalize all ids to the same encoding size + max_encoding_size = max( + self._ordering.string_encoding.length, + *[expression._ordering.string_encoding.length for expression in other], + ) + for i, expr in enumerate([self, *other]): + ordering_prefix = str(i).zfill(prefix_size) + table = expr._to_ibis_expr( + ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN + ) + # Rename the value columns based on horizontal offset before applying union. + table = table.select( + [ + table[col].name(f"column_{i}") + if col != ORDER_ID_COLUMN + else ( + ordering_prefix + + reencode_order_string( + table[ORDER_ID_COLUMN], max_encoding_size + ) + ).name(ORDER_ID_COLUMN) + for i, col in enumerate(table.columns) + ] + ) + tables.append(table) + combined_table = ibis.union(*tables) + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + string_encoding=StringEncoding(True, prefix_size + max_encoding_size), + ) + return CompiledArrayValue( + combined_table, + columns=[ + combined_table[col] + for col in combined_table.columns + if col != ORDER_ID_COLUMN + ], + hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def project_unary_op( + self, column_name: str, op: ops.UnaryOp, output_name=None + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with unary operation applied to one column.""" + value = op._as_ibis(self._get_ibis_column(column_name)).name( + output_name or column_name + ) + return self._set_or_replace_by_id(output_name or column_name, value) + + def project_binary_op( + self, + left_column_id: str, + right_column_id: str, + op: ops.BinaryOp, + output_column_id: str, + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with binary operation applied to two columns.""" + value = op( + self._get_ibis_column(left_column_id), + self._get_ibis_column(right_column_id), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def project_ternary_op( + self, + col_id_1: str, + col_id_2: str, + col_id_3: str, + op: ops.TernaryOp, + output_column_id: str, + ) -> CompiledArrayValue: + """Creates a new expression based on this expression with ternary operation applied to three columns.""" + value = op( + self._get_ibis_column(col_id_1), + self._get_ibis_column(col_id_2), + self._get_ibis_column(col_id_3), + ).name(output_column_id) + return self._set_or_replace_by_id(output_column_id, value) + + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> CompiledArrayValue: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform + dropna: whether null keys should be dropped + """ + table = self._to_ibis_expr("unordered") + stats = { + col_out: agg_op._as_ibis(table[col_in]) + for col_in, agg_op, col_out in aggregations + } + if by_column_ids: + result = table.group_by(by_column_ids).aggregate(**stats) + # Must have deterministic ordering, so order by the unique "by" column + ordering = ExpressionOrdering( + tuple( + [ + OrderingColumnReference(column_id=column_id) + for column_id in by_column_ids + ] + ), + total_ordering_columns=frozenset(by_column_ids), + ) + columns = tuple(result[key] for key in result.columns) + expr = CompiledArrayValue(result, columns=columns, ordering=ordering) + if dropna: + for column_id in by_column_ids: + expr = expr._filter( + ops.notnull_op._as_ibis(expr._get_ibis_column(column_id)) + ) + # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation + return expr._project_offsets() + else: + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [OrderingColumnReference(ORDER_ID_COLUMN)] + ), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return CompiledArrayValue( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> CompiledArrayValue: + """ + Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. + This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. + Arguments: + corr_aggregations: left_column_id, right_column_id, output_column_id tuples + """ + table = self._to_ibis_expr("unordered") + stats = { + col_out: table[col_left].corr(table[col_right], how="pop") + for col_left, col_right, col_out in corr_aggregations + } + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ORDER_ID_COLUMN)]), + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return CompiledArrayValue( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + + def project_window_op( + self, + column_name: str, + op: agg_ops.WindowOp, + window_spec: WindowSpec, + output_name=None, + *, + never_skip_nulls=False, + skip_reproject_unsafe: bool = False, + ) -> CompiledArrayValue: + """ + Creates a new expression based on this expression with unary operation applied to one column. + column_name: the id of the input column present in the expression + op: the windowable operator to apply to the input column + window_spec: a specification of the window over which to apply the operator + output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided + never_skip_nulls: will disable null skipping for operators that would otherwise do so + skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection + """ + column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) + window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) + + window_op = op._as_ibis(column, window) + + clauses = [] + if op.skips_nulls and not never_skip_nulls: + clauses.append((column.isnull(), ibis.NA)) + if window_spec.min_periods: + if op.skips_nulls: + # Most operations do not count NULL values towards min_periods + observation_count = agg_ops.count_op._as_ibis(column, window) + else: + # Operations like count treat even NULLs as valid observations for the sake of min_periods + # notnull is just used to convert null values to non-null (FALSE) values to be counted + denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) + observation_count = agg_ops.count_op._as_ibis(denulled_value, window) + clauses.append( + ( + observation_count < ibis_types.literal(window_spec.min_periods), + ibis.NA, + ) + ) + if clauses: + case_statement = ibis.case() + for clause in clauses: + case_statement = case_statement.when(clause[0], clause[1]) + case_statement = case_statement.else_(window_op).end() + window_op = case_statement + + result = self._set_or_replace_by_id(output_name or column_name, window_op) + # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. + return result._reproject_to_table() if not skip_reproject_unsafe else result + + def to_sql( + self, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + offsets_id = offset_column or ORDER_ID_COLUMN + + sql = ibis_bigquery.Backend().compile( + self._to_ibis_expr( + ordering_mode="offset_col" + if (offset_column or sorted) + else "unordered", + order_col_name=offsets_id, + col_id_overrides=col_id_overrides, + ) + ) + if sorted: + sql = textwrap.dedent( + f""" + SELECT * EXCEPT (`{offsets_id}`) + FROM ({sql}) + ORDER BY `{offsets_id}` + """ + ) + return typing.cast(str, sql) + + def _to_ibis_expr( + self, + ordering_mode: Literal["string_encoded", "offset_col", "unordered"], + order_col_name: Optional[str] = ORDER_ID_COLUMN, + expose_hidden_cols: bool = False, + fraction: Optional[float] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + ): + """ + Creates an Ibis table expression representing the DataFrame. + + ArrayValue objects are sorted, so the following options are available + to reflect this in the ibis expression. + + * "offset_col": Zero-based offsets are generated as a column, this will + not sort the rows however. + * "string_encoded": An ordered string column is provided in output table. + * "unordered": No ordering information will be provided in output. Only + value columns are projected. + + For offset or ordered column, order_col_name can be used to assign the + output label for the ordering column. If none is specified, the default + column name will be 'bigframes_ordering_id' + + Args: + ordering_mode: + How to construct the Ibis expression from the ArrayValue. See + above for details. + order_col_name: + If the ordering mode outputs a single ordering or offsets + column, use this as the column name. + expose_hidden_cols: + If True, include the hidden ordering columns in the results. + Only compatible with `order_by` and `unordered` + ``ordering_mode``. + col_id_overrides: + overrides the column ids for the result + Returns: + An ibis expression representing the data help by the ArrayValue object. + """ + assert ordering_mode in ( + "string_encoded", + "offset_col", + "unordered", + ) + if expose_hidden_cols and ordering_mode in ("ordered_col", "offset_col"): + raise ValueError( + f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" + ) + + columns = list(self._columns) + columns_to_drop: list[ + str + ] = [] # Ordering/Filtering columns that will be dropped at end + + if self._reduced_predicate is not None: + columns.append(self._reduced_predicate) + # Usually drop predicate as it is will be all TRUE after filtering + if not expose_hidden_cols: + columns_to_drop.append(self._reduced_predicate.get_name()) + + order_columns = self._create_order_columns( + ordering_mode, order_col_name, expose_hidden_cols + ) + columns.extend(order_columns) + + # Special case for empty tables, since we can't create an empty + # projection. + if not columns: + return ibis.memtable([]) + + # Make sure all dtypes are the "canonical" ones for BigFrames. This is + # important for operations like UNION where the schema must match. + table = self._table.select( + bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + ) + base_table = table + if self._reduced_predicate is not None: + table = table.filter(base_table[PREDICATE_COLUMN]) + table = table.drop(*columns_to_drop) + if col_id_overrides: + table = table.relabel(col_id_overrides) + if fraction is not None: + table = table.filter(ibis.random() < ibis.literal(fraction)) + return table + + def _create_order_columns( + self, + ordering_mode: str, + order_col_name: Optional[str], + expose_hidden_cols: bool, + ) -> typing.Sequence[ibis_types.Value]: + # Generate offsets if current ordering id semantics are not sufficiently strict + if ordering_mode == "offset_col": + return (self._create_offset_column().name(order_col_name),) + elif ordering_mode == "string_encoded": + return (self._create_string_ordering_column().name(order_col_name),) + elif expose_hidden_cols: + return self._hidden_ordering_columns + return () + + def _create_offset_column(self) -> ibis_types.IntegerColumn: + if self._ordering.total_order_col and self._ordering.is_sequential: + offsets = self._get_any_column(self._ordering.total_order_col.column_id) + return typing.cast(ibis_types.IntegerColumn, offsets) + else: + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self._reduced_predicate) + offsets = ibis.row_number().over(window) + return typing.cast(ibis_types.IntegerColumn, offsets) + + def _create_string_ordering_column(self) -> ibis_types.StringColumn: + if self._ordering.total_order_col and self._ordering.is_string_encoded: + string_order_ids = self._get_any_column( + self._ordering.total_order_col.column_id + ) + return typing.cast(ibis_types.StringColumn, string_order_ids) + if ( + self._ordering.total_order_col + and self._ordering.integer_encoding.is_encoded + ): + # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers + int_values = self._get_any_column(self._ordering.total_order_col.column_id) + return encode_order_string( + typing.cast(ibis_types.IntegerColumn, int_values), + ) + else: + # Have to build string from scratch + window = ibis.window(order_by=self._ibis_order) + if self._predicates: + window = window.group_by(self._reduced_predicate) + row_nums = typing.cast( + ibis_types.IntegerColumn, ibis.row_number().over(window) + ) + return encode_order_string(row_nums) + + def _reproject_to_table(self) -> CompiledArrayValue: + """ + Internal operators that projects the internal representation into a + new ibis table expression where each value column is a direct + reference to a column in that table expression. Needed after + some operations such as window operations that cannot be used + recursively in projections. + """ + table = self._to_ibis_expr( + "unordered", + expose_hidden_cols=True, + ) + columns = [table[column_name] for column_name in self._column_names] + ordering_col_ids = [ + ref.column_id for ref in self._ordering.all_ordering_columns + ] + hidden_ordering_columns = [ + table[column_name] + for column_name in self._hidden_ordering_column_names + if column_name in ordering_col_ids + ] + return CompiledArrayValue( + table, + columns=columns, + hidden_ordering_columns=hidden_ordering_columns, + ordering=self._ordering, + ) + + def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = False): + group_by: typing.List[ibis_types.Value] = ( + [ + typing.cast( + ibis_types.Column, _as_identity(self._get_ibis_column(column)) + ) + for column in window_spec.grouping_keys + ] + if window_spec.grouping_keys + else [] + ) + if self._reduced_predicate is not None: + group_by.append(self._reduced_predicate) + if window_spec.ordering: + order_by = _convert_ordering_to_table_values( + {**self._column_names, **self._hidden_ordering_column_names}, + window_spec.ordering, + ) + if not allow_ties: + # Most operator need an unambiguous ordering, so the table's total ordering is appended + order_by = tuple([*order_by, *self._ibis_order]) + elif (window_spec.following is not None) or (window_spec.preceding is not None): + # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. + order_by = tuple(self._ibis_order) + else: + # Unbound grouping window. Suitable for aggregations but not for analytic function application. + order_by = None + return ibis.window( + preceding=window_spec.preceding, + following=window_spec.following, + order_by=order_by, + group_by=group_by, + ) + + def unpivot( + self, + row_labels: typing.Sequence[typing.Hashable], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], + *, + passthrough_columns: typing.Sequence[str] = (), + index_col_ids: typing.Sequence[str] = ["index"], + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), + how="left", + ) -> CompiledArrayValue: + """ + Unpivot ArrayValue columns. + + Args: + row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. + unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. + passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. + index_col_id (str): The column id to be used for the row labels. + dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. + + Returns: + ArrayValue: The unpivoted ArrayValue + """ + if how not in ("left", "right"): + raise ValueError("'how' must be 'left' or 'right'") + table = self._to_ibis_expr("unordered", expose_hidden_cols=True) + row_n = len(row_labels) + hidden_col_ids = self._hidden_ordering_column_names.keys() + if not all( + len(source_columns) == row_n for _, source_columns in unpivot_columns + ): + raise ValueError("Columns and row labels must all be same length.") + + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + if isinstance(row_labels[0], tuple): + labels_table = ibis.memtable(row_labels) + labels_ibis_types = [ + labels_table[col].type() for col in labels_table.columns + ] + else: + labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] + labels_dtypes = [ + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + for ibis_type in labels_ibis_types + ] + + label_columns = [] + for label_part, (col_id, label_dtype) in enumerate( + zip(index_col_ids, labels_dtypes) + ): + # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels + labels_as_tuples = [ + label if isinstance(label, tuple) else (label,) for label in row_labels + ] + cases = [ + ( + i, + bigframes.dtypes.literal_to_ibis_scalar( + label_tuple[label_part], # type:ignore + force_dtype=label_dtype, # type:ignore + ), + ) + for i, label_tuple in enumerate(labels_as_tuples) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(col_id) + ) + label_columns.append(labels_value) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore + ) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, + *label_columns, + *unpivot_values, + *hidden_col_ids, + unpivot_offset_id, + ) + + # Extend the original ordering using unpivot_offset_id + old_ordering = self._ordering + if how == "left": + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + *old_ordering.ordering_value_columns, + OrderingColumnReference(unpivot_offset_id), + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + else: # how=="right" + new_ordering = ExpressionOrdering( + ordering_value_columns=tuple( + [ + OrderingColumnReference(unpivot_offset_id), + *old_ordering.ordering_value_columns, + ] + ), + total_ordering_columns=frozenset( + [*old_ordering.total_ordering_columns, unpivot_offset_id] + ), + ) + value_columns = [ + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns + ] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] + hidden_ordering_columns = [ + unpivot_table[unpivot_offset_id], + *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], + ] + return CompiledArrayValue( + table=unpivot_table, + columns=[ + *[unpivot_table[col_id] for col_id in index_col_ids], + *value_columns, + *passthrough_values, + ], + hidden_ordering_columns=hidden_ordering_columns, + ordering=new_ordering, + ) + + def assign(self, source_id: str, destination_id: str) -> CompiledArrayValue: + return self._set_or_replace_by_id( + destination_id, self._get_ibis_column(source_id) + ) + + def assign_constant( + self, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> CompiledArrayValue: + # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. + ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) + if ibis_value is None: + raise NotImplementedError( + f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" + ) + expr = self._set_or_replace_by_id(destination_id, ibis_value) + return expr._reproject_to_table() + + def _set_or_replace_by_id( + self, id: str, new_value: ibis_types.Value + ) -> CompiledArrayValue: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + if id in ordering_col_ids: + return self._hide_column(id)._set_or_replace_by_id(id, new_value) + + builder = self.builder() + if id in self.column_ids: + builder.columns = [ + val if (col_id != id) else new_value.name(id) + for col_id, val in zip(self.column_ids, self._columns) + ] + else: + builder.columns = [*self.columns, new_value.name(id)] + return builder.build() + + +class ArrayValueBuilder: + """Mutable expression class. + Use ArrayValue.builder() to create from a ArrayValue object. + """ + + def __init__( + self, + table: ibis_types.Table, + ordering: ExpressionOrdering, + columns: Collection[ibis_types.Value] = (), + hidden_ordering_columns: Collection[ibis_types.Value] = (), + predicates: Optional[Collection[ibis_types.BooleanValue]] = None, + ): + self.table = table + self.columns = list(columns) + self.hidden_ordering_columns = list(hidden_ordering_columns) + self.ordering = ordering + self.predicates = list(predicates) if predicates is not None else None + + def build(self) -> CompiledArrayValue: + return CompiledArrayValue( + table=self.table, + columns=self.columns, + hidden_ordering_columns=self.hidden_ordering_columns, + ordering=self.ordering, + predicates=self.predicates, + ) + + +def _reduce_predicate_list( + predicate_list: typing.Collection[ibis_types.BooleanValue], +) -> ibis_types.BooleanValue: + """Converts a list of predicates BooleanValues into a single BooleanValue.""" + if len(predicate_list) == 0: + raise ValueError("Cannot reduce empty list of predicates") + if len(predicate_list) == 1: + (item,) = predicate_list + return item + return functools.reduce(lambda acc, pred: acc.__and__(pred), predicate_list) + + +def _convert_ordering_to_table_values( + value_lookup: typing.Mapping[str, ibis_types.Value], + ordering_columns: typing.Sequence[OrderingColumnReference], +) -> typing.Sequence[ibis_types.Value]: + column_refs = ordering_columns + ordering_values = [] + for ordering_col in column_refs: + column = typing.cast(ibis_types.Column, value_lookup[ordering_col.column_id]) + ordering_value = ( + ibis.asc(column) + if ordering_col.direction.is_ascending + else ibis.desc(column) + ) + # Bigquery SQL considers NULLS to be "smallest" values, but we need to override in these cases. + if (not ordering_col.na_last) and (not ordering_col.direction.is_ascending): + # Force nulls to be first + is_null_val = typing.cast(ibis_types.Column, column.isnull()) + ordering_values.append(ibis.desc(is_null_val)) + elif (ordering_col.na_last) and (ordering_col.direction.is_ascending): + # Force nulls to be last + is_null_val = typing.cast(ibis_types.Column, column.isnull()) + ordering_values.append(ibis.asc(is_null_val)) + ordering_values.append(ordering_value) + return ordering_values + + +def _as_identity(value: ibis_types.Value): + # Some types need to be converted to string to enable groupby + if value.type().is_float64() or value.type().is_geospatial(): + return value.cast(ibis_dtypes.str) + return value diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py new file mode 100644 index 0000000000..195d830122 --- /dev/null +++ b/bigframes/core/compile/compiler.py @@ -0,0 +1,185 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import functools +import io +import typing + +import pandas as pd + +import bigframes.core.compile as compiled +import bigframes.core.compile.single_column +import bigframes.core.nodes as nodes + +if typing.TYPE_CHECKING: + import bigframes.core + import bigframes.session + + +@functools.cache +def compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: + """Compile node into CompileArrayValue. Caches result.""" + return _compile_node(node) + + +@functools.singledispatch +def _compile_node(node: nodes.BigFrameNode) -> compiled.CompiledArrayValue: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unnrecognized node: {node}") + + +@_compile_node.register +def compile_join(node: nodes.JoinNode): + compiled_left = compile_node(node.left_child) + compiled_right = compile_node(node.right_child) + return bigframes.core.compile.single_column.join_by_column( + compiled_left, + node.left_column_ids, + compiled_right, + node.right_column_ids, + how=node.how, + allow_row_identity_join=node.allow_row_identity_join, + ) + + +@_compile_node.register +def compile_select(node: nodes.SelectNode): + return compile_node(node.child).select_columns(node.column_ids) + + +@_compile_node.register +def compile_drop(node: nodes.DropColumnsNode): + return compile_node(node.child).drop_columns(node.columns) + + +@_compile_node.register +def compile_readlocal(node: nodes.ReadLocalNode): + array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) + return compiled.CompiledArrayValue.mem_expr_from_pandas(array_as_pd) + + +@_compile_node.register +def compile_readgbq(node: nodes.ReadGbqNode): + return compiled.CompiledArrayValue( + node.table, + node.columns, + node.hidden_ordering_columns, + node.ordering, + ) + + +@_compile_node.register +def compile_promote_offsets(node: nodes.PromoteOffsetsNode): + return compile_node(node.child).promote_offsets(node.col_id) + + +@_compile_node.register +def compile_filter(node: nodes.FilterNode): + return compile_node(node.child).filter(node.predicate_id, node.keep_null) + + +@_compile_node.register +def compile_orderby(node: nodes.OrderByNode): + return compile_node(node.child).order_by(node.by, node.stable) + + +@_compile_node.register +def compile_reversed(node: nodes.ReversedNode): + return compile_node(node.child).reversed() + + +@_compile_node.register +def compile_project_unary(node: nodes.ProjectUnaryOpNode): + return compile_node(node.child).project_unary_op( + node.input_id, node.op, node.output_id + ) + + +@_compile_node.register +def compile_project_binary(node: nodes.ProjectBinaryOpNode): + return compile_node(node.child).project_binary_op( + node.left_input_id, node.right_input_id, node.op, node.output_id + ) + + +@_compile_node.register +def compile_project_ternary(node: nodes.ProjectTernaryOpNode): + return compile_node(node.child).project_ternary_op( + node.input_id1, node.input_id2, node.input_id3, node.op, node.output_id + ) + + +@_compile_node.register +def compile_concat(node: nodes.ConcatNode): + compiled_nodes = [compile_node(node) for node in node.children] + return compiled_nodes[0].concat(compiled_nodes[1:]) + + +@_compile_node.register +def compile_aggregate(node: nodes.AggregateNode): + return compile_node(node.child).aggregate( + node.aggregations, node.by_column_ids, node.dropna + ) + + +@_compile_node.register +def compile_corr(node: nodes.CorrNode): + return compile_node(node.child).corr_aggregate(node.corr_aggregations) + + +@_compile_node.register +def compile_window(node: nodes.WindowOpNode): + return compile_node(node.child).project_window_op( + node.column_name, + node.op, + node.window_spec, + node.output_name, + never_skip_nulls=node.never_skip_nulls, + skip_reproject_unsafe=node.skip_reproject_unsafe, + ) + + +@_compile_node.register +def compile_reproject(node: nodes.ReprojectOpNode): + return compile_node(node.child)._reproject_to_table() + + +@_compile_node.register +def compile_unpivot(node: nodes.UnpivotNode): + return compile_node(node.child).unpivot( + node.row_labels, + node.unpivot_columns, + passthrough_columns=node.passthrough_columns, + index_col_ids=node.index_col_ids, + dtype=node.dtype, + how=node.how, + ) + + +@_compile_node.register +def compile_assign(node: nodes.AssignNode): + return compile_node(node.child).assign(node.source_id, node.destination_id) + + +@_compile_node.register +def compile_assign_constant(node: nodes.AssignConstantNode): + return compile_node(node.child).assign_constant( + node.destination_id, node.value, node.dtype + ) + + +@_compile_node.register +def compiler_random_sample(node: nodes.RandomSampleNode): + return compile_node(node.child)._uniform_sampling(node.fraction) diff --git a/bigframes/core/joins/row_identity.py b/bigframes/core/compile/row_identity.py similarity index 94% rename from bigframes/core/joins/row_identity.py rename to bigframes/core/compile/row_identity.py index 76e456ec94..2e9bc0527c 100644 --- a/bigframes/core/joins/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -23,15 +23,16 @@ import ibis.expr.types as ibis_types import bigframes.constants as constants -import bigframes.core as core +import bigframes.core.compile as compiled import bigframes.core.joins.name_resolution as naming +import bigframes.core.ordering as orderings SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} def join_by_row_identity( - left: core.ArrayValue, right: core.ArrayValue, *, how: str -) -> core.ArrayValue: + left: compiled.CompiledArrayValue, right: compiled.CompiledArrayValue, *, how: str +) -> compiled.CompiledArrayValue: """Compute join when we are joining by row identity not a specific column.""" if how not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( @@ -101,8 +102,8 @@ def join_by_row_identity( ) # Assume that left ordering is sufficient since 1:1 join over same base table join_total_order_cols = left_total_order_cols - new_ordering = core.ExpressionOrdering( - ordering_columns, total_ordering_columns=join_total_order_cols + new_ordering = orderings.ExpressionOrdering( + tuple(ordering_columns), total_ordering_columns=join_total_order_cols ) hidden_ordering_columns = [ @@ -117,8 +118,7 @@ def join_by_row_identity( if key.column_id in right._hidden_ordering_column_names.keys() ] - joined_expr = core.ArrayValue( - left._session, + joined_expr = compiled.CompiledArrayValue( left._table, columns=joined_columns, hidden_ordering_columns=hidden_ordering_columns, diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/compile/single_column.py similarity index 87% rename from bigframes/core/joins/single_column.py rename to bigframes/core/compile/single_column.py index 0c0e2008b5..b992aa1d1d 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -23,16 +23,16 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types -import bigframes.core as core -import bigframes.core.joins.name_resolution as naming -import bigframes.core.joins.row_identity -import bigframes.core.ordering +import bigframes.core.compile as compiled +import bigframes.core.compile.row_identity +import bigframes.core.joins as joining +import bigframes.core.ordering as orderings def join_by_column( - left: core.ArrayValue, + left: compiled.CompiledArrayValue, left_column_ids: typing.Sequence[str], - right: core.ArrayValue, + right: compiled.CompiledArrayValue, right_column_ids: typing.Sequence[str], *, how: Literal[ @@ -42,7 +42,7 @@ def join_by_column( "right", ], allow_row_identity_join: bool = True, -) -> core.ArrayValue: +) -> compiled.CompiledArrayValue: """Join two expressions by column equality. Arguments: @@ -61,7 +61,7 @@ def join_by_column( """ if ( allow_row_identity_join - and how in bigframes.core.joins.row_identity.SUPPORTED_ROW_IDENTITY_HOW + and how in bigframes.core.compile.row_identity.SUPPORTED_ROW_IDENTITY_HOW and left._table.equals(right._table) # Make sure we're joining on exactly the same column(s), at least with # regards to value its possible that they both have the same names but @@ -73,15 +73,15 @@ def join_by_column( for lcol, rcol in zip(left_column_ids, right_column_ids) ) ): - return bigframes.core.joins.row_identity.join_by_row_identity( + return bigframes.core.compile.row_identity.join_by_row_identity( left, right, how=how ) else: # Value column mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result - l_public_mapping, r_public_mapping = naming.JOIN_NAME_REMAPPER( + l_public_mapping, r_public_mapping = joining.JOIN_NAME_REMAPPER( left.column_ids, right.column_ids ) - l_hidden_mapping, r_hidden_mapping = naming.JoinNameRemapper( + l_hidden_mapping, r_hidden_mapping = joining.JoinNameRemapper( namespace="hidden" )(left._hidden_column_ids, right._hidden_column_ids) l_mapping = {**l_public_mapping, **l_hidden_mapping} @@ -134,8 +134,7 @@ def join_by_column( for col in right._hidden_ordering_columns ], ] - return core.ArrayValue( - left._session, + return compiled.CompiledArrayValue( combined_table, columns=columns, hidden_ordering_columns=hidden_ordering_columns, @@ -151,12 +150,12 @@ def value_to_join_key(value: ibis_types.Value): def join_orderings( - left: core.ExpressionOrdering, - right: core.ExpressionOrdering, + left: orderings.ExpressionOrdering, + right: orderings.ExpressionOrdering, left_id_mapping: Mapping[str, str], right_id_mapping: Mapping[str, str], left_order_dominates: bool = True, -) -> core.ExpressionOrdering: +) -> orderings.ExpressionOrdering: left_ordering_refs = [ ref.with_name(left_id_mapping[ref.column_id]) for ref in left.all_ordering_columns @@ -176,7 +175,7 @@ def join_orderings( right_total_order_cols = frozenset( [right_id_mapping[id] for id in right.total_ordering_columns] ) - return core.ExpressionOrdering( - ordering_value_columns=joined_refs, + return orderings.ExpressionOrdering( + ordering_value_columns=tuple(joined_refs), total_ordering_columns=left_total_order_cols | right_total_order_cols, ) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index db0843fcbc..2a19a83dd5 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -193,7 +193,7 @@ def cumprod(self, *args, **kwargs) -> df.DataFrame: def shift(self, periods=1) -> series.Series: window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -201,7 +201,7 @@ def shift(self, periods=1) -> series.Series: def diff(self, periods=1) -> series.Series: window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -210,7 +210,7 @@ def diff(self, periods=1) -> series.Series: def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=window - 1, following=0, min_periods=min_periods or window, @@ -225,7 +225,7 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: def expanding(self, min_periods: int = 1) -> windows.Window: window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), following=0, min_periods=min_periods, ) @@ -389,7 +389,7 @@ def _apply_window_op( ): """Apply window op to groupby. Defaults to grouped cumulative window.""" window_spec = window or core.WindowSpec( - grouping_keys=self._by_col_ids, following=0 + grouping_keys=tuple(self._by_col_ids), following=0 ) columns = self._aggregated_columns(numeric_only=numeric_only) block, result_ids = self._block.multi_apply_window_op( @@ -528,7 +528,7 @@ def cumcount(self, *args, **kwargs) -> series.Series: def shift(self, periods=1) -> series.Series: """Shift index by desired number of periods.""" window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -536,7 +536,7 @@ def shift(self, periods=1) -> series.Series: def diff(self, periods=1) -> series.Series: window = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -545,7 +545,7 @@ def diff(self, periods=1) -> series.Series: def rolling(self, window: int, min_periods=None) -> windows.Window: # To get n size window, need current row and n-1 preceding rows. window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), preceding=window - 1, following=0, min_periods=min_periods or window, @@ -564,7 +564,7 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: def expanding(self, min_periods: int = 1) -> windows.Window: window_spec = core.WindowSpec( - grouping_keys=self._by_col_ids, + grouping_keys=tuple(self._by_col_ids), following=0, min_periods=min_periods, ) @@ -597,7 +597,7 @@ def _apply_window_op( ): """Apply window op to groupby. Defaults to grouped cumulative window.""" window_spec = window or core.WindowSpec( - grouping_keys=self._by_col_ids, following=0 + grouping_keys=tuple(self._by_col_ids), following=0 ) label = self._value_name if not discard_name else None diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index d18a0a38ef..f6ce084714 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -311,7 +311,7 @@ def _loc_getitem_series_or_dataframe( values = [entry[i] for entry in key] index_cols_dict[index_name] = values keys_df = bigframes.dataframe.DataFrame( - index_cols_dict, session=series_or_dataframe._get_block().expr._session + index_cols_dict, session=series_or_dataframe._get_block().expr.session ) keys_df = keys_df.set_index(temporary_index_names, drop=True) keys_df = keys_df.rename_axis(original_index_names) @@ -324,7 +324,7 @@ def _loc_getitem_series_or_dataframe( index_name = "unnamed_col" keys_df = bigframes.dataframe.DataFrame( {index_name: key}, - session=series_or_dataframe._get_block().expr._session, + session=series_or_dataframe._get_block().expr.session, ) keys_df = keys_df.set_index(index_name, drop=True) if index_name_is_none: @@ -343,7 +343,7 @@ def _loc_getitem_series_or_dataframe( elif pd.api.types.is_scalar(key): index_name = "unnamed_col" keys_df = bigframes.dataframe.DataFrame( - {index_name: [key]}, session=series_or_dataframe._get_block().expr._session + {index_name: [key]}, session=series_or_dataframe._get_block().expr.session ) keys_df = keys_df.set_index(index_name, drop=True) keys_df.index.name = None diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index b9ffdff21e..6c66c36062 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -26,8 +26,7 @@ import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks -import bigframes.core.joins as joins -import bigframes.core.joins.name_resolution as join_names +import bigframes.core.joins as joining import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dtypes @@ -402,7 +401,7 @@ def to_pandas(self) -> pandas.Index: dtypes = dict(zip(index_columns, self.dtypes)) expr = self._expr.select_columns(index_columns) results, _ = expr.start_query() - df = expr._session._rows_to_dataframe(results, dtypes) + df = expr.session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index index.names = list(self._block._index_labels) @@ -461,11 +460,10 @@ def join_mono_indexed( ) -> Tuple[IndexValue, Tuple[Mapping[str, str], Mapping[str, str]],]: left_expr = left._block.expr right_expr = right._block.expr - get_column_left, get_column_right = join_names.JOIN_NAME_REMAPPER( + get_column_left, get_column_right = joining.JOIN_NAME_REMAPPER( left_expr.column_ids, right_expr.column_ids ) - combined_expr = joins.join_by_column( - left._block.expr, + combined_expr = left._block.expr.join( left._block.index_columns, right._block.expr, right._block.index_columns, @@ -520,12 +518,11 @@ def join_multi_indexed( left_expr = left._block.expr right_expr = right._block.expr - get_column_left, get_column_right = join_names.JOIN_NAME_REMAPPER( + get_column_left, get_column_right = joining.JOIN_NAME_REMAPPER( left_expr.column_ids, right_expr.column_ids ) - combined_expr = joins.join_by_column( - left_expr, + combined_expr = left_expr.join( left_join_ids, right_expr, right_join_ids, diff --git a/bigframes/core/joins/__init__.py b/bigframes/core/joins/__init__.py index 3f9447aef0..5d407ec22b 100644 --- a/bigframes/core/joins/__init__.py +++ b/bigframes/core/joins/__init__.py @@ -15,11 +15,6 @@ """Helpers to join ArrayValue objects.""" from bigframes.core.joins.merge import merge -from bigframes.core.joins.row_identity import join_by_row_identity -from bigframes.core.joins.single_column import join_by_column +from bigframes.core.joins.name_resolution import JOIN_NAME_REMAPPER, JoinNameRemapper -__all__ = ( - "join_by_row_identity", - "join_by_column", - "merge", -) +__all__ = ("merge", "JoinNameRemapper", "JOIN_NAME_REMAPPER") diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py new file mode 100644 index 0000000000..7b252b164f --- /dev/null +++ b/bigframes/core/nodes.py @@ -0,0 +1,245 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass, field +import functools +import typing +from typing import Optional, Tuple + +import pandas + +import bigframes.core.guid +from bigframes.core.ordering import OrderingColumnReference +import bigframes.core.window_spec as window +import bigframes.dtypes +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +if typing.TYPE_CHECKING: + import ibis.expr.types as ibis_types + + import bigframes.core.ordering as orderings + import bigframes.session + + +@dataclass(frozen=True) +class BigFrameNode: + """ + Immutable node for representing 2D typed array as a tree of operators. + + All subclasses must be hashable so as to be usable as caching key. + """ + + @property + def deterministic(self) -> bool: + """Whether this node will evaluates deterministically.""" + return True + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + """Direct children of this node""" + return tuple([]) + + @functools.cached_property + def session(self): + sessions = [] + for child in self.child_nodes: + if child.session is not None: + sessions.append(child.session) + unique_sessions = len(set(sessions)) + if unique_sessions > 1: + raise ValueError("Cannot use combine sources from multiple sessions.") + elif unique_sessions == 1: + return sessions[0] + return None + + +@dataclass(frozen=True) +class UnaryNode(BigFrameNode): + child: BigFrameNode + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return (self.child,) + + +@dataclass(frozen=True) +class JoinNode(BigFrameNode): + left_child: BigFrameNode + right_child: BigFrameNode + left_column_ids: typing.Tuple[str, ...] + right_column_ids: typing.Tuple[str, ...] + how: typing.Literal[ + "inner", + "left", + "outer", + "right", + ] + allow_row_identity_join: bool = True + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return (self.left_child, self.right_child) + + +@dataclass(frozen=True) +class ConcatNode(BigFrameNode): + children: Tuple[BigFrameNode, ...] + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return self.children + + +# Input Nodex +@dataclass(frozen=True) +class ReadLocalNode(BigFrameNode): + feather_bytes: bytes + column_ids: typing.Tuple[str, ...] + + +# TODO: Refactor to take raw gbq object reference +@dataclass(frozen=True) +class ReadGbqNode(BigFrameNode): + table: ibis_types.Table = field() + table_session: bigframes.session.Session = field() + columns: Tuple[ibis_types.Value, ...] = field() + hidden_ordering_columns: Tuple[ibis_types.Value, ...] = field() + ordering: orderings.ExpressionOrdering = field() + + @property + def session(self): + return (self.table_session,) + + +# Unary nodes +@dataclass(frozen=True) +class DropColumnsNode(UnaryNode): + columns: Tuple[str, ...] + + +@dataclass(frozen=True) +class PromoteOffsetsNode(UnaryNode): + col_id: str + + +@dataclass(frozen=True) +class FilterNode(UnaryNode): + predicate_id: str + keep_null: bool = False + + +@dataclass(frozen=True) +class OrderByNode(UnaryNode): + by: Tuple[OrderingColumnReference, ...] + stable: bool = False + + +@dataclass(frozen=True) +class ReversedNode(UnaryNode): + pass + + +@dataclass(frozen=True) +class SelectNode(UnaryNode): + column_ids: typing.Tuple[str, ...] + + +@dataclass(frozen=True) +class ProjectUnaryOpNode(UnaryNode): + input_id: str + op: ops.UnaryOp + output_id: Optional[str] = None + + +@dataclass(frozen=True) +class ProjectBinaryOpNode(UnaryNode): + left_input_id: str + right_input_id: str + op: ops.BinaryOp + output_id: str + + +@dataclass(frozen=True) +class ProjectTernaryOpNode(UnaryNode): + input_id1: str + input_id2: str + input_id3: str + op: ops.TernaryOp + output_id: str + + +@dataclass(frozen=True) +class AggregateNode(UnaryNode): + aggregations: typing.Tuple[typing.Tuple[str, agg_ops.AggregateOp, str], ...] + by_column_ids: typing.Tuple[str, ...] = tuple([]) + dropna: bool = True + + +# TODO: Unify into aggregate +@dataclass(frozen=True) +class CorrNode(UnaryNode): + corr_aggregations: typing.Tuple[typing.Tuple[str, str, str], ...] + + +@dataclass(frozen=True) +class WindowOpNode(UnaryNode): + column_name: str + op: agg_ops.WindowOp + window_spec: window.WindowSpec + output_name: typing.Optional[str] = None + never_skip_nulls: bool = False + skip_reproject_unsafe: bool = False + + +@dataclass(frozen=True) +class ReprojectOpNode(UnaryNode): + pass + + +@dataclass(frozen=True) +class UnpivotNode(UnaryNode): + row_labels: typing.Tuple[typing.Hashable, ...] + unpivot_columns: typing.Tuple[ + typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]], ... + ] + passthrough_columns: typing.Tuple[str, ...] = () + index_col_ids: typing.Tuple[str, ...] = ("index",) + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] + ] = (pandas.Float64Dtype(),) + how: typing.Literal["left", "right"] = "left" + + +@dataclass(frozen=True) +class AssignNode(UnaryNode): + source_id: str + destination_id: str + + +@dataclass(frozen=True) +class AssignConstantNode(UnaryNode): + destination_id: str + value: typing.Hashable + dtype: typing.Optional[bigframes.dtypes.Dtype] + + +@dataclass(frozen=True) +class RandomSampleNode(UnaryNode): + fraction: float + + @property + def deterministic(self) -> bool: + return False diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index d5f07ecf91..2cecd2fe7b 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -86,7 +86,7 @@ class IntegerEncoding: class ExpressionOrdering: """Immutable object that holds information about the ordering of rows in a ArrayValue object.""" - ordering_value_columns: Sequence[OrderingColumnReference] = () + ordering_value_columns: typing.Tuple[OrderingColumnReference, ...] = () integer_encoding: IntegerEncoding = IntegerEncoding(False) string_encoding: StringEncoding = StringEncoding(False) # A table has a total ordering defined by the identities of a set of 1 or more columns. @@ -170,7 +170,7 @@ def with_column_remap(self, mapping: typing.Mapping[str, str]): mapping.get(col_id, col_id) for col_id in self.total_ordering_columns ) return ExpressionOrdering( - new_value_columns, + tuple(new_value_columns), integer_encoding=self.integer_encoding, string_encoding=self.string_encoding, total_ordering_columns=new_total_order, diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py new file mode 100644 index 0000000000..3458bfb1b8 --- /dev/null +++ b/bigframes/core/window_spec.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +import typing + +import bigframes.core.ordering as orderings + + +@dataclass(frozen=True) +class WindowSpec: + """ + Specifies a window over which aggregate and analytic function may be applied. + grouping_keys: set of column ids to group on + preceding: Number of preceding rows in the window + following: Number of preceding rows in the window + ordering: List of columns ids and ordering direction to override base ordering + """ + + grouping_keys: typing.Tuple[str, ...] = tuple() + ordering: typing.Tuple[orderings.OrderingColumnReference, ...] = tuple() + preceding: typing.Optional[int] = None + following: typing.Optional[int] = None + min_periods: int = 0 diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3fd8319876..9d22c02d87 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -170,9 +170,7 @@ def __init__( if isinstance(dt, pandas.ArrowDtype) ) ): - self._block = blocks.block_from_local( - pd_dataframe, session or bigframes.pandas.get_global_session() - ) + self._block = blocks.block_from_local(pd_dataframe) elif session: self._block = session.read_pandas(pd_dataframe)._get_block() else: @@ -299,7 +297,7 @@ def values(self) -> numpy.ndarray: @property def _session(self) -> bigframes.Session: - return self._get_block().expr._session + return self._get_block().expr.session def __len__(self): rows, _ = self.shape @@ -1107,7 +1105,7 @@ def _assign_single_item( ) local_df = bigframes.dataframe.DataFrame( - {k: v}, session=self._get_block().expr._session + {k: v}, session=self._get_block().expr.session ) # local_df is likely (but not guarunteed) to be cached locally # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE @@ -2203,7 +2201,7 @@ def to_csv( field_delimiter=sep, header=header, ) - _, query_job = self._block.expr._session._start_query(export_data_statement) + _, query_job = self._block.expr.session._start_query(export_data_statement) self._set_internal_query_job(query_job) def to_json( @@ -2245,7 +2243,7 @@ def to_json( format="JSON", export_options={}, ) - _, query_job = self._block.expr._session._start_query(export_data_statement) + _, query_job = self._block.expr.session._start_query(export_data_statement) self._set_internal_query_job(query_job) def to_gbq( @@ -2274,7 +2272,7 @@ def to_gbq( write_disposition=dispositions[if_exists], destination=bigquery.table.TableReference.from_string( destination_table, - default_project=self._block.expr._session.bqclient.project, + default_project=self._block.expr.session.bqclient.project, ), ) @@ -2321,7 +2319,7 @@ def to_parquet( format="PARQUET", export_options=export_options, ) - _, query_job = self._block.expr._session._start_query(export_data_statement) + _, query_job = self._block.expr.session._start_query(export_data_statement) self._set_internal_query_job(query_job) def to_dict( @@ -2464,7 +2462,7 @@ def _run_io_query( """Executes a query job presenting this dataframe and returns the destination table.""" expr = self._block.expr - session = expr._session + session = expr.session sql = self._create_io_query(index=index, ordering_id=ordering_id) _, query_job = session._start_query( sql=sql, job_config=job_config # type: ignore diff --git a/bigframes/ml/metrics.py b/bigframes/ml/metrics.py index 3bcb621f74..5731b946ca 100644 --- a/bigframes/ml/metrics.py +++ b/bigframes/ml/metrics.py @@ -96,7 +96,7 @@ def roc_curve( y_true_series, y_score_series = utils.convert_to_series(y_true, y_score) - session = y_true_series._block.expr._session + session = y_true_series._block.expr.session # We operate on rows, so, remove the index if there is one # TODO(bmil): check that the indexes are equivalent before removing diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index b9abb2cc03..d33befe4da 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -94,9 +94,7 @@ def __init__( if isinstance(dt, pd.ArrowDtype) ) ): - self._block = blocks.block_from_local( - pd_dataframe, session or bigframes.pandas.get_global_session() - ) + self._block = blocks.block_from_local(pd_dataframe) elif session: self._block = session.read_pandas(pd_dataframe)._get_block() else: diff --git a/bigframes/series.py b/bigframes/series.py index c191452783..37d00d16f3 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -29,7 +29,6 @@ import bigframes.constants as constants import bigframes.core -from bigframes.core import WindowSpec import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.groupby as groupby @@ -43,6 +42,7 @@ import bigframes.core.scalar as scalars import bigframes.core.utils as utils import bigframes.core.window +import bigframes.core.window_spec import bigframes.dataframe import bigframes.dtypes import bigframes.formatting_helpers as formatter @@ -367,43 +367,43 @@ def between(self, left, right, inclusive="both"): def cumsum(self) -> Series: return self._apply_window_op( - agg_ops.sum_op, bigframes.core.WindowSpec(following=0) + agg_ops.sum_op, bigframes.core.window_spec.WindowSpec(following=0) ) def ffill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.WindowSpec(preceding=limit, following=0) + window = bigframes.core.window_spec.WindowSpec(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) pad = ffill def bfill(self, *, limit: typing.Optional[int] = None) -> Series: - window = bigframes.core.WindowSpec(preceding=0, following=limit) + window = bigframes.core.window_spec.WindowSpec(preceding=0, following=limit) return self._apply_window_op(agg_ops.FirstNonNullOp(), window) def cummax(self) -> Series: return self._apply_window_op( - agg_ops.max_op, bigframes.core.WindowSpec(following=0) + agg_ops.max_op, bigframes.core.window_spec.WindowSpec(following=0) ) def cummin(self) -> Series: return self._apply_window_op( - agg_ops.min_op, bigframes.core.WindowSpec(following=0) + agg_ops.min_op, bigframes.core.window_spec.WindowSpec(following=0) ) def cumprod(self) -> Series: return self._apply_window_op( - agg_ops.product_op, bigframes.core.WindowSpec(following=0) + agg_ops.product_op, bigframes.core.window_spec.WindowSpec(following=0) ) def shift(self, periods: int = 1) -> Series: - window = bigframes.core.WindowSpec( + window = bigframes.core.window_spec.WindowSpec( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) return self._apply_window_op(agg_ops.ShiftOp(periods), window) def diff(self, periods: int = 1) -> Series: - window = bigframes.core.WindowSpec( + window = bigframes.core.window_spec.WindowSpec( preceding=periods if periods > 0 else None, following=-periods if periods < 0 else None, ) @@ -805,7 +805,7 @@ def mode(self) -> Series: block, max_value_count_col_id = block.apply_window_op( value_count_col_id, agg_ops.max_op, - window_spec=WindowSpec(), + window_spec=bigframes.core.window_spec.WindowSpec(), ) block, is_mode_col_id = block.apply_binary_op( value_count_col_id, @@ -1009,9 +1009,7 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> Any: return self._block.get_stat(self._value_column, op) def _apply_window_op( - self, - op: agg_ops.WindowOp, - window_spec: bigframes.core.WindowSpec, + self, op: agg_ops.WindowOp, window_spec: bigframes.core.window_spec.WindowSpec ): block = self._block block, result_id = block.apply_window_op( @@ -1070,7 +1068,7 @@ def sort_index(self, *, axis=0, ascending=True, na_position="last") -> Series: def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: # To get n size window, need current row and n-1 preceding rows. - window_spec = WindowSpec( + window_spec = bigframes.core.window_spec.WindowSpec( preceding=window - 1, following=0, min_periods=min_periods or window ) return bigframes.core.window.Window( @@ -1078,7 +1076,9 @@ def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window ) def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: - window_spec = WindowSpec(following=0, min_periods=min_periods) + window_spec = bigframes.core.window_spec.WindowSpec( + following=0, min_periods=min_periods + ) return bigframes.core.window.Window( self._block, window_spec, self._block.value_columns, is_series=True ) @@ -1251,7 +1251,7 @@ def reindex(self, index=None, *, validate: typing.Optional[bool] = None): "Cannot reindex with index with different nlevels" ) new_indexer = bigframes.dataframe.DataFrame( - index=index, session=self._get_block().expr._session + index=index, session=self._get_block().expr.session )[[]] # multiindex join is senstive to index names, so we will set all these result = new_indexer.rename_axis(range(new_indexer.index.nlevels)).join( @@ -1415,7 +1415,7 @@ def map( elif isinstance(arg, Mapping): map_df = bigframes.dataframe.DataFrame( {"keys": list(arg.keys()), self.name: list(arg.values())}, - session=self._get_block().expr._session, + session=self._get_block().expr.session, ) map_df = map_df.set_index("keys") elif callable(arg): diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index af1f70d54d..473de62f53 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -68,6 +68,7 @@ import bigframes.core.blocks as blocks import bigframes.core.guid as guid from bigframes.core.ordering import IntegerEncoding, OrderingColumnReference +import bigframes.core.ordering as orderings import bigframes.core.utils as utils import bigframes.dataframe as dataframe import bigframes.formatting_helpers as formatting_helpers @@ -206,6 +207,10 @@ def _session_dataset_id(self): def _project(self): return self.bqclient.project + def __hash__(self): + # Stable hash needed to use in expression tree + return hash(self._session_id) + def _create_and_bind_bq_session(self): """Create a BQ session and bind the session id with clients to capture BQ activities: go/bigframes-transient-data""" @@ -592,11 +597,13 @@ def _read_gbq_table( # primary key(s) are set on a table. The query engine assumes such # columns are unique, even if not enforced. is_total_ordering = True - ordering = core.ExpressionOrdering( - ordering_value_columns=[ - core.OrderingColumnReference(column_id) - for column_id in total_ordering_cols - ], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple( + [ + core.OrderingColumnReference(column_id) + for column_id in total_ordering_cols + ] + ), total_ordering_columns=frozenset(total_ordering_cols), ) @@ -634,10 +641,13 @@ def _read_gbq_table( distinct_count = row["distinct_count"] is_total_ordering = total_count == distinct_count - ordering = core.ExpressionOrdering( - ordering_value_columns=[ - core.OrderingColumnReference(column_id) for column_id in index_cols - ], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple( + [ + core.OrderingColumnReference(column_id) + for column_id in index_cols + ] + ), total_ordering_columns=frozenset(index_cols), ) @@ -713,7 +723,7 @@ def _read_gbq_with_ordering( index_cols: Iterable[str] = (), index_labels: Iterable[Optional[str]] = (), hidden_cols: Iterable[str] = (), - ordering: core.ExpressionOrdering, + ordering: orderings.ExpressionOrdering, is_total_ordering: bool = False, api_name: str, ) -> dataframe.DataFrame: @@ -826,7 +836,7 @@ def _read_ibis( index_labels: Iterable[blocks.Label], column_keys: Iterable[str], column_labels: Iterable[blocks.Label], - ordering: core.ExpressionOrdering, + ordering: orderings.ExpressionOrdering, ) -> dataframe.DataFrame: """Turns a table expression (plus index column) into a DataFrame.""" @@ -843,7 +853,7 @@ def _read_ibis( hidden_ordering_columns.append(table_expression[ref.column_id]) block = blocks.Block( - core.ArrayValue( + core.ArrayValue.from_ibis( self, table_expression, columns, hidden_ordering_columns, ordering ), index_columns=[index_col.get_name() for index_col in index_cols], @@ -959,8 +969,8 @@ def _read_pandas( ) self._start_generic_job(load_job) - ordering = core.ExpressionOrdering( - ordering_value_columns=[OrderingColumnReference(ordering_col)], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple([OrderingColumnReference(ordering_col)]), total_ordering_columns=frozenset([ordering_col]), integer_encoding=IntegerEncoding(True, is_sequential=True), ) @@ -1303,7 +1313,7 @@ def _create_sequential_ordering( table: ibis_types.Table, index_cols: Iterable[str] = (), api_name: str = "", - ) -> Tuple[ibis_types.Table, core.ExpressionOrdering]: + ) -> Tuple[ibis_types.Table, orderings.ExpressionOrdering]: # Since this might also be used as the index, don't use the default # "ordering ID" name. default_ordering_name = guid.generate_guid("bigframes_ordering_") @@ -1320,8 +1330,8 @@ def _create_sequential_ordering( f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}" ) ordering_reference = core.OrderingColumnReference(default_ordering_name) - ordering = core.ExpressionOrdering( - ordering_value_columns=[ordering_reference], + ordering = orderings.ExpressionOrdering( + ordering_value_columns=tuple([ordering_reference]), total_ordering_columns=frozenset([default_ordering_name]), integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), ) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index f7fc4eaa8f..084b723fba 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -98,7 +98,7 @@ def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): bf.options.display.progress_bar = "terminal" - penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( + penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = ( False ) penguins_df_default_index.to_pandas() @@ -117,7 +117,7 @@ def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): - penguins_df_default_index._block._expr._session.bqclient.default_query_job_config.use_query_cache = ( + penguins_df_default_index._block._expr.session.bqclient.default_query_job_config.use_query_cache = ( False ) penguins_df_default_index.to_pandas() diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c9510290b6..05d8b84185 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2858,7 +2858,7 @@ def test_map_series_input(scalars_dfs): pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] pd_map_series.index = new_index bf_map_series = series.Series( - pd_map_series, session=scalars_df._get_block().expr._session + pd_map_series, session=scalars_df._get_block().expr.session ) pd_result = scalars_pandas_df.int64_too.map(pd_map_series) @@ -2877,7 +2877,7 @@ def test_map_series_input_duplicates_error(scalars_dfs): pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] pd_map_series.index = new_index bf_map_series = series.Series( - pd_map_series, session=scalars_df._get_block().expr._session + pd_map_series, session=scalars_df._get_block().expr.session ) with pytest.raises(pd.errors.InvalidIndexError): diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 127a88a760..bf72e444eb 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -318,7 +318,6 @@ def test_read_pandas(session, scalars_dfs): _, scalars_pandas_df = scalars_dfs df = session.read_pandas(scalars_pandas_df) - assert df._block._expr._ordering is not None result = df.to_pandas() expected = scalars_pandas_df @@ -350,9 +349,8 @@ def test_read_pandas_rowid_exists_adds_suffix(session, scalars_pandas_df_default pandas_df = scalars_pandas_df_default_index.copy() pandas_df["rowid"] = np.arange(pandas_df.shape[0]) - df = session.read_pandas(pandas_df) - total_order_col = df._block._expr._ordering.total_order_col - assert total_order_col and total_order_col.column_id == "rowid_2" + df_roundtrip = session.read_pandas(pandas_df).to_pandas() + pd.testing.assert_frame_equal(df_roundtrip, pandas_df, check_dtype=False) def test_read_pandas_tokyo( @@ -385,7 +383,6 @@ def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): # Convert default pandas dtypes to match BigQuery DataFrames dtypes. dtype=dtype, ) - assert df._block._expr._ordering is not None # TODO(chelsealin): If we serialize the index, can more easily compare values. pd.testing.assert_index_equal(df.columns, scalars_df.columns) @@ -441,7 +438,6 @@ def test_read_csv_local_default_engine(session, scalars_dfs, sep): # Convert default pandas dtypes to match BigQuery DataFrames dtypes. dtype=dtype, ) - assert df._block._expr._ordering is not None # TODO(chelsealin): If we serialize the index, can more easily compare values. pd.testing.assert_index_equal(df.columns, scalars_df.columns) @@ -976,7 +972,6 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): orient="records", ) - assert df._block._expr._ordering is not None pd.testing.assert_index_equal(df.columns, scalars_df.columns) # The auto detects of BigQuery load job have restrictions to detect the bytes, diff --git a/tests/unit/core/test_blocks.py b/tests/unit/core/test_blocks.py index a7e9b5a84b..86715d090c 100644 --- a/tests/unit/core/test_blocks.py +++ b/tests/unit/core/test_blocks.py @@ -18,8 +18,6 @@ import bigframes.core.blocks as blocks -from .. import resources - @pytest.mark.parametrize( ("data",), @@ -76,9 +74,8 @@ ) def test_block_from_local(data): expected = pandas.DataFrame(data) - session = resources.create_pandas_session({}) - block = blocks.block_from_local(data, session=session) + block = blocks.block_from_local(data) pandas.testing.assert_index_equal(block.column_labels, expected.columns) assert tuple(block.index_labels) == tuple(expected.index.names) diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 0a68600a35..f660d774f0 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -22,6 +22,7 @@ import bigframes import bigframes.core as core +import bigframes.core.ordering import bigframes.session.clients """Utilities for creating test resources.""" @@ -61,14 +62,20 @@ def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Sess def create_arrayvalue( df: pandas.DataFrame, total_ordering_columns: List[str] -) -> bigframes.core.ArrayValue: +) -> core.ArrayValue: session = create_pandas_session({"test_table": df}) ibis_table = session.ibis_client.table("test_table") columns = tuple(ibis_table[key] for key in ibis_table.columns) - ordering = core.ExpressionOrdering( - [core.OrderingColumnReference(column) for column in total_ordering_columns], + ordering = bigframes.core.ordering.ExpressionOrdering( + tuple( + [core.OrderingColumnReference(column) for column in total_ordering_columns] + ), total_ordering_columns=frozenset(total_ordering_columns), ) - return core.ArrayValue( - session=session, table=ibis_table, columns=columns, ordering=ordering + return core.ArrayValue.from_ibis( + session=session, + table=ibis_table, + columns=columns, + hidden_ordering_columns=(), + ordering=ordering, ) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 69b9e79807..d9672b2635 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -16,6 +16,7 @@ import pandas import bigframes.core as core +import bigframes.core.ordering import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -37,15 +38,19 @@ def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): ) ibis_table = session.ibis_client.table("test_table") columns = (ibis_table["col1"], ibis_table["col2"], ibis_table["col3"]) - ordering = core.ExpressionOrdering( - [core.OrderingColumnReference("col1")], + ordering = bigframes.core.ordering.ExpressionOrdering( + tuple([core.OrderingColumnReference("col1")]), total_ordering_columns=frozenset(["col1"]), ) - actual = core.ArrayValue( - session=session, table=ibis_table, columns=columns, ordering=ordering + actual = core.ArrayValue.from_ibis( + session=session, + table=ibis_table, + columns=columns, + ordering=ordering, + hidden_ordering_columns=(), ) - assert actual._table is ibis_table - assert len(actual.columns) == 3 + assert actual.compile()._table is ibis_table + assert len(actual.column_ids) == 3 def test_arrayvalue_with_get_column_type(): @@ -78,7 +83,7 @@ def test_arrayvalue_with_get_column(): ), total_ordering_columns=["col1"], ) - col1 = value._get_ibis_column("col1") + col1 = value.compile()._get_ibis_column("col1") assert isinstance(col1, ibis_types.Value) assert col1.get_name() == "col1" assert col1.type().is_int64() @@ -95,7 +100,7 @@ def test_arrayvalues_to_ibis_expr_with_get_column(): ), total_ordering_columns=["col1"], ) - expr = value._get_ibis_column("col1") + expr = value.compile()._get_ibis_column("col1") assert expr.get_name() == "col1" assert expr.type().is_int64() @@ -112,7 +117,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr._to_ibis_expr("unordered") + actual = expr.compile()._to_ibis_expr("unordered") assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -131,8 +136,8 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_unary_op("col1", ops.AsTypeOp("string")) - assert value.columns[0].type().is_int64() + expr = value.project_unary_op("col1", ops.AsTypeOp("string")).compile() + assert value.compile().columns[0].type().is_int64() assert expr.columns[0].type().is_string() @@ -147,7 +152,7 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_binary_op("col2", "col3", ops.add_op, "col4") + expr = value.project_binary_op("col2", "col3", ops.add_op, "col4").compile() assert expr.columns[3].type().is_float64() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 4 @@ -166,7 +171,9 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_ternary_op("col2", "col3", "col4", ops.where_op, "col5") + expr = value.project_ternary_op( + "col2", "col3", "col4", ops.where_op, "col5" + ).compile() assert expr.columns[4].type().is_float64() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 5 @@ -188,7 +195,7 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): aggregations=(("col1", agg_ops.sum_op, "col4"),), by_column_ids=["col1"], dropna=False, - ) + ).compile() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 2 assert actual.columns[0] == "col1" @@ -207,7 +214,7 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): ), total_ordering_columns=["col1"], ) - expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]) + expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]).compile() actual = expr._to_ibis_expr("unordered") assert len(expr.columns) == 1 assert actual.columns[0] == "col4" From 29032d06811569121f7be2a7de915740df7daf6e Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Thu, 26 Oct 2023 18:20:58 -0700 Subject: [PATCH 13/22] fix: fix bug with column names under repeated column assignment (#150) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/dataframe.py | 23 +++++++++++------------ tests/system/small/test_dataframe.py | 22 ++++++++++++++++++++-- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 9d22c02d87..01117d3e0a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1110,19 +1110,18 @@ def _assign_single_item( # local_df is likely (but not guarunteed) to be cached locally # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE - this_offsets_col_id = bigframes.core.guid.generate_guid() - this_expr = self._get_block()._expr.promote_offsets(this_offsets_col_id) - block = blocks.Block( - expr=this_expr, - index_labels=self.index.names, - index_columns=self._block.index_columns, - column_labels=[this_offsets_col_id] + list(self._block.value_columns), - ) # offsets are temporarily the first value column, label set to id - this_df_with_offsets = DataFrame(data=block) - join_result = this_df_with_offsets.join( - other=local_df, on=this_offsets_col_id, how="left" + new_column_block = local_df._block + original_index_column_ids = self._block.index_columns + self_block = self._block.reset_index(drop=False) + result_index, (get_column_left, get_column_right) = self_block.index.join( + new_column_block.index, how="left", block_identity_join=True ) - return join_result.drop(columns=[this_offsets_col_id]) + result_block = result_index._block + result_block = result_block.set_index( + [get_column_left[col_id] for col_id in original_index_column_ids], + index_labels=self._block.index_labels, + ) + return DataFrame(result_block) else: return self._assign_scalar(k, v) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index a746a1867c..e459e3bee3 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -505,14 +505,32 @@ def test_assign_new_column_w_setitem_list(scalars_dfs): pd.testing.assert_frame_equal(bf_result, pd_result) +def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + pd_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() pd_df = scalars_pandas_df.copy() # set the custom index - pd_df = pd_df.set_index("string_col") - bf_df = bf_df.set_index("string_col") + pd_df = pd_df.set_index(["string_col", "int64_col"]) + bf_df = bf_df.set_index(["string_col", "int64_col"]) bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] From 2d7128d9b2107c3667a5ad7f153d446bfdc04df5 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 27 Oct 2023 22:46:13 +0000 Subject: [PATCH 14/22] test: refactor remote function tests (#147) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes moves the tests that deploy cloud function to large remote function tests, and the tests that do not make call to bigquery service to unit tests. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- tests/system/large/test_remote_function.py | 90 ++++++++++++ tests/system/small/test_remote_function.py | 156 ++++----------------- tests/unit/test_remote_function.py | 28 ++++ 3 files changed, 148 insertions(+), 126 deletions(-) create mode 100644 tests/unit/test_remote_function.py diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 730a1dbde4..c8f8f66eba 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -27,6 +27,7 @@ import pytest import test_utils.prefixer +import bigframes from bigframes.remote_function import ( get_cloud_function_name, get_remote_function_locations, @@ -1120,3 +1121,92 @@ def plusone(x): ) for dir_ in dirs_to_cleanup: shutil.rmtree(dir_) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_via_session_context_connection_setter( + scalars_dfs, dataset_id, bq_cf_connection +): + # Creating a session scoped only to this test as we would be setting a + # property in it + context = bigframes.BigQueryOptions() + context.bq_connection = bq_cf_connection + session = bigframes.connect(context) + + try: + # Without an explicit bigquery connection, the one present in Session, + # set via context setter would be used. Without an explicit `reuse` the + # default behavior of reuse=True will take effect. Please note that the + # udf is same as the one used in other tests in this file so the underlying + # cloud function would be common with reuse=True. Since we are using a + # unique dataset_id, even though the cloud function would be reused, the bq + # remote function would still be created, making use of the bq connection + # set in the BigQueryOptions above. + @session.remote_function([int], int, dataset=dataset_id) + def square(x): + return x * x + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square + ) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_default_connection(session, scalars_dfs, dataset_id): + try: + + @session.remote_function([int], int, dataset=dataset_id) + def square(x): + return x * x + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_int64_col_filter = bf_int64_col.notnull() + bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] + bf_result_col = bf_int64_col_filtered.apply(square) + bf_result = ( + bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_int64_col_filter = pd_int64_col.notnull() + pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] + pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) + # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. + # pd_int64_col_filtered.dtype is Int64Dtype() + # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. + # For this test let's force the pandas dtype to be same as bigframes' dtype. + pd_result_col = pd_result_col.astype(pandas.Int64Dtype()) + pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square + ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index d024a57ded..89907a53df 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -13,14 +13,11 @@ # limitations under the License. from google.cloud import bigquery -from ibis.backends.bigquery import datatypes as bq_types -from ibis.expr import datatypes as ibis_types import pandas as pd import pytest import bigframes from bigframes import remote_function as rf -import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal_ignore_ordering @@ -65,45 +62,14 @@ def bq_cf_connection_location_project_mismatched() -> str: @pytest.fixture(scope="module") -def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: - return bigframes.Session(bigframes.BigQueryOptions(bq_connection=bq_cf_connection)) - - -@pytest.fixture(scope="module") -def session_with_bq_connection_location_specified( - bq_cf_connection_location, -) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location) - ) - - -@pytest.fixture(scope="module") -def session_with_bq_connection_location_mistached( - bq_cf_connection_location_mistached, -) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_mistached) - ) - - -@pytest.fixture(scope="module") -def session_with_bq_connection_location_project_specified( - bq_cf_connection_location_project, +def session_with_bq_connection_and_permanent_dataset( + bq_cf_connection, dataset_id_permanent ) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_project) + session = bigframes.Session( + bigframes.BigQueryOptions(bq_connection=bq_cf_connection) ) - - -def test_supported_types_correspond(): - # The same types should be representable by the supported Python and BigQuery types. - ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES} - ibis_types_from_bigquery = { - bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS - } - - assert ibis_types_from_python == ibis_types_from_bigquery + session._session_dataset = bigquery.Dataset(dataset_id_permanent) + return session @pytest.mark.flaky(retries=2, delay=120) @@ -311,11 +277,13 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs): +def test_remote_function_direct_session_param( + session_with_bq_connection_and_permanent_dataset, scalars_dfs +): @rf.remote_function( [int], int, - session=session_with_bq_connection, + session=session_with_bq_connection_and_permanent_dataset, ) def square(x): return x * x @@ -345,7 +313,9 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_default(session_with_bq_connection, scalars_dfs): +def test_remote_function_via_session_default( + session_with_bq_connection_and_permanent_dataset, scalars_dfs +): # Session has bigquery connection initialized via context. Without an # explicit dataset the default dataset from the session would be used. # Without an explicit bigquery connection, the one present in Session set @@ -353,7 +323,7 @@ def test_remote_function_via_session_default(session_with_bq_connection, scalars # the default behavior of reuse=True will take effect. Please note that the # udf is same as the one used in other tests in this file so the underlying # cloud function would be common and quickly reused. - @session_with_bq_connection.remote_function([int], int) + @session_with_bq_connection_and_permanent_dataset.remote_function([int], int) def square(x): return x * x @@ -421,87 +391,15 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_via_session_context_connection_setter( - scalars_dfs, dataset_id, bq_cf_connection +def test_dataframe_applymap( + session_with_bq_connection_and_permanent_dataset, scalars_dfs ): - # Creating a session scoped only to this test as we would be setting a - # property in it - context = bigframes.BigQueryOptions() - context.bq_connection = bq_cf_connection - session = bigframes.connect(context) - - # Without an explicit bigquery connection, the one present in Session, - # set via context setter would be used. Without an explicit `reuse` the - # default behavior of reuse=True will take effect. Please note that the - # udf is same as the one used in other tests in this file so the underlying - # cloud function would be common with reuse=True. Since we are using a - # unique dataset_id, even though the cloud function would be reused, the bq - # remote function would still be created, making use of the bq connection - # set in the BigQueryOptions above. - @session.remote_function([int], int, dataset=dataset_id) - def square(x): - return x * x - - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] - bf_int64_col_filter = bf_int64_col.notnull() - bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] - bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() - ) - - pd_int64_col = scalars_pandas_df["int64_col"] - pd_int64_col_filter = pd_int64_col.notnull() - pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] - pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) - # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. - # pd_int64_col_filtered.dtype is Int64Dtype() - # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. - # For this test let's force the pandas dtype to be same as bigframes' dtype. - pd_result_col = pd_result_col.astype(pd.Int64Dtype()) - pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) - - -@pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_default_connection(scalars_dfs, dataset_id): - @bpd.remote_function([int], int, dataset=dataset_id) - def square(x): - return x * x - - scalars_df, scalars_pandas_df = scalars_dfs - - bf_int64_col = scalars_df["int64_col"] - bf_int64_col_filter = bf_int64_col.notnull() - bf_int64_col_filtered = bf_int64_col[bf_int64_col_filter] - bf_result_col = bf_int64_col_filtered.apply(square) - bf_result = ( - bf_int64_col_filtered.to_frame().assign(result=bf_result_col).to_pandas() - ) - - pd_int64_col = scalars_pandas_df["int64_col"] - pd_int64_col_filter = pd_int64_col.notnull() - pd_int64_col_filtered = pd_int64_col[pd_int64_col_filter] - pd_result_col = pd_int64_col_filtered.apply(lambda x: x * x) - # TODO(shobs): Figure why pandas .apply() changes the dtype, i.e. - # pd_int64_col_filtered.dtype is Int64Dtype() - # pd_int64_col_filtered.apply(lambda x: x * x).dtype is int64. - # For this test let's force the pandas dtype to be same as bigframes' dtype. - pd_result_col = pd_result_col.astype(pd.Int64Dtype()) - pd_result = pd_int64_col_filtered.to_frame().assign(result=pd_result_col) - - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) - - -@pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap(session_with_bq_connection, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( + [int], int + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -524,11 +422,15 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_dataframe_applymap_na_ignore(session_with_bq_connection, scalars_dfs): +def test_dataframe_applymap_na_ignore( + session_with_bq_connection_and_permanent_dataset, scalars_dfs +): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( + [int], int + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs int64_cols = ["int64_col", "int64_too"] @@ -549,11 +451,13 @@ def add_one(x): @pytest.mark.flaky(retries=2, delay=120) -def test_series_map(session_with_bq_connection, scalars_dfs): +def test_series_map(session_with_bq_connection_and_permanent_dataset, scalars_dfs): def add_one(x): return x + 1 - remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + remote_add_one = session_with_bq_connection_and_permanent_dataset.remote_function( + [int], int + )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -635,7 +539,7 @@ def square1(x): @pytest.mark.flaky(retries=2, delay=120) -def test_read_gbq_function_reads_udfs(bigquery_client, scalars_dfs, dataset_id): +def test_read_gbq_function_reads_udfs(bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) arg = bigquery.RoutineArgument( name="x", diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py new file mode 100644 index 0000000000..540f4020d3 --- /dev/null +++ b/tests/unit/test_remote_function.py @@ -0,0 +1,28 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ibis.backends.bigquery import datatypes as bq_types +from ibis.expr import datatypes as ibis_types + +from bigframes import remote_function as rf + + +def test_supported_types_correspond(): + # The same types should be representable by the supported Python and BigQuery types. + ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES} + ibis_types_from_bigquery = { + bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS + } + + assert ibis_types_from_python == ibis_types_from_bigquery From 4e4409c5b235171f3770aec852193026519948fd Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 27 Oct 2023 19:22:28 -0700 Subject: [PATCH 15/22] feat: add dataframe melt (#116) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/core/blocks.py | 41 +++++++++++++++++- bigframes/dataframe.py | 38 ++++++++++++++++ tests/system/small/test_dataframe.py | 43 +++++++++++++++++++ tests/system/small/test_multiindex.py | 28 ++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 28 ++++++++++++ 5 files changed, 176 insertions(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index cc13edeaf9..635e7db865 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1356,13 +1356,50 @@ def stack(self, how="left", levels: int = 1): index_columns = [*added_index_columns, *self.index_columns] index_labels = [*new_index_level_names, *self._index_labels] - block = Block( + return Block( unpivot_expr, index_columns=index_columns, column_labels=result_index, index_labels=index_labels, ) - return block + + def melt( + self, + id_vars=typing.Sequence[str], + value_vars=typing.Sequence[str], + var_names=typing.Sequence[typing.Hashable], + value_name: typing.Hashable = "value", + ): + # TODO: Implement col_level and ignore_index + unpivot_col_id = guid.generate_guid() + var_col_ids = tuple([guid.generate_guid() for _ in var_names]) + # single unpivot col + unpivot_col = (unpivot_col_id, tuple(value_vars)) + value_labels = [self.col_id_to_label[col_id] for col_id in value_vars] + id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] + + dtype = self._expr.get_column_type(value_vars[0]) + + unpivot_expr = self._expr.unpivot( + row_labels=value_labels, + passthrough_columns=id_vars, + unpivot_columns=(unpivot_col,), + index_col_ids=var_col_ids, + dtype=dtype, + how="right", + ) + index_id = guid.generate_guid() + unpivot_expr = unpivot_expr.promote_offsets(index_id) + # Need to reorder to get id_vars before var_col and unpivot_col + unpivot_expr = unpivot_expr.select_columns( + [index_id, *id_vars, *var_col_ids, unpivot_col_id] + ) + + return Block( + unpivot_expr, + column_labels=[*id_labels, *var_names, value_name], + index_columns=[index_id], + ) def _create_stack_column( self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 01117d3e0a..49d7ad991a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1673,6 +1673,44 @@ def idxmin(self) -> bigframes.series.Series: def idxmax(self) -> bigframes.series.Series: return bigframes.series.Series(block_ops.idxmax(self._block)) + def melt( + self, + id_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + value_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None, + var_name: typing.Union[ + typing.Hashable, typing.Sequence[typing.Hashable] + ] = None, + value_name: typing.Hashable = "value", + ): + if var_name is None: + # Determine default var_name. Attempt to use column labels if they are unique + if self.columns.nlevels > 1: + if len(set(self.columns.names)) == len(self.columns.names): + var_name = self.columns.names + else: + var_name = [f"variable_{i}" for i in range(len(self.columns.names))] + else: + var_name = self.columns.name or "variable" + + var_name = tuple(var_name) if utils.is_list_like(var_name) else (var_name,) + + if id_vars is not None: + id_col_ids = [self._resolve_label_exact(col) for col in id_vars] + else: + id_col_ids = [] + if value_vars is not None: + val_col_ids = [self._resolve_label_exact(col) for col in value_vars] + else: + val_col_ids = [ + col_id + for col_id in self._block.value_columns + if col_id not in id_col_ids + ] + + return DataFrame( + self._block.melt(id_col_ids, val_col_ids, var_name, value_name) + ) + def describe(self) -> DataFrame: df_numeric = self._drop_non_numeric(keep_bool=False) if len(df_numeric.columns) == 0: diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e459e3bee3..b503f9a31d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1919,6 +1919,49 @@ def test_df_stack(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) +def test_df_melt_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].melt().to_pandas() + pd_result = scalars_pandas_df[columns].melt() + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_melt_parameterized(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + + bf_result = scalars_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ).to_pandas() + pd_result = scalars_pandas_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ) + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + def test_df_unstack(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs # To match bigquery dataframes diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index a87dacae04..d6bf46f77c 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -752,6 +752,34 @@ def test_column_multi_index_stack(level): ) +def test_column_multi_index_melt(): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + + level1 = pandas.Index(["b", "a", "b"]) + level2 = pandas.Index(["a", "b", "b"]) + level3 = pandas.Index(["b", "b", "a"]) + + multi_columns = pandas.MultiIndex.from_arrays( + [level1, level2, level3], names=["l1", "l2", "l3"] + ) + pd_df = pandas.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=[5, 2, None], + columns=multi_columns, + dtype="Int64", + ) + bf_df = bpd.DataFrame(pd_df) + + bf_result = bf_df.melt().to_pandas() + pd_result = pd_df.melt() + + # BigFrames uses different string and int types, but values are identical + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]") diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 13a81b4645..67836a8fd2 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2010,6 +2010,34 @@ def idxmax(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def melt(self, id_vars, value_vars, var_name, value_name): + """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + + Parameters + ---------- + id_vars (tuple, list, or ndarray, optional): + Column(s) to use as identifier variables. + value_vars (tuple, list, or ndarray, optional): + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name (scalar): + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name (scalar, default 'value'): + Name to use for the 'value' column. + + Returns: + DataFrame: Unpivoted DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): """ Count number of distinct elements in specified axis. From ac44ccd3936cdb28755d2bbe16377d489f08d5e5 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:07:58 -0700 Subject: [PATCH 16/22] docs: add artithmetic df sample code (#153) * docs: add artithmetic df sample code * fix: address comments --- bigframes/session/__init__.py | 4 +- .../bigframes_vendored/pandas/core/frame.py | 494 +++++++++++++++++- 2 files changed, 492 insertions(+), 6 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 473de62f53..932a41f283 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -352,7 +352,7 @@ def read_gbq_query( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - Simple query input: + Simple query input: >>> df = bpd.read_gbq_query(''' ... SELECT @@ -368,7 +368,7 @@ def read_gbq_query( [2 rows x 3 columns] - Preserve ordering in a query input. + Preserve ordering in a query input. >>> df = bpd.read_gbq_query(''' ... SELECT diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 67836a8fd2..013d170114 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -697,6 +697,7 @@ def align( Join method is specified for each axis Index. + Args: other (DataFrame or Series): join ({{'outer', 'inner', 'left', 'right'}}, default 'outer'): @@ -978,9 +979,9 @@ def sort_values( Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. - kind (str, default `quicksort`): - Choice of sorting algorithm. Accepts 'quicksort’, β€˜mergesort’, - β€˜heapsort’, β€˜stable’. Ignored except when determining whether to + kind (str, default 'quicksort'): + Choice of sorting algorithm. Accepts 'quicksort', 'mergesort', + 'heapsort', 'stable'. Ignored except when determining whether to sort stably. 'mergesort' or 'stable' will result in stable reorder. na_position ({'first', 'last'}, default `last`): ``{'first', 'last'}``, default 'last' Puts NaNs at the beginning @@ -1014,6 +1015,29 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].eq(360) + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``==``: + >>> df["degrees"] == 360 + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1036,6 +1060,30 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].ne(360) + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``!=``: + + >>> df["degrees"] != 360 + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1061,6 +1109,30 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].le(180) + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``<=``: + + >>> df["degrees"] <= 180 + circle False + triangle True + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1087,6 +1159,30 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].lt(180) + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``<``: + + >>> df["degrees"] < 180 + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1113,6 +1209,30 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can use method name: + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].ge(360) + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``>=``: + + >>> df["degrees"] >= 360 + circle True + triangle False + rectangle True + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1139,6 +1259,28 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: floating point columns are considered different (i.e. `NaN` != `NaN`). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'angles': [0, 3, 4], + ... 'degrees': [360, 180, 360]}, + ... index=['circle', 'triangle', 'rectangle']) + >>> df["degrees"].gt(360) + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + + You can also use arithmetic operator ``>``: + + >>> df["degrees"] > 360 + circle False + triangle False + rectangle False + Name: degrees, dtype: boolean + Args: other (scalar, sequence, Series, or DataFrame): Any single or multiple element data structure, or list-like object. @@ -1162,6 +1304,32 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].add(df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + + You can also use arithmetic operator ``+``: + + >>> df['A'] + (df['B']) + 0 5 + 1 7 + 2 9 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1185,6 +1353,32 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].sub(df['B']) + 0 -3 + 1 -3 + 2 -3 + dtype: Int64 + + You can also use arithmetic operator ``-``: + + >>> df['A'] - (df['B']) + 0 -3 + 1 -3 + 2 -3 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1208,6 +1402,29 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rsub(df['B']) + 0 3 + 1 3 + 2 3 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``-``: + + >>> df['B'] - (df['A']) + 0 3 + 1 3 + 2 3 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1231,6 +1448,32 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].mul(df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + + You can also use arithmetic operator ``*``: + + >>> df['A'] * (df['B']) + 0 4 + 1 10 + 2 18 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1254,6 +1497,32 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].truediv(df['B']) + 0 0.25 + 1 0.4 + 2 0.5 + dtype: Float64 + + You can also use arithmetic operator ``/``: + + >>> df['A'] / (df['B']) + 0 0.25 + 1 0.4 + 2 0.5 + dtype: Float64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1277,6 +1546,29 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rtruediv(df['B']) + 0 4.0 + 1 2.5 + 2 2.0 + dtype: Float64 + + It's equivalent to using arithmetic operator: ``/``: + + >>> df['B'] / (df['A']) + 0 4.0 + 1 2.5 + 2 2.0 + dtype: Float64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1300,6 +1592,32 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].floordiv(df['B']) + 0 0 + 1 0 + 2 0 + dtype: Int64 + + You can also use arithmetic operator ``//``: + + >>> df['A'] // (df['B']) + 0 0 + 1 0 + 2 0 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1323,6 +1641,29 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rfloordiv(df['B']) + 0 4 + 1 2 + 2 2 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``//``: + + >>> df['B'] // (df['A']) + 0 4 + 1 2 + 2 2 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1346,6 +1687,32 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].mod(df['B']) + 0 1 + 1 2 + 2 3 + dtype: Int64 + + You can also use arithmetic operator ``%``: + + >>> df['A'] % (df['B']) + 0 1 + 1 2 + 2 3 + dtype: Int64 + Args: other: Any single or multiple element data structure, or list-like object. @@ -1369,6 +1736,29 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rmod(df['B']) + 0 0 + 1 1 + 2 0 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``%``: + + >>> df['B'] % (df['A']) + 0 0 + 1 1 + 2 0 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1382,7 +1772,7 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def pow(self, other, axis: str | int = "columns") -> DataFrame: - """Get Exponential power of dataframe and other, element-wise (binary operator `pow`). + """Get Exponential power of dataframe and other, element-wise (binary operator `**`). Equivalent to ``dataframe ** other``, but with support to substitute a fill_value for missing data in one of the inputs. With reverse version, `rpow`. @@ -1393,6 +1783,32 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + + You can use method name: + + >>> df['A'].pow(df['B']) + 0 1 + 1 32 + 2 729 + dtype: Int64 + + You can also use arithmetic operator ``**``: + + >>> df['A'] ** (df['B']) + 0 1 + 1 32 + 2 729 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1417,6 +1833,29 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: .. note:: Mismatched indices will be unioned together. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'A': [1, 2, 3], + ... 'B': [4, 5, 6], + ... }) + >>> df['A'].rpow(df['B']) + 0 4 + 1 25 + 2 216 + dtype: Int64 + + It's equivalent to using arithmetic operator: ``**``: + + >>> df['B'] ** (df['A']) + 0 4 + 1 25 + 2 216 + dtype: Int64 + Args: other (float, int, or Series): Any single or multiple element data structure, or list-like object. @@ -1438,6 +1877,21 @@ def combine( to element-wise combine columns. The row and column indexes of the resulting DataFrame will be the union of the two. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df1 = bpd.DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 + >>> df1.combine(df2, take_smaller) + A B + 0 0 3 + 1 0 3 + + [2 rows x 2 columns] + Args: other (DataFrame): The DataFrame to merge column-wise. @@ -1468,6 +1922,20 @@ def combine_first(self, other) -> DataFrame: second.loc[index, col] are not missing values, upon calling first.combine_first(second). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df1 = bpd.DataFrame({'A': [None, 0], 'B': [None, 4]}) + >>> df2 = bpd.DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine_first(df2) + A B + 0 1.0 3.0 + 1 0.0 4.0 + + [2 rows x 2 columns] + Args: other (DataFrame): Provided DataFrame to use to fill null values. @@ -1485,6 +1953,24 @@ def update( Aligns on indices. There is no return value. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600]}) + >>> new_df = bpd.DataFrame({'B': [4, 5, 6], + ... 'C': [7, 8, 9]}) + >>> df.update(new_df) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + [3 rows x 2 columns] + Args: other (DataFrame, or object coercible into a DataFrame): Should have at least one matching index/column label From 79a638eda80c482b640b523426ffd95c42747edc Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 30 Oct 2023 18:56:14 +0000 Subject: [PATCH 17/22] feat: Implement operator `@` for `DataFrame.dot` (#139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/297502513 πŸ¦• --- bigframes/dataframe.py | 2 ++ tests/system/small/test_dataframe.py | 33 +++++++++++++++++++++++++++ tests/system/small/test_multiindex.py | 16 +++++++++++++ 3 files changed, 51 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 49d7ad991a..3369fb4868 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2707,3 +2707,5 @@ def get_right_id(id): result = result[other.name].rename() return result + + __matmul__ = dot diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b503f9a31d..c96faa3526 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3264,6 +3264,23 @@ def test_df_dot( ) +def test_df_dot_operator( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + def test_df_dot_series( matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df ): @@ -3278,3 +3295,19 @@ def test_df_dot_series( bf_result, pd_result, ) + + +def test_df_dot_operator_series( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df["x"]).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df["x"] + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index d6bf46f77c..bc35f633fd 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -998,6 +998,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + # right multi-index right_index = pandas.MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "bb")]) bf1 = bpd.DataFrame(left_matrix) @@ -1005,6 +1008,9 @@ def test_df_multi_index_dot_not_supported(): with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): bf1.dot(bf2) + with pytest.raises(NotImplementedError, match="Multi-index input is not supported"): + bf1 @ bf2 + def test_column_multi_index_dot_not_supported(): left_matrix = [[1, 2, 3], [2, 5, 7]] @@ -1022,6 +1028,11 @@ def test_column_multi_index_dot_not_supported(): ): bf1.dot(bf2) + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 + # right multi-columns bf1 = bpd.DataFrame(left_matrix) bf2 = bpd.DataFrame(right_matrix, columns=multi_level_columns) @@ -1029,3 +1040,8 @@ def test_column_multi_index_dot_not_supported(): NotImplementedError, match="Multi-level column input is not supported" ): bf1.dot(bf2) + + with pytest.raises( + NotImplementedError, match="Multi-level column input is not supported" + ): + bf1 @ bf2 From cfebfaa91f945f0024ef743d38acb0b2ec8c4079 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 31 Oct 2023 10:54:17 -0700 Subject: [PATCH 18/22] test: add code snippets for loading data from BigQuery Job (#154) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: add code snippets for loading data from BigQuery Job * fix: address the comments * fix: fix the broken test * use BigQuery Client library to get the job_id * feat: Implement operator `@` for `DataFrame.dot` (#139) Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/297502513 πŸ¦• * fix: fix the comments --------- Co-authored-by: Shobhit Singh --- .../load_data_from_biquery_job_test.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 samples/snippets/load_data_from_biquery_job_test.py diff --git a/samples/snippets/load_data_from_biquery_job_test.py b/samples/snippets/load_data_from_biquery_job_test.py new file mode 100644 index 0000000000..5271574a49 --- /dev/null +++ b/samples/snippets/load_data_from_biquery_job_test.py @@ -0,0 +1,51 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_load_data_from_bigquery_job(): + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client(project="bigframes-dev", location="us") + + query = """ + SELECT * + FROM `bigquery-public-data.ml_datasets.penguins` + LIMIT 20 + """ + query_job = client.query(query) + JOB_ID = query_job.job_id + your_project_id = "bigframes-dev" + + # [START bigquery_dataframes_load_data_from_bigquery_job] + from google.cloud import bigquery + + import bigframes.pandas as bpd + + # Project ID inserted based on the query results selected to explore + project = your_project_id + # Location inserted based on the query results selected to explore + location = "us" + client = bigquery.Client(project=project, location=location) + + # Job ID inserted based on the query results selcted to explore + job_id = JOB_ID + job = client.get_job(job_id) + destination = str(job.destination) + + # Load data from a BigQuery table using BigFrames DataFrames: + bq_df = bpd.read_gbq_table(destination) + + # [END bigquery_dataframes_load_data_from_bigquery_job] + assert bq_df is not None From 63c7919e28d2e0b864142320b47374d807f07c03 Mon Sep 17 00:00:00 2001 From: Bradford Orr <15842009+orrbradford@users.noreply.github.com> Date: Tue, 31 Oct 2023 12:46:47 -0700 Subject: [PATCH 19/22] feat: add bigframes.options.compute.maximum_bytes_billed option that sets maximum bytes billed on query jobs (#133) -implement context manager for global options -maximum_bytes_billed only applies to query jobs. This limitation will be set per query. Operations that trigger multiple jobs may result in total usage beyond this setting --- bigframes/__init__.py | 3 +- bigframes/_config/__init__.py | 11 +++++ bigframes/_config/compute_options.py | 35 +++++++++++++++ bigframes/_config/display_options.py | 23 ++++------ bigframes/pandas/__init__.py | 4 ++ bigframes/session/__init__.py | 19 +++++--- docs/reference/bigframes/options.rst | 2 + docs/templates/toc.yml | 2 + tests/system/conftest.py | 7 --- tests/system/small/test_progress_bar.py | 17 ++++--- tests/unit/test_compute_options.py | 30 +++++++++++++ .../pandas/_config/config.py | 45 +++++++++++++++++++ 12 files changed, 162 insertions(+), 36 deletions(-) create mode 100644 bigframes/_config/compute_options.py create mode 100644 tests/unit/test_compute_options.py create mode 100644 third_party/bigframes_vendored/pandas/_config/config.py diff --git a/bigframes/__init__.py b/bigframes/__init__.py index 8f41790072..bd1476957b 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -14,7 +14,7 @@ """BigQuery DataFrames provides a DataFrame API scaled by the BigQuery engine.""" -from bigframes._config import options +from bigframes._config import option_context, options from bigframes._config.bigquery_options import BigQueryOptions from bigframes.core.global_session import close_session, get_global_session from bigframes.session import connect, Session @@ -28,4 +28,5 @@ "connect", "Session", "__version__", + "option_context", ] diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index e26eaf8800..8dcebfce6a 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -18,8 +18,10 @@ """ import bigframes._config.bigquery_options as bigquery_options +import bigframes._config.compute_options as compute_options import bigframes._config.display_options as display_options import bigframes._config.sampling_options as sampling_options +import third_party.bigframes_vendored.pandas._config.config as pandas_config class Options: @@ -29,6 +31,7 @@ def __init__(self): self._bigquery_options = bigquery_options.BigQueryOptions() self._display_options = display_options.DisplayOptions() self._sampling_options = sampling_options.SamplingOptions() + self._compute_options = compute_options.ComputeOptions() @property def bigquery(self) -> bigquery_options.BigQueryOptions: @@ -49,6 +52,11 @@ def sampling(self) -> sampling_options.SamplingOptions: parameters in specific functions.""" return self._sampling_options + @property + def compute(self) -> compute_options.ComputeOptions: + """Options controlling object computation.""" + return self._compute_options + options = Options() """Global options for default session.""" @@ -58,3 +66,6 @@ def sampling(self) -> sampling_options.SamplingOptions: "Options", "options", ) + + +option_context = pandas_config.option_context diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py new file mode 100644 index 0000000000..20c31d3906 --- /dev/null +++ b/bigframes/_config/compute_options.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Options for displaying objects.""" + +import dataclasses +from typing import Optional + + +@dataclasses.dataclass +class ComputeOptions: + """ + Encapsulates configuration for compute options. + + Attributes: + maximum_bytes_billed (int, Options): + Limits the bytes billed for query jobs. Queries that will have + bytes billed beyond this limit will fail (without incurring a + charge). If unspecified, this will be set to your project default. + See `maximum_bytes_billed `_. + + """ + + maximum_bytes_billed: Optional[int] = None diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index 8bd2743f17..ad3ea3f68c 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -40,17 +40,12 @@ def pandas_repr(display_options: DisplayOptions): This context manager makes sure we reset the pandas options when we're done so that we don't override pandas behavior. """ - original_max_cols = pd.options.display.max_columns - original_max_rows = pd.options.display.max_rows - original_show_dimensions = pd.options.display.show_dimensions - - pd.options.display.max_columns = display_options.max_columns - pd.options.display.max_rows = display_options.max_rows - pd.options.display.show_dimensions = True # type: ignore - - try: - yield - finally: - pd.options.display.max_columns = original_max_cols - pd.options.display.max_rows = original_max_rows - pd.options.display.show_dimensions = original_show_dimensions + with pd.option_context( + "display.max_columns", + display_options.max_columns, + "display.max_rows", + display_options.max_rows, + "display.show_dimensions", + True, + ) as pandas_context: + yield (pandas_context) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 8d9726312f..0fab1109dc 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -462,6 +462,9 @@ def read_gbq_function(function_name: str): options = config.options """Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" +option_context = config.option_context +"""Global :class:`~bigframes._config.option_context` to configure BigQuery DataFrames.""" + # Session management APIs get_global_session = global_session.get_global_session close_session = global_session.close_session @@ -494,6 +497,7 @@ def read_gbq_function(function_name: str): # Other public pandas attributes "NamedAgg", "options", + "option_context", # Session management APIs "get_global_session", "close_session", diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 932a41f283..4858c7726a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1504,12 +1504,10 @@ def _start_query( max_results: Optional[int] = None, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """ - Starts query job and waits for results + Starts query job and waits for results. """ - if job_config is not None: - query_job = self.bqclient.query(sql, job_config=job_config) - else: - query_job = self.bqclient.query(sql) + job_config = self._prepare_job_config(job_config) + query_job = self.bqclient.query(sql, job_config=job_config) opts = bigframes.options.display if opts.progress_bar is not None and not query_job.configuration.dry_run: @@ -1538,6 +1536,17 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob): else: job.result() + def _prepare_job_config( + self, job_config: Optional[bigquery.QueryJobConfig] = None + ) -> bigquery.QueryJobConfig: + if job_config is None: + job_config = self.bqclient.default_query_job_config + if bigframes.options.compute.maximum_bytes_billed is not None: + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) + return job_config + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/docs/reference/bigframes/options.rst b/docs/reference/bigframes/options.rst index d831a519fe..991399eb88 100644 --- a/docs/reference/bigframes/options.rst +++ b/docs/reference/bigframes/options.rst @@ -12,3 +12,5 @@ Options and settings .. autoclass:: bigframes._config.display_options.DisplayOptions .. autoclass:: bigframes._config.sampling_options.SamplingOptions + +.. autoclass:: bigframes._config.compute_options.ComputeOptions diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 4fe2ec1a6a..9879721d28 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -13,6 +13,8 @@ uid: bigframes._config.display_options.DisplayOptions - name: SamplingOptions uid: bigframes._config.sampling_options.SamplingOptions + - name: ComputeOptions + uid: bigframes._config.compute_options.ComputeOptions name: Options and settings - items: - name: Session diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 8885b03d34..f9f69c6c8e 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -898,13 +898,6 @@ def usa_names_grouped_table( return session.bqclient.get_table(table_id) -@pytest.fixture() -def deferred_repr(): - bigframes.options.display.repr_mode = "deferred" - yield - bigframes.options.display.repr_mode = "head" - - @pytest.fixture() def restore_sampling_settings(): enable_downsampling = bigframes.options.sampling.enable_downsampling diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 084b723fba..30ea63b483 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -135,12 +135,11 @@ def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): assert string in query_job_repr -def test_query_job_dry_run( - penguins_df_default_index: bf.dataframe.DataFrame, capsys, deferred_repr -): - repr(penguins_df_default_index) - repr(penguins_df_default_index["body_mass_g"]) - lines = capsys.readouterr().out.split("\n") - lines = filter(None, lines) - for line in lines: - assert "Computation deferred. Computation will process" in line +def test_query_job_dry_run(penguins_df_default_index: bf.dataframe.DataFrame, capsys): + with bf.option_context("display.repr_mode", "deferred"): + repr(penguins_df_default_index) + repr(penguins_df_default_index["body_mass_g"]) + lines = capsys.readouterr().out.split("\n") + lines = filter(None, lines) + for line in lines: + assert "Computation deferred. Computation will process" in line diff --git a/tests/unit/test_compute_options.py b/tests/unit/test_compute_options.py new file mode 100644 index 0000000000..499a0a5fef --- /dev/null +++ b/tests/unit/test_compute_options.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import bigframes as bf + +from . import resources + + +def test_maximum_bytes_option(): + session = resources.create_bigquery_session() + num_query_calls = 0 + with bf.option_context("compute.maximum_bytes_billed", 10000): + # clear initial method calls + session.bqclient.method_calls = [] + session._start_query("query") + for call in session.bqclient.method_calls: + _, _, kwargs = call + num_query_calls += 1 + assert kwargs["job_config"].maximum_bytes_billed == 10000 + assert num_query_calls > 0 diff --git a/third_party/bigframes_vendored/pandas/_config/config.py b/third_party/bigframes_vendored/pandas/_config/config.py new file mode 100644 index 0000000000..8abaca76c7 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/_config/config.py @@ -0,0 +1,45 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/_config/config.py +import contextlib +import operator + +import bigframes + + +class option_context(contextlib.ContextDecorator): + """ + Context manager to temporarily set options in the `with` statement context. + + You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. + + Examples + -------- + >>> import bigframes + >>> with bigframes.option_context('display.max_rows', 10, 'display.max_columns', 5): + ... pass + """ + + def __init__(self, *args) -> None: + if len(args) % 2 != 0 or len(args) < 2: + raise ValueError( + "Need to invoke as option_context(pat, val, [(pat, val), ...])." + ) + + self.ops = list(zip(args[::2], args[1::2])) + + def __enter__(self) -> None: + self.undo = [ + (pat, operator.attrgetter(pat)(bigframes.options)) for pat, val in self.ops + ] + + for pat, val in self.ops: + self._set_option(pat, val) + + def __exit__(self, *args) -> None: + if self.undo: + for pat, val in self.undo: + self._set_option(pat, val) + + def _set_option(self, pat, val): + root, attr = pat.rsplit(".", 1) + parent = operator.attrgetter(root)(bigframes.options) + setattr(parent, attr, val) From 0801d96830dab467232277dea9fd2dacee41055c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 31 Oct 2023 18:00:20 -0500 Subject: [PATCH 20/22] docs: fix indentation on `read_gbq_function` code sample (#163) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # πŸ¦• --- bigframes/session/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 4858c7726a..5a61ed534f 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1467,13 +1467,13 @@ def read_gbq_function( **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None - >>> function_name = "bqutil.fn.cw_lower_case_ascii_only" - >>> func = bpd.read_gbq_function(function_name=function_name) - >>> func.bigframes_remote_function - 'bqutil.fn.cw_lower_case_ascii_only' + >>> function_name = "bqutil.fn.cw_lower_case_ascii_only" + >>> func = bpd.read_gbq_function(function_name=function_name) + >>> func.bigframes_remote_function + 'bqutil.fn.cw_lower_case_ascii_only' Args: function_name (str): From d8baad5b71ec67a35a0fb6132ee16e4c7418c456 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Wed, 1 Nov 2023 13:41:27 -0700 Subject: [PATCH 21/22] feat: add pd.get_dummies (#149) * feat: add pd.get_dummies * remove unneeded prefix case * param/documentation fixes * be stricter about types in test * be stricter about types in series test * remove unneeded comment * adjust for type difference in pandas 1 * add example code (tested) * fix None columns and add test cases * variable names and _get_unique_values per-column * account for pandas 1 behavior difference * remove already_seen set * avoid unnecessary join/projection * fix column ordering edge case * adjust for picky examples checker * example tweak * make part of the example comments * use ellipsis in doctest comment * add to doctest string * extract parameter standardization * extract submethods --------- Co-authored-by: Henry J Solberg --- bigframes/pandas/__init__.py | 177 ++++++++++++++++++ tests/system/small/test_pandas.py | 112 +++++++++++ .../pandas/core/reshape/concat.py | 2 +- .../pandas/core/reshape/encoding.py | 119 ++++++++++++ .../pandas/core/reshape/merge.py | 1 - .../pandas/core/reshape/tile.py | 2 +- 6 files changed, 410 insertions(+), 3 deletions(-) create mode 100644 third_party/bigframes_vendored/pandas/core/reshape/encoding.py diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 0fab1109dc..1c52b103fb 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -45,14 +45,18 @@ ) import bigframes._config as config +import bigframes.constants as constants +import bigframes.core.blocks import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.core.reshape import bigframes.dataframe +import bigframes.operations as ops import bigframes.series import bigframes.session import bigframes.session.clients import third_party.bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat +import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -134,6 +138,179 @@ def cut( cut.__doc__ = vendored_pandas_tile.cut.__doc__ +def get_dummies( + data: Union[DataFrame, Series], + prefix: Union[List, dict, str, None] = None, + prefix_sep: Union[List, dict, str, None] = "_", + dummy_na: bool = False, + columns: Optional[List] = None, + drop_first: bool = False, + dtype: Any = None, +) -> DataFrame: + # simplify input parameters into per-input-label lists + # also raise errors for invalid parameters + column_labels, prefixes, prefix_seps = _standardize_get_dummies_params( + data, prefix, prefix_sep, columns, dtype + ) + + # combine prefixes into per-column-id list + full_columns_prefixes, columns_ids = _determine_get_dummies_columns_from_labels( + data, column_labels, prefix is not None, prefixes, prefix_seps + ) + + # run queries to compute unique values + block = data._block + max_unique_value = ( + bigframes.core.blocks._BQ_MAX_COLUMNS + - len(block.value_columns) + - len(block.index_columns) + - 1 + ) // len(column_labels) + columns_values = [ + block._get_unique_values([col_id], max_unique_value) for col_id in columns_ids + ] + + # for each dummified column, add the content of the output columns via block operations + intermediate_col_ids = [] + for i in range(len(columns_values)): + level = columns_values[i].get_level_values(0).sort_values().dropna() + if drop_first: + level = level[1:] + column_label = full_columns_prefixes[i] + column_id = columns_ids[i] + block, new_intermediate_col_ids = _perform_get_dummies_block_operations( + block, level, column_label, column_id, dummy_na + ) + intermediate_col_ids.extend(new_intermediate_col_ids) + + # drop dummified columns (and the intermediate columns we added) + block = block.drop_columns(columns_ids + intermediate_col_ids) + return DataFrame(block) + + +get_dummies.__doc__ = vendored_pandas_encoding.get_dummies.__doc__ + + +def _standardize_get_dummies_params( + data: Union[DataFrame, Series], + prefix: Union[List, dict, str, None], + prefix_sep: Union[List, dict, str, None], + columns: Optional[List], + dtype: Any, +) -> Tuple[List, List[str], List[str]]: + block = data._block + + if isinstance(data, Series): + columns = [block.column_labels[0]] + if columns is not None and not pandas.api.types.is_list_like(columns): + raise TypeError("Input must be a list-like for parameter `columns`") + if dtype is not None and dtype not in [ + pandas.BooleanDtype, + bool, + "Boolean", + "boolean", + "bool", + ]: + raise NotImplementedError( + f"Only Boolean dtype is currently supported. {constants.FEEDBACK_LINK}" + ) + + if columns is None: + default_dummy_types = [pandas.StringDtype, "string[pyarrow]"] + columns = [] + columns_set = set() + for col_id in block.value_columns: + label = block.col_id_to_label[col_id] + if ( + label not in columns_set + and block.expr.get_column_type(col_id) in default_dummy_types + ): + columns.append(label) + columns_set.add(label) + + column_labels: List = typing.cast(List, columns) + + def parse_prefix_kwarg(kwarg, kwarg_name) -> Optional[List[str]]: + if kwarg is None: + return None + if isinstance(kwarg, str): + return [kwarg] * len(column_labels) + if isinstance(kwarg, dict): + return [kwarg[column] for column in column_labels] + kwarg = typing.cast(List, kwarg) + if pandas.api.types.is_list_like(kwarg) and len(kwarg) != len(column_labels): + raise ValueError( + f"Length of '{kwarg_name}' ({len(kwarg)}) did not match " + f"the length of the columns being encoded ({len(column_labels)})." + ) + if pandas.api.types.is_list_like(kwarg): + return list(map(str, kwarg)) + raise TypeError(f"{kwarg_name} kwarg must be a string, list, or dictionary") + + prefix_seps = parse_prefix_kwarg(prefix_sep or "_", "prefix_sep") + prefix_seps = typing.cast(List, prefix_seps) + prefixes = parse_prefix_kwarg(prefix, "prefix") + if prefixes is None: + prefixes = column_labels + prefixes = typing.cast(List, prefixes) + + return column_labels, prefixes, prefix_seps + + +def _determine_get_dummies_columns_from_labels( + data: Union[DataFrame, Series], + column_labels: List, + prefix_given: bool, + prefixes: List[str], + prefix_seps: List[str], +) -> Tuple[List[str], List[str]]: + block = data._block + + columns_ids = [] + columns_prefixes = [] + for i in range(len(column_labels)): + label = column_labels[i] + empty_prefix = label is None or (isinstance(data, Series) and not prefix_given) + full_prefix = "" if empty_prefix else prefixes[i] + prefix_seps[i] + + for col_id in block.label_to_col_id[label]: + columns_ids.append(col_id) + columns_prefixes.append(full_prefix) + + return columns_prefixes, columns_ids + + +def _perform_get_dummies_block_operations( + block: bigframes.core.blocks.Block, + level: pandas.Index, + column_label: str, + column_id: str, + dummy_na: bool, +) -> Tuple[bigframes.core.blocks.Block, List[str]]: + intermediate_col_ids = [] + for value in level: + new_column_label = f"{column_label}{value}" + if column_label == "": + new_column_label = value + new_block, new_id = block.apply_unary_op( + column_id, ops.BinopPartialLeft(ops.eq_op, value) + ) + intermediate_col_ids.append(new_id) + block, _ = new_block.apply_unary_op( + new_id, + ops.BinopPartialRight(ops.fillna_op, False), + result_label=new_column_label, + ) + if dummy_na: + # dummy column name for na depends on the dtype + na_string = str(pandas.Index([None], dtype=level.dtype)[0]) + new_column_label = f"{column_label}{na_string}" + block, _ = block.apply_unary_op( + column_id, ops.isnull_op, result_label=new_column_label + ) + return block, intermediate_col_ids + + def qcut( x: bigframes.series.Series, q: int, diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index f8fa78587f..0292ebd206 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -45,6 +45,118 @@ def test_concat_series(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("kwargs"), + [ + { + "prefix": ["prefix1", "prefix2"], + "prefix_sep": "_", + "dummy_na": None, + "columns": ["bool_col", "int64_col"], + "drop_first": False, + }, + { + "prefix": "prefix", + "prefix_sep": ["_", ","], + "dummy_na": False, + "columns": ["int64_too", "string_col"], + "drop_first": False, + }, + { + "prefix": None, + "prefix_sep": ".", + "dummy_na": True, + "columns": ["time_col", "float64_col"], + "drop_first": True, + }, + ], +) +def test_get_dummies_dataframe(scalars_dfs, kwargs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = bpd.get_dummies(scalars_df, **kwargs, dtype=bool) + pd_result = pd.get_dummies(scalars_pandas_df, **kwargs, dtype=bool) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_dummies_dataframe_duplicate_labels(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("pandas has different behavior in 1.x") + + scalars_df, scalars_pandas_df = scalars_dfs + + scalars_renamed_df = scalars_df.rename( + columns={"int64_too": "int64_col", "float64_col": None, "string_col": None} + ) + scalars_renamed_pandas_df = scalars_pandas_df.rename( + columns={"int64_too": "int64_col", "float64_col": None, "string_col": None} + ) + + bf_result = bpd.get_dummies( + scalars_renamed_df, columns=["int64_col", None], dtype=bool + ) + pd_result = pd.get_dummies( + scalars_renamed_pandas_df, columns=["int64_col", None], dtype=bool + ) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + + pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_dummies_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df.date_col + pd_series = scalars_pandas_df.date_col + + bf_result = bpd.get_dummies(bf_series, dtype=bool) + pd_result = pd.get_dummies(pd_series, dtype=bool) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + pd_result.columns = pd_result.columns.astype(object) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_get_dummies_series_nameless(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_series = scalars_df.date_col.rename(None) + pd_series = scalars_pandas_df.date_col.rename(None) + + bf_result = bpd.get_dummies(bf_series, dtype=bool) + pd_result = pd.get_dummies(pd_series, dtype=bool) + # dtype argument above is needed for pandas v1 only + + # adjust for expected dtype differences + for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes): + if type_name == "bool": + pd_result[column_name] = pd_result[column_name].astype("boolean") + pd_result.columns = pd_result.columns.astype(object) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + @pytest.mark.parametrize( ("how"), [ diff --git a/third_party/bigframes_vendored/pandas/core/reshape/concat.py b/third_party/bigframes_vendored/pandas/core/reshape/concat.py index 6e6d2d8b5c..b0472c524a 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/concat.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/concat.py @@ -1,6 +1,6 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/concat.py """ -Concat routines. +Concat routines """ from __future__ import annotations diff --git a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py new file mode 100644 index 0000000000..da92b58f50 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py @@ -0,0 +1,119 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/encoding.py +""" +Encoding routines +""" +from __future__ import annotations + +from bigframes import constants + + +def get_dummies( + data, + prefix=None, + prefix_sep="_", + dummy_na=False, + columns=None, + drop_first=False, + dtype=None, +): + """ + Convert categorical variable into dummy/indicator variables. + + Each variable is converted in as many 0/1 variables as there are + different values. Columns in the output are each named after a value; + if the input is a DataFrame, the name of the original variable is + prepended to the value. + + **Examples:** + >>> import bigframes.pandas as pd + >>> pd.options.display.progress_bar = None + >>> s = pd.Series(list('abca')) + >>> pd.get_dummies(s) + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + + [4 rows x 3 columns] + + >>> s1 = pd.Series(['a', 'b', None]) + >>> pd.get_dummies(s1) + a b + 0 True False + 1 False True + 2 False False + + [3 rows x 2 columns] + + >>> pd.get_dummies(s1, dummy_na=True) + a b + 0 True False False + 1 False True False + 2 False False True + + [3 rows x 3 columns] + + >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]}) + >>> pd.get_dummies(df, prefix=['col1', 'col2']) + C col1_a col1_b col2_a col2_b col2_c + 0 1 True False False True False + 1 2 False True True False False + 2 3 True False False False True + + [3 rows x 6 columns] + + >>> pd.get_dummies(pd.Series(list('abcaa'))) + a b c + 0 True False False + 1 False True False + 2 False False True + 3 True False False + 4 True False False + + [5 rows x 3 columns] + + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) + b c + 0 False False + 1 True False + 2 False True + 3 False False + 4 False False + + [5 rows x 2 columns] + + Args: + data (Series or DataFrame): + Data of which to get dummy indicators. + + prefix (str, list of str, or dict of str, default None): + String to append DataFrame column names. Pass a list with length + equal to the number of columns when calling get_dummies on a + DataFrame. Alternatively, prefix can be a dictionary mapping column + names to prefixes. + + prefix_sep (str, list of str, or dict of str, default '_'): + Separator/delimiter to use, appended to prefix. Or pass a list or + dictionary as with prefix. + + dummy_na (bool, default False): + Add a column to indicate NaNs, if False NaNs are ignored. + + columns (list-like, default None): + Column names in the DataFrame to be encoded. If columns is None + then only the columns with string dtype will be converted. + + drop_first (bool, default False): + Whether to get k-1 dummies out of k categorical levels by removing the + first level. + + dtype (dtype, default bool): + Data type for new columns. Only a single dtype is allowed. + + Returns: + DataFrame: Dummy-coded data. If data contains other columns than the + dummy-coded one(s), these will be prepended, unaltered, to the + result. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py index cc81de405b..b03f366fca 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/merge.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py @@ -16,7 +16,6 @@ def merge( sort=False, suffixes=("_x", "_y"), ): - """ Merge DataFrame objects with a database-style join. diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 24ea655a5f..d4471ed68e 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -1,6 +1,6 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/tile.py """ -Quantilization functions and related stuff +Quantilization functions and related routines """ from __future__ import annotations From bf1ec89f8da2c7b2d042b7516a16a9e7cda6db06 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Wed, 1 Nov 2023 14:49:24 -0700 Subject: [PATCH 22/22] chore(main): release 0.12.0 (#151) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 29 +++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93ebadb56f..845d3634bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,35 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.12.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.11.0...v0.12.0) (2023-11-01) + + +### Features + +* Add `DataFrame.melt` ([#113](https://github.com/googleapis/python-bigquery-dataframes/issues/113)) ([4e4409c](https://github.com/googleapis/python-bigquery-dataframes/commit/4e4409c5b235171f3770aec852193026519948fd)) +* Add `DataFrame.to_pandas_batches()` to download large `DataFrame` objects ([#136](https://github.com/googleapis/python-bigquery-dataframes/issues/136)) ([3afd4a3](https://github.com/googleapis/python-bigquery-dataframes/commit/3afd4a35f4c38dad86dab17ff62444cd418cab88)) +* Add bigframes.options.compute.maximum_bytes_billed option that sets maximum bytes billed on query jobs ([#133](https://github.com/googleapis/python-bigquery-dataframes/issues/133)) ([63c7919](https://github.com/googleapis/python-bigquery-dataframes/commit/63c7919e28d2e0b864142320b47374d807f07c03)) +* Add pandas.qcut ([#104](https://github.com/googleapis/python-bigquery-dataframes/issues/104)) ([8e44518](https://github.com/googleapis/python-bigquery-dataframes/commit/8e4451841ba09099b0ed5433f9102511741dfbed)) +* Add pd.get_dummies ([#149](https://github.com/googleapis/python-bigquery-dataframes/issues/149)) ([d8baad5](https://github.com/googleapis/python-bigquery-dataframes/commit/d8baad5b71ec67a35a0fb6132ee16e4c7418c456)) +* Add unstack to series, add level param ([#115](https://github.com/googleapis/python-bigquery-dataframes/issues/115)) ([5edcd19](https://github.com/googleapis/python-bigquery-dataframes/commit/5edcd19e6200db9b9ebe3d4945816b3ebf1f7bcd)) +* Implement operator `@` for `DataFrame.dot` ([#139](https://github.com/googleapis/python-bigquery-dataframes/issues/139)) ([79a638e](https://github.com/googleapis/python-bigquery-dataframes/commit/79a638eda80c482b640b523426ffd95c42747edc)) +* Populate ibis version in user agent ([#140](https://github.com/googleapis/python-bigquery-dataframes/issues/140)) ([c639a36](https://github.com/googleapis/python-bigquery-dataframes/commit/c639a3657465e2b68a3b93c363bd3ae1e969d2cc)) + + +### Bug Fixes + +* Don't override the global logging config ([#138](https://github.com/googleapis/python-bigquery-dataframes/issues/138)) ([2ddbf74](https://github.com/googleapis/python-bigquery-dataframes/commit/2ddbf743efc2fd8ffb61ae8d3333fc4b98ce4b55)) +* Fix bug with column names under repeated column assignment ([#150](https://github.com/googleapis/python-bigquery-dataframes/issues/150)) ([29032d0](https://github.com/googleapis/python-bigquery-dataframes/commit/29032d06811569121f7be2a7de915740df7daf6e)) +* Resolve plotly rendering issue by using ipython html for job pro… ([#134](https://github.com/googleapis/python-bigquery-dataframes/issues/134)) ([39df43e](https://github.com/googleapis/python-bigquery-dataframes/commit/39df43e243ac0374d1a1eb2a75779324825afbe9)) +* Use indexee's session for loc listlike cases ([#152](https://github.com/googleapis/python-bigquery-dataframes/issues/152)) ([27c5725](https://github.com/googleapis/python-bigquery-dataframes/commit/27c57255c7fe11e1ef9b9826d988d80fc17442a6)) + + +### Documentation + +* Add artithmetic df sample code ([#153](https://github.com/googleapis/python-bigquery-dataframes/issues/153)) ([ac44ccd](https://github.com/googleapis/python-bigquery-dataframes/commit/ac44ccd3936cdb28755d2bbe16377d489f08d5e5)) +* Fix indentation on `read_gbq_function` code sample ([#163](https://github.com/googleapis/python-bigquery-dataframes/issues/163)) ([0801d96](https://github.com/googleapis/python-bigquery-dataframes/commit/0801d96830dab467232277dea9fd2dacee41055c)) +* Link to ML.EVALUATE BQML page for score() methods ([#137](https://github.com/googleapis/python-bigquery-dataframes/issues/137)) ([45c617f](https://github.com/googleapis/python-bigquery-dataframes/commit/45c617fee7becc42f1c129246ffdc32f3a963f12)) + ## [0.11.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.10.0...v0.11.0) (2023-10-26) diff --git a/bigframes/version.py b/bigframes/version.py index 18edfa5615..b324ed7234 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.11.0" +__version__ = "0.12.0"