From bc82804da43c03c2311cd56f47a2316d3aae93d2 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 16 Apr 2024 16:32:17 -0700 Subject: [PATCH 01/15] feat: Add quantile statistic (#613) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 馃 --- bigframes/constants.py | 3 + bigframes/core/block_transforms.py | 34 +++++++++++ bigframes/core/blocks.py | 13 +++-- bigframes/core/compile/aggregate_compiler.py | 8 +++ bigframes/core/groupby/__init__.py | 57 +++++++++++++++++-- bigframes/dataframe.py | 30 +++++++++- bigframes/operations/aggregations.py | 12 ++++ bigframes/series.py | 19 +++++-- tests/system/small/test_dataframe.py | 30 +++++++++- tests/system/small/test_groupby.py | 35 ++++++++++++ tests/system/small/test_series.py | 21 +++++++ .../ibis/backends/bigquery/registry.py | 8 +++ .../bigframes_vendored/pandas/core/frame.py | 40 ++++++++++++- .../pandas/core/groupby/__init__.py | 30 ++++++++++ .../bigframes_vendored/pandas/core/series.py | 42 +++++++++++++- .../pandas/plotting/_core.py | 4 ++ 16 files changed, 366 insertions(+), 20 deletions(-) diff --git a/bigframes/constants.py b/bigframes/constants.py index 0751501085..c6d8f3acc2 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -92,3 +92,6 @@ LEP_ENABLED_BIGQUERY_LOCATIONS = frozenset( ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS ) + +# BigQuery default is 10000, leave 100 for overhead +MAX_COLUMNS = 9900 diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index c789b2a69c..1eae73014c 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -15,6 +15,7 @@ import functools import typing +from typing import Sequence import pandas as pd @@ -105,6 +106,39 @@ def indicate_duplicates( ) +def quantile( + block: blocks.Block, + columns: Sequence[str], + qs: Sequence[float], + grouping_column_ids: Sequence[str] = (), +) -> blocks.Block: + # TODO: handle windowing and more interpolation methods + window = core.WindowSpec( + grouping_keys=tuple(grouping_column_ids), + ) + quantile_cols = [] + labels = [] + if len(columns) * len(qs) > constants.MAX_COLUMNS: + raise NotImplementedError("Too many aggregates requested.") + for col in columns: + for q in qs: + label = block.col_id_to_label[col] + new_label = (*label, q) if isinstance(label, tuple) else (label, q) + labels.append(new_label) + block, quantile_col = block.apply_window_op( + col, + agg_ops.QuantileOp(q), + window_spec=window, + ) + quantile_cols.append(quantile_col) + block, results = block.aggregate( + grouping_column_ids, + tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols), + dropna=True, + ) + return block.select_columns(results).with_column_labels(labels) + + def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: supported_methods = [ "linear", diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 5b411e5416..f6850020df 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1498,12 +1498,17 @@ def stack(self, how="left", levels: int = 1): row_label_tuples = utils.index_as_tuples(row_labels) - if col_labels is not None: + if col_labels is None: + result_index: pd.Index = pd.Index([None]) + result_col_labels: Sequence[Tuple] = list([()]) + elif (col_labels.nlevels == 1) and all( + col_labels.isna() + ): # isna not implemented for MultiIndex for newer pandas versions + result_index = pd.Index([None]) + result_col_labels = utils.index_as_tuples(col_labels.drop_duplicates()) + else: result_index = col_labels.drop_duplicates().dropna(how="all") result_col_labels = utils.index_as_tuples(result_index) - else: - result_index = pd.Index([None]) - result_col_labels = list([()]) # Get matching columns unpivot_columns: List[Tuple[str, List[str]]] = [] diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index ae21243506..98d296c779 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -148,6 +148,14 @@ def _( return cast(ibis_types.NumericValue, value) +@compile_unary_agg.register +@numeric_op +def _( + op: agg_ops.QuantileOp, column: ibis_types.NumericColumn, window=None +) -> ibis_types.NumericValue: + return _apply_window_if_present(column.quantile(op.q), window) + + @compile_unary_agg.register @numeric_op def _( diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index e2b28553c6..0f53342352 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -15,6 +15,7 @@ from __future__ import annotations import typing +from typing import Sequence, Union import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby import pandas as pd @@ -115,14 +116,35 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame: def median( self, numeric_only: bool = False, *, exact: bool = False ) -> df.DataFrame: - if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) if not numeric_only: self._raise_on_non_numeric("median") + if exact: + return self.quantile(0.5) return self._aggregate_all(agg_ops.median_op, numeric_only=True) + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ) -> df.DataFrame: + if not numeric_only: + self._raise_on_non_numeric("quantile") + q_cols = tuple( + col + for col in self._selected_cols + if self._column_type(col) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + ) + multi_q = utils.is_list_like(q) + result = block_ops.quantile( + self._block, + q_cols, + qs=tuple(q) if multi_q else (q,), # type: ignore + grouping_column_ids=self._by_col_ids, + ) + result_df = df.DataFrame(result) + if multi_q: + return result_df.stack() + else: + return result_df.droplevel(-1, 1) + def min(self, numeric_only: bool = False, *args) -> df.DataFrame: return self._aggregate_all(agg_ops.min_op, numeric_only=numeric_only) @@ -466,8 +488,31 @@ def sum(self, *args) -> series.Series: def mean(self, *args) -> series.Series: return self._aggregate(agg_ops.mean_op) - def median(self, *args, **kwargs) -> series.Series: - return self._aggregate(agg_ops.mean_op) + def median( + self, + *args, + exact: bool = False, + **kwargs, + ) -> series.Series: + if exact: + return self.quantile(0.5) + else: + return self._aggregate(agg_ops.median_op) + + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ) -> series.Series: + multi_q = utils.is_list_like(q) + result = block_ops.quantile( + self._block, + (self._value_column,), + qs=tuple(q) if multi_q else (q,), # type: ignore + grouping_column_ids=self._by_col_ids, + ) + if multi_q: + return series.Series(result.stack()) + else: + return series.Series(result.stack()).droplevel(-1) def std(self, *args, **kwargs) -> series.Series: return self._aggregate(agg_ops.std_op) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2deef95277..953a89c34f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2009,8 +2009,34 @@ def median( frame = self._raise_on_non_numeric("median") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_stack(agg_ops.median_op) - return bigframes.series.Series(block.select_column("values")) + if exact: + return self.quantile() + else: + block = frame._block.aggregate_all_and_stack(agg_ops.median_op) + return bigframes.series.Series(block.select_column("values")) + + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ): + if not numeric_only: + frame = self._raise_on_non_numeric("median") + else: + frame = self._drop_non_numeric() + multi_q = utils.is_list_like(q) + result = block_ops.quantile( + frame._block, frame._block.value_columns, qs=tuple(q) if multi_q else (q,) # type: ignore + ) + if multi_q: + return DataFrame(result.stack()).droplevel(0) + else: + result_df = ( + DataFrame(result) + .stack(list(range(0, frame.columns.nlevels))) + .droplevel(0) + ) + result_series = bigframes.series.Series(result_df._block) + result_series.name = q + return result_series def std( self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index f33dc16e30..0d27d1d75d 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -109,6 +109,18 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return input_types[0] +@dataclasses.dataclass(frozen=True) +class QuantileOp(UnaryAggregateOp): + q: float + + @property + def name(self): + return f"{int(self.q*100)}%" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) + + @dataclasses.dataclass(frozen=True) class ApproxQuartilesOp(UnaryAggregateOp): quartile: int diff --git a/bigframes/series.py b/bigframes/series.py index 2f9123f9a3..b834411bce 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -23,7 +23,7 @@ import os import textwrap import typing -from typing import Any, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery @@ -968,10 +968,19 @@ def mean(self) -> float: def median(self, *, exact: bool = False) -> float: if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) - return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) + return typing.cast(float, self.quantile(0.5)) + else: + return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) + + def quantile(self, q: Union[float, Sequence[float]] = 0.5) -> Union[Series, float]: + qs = tuple(q) if utils.is_list_like(q) else (q,) + result = block_ops.quantile(self._block, (self._value_column,), qs=qs) + if utils.is_list_like(q): + result = result.stack() + result = result.drop_levels([result.index_columns[0]]) + return Series(result) + else: + return cast(float, Series(result).to_pandas().squeeze()) def sum(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.sum_op)) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e70764fcc0..7fef7a9dc7 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2504,7 +2504,10 @@ def test_df_melt_default(scalars_dfs): # Pandas produces int64 index, Bigframes produces Int64 (nullable) pd.testing.assert_frame_equal( - bf_result, pd_result, check_index_type=False, check_dtype=False + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, ) @@ -3029,6 +3032,31 @@ def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): ) +def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_index): + q = 0.45 + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index): + q = [0, 0.33, 0.67, 1.0] + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + pd_result.index = pd_result.index.astype("Float64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("op"), [ diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index ba79ba1ab1..7b36a06f49 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -65,6 +65,24 @@ def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index): assert ((pd_min <= bf_result_computed) & (bf_result_computed <= pd_max)).all().all() +@pytest.mark.parametrize( + ("q"), + [ + ([0.2, 0.4, 0.6, 0.8]), + (0.11), + ], +) +def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q): + col_names = ["int64_too", "float64_col", "int64_col", "string_col"] + bf_result = ( + scalars_df_index[col_names].groupby("string_col").quantile(q) + ).to_pandas() + pd_result = scalars_pandas_df_index[col_names].groupby("string_col").quantile(q) + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("operator"), [ @@ -389,3 +407,20 @@ def test_dataframe_groupby_nonnumeric_with_mean(): pd.testing.assert_frame_equal( pd_result, bf_result, check_index_type=False, check_dtype=False ) + + +@pytest.mark.parametrize( + ("q"), + [ + ([0.2, 0.4, 0.6, 0.8]), + (0.11), + ], +) +def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q): + bf_result = ( + scalars_df_index.groupby("string_col")["int64_col"].quantile(q) + ).to_pandas() + pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].quantile(q) + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d27cd0a236..87267696ba 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1320,6 +1320,27 @@ def test_median(scalars_dfs): assert pd_min < bf_result < pd_max +def test_median_exact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].median(exact=True) + pd_result = scalars_pandas_df[col_name].median() + assert math.isclose(pd_result, bf_result) + + +def test_series_quantile(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name] + + pd_result = pd_series.quantile([0.0, 0.4, 0.6, 1.0]) + bf_result = bf_series.quantile([0.0, 0.4, 0.6, 1.0]) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + def test_numeric_literal(scalars_dfs): scalars_df, _ = scalars_dfs col_name = "numeric_col" diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index 88826b31ce..fddeab19a2 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -3,6 +3,7 @@ import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops from ibis.backends.bigquery.registry import OPERATION_REGISTRY +import ibis.expr.operations.reductions as ibis_reductions def _approx_quantiles(translator, op: vendored_ibis_ops.ApproximateMultiQuantile): @@ -31,12 +32,19 @@ def _generate_array(translator, op: vendored_ibis_ops.GenerateArray): return f"GENERATE_ARRAY(0, {arg})" +def _quantile(translator, op: ibis_reductions.Quantile): + arg = translator.translate(op.arg) + quantile = translator.translate(op.quantile) + return f"PERCENTILE_CONT({arg}, {quantile})" + + patched_ops = { vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore + ibis_reductions.Quantile: _quantile, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6707dc1403..e894900646 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4509,13 +4509,51 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): Default False. Include only float, int, boolean columns. exact (bool. default False): Default False. Get the exact median instead of an approximate - one. Note: ``exact=True`` not yet supported. + one. Returns: bigframes.series.Series: Series with the median of values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ): + """ + Return values at the given quantile over requested axis. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df.quantile(.1) + a 1.3 + b 3.7 + Name: 0.1, dtype: Float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + + [2 rows x 2 columns] + + Args: + q (float or array-like, default 0.5 (50% quantile)): + Value between 0 <= q <= 1, the quantile(s) to compute. + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. + + Returns: + Series or DataFrame: + If ``q`` is an array, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def var(self, axis=0, *, numeric_only: bool = False): """Return unbiased variance over requested axis. diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index ed4ca66f38..6310d7e271 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -85,6 +85,36 @@ def median( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def quantile(self, q=0.5, *, numeric_only: bool = False): + """ + Return group values at the given quantile, a la numpy.percentile. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([ + ... ['a', 1], ['a', 2], ['a', 3], + ... ['b', 1], ['b', 3], ['b', 5] + ... ], columns=['key', 'val']) + >>> df.groupby('key').quantile() + val + key + a 2.0 + b 3.0 + + [2 rows x 1 columns] + + Args: + q (float or array-like, default 0.5 (50% quantile)): + Value(s) between 0 and 1 providing the quantile(s) to compute. + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. + + Returns: + Series or DataFrame: Return type determined by caller of GroupBy object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def std( self, *, diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 46bc9714f8..5e3b4c46ef 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3,7 +3,16 @@ """ from __future__ import annotations -from typing import Hashable, IO, Literal, Mapping, Optional, Sequence, TYPE_CHECKING +from typing import ( + Hashable, + IO, + Literal, + Mapping, + Optional, + Sequence, + TYPE_CHECKING, + Union, +) from bigframes_vendored.pandas.core.generic import NDFrame import numpy @@ -3151,6 +3160,37 @@ def median(self, *, exact: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def quantile( + self, + q: Union[float, Sequence[float]] = 0.5, + ) -> Union[Series, float]: + """ + Return value at the given quantile. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4]) + >>> s.quantile(.5) + 2.5 + >>> s.quantile([.25, .5, .75]) + 0.25 1.75 + 0.5 2.5 + 0.75 3.25 + dtype: Float64 + + Args: + q (float or array-like, default 0.5 (50% quantile)): + The quantile(s) to compute, which can lie in range: 0 <= q <= 1. + + Returns: + float or Series: + If ``q`` is an array, a Series will be returned where the + index is ``q`` and the values are the quantiles, otherwise + a float will be returned. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def prod(self): """Return the product of the values over the requested axis. diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index 19f56965df..bf016357a6 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -11,6 +11,7 @@ class PlotAccessor: For Series: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -57,6 +58,7 @@ def hist( >>> import bigframes.pandas as bpd >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -93,6 +95,7 @@ def line( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'one': [1, 2, 3, 4], @@ -160,6 +163,7 @@ def area( Draw an area plot based on basic business metrics: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'sales': [3, 2, 3, 9, 10, 6], From 250548c248fe3a4fdfa92494aa0d550de8608612 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Tue, 16 Apr 2024 17:42:09 -0700 Subject: [PATCH 02/15] test: add a bigquery usage report to notebook test session (#604) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: add a bigquery usage report to notebook test session * filter out mocks * remove pointless type hint * fix replace statement * account for dry runs * ipynb only * use env var via nox * don't import bigframes from noxfile * address comments * 馃 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- bigframes/session/_io/bigquery.py | 31 +++++++++++++++++ noxfile.py | 58 ++++++++++++++++++++++++++----- 2 files changed, 80 insertions(+), 9 deletions(-) diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index 75283a060a..ac6ba4bae4 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -18,6 +18,7 @@ import datetime import itertools +import os import textwrap import types from typing import Dict, Iterable, Optional, Sequence, Tuple, Union @@ -34,6 +35,8 @@ MAX_LABELS_COUNT = 64 TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}" +LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" + def create_job_configs_labels( job_configs_labels: Optional[Dict[str, str]], @@ -243,4 +246,32 @@ def start_query_with_client( ) else: results_iterator = query_job.result(max_results=max_results) + + if LOGGING_NAME_ENV_VAR in os.environ: + # when running notebooks via pytest nbmake + pytest_log_job(query_job) + return results_iterator, query_job + + +def pytest_log_job(query_job: bigquery.QueryJob): + """For pytest runs only, log information about the query job + to a file in order to create a performance report. + """ + if LOGGING_NAME_ENV_VAR not in os.environ: + raise EnvironmentError( + "Environment variable {env_var} is not set".format( + env_var=LOGGING_NAME_ENV_VAR + ) + ) + test_name = os.environ[LOGGING_NAME_ENV_VAR] + current_directory = os.getcwd() + bytes_processed = query_job.total_bytes_processed + if not isinstance(bytes_processed, int): + return # filter out mocks + if query_job.configuration.dry_run: + # dry runs don't process their total_bytes_processed + bytes_processed = 0 + bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed") + with open(bytes_file, "a") as f: + f.write(str(bytes_processed) + "\n") diff --git a/noxfile.py b/noxfile.py index fa9c0a57d8..9479a7a318 100644 --- a/noxfile.py +++ b/noxfile.py @@ -764,6 +764,8 @@ def notebook(session: nox.Session): "--nbmake-timeout=900", # 15 minutes ] + logging_name_env_var = "BIGFRAMES_PERFORMANCE_LOG_NAME" + try: # Populate notebook parameters and make a backup so that the notebooks # are runnable. @@ -773,13 +775,21 @@ def notebook(session: nox.Session): *notebooks, ) - # Run self-contained notebooks in single session.run - # achieve parallelization via -n - session.run( - *pytest_command, - "-nauto", - *notebooks, - ) + # Run notebooks in parallel session.run's, since each notebook + # takes an environment variable for performance logging + processes = [] + for notebook in notebooks: + session.env[logging_name_env_var] = os.path.basename(notebook) + process = Process( + target=session.run, + args=(*pytest_command, notebook), + ) + process.start() + processes.append(process) + + for process in processes: + process.join() + finally: # Prevent our notebook changes from getting checked in to git # accidentally. @@ -789,11 +799,12 @@ def notebook(session: nox.Session): *notebooks, ) - # Run regionalized notebooks in parallel session.run's, since each notebook - # takes a different region via env param. + # Additionally run regionalized notebooks in parallel session.run's. + # Each notebook takes a different region via env param. processes = [] for notebook, regions in notebooks_reg.items(): for region in regions: + session.env[logging_name_env_var] = os.path.basename(notebook) process = Process( target=session.run, args=(*pytest_command, notebook), @@ -805,6 +816,35 @@ def notebook(session: nox.Session): for process in processes: process.join() + # when run via pytest, notebooks output a .bytesprocessed report + # collect those reports and print a summary + _print_bytes_processed_report() + + +def _print_bytes_processed_report(): + """Add an informational report about http queries and bytes + processed to the testlog output for purposes of measuring + bigquery-related performance changes. + """ + print("---BIGQUERY USAGE REPORT---") + cumulative_queries = 0 + cumulative_bytes = 0 + for report in Path("notebooks/").glob("*/*.bytesprocessed"): + with open(report, "r") as f: + filename = report.stem + lines = f.read().splitlines() + query_count = len(lines) + total_bytes = sum([int(line) for line in lines]) + format_string = f"{filename} - query count: {query_count}, bytes processed sum: {total_bytes}" + print(format_string) + cumulative_bytes += total_bytes + cumulative_queries += query_count + print( + "---total queries: {total_queries}, total bytes: {total_bytes}---".format( + total_queries=cumulative_queries, total_bytes=cumulative_bytes + ) + ) + @nox.session(python="3.10") def release_dry_run(session): From 34f9f61eee6878c74f50197f657682e37474becc Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Wed, 17 Apr 2024 10:50:46 -0400 Subject: [PATCH 03/15] chore(python): bump idna from 3.4 to 3.7 in .kokoro (#608) Source-Link: https://github.com/googleapis/synthtool/commit/d50980e704793a2d3310bfb3664f3a82f24b5796 Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:5a4c19d17e597b92d786e569be101e636c9c2817731f80a5adec56b2aa8fe070 Co-authored-by: Owl Bot Co-authored-by: Anthonios Partheniou --- .github/.OwlBot.lock.yaml | 4 ++-- .github/auto-label.yaml | 5 +++++ .github/blunderbuss.yml | 17 +++++++++++++++++ .kokoro/requirements.txt | 6 +++--- 4 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 .github/blunderbuss.yml diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 4bdeef3904..81f87c5691 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:a8a80fc6456e433df53fc2a0d72ca0345db0ddefb409f1b75b118dfd1babd952 -# created: 2024-03-15T16:25:47.905264637Z + digest: sha256:5a4c19d17e597b92d786e569be101e636c9c2817731f80a5adec56b2aa8fe070 +# created: 2024-04-12T11:35:58.922854369Z diff --git a/.github/auto-label.yaml b/.github/auto-label.yaml index b2016d119b..8b37ee8971 100644 --- a/.github/auto-label.yaml +++ b/.github/auto-label.yaml @@ -13,3 +13,8 @@ # limitations under the License. requestsize: enabled: true + +path: + pullrequest: true + paths: + samples: "samples" diff --git a/.github/blunderbuss.yml b/.github/blunderbuss.yml new file mode 100644 index 0000000000..8d9cb1008e --- /dev/null +++ b/.github/blunderbuss.yml @@ -0,0 +1,17 @@ +# Blunderbuss config +# +# This file controls who is assigned for pull requests and issues. +# Note: This file is autogenerated. To make changes to the assignee +# team, please update `codeowner_team` in `.repo-metadata.json`. +assign_issues: + - googleapis/api-bigquery-dataframe + +assign_issues_by: + - labels: + - "samples" + to: + - googleapis/python-samples-reviewers + - googleapis/api-bigquery-dataframe + +assign_prs: + - googleapis/api-bigquery-dataframe diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index dd61f5f320..51f92b8e12 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -252,9 +252,9 @@ googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via google-api-core -idna==3.4 \ - --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \ - --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2 +idna==3.7 \ + --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ + --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via requests importlib-metadata==6.8.0 \ --hash=sha256:3ebb78df84a805d7698245025b975d9d67053cd94c79245ba4b3eb694abe68bb \ From 9f8f181279133abdb7da3aa045df6fa278587013 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Wed, 17 Apr 2024 09:40:51 -0700 Subject: [PATCH 04/15] fix: address technical writers fb (#611) * fix: address technical writers fb --- bigframes/ml/model_selection.py | 2 +- .../pandas/core/indexes/accessor.py | 2 +- .../bigframes_vendored/sklearn/base.py | 2 +- .../sklearn/metrics/_classification.py | 8 +++--- .../bigframes_vendored/sklearn/pipeline.py | 7 ++--- .../sklearn/preprocessing/_encoder.py | 26 ++++++++++++------- .../sklearn/preprocessing/_label.py | 2 +- 7 files changed, 28 insertions(+), 21 deletions(-) diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 42c13fdb40..48eb5a93a7 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -35,7 +35,7 @@ def train_test_split( Args: *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series): A sequence of BigQuery DataFrames or Series that can be joined on - their indexes + their indexes. test_size (default None): The proportion of the dataset to include in the test split. If None, this will default to the complement of train_size. If both diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 3f0175359a..f34612cb11 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -37,7 +37,7 @@ def dayofweek(self): """The day of the week with Monday=0, Sunday=6. Return the day of the week. It is assumed the week starts on - Monday, which is denoted by 0 and ends on Sunday which is denoted + Monday, which is denoted by 0 and ends on Sunday, which is denoted by 6. **Examples:** diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index fd8db7a227..1a151a1119 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -153,7 +153,7 @@ def fit_transform(self, X, y=None): Target values (None for unsupervised transformations). Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new) + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new). Transformed DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index 00bbf8cd60..8e8b2c1952 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -122,7 +122,7 @@ def recall_score( ): """Compute the recall. - The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + The recall is the ratio ``tp / (tp + fn)``, where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. @@ -170,7 +170,7 @@ def precision_score( ): """Compute the precision. - The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + The precision is the ratio ``tp / (tp + fp)``, where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. @@ -244,9 +244,9 @@ def f1_score( dtype: float64 Args: - y_true: Series or DataFrame of shape (n_samples,) + y_true: Series or DataFrame of shape (n_samples,). Ground truth (correct) target values. - y_pred: Series or DataFrame of shape (n_samples,) + y_pred: Series or DataFrame of shape (n_samples,). Estimated targets as returned by a classifier. average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ default='binary' diff --git a/third_party/bigframes_vendored/sklearn/pipeline.py b/third_party/bigframes_vendored/sklearn/pipeline.py index aed1565960..8a98ee4141 100644 --- a/third_party/bigframes_vendored/sklearn/pipeline.py +++ b/third_party/bigframes_vendored/sklearn/pipeline.py @@ -20,13 +20,14 @@ class Pipeline(BaseEstimator, metaclass=ABCMeta): """Pipeline of transforms with a final estimator. Sequentially apply a list of transforms and a final estimator. - Intermediate steps of the pipeline must be `transforms`, that is, they + Intermediate steps of the pipeline must be `transforms`. That is, they must implement `fit` and `transform` methods. The final estimator only needs to implement `fit`. The purpose of the pipeline is to assemble several steps that can be - cross-validated together while setting different parameters. This simplifies code, and allows deploying an estimator - and peprocessing together, e.g. with `Pipeline.to_gbq(...).` + cross-validated together while setting different parameters. This + simplifies code and allows for deploying an estimator and peprocessing + together, e.g. with `Pipeline.to_gbq(...).` """ def fit( diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 5e5e8ac042..b883e82249 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -23,15 +23,21 @@ class OneHotEncoder(BaseEstimator): Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to a binary one-hot encoding. - .. code-block:: - - from bigframes.ml.preprocessing import OneHotEncoder - import bigframes.pandas as bpd - - enc = OneHotEncoder() - X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) - enc.fit(X) - print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]}))) + >>> from bigframes.ml.preprocessing import OneHotEncoder + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> enc = OneHotEncoder() + >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) + >>> enc.fit(X) + OneHotEncoder() + + >>> print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]}))) + onehotencoded_a onehotencoded_b + 0 [{'index': 1, 'value': 1.0}] [{'index': 1, 'value': 1.0}] + 1 [{'index': 2, 'value': 1.0}] [{'index': 0, 'value': 1.0}] + + [2 rows x 2 columns] Args: drop (Optional[Literal["most_frequent"]], default None): @@ -52,7 +58,7 @@ class OneHotEncoder(BaseEstimator): Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, max_categories includes the category representing the infrequent categories along with the frequent categories. - Default None, set limit to 1,000,000. + Default None. Set limit to 1,000,000. """ def fit(self, X, y=None): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py index cc6b995c8c..61a44db92f 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -26,7 +26,7 @@ class LabelEncoder(BaseEstimator): Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, max_categories includes the category representing the infrequent categories along with the frequent categories. - Default None, set limit to 1,000,000. + Default None. Set limit to 1,000,000. """ def fit(self, y): From 8f9ece6d13f57f02d677bf0e3fea97dea94ae240 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 17 Apr 2024 15:48:38 -0700 Subject: [PATCH 05/15] fix: infer narrowest numeric type when combining numeric columns (#602) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Swe帽a (Swast) --- bigframes/core/__init__.py | 88 +++++- bigframes/core/block_transforms.py | 2 +- bigframes/core/blocks.py | 16 +- bigframes/core/compile/compiled.py | 284 ------------------ bigframes/core/compile/compiler.py | 12 - bigframes/core/compile/scalar_op_compiler.py | 38 +++ bigframes/core/expression.py | 3 - bigframes/core/join_def.py | 5 + bigframes/core/nodes.py | 84 ------ bigframes/dataframe.py | 8 +- bigframes/operations/__init__.py | 100 +++--- tests/system/small/test_dataframe.py | 10 +- .../bigframes_vendored/pandas/core/frame.py | 84 +++--- .../bigframes_vendored/pandas/core/series.py | 6 +- 14 files changed, 233 insertions(+), 507 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 3fa690ef37..9e6b86fc30 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -354,10 +354,7 @@ def unpivot( *, passthrough_columns: typing.Sequence[str] = (), index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] - ] = pandas.Float64Dtype(), - how: typing.Literal["left", "right"] = "left", + join_side: typing.Literal["left", "right"] = "left", ) -> ArrayValue: """ Unpivot ArrayValue columns. @@ -367,23 +364,88 @@ def unpivot( unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. index_col_id (str): The column id to be used for the row labels. - dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. Returns: ArrayValue: The unpivoted ArrayValue """ + # There will be N labels, used to disambiguate which of N source columns produced each output row + explode_offsets_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + labels_array = self._create_unpivot_labels_array(row_labels, index_col_ids) + labels_array = labels_array.promote_offsets(explode_offsets_id) + + # Unpivot creates N output rows for each input row, labels disambiguate these N rows + joined_array = self._cross_join_w_labels(labels_array, join_side) + + # Build the output rows as a case statment that selects between the N input columns + unpivot_exprs = [] + # Supports producing multiple stacked ouput columns for stacking only part of hierarchical index + for col_id, input_ids in unpivot_columns: + # row explode offset used to choose the input column + # we use offset instead of label as labels are not necessarily unique + cases = tuple( + ( + ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), + ex.free_var(id_or_null) + if (id_or_null is not None) + else ex.const(None), + ) + for i, id_or_null in enumerate(input_ids) + ) + col_expr = ops.case_when_op.as_expr(*cases) + unpivot_exprs.append((col_expr, col_id)) + + label_exprs = ((ex.free_var(id), id) for id in index_col_ids) + # passthrough columns are unchanged, just repeated N times each + passthrough_exprs = ((ex.free_var(id), id) for id in passthrough_columns) return ArrayValue( - nodes.UnpivotNode( - child=self.node, - row_labels=tuple(row_labels), - unpivot_columns=tuple(unpivot_columns), - passthrough_columns=tuple(passthrough_columns), - index_col_ids=tuple(index_col_ids), - dtype=dtype, - how=how, + nodes.ProjectionNode( + child=joined_array.node, + assignments=(*label_exprs, *unpivot_exprs, *passthrough_exprs), ) ) + def _cross_join_w_labels( + self, labels_array: ArrayValue, join_side: typing.Literal["left", "right"] + ) -> ArrayValue: + """ + Convert each row in self to N rows, one for each label in labels array. + """ + table_join_side = ( + join_def.JoinSide.LEFT if join_side == "left" else join_def.JoinSide.RIGHT + ) + labels_join_side = table_join_side.inverse() + labels_mappings = tuple( + join_def.JoinColumnMapping(labels_join_side, id, id) + for id in labels_array.schema.names + ) + table_mappings = tuple( + join_def.JoinColumnMapping(table_join_side, id, id) + for id in self.schema.names + ) + join = join_def.JoinDefinition( + conditions=(), mappings=(*labels_mappings, *table_mappings), type="cross" + ) + if join_side == "left": + joined_array = self.join(labels_array, join_def=join) + else: + joined_array = labels_array.join(self, join_def=join) + return joined_array + + def _create_unpivot_labels_array( + self, + former_column_labels: typing.Sequence[typing.Hashable], + col_ids: typing.Sequence[str], + ) -> ArrayValue: + """Create an ArrayValue from a list of label tuples.""" + rows = [] + for row_offset in range(len(former_column_labels)): + row_label = former_column_labels[row_offset] + row_label = (row_label,) if not isinstance(row_label, tuple) else row_label + row = {col_ids[i]: row_label[i] for i in range(len(col_ids))} + rows.append(row) + + return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session) + def join( self, other: ArrayValue, diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 1eae73014c..562689a736 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -857,5 +857,5 @@ def _idx_extrema( # Stack the entire column axis to produce single-column result # Assumption: uniform dtype for stackability return block.aggregate_all_and_stack( - agg_ops.AnyValueOp(), dtype=block.dtypes[0] + agg_ops.AnyValueOp(), ).with_column_labels([original_block.index.name]) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f6850020df..0f9cacd83d 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -914,9 +914,6 @@ def aggregate_all_and_stack( axis: int | str = 0, value_col_id: str = "values", dropna: bool = True, - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] - ] = pd.Float64Dtype(), ) -> Block: axis_n = utils.get_axis_number(axis) if axis_n == 0: @@ -931,7 +928,6 @@ def aggregate_all_and_stack( row_labels=self.column_labels.to_list(), index_col_ids=index_col_ids, unpivot_columns=tuple([(value_col_id, tuple(self.value_columns))]), - dtype=dtype, ) return Block( result_expr, @@ -949,7 +945,6 @@ def aggregate_all_and_stack( index_col_ids=[guid.generate_guid()], unpivot_columns=[(value_col_id, tuple(self.value_columns))], passthrough_columns=[*self.index_columns, offset_col], - dtype=dtype, ) index_aggregations = [ (ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.free_var(col_id)), col_id) @@ -1512,13 +1507,10 @@ def stack(self, how="left", levels: int = 1): # Get matching columns unpivot_columns: List[Tuple[str, List[str]]] = [] - dtypes = [] for val in result_col_labels: col_id = guid.generate_guid("unpivot_") input_columns, dtype = self._create_stack_column(val, row_label_tuples) unpivot_columns.append((col_id, input_columns)) - if dtype: - dtypes.append(dtype or pd.Float64Dtype()) added_index_columns = [guid.generate_guid() for _ in range(row_labels.nlevels)] unpivot_expr = self._expr.unpivot( @@ -1526,8 +1518,7 @@ def stack(self, how="left", levels: int = 1): passthrough_columns=self.index_columns, unpivot_columns=unpivot_columns, index_col_ids=added_index_columns, - dtype=tuple(dtypes), - how=how, + join_side=how, ) new_index_level_names = self.column_labels.names[-levels:] if how == "left": @@ -1559,15 +1550,12 @@ def melt( value_labels = [self.col_id_to_label[col_id] for col_id in value_vars] id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] - dtype = self._expr.get_column_type(value_vars[0]) - unpivot_expr = self._expr.unpivot( row_labels=value_labels, passthrough_columns=id_vars, unpivot_columns=(unpivot_col,), index_col_ids=var_col_ids, - dtype=dtype, - how="right", + join_side="right", ) index_id = guid.generate_guid() unpivot_expr = unpivot_expr.promote_offsets(index_id) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index f1c5d62010..a59d599679 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -40,10 +40,8 @@ OrderingExpression, ) import bigframes.core.schema as schemata -import bigframes.core.utils as utils from bigframes.core.window_spec import WindowSpec import bigframes.dtypes -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops ORDER_ID_COLUMN = "bigframes_ordering_id" @@ -109,36 +107,6 @@ def filter(self: T, predicate: ex.Expression) -> T: """Filter the table on a given expression, the predicate must be a boolean expression.""" ... - @abc.abstractmethod - def unpivot( - self: T, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] - ] = pandas.Float64Dtype(), - how="left", - ) -> T: - """ - Unpivot ArrayValue columns. - - Args: - row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. - unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. - passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. - index_col_id (str): The column id to be used for the row labels. - dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. - - Returns: - ArrayValue: The unpivoted ArrayValue - """ - ... - @abc.abstractmethod def _reproject_to_table(self: T) -> T: """ @@ -332,115 +300,6 @@ def _filter(self, predicate_value: ibis_types.BooleanValue) -> UnorderedIR: expr.predicates = [*self._predicates, predicate_value] return expr.build() - def unpivot( - self, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] - ] = pandas.Float64Dtype(), - how="left", - ) -> UnorderedIR: - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr() - row_n = len(row_labels) - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) - ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - op_compiler.compile_row_op( - ops.AsTypeOp(col_dtype), (unpivot_table[col],) - ) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - unpivot_offset_id, - ) - - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - return UnorderedIR( - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - ) - def aggregate( self, aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], @@ -920,149 +779,6 @@ def project_window_op( # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. return result._reproject_to_table() if not skip_reproject_unsafe else result - def unpivot( - self, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] - ] = pandas.Float64Dtype(), - how="left", - ) -> OrderedIR: - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) - row_n = len(row_labels) - hidden_col_ids = self._hidden_ordering_column_names.keys() - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) - ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - op_compiler.compile_row_op( - ops.AsTypeOp(col_dtype), (unpivot_table[col],) - ) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - *hidden_col_ids, - unpivot_offset_id, - ) - - # Extend the original ordering using unpivot_offset_id - old_ordering = self._ordering - if how == "left": - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - *old_ordering.ordering_value_columns, - ascending_over(unpivot_offset_id), - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - else: # how=="right" - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - ascending_over(unpivot_offset_id), - *old_ordering.ordering_value_columns, - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - hidden_ordering_columns = [ - unpivot_table[unpivot_offset_id], - *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], - ] - return OrderedIR( - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - hidden_ordering_columns=hidden_ordering_columns, - ordering=new_ordering, - ) - def _reproject_to_table(self) -> OrderedIR: table = self._to_ibis_expr( ordering_mode="unordered", diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 638e3eacdd..a68023d13d 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -179,18 +179,6 @@ def compile_reproject(node: nodes.ReprojectOpNode, ordered: bool = True): return compile_node(node.child, ordered)._reproject_to_table() -@_compile_node.register -def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): - return compile_node(node.child, ordered).unpivot( - node.row_labels, - node.unpivot_columns, - passthrough_columns=node.passthrough_columns, - index_col_ids=node.index_col_ids, - dtype=node.dtype, - how=node.how, - ) - - @_compile_node.register def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True): return compile_node(node.child, ordered).explode(node.column_ids) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 53a25d63ed..072d974b39 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -189,6 +189,25 @@ def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): return decorator + def register_nary_op(self, op_ref: typing.Union[ops.NaryOp, type[ops.NaryOp]]): + """ + Decorator to register a nary op implementation. + + Args: + op_ref (NaryOp or NaryOp type): + Class or instance of operator that is implemented by the decorated function. + """ + key = typing.cast(str, op_ref.name) + + def decorator(impl: typing.Callable[..., ibis_types.Value]): + def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): + return impl(*args) + + self._register(key, normalized_impl) + return impl + + return decorator + def _register( self, op_name: str, @@ -1346,6 +1365,25 @@ def clip_op( ) +@scalar_op_compiler.register_nary_op(ops.case_when_op) +def switch_op(*cases_and_outputs: ibis_types.Value) -> ibis_types.Value: + # ibis can handle most type coercions, but we need to force bool -> int + # TODO: dispatch coercion depending on bigframes dtype schema + result_values = cases_and_outputs[1::2] + do_upcast_bool = any(t.type().is_numeric() for t in result_values) + if do_upcast_bool: + # Just need to upcast to int, ibis can handle further coercion + result_values = tuple( + val.cast(ibis_dtypes.int64) if val.type().is_boolean() else val + for val in result_values + ) + + case_val = ibis.case() + for predicate, output in zip(cases_and_outputs[::2], result_values): + case_val = case_val.when(predicate, output) + return case_val.end() + + # Helpers def is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 4980f5369d..70eb519a1b 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -190,9 +190,6 @@ class OpExpression(Expression): op: bigframes.operations.RowOp inputs: typing.Tuple[Expression, ...] - def __post_init__(self): - assert self.op.arguments == len(self.inputs) - @property def unbound_variables(self) -> typing.Tuple[str, ...]: return tuple( diff --git a/bigframes/core/join_def.py b/bigframes/core/join_def.py index 4646a0d6ae..632a1864da 100644 --- a/bigframes/core/join_def.py +++ b/bigframes/core/join_def.py @@ -22,6 +22,11 @@ class JoinSide(enum.Enum): LEFT = 0 RIGHT = 1 + def inverse(self) -> JoinSide: + if self == JoinSide.LEFT: + return JoinSide.RIGHT + return JoinSide.LEFT + JoinType = Literal["inner", "outer", "left", "right", "cross"] diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index a1072b0d68..688e165732 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -21,8 +21,6 @@ import typing from typing import Callable, Tuple -import pandas - import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.join_def import JoinColumnMapping, JoinDefinition, JoinSide @@ -579,88 +577,6 @@ def relation_ops_created(self) -> int: return 0 -@dataclass(frozen=True) -class UnpivotNode(UnaryNode): - # TODO: Refactor unpivot - row_labels: typing.Tuple[typing.Hashable, ...] - unpivot_columns: typing.Tuple[ - typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]], ... - ] - passthrough_columns: typing.Tuple[str, ...] = () - index_col_ids: typing.Tuple[str, ...] = ("index",) - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] - ] = (pandas.Float64Dtype(),) - how: typing.Literal["left", "right"] = "left" - - def __hash__(self): - return self._node_hash - - @property - def row_preserving(self) -> bool: - return False - - @property - def non_local(self) -> bool: - return True - - @property - def joins(self) -> bool: - return True - - @functools.cached_property - def schema(self) -> schemata.ArraySchema: - def infer_dtype( - values: typing.Iterable[typing.Hashable], - ) -> bigframes.dtypes.Dtype: - item_types = map(lambda x: bigframes.dtypes.infer_literal_type(x), values) - etype = functools.reduce( - lambda t1, t2: bigframes.dtypes.lcd_type(t1, t2) - if (t1 and t2) - else None, - item_types, - ) - return bigframes.dtypes.dtype_for_etype(etype) - - label_tuples = [ - label if isinstance(label, tuple) else (label,) for label in self.row_labels - ] - idx_dtypes = [ - infer_dtype(map(lambda x: typing.cast(tuple, x)[i], label_tuples)) - for i in range(len(self.index_col_ids)) - ] - - index_items = [ - schemata.SchemaItem(id, dtype) - for id, dtype in zip(self.index_col_ids, idx_dtypes) - ] - value_dtypes = ( - self.dtype - if isinstance(self.dtype, tuple) - else (self.dtype,) * len(self.unpivot_columns) - ) - value_items = [ - schemata.SchemaItem(col[0], dtype) - for col, dtype in zip(self.unpivot_columns, value_dtypes) - ] - passthrough_items = [ - schemata.SchemaItem(id, self.child.schema.get_type(id)) - for id in self.passthrough_columns - ] - return schemata.ArraySchema((*index_items, *value_items, *passthrough_items)) - - @property - def variables_introduced(self) -> int: - return ( - len(self.schema.items) - len(self.passthrough_columns) + OVERHEAD_VARIABLES - ) - - @property - def relation_ops_created(self) -> int: - # Unpivot is essentially a cross join and a projection. - return 2 - - @dataclass(frozen=True) class RandomSampleNode(UnaryNode): fraction: float diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 953a89c34f..11e592542c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1961,9 +1961,7 @@ def any( frame = self._raise_on_non_boolean("any") else: frame = self._drop_non_bool() - block = frame._block.aggregate_all_and_stack( - agg_ops.any_op, dtype=pandas.BooleanDtype(), axis=axis - ) + block = frame._block.aggregate_all_and_stack(agg_ops.any_op, axis=axis) return bigframes.series.Series(block.select_column("values")) def all( @@ -1973,9 +1971,7 @@ def all( frame = self._raise_on_non_boolean("all") else: frame = self._drop_non_bool() - block = frame._block.aggregate_all_and_stack( - agg_ops.all_op, dtype=pandas.BooleanDtype(), axis=axis - ) + block = frame._block.aggregate_all_and_stack(agg_ops.all_op, axis=axis) return bigframes.series.Series(block.select_column("values")) def sum( diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index d631ba8508..a7c385a2b8 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -15,7 +15,9 @@ from __future__ import annotations import dataclasses +import functools import typing +from typing import Tuple, Union import numpy as np import pandas as pd @@ -34,11 +36,6 @@ class RowOp(typing.Protocol): def name(self) -> str: ... - @property - def arguments(self) -> int: - """The number of column argument the operation takes""" - ... - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: ... @@ -48,21 +45,29 @@ def order_preserving(self) -> bool: ... -# These classes can be used to create simple ops that don't take local parameters -# All is needed is a unique name, and to register an implementation in ibis_mappings.py @dataclasses.dataclass(frozen=True) -class UnaryOp: +class NaryOp: @property def name(self) -> str: raise NotImplementedError("RowOp abstract base class has no implementation") + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + raise NotImplementedError("Abstract operation has no output type") + + @property + def order_preserving(self) -> bool: + """Whether the row operation preserves total ordering. Can be pruned from ordering expressions.""" + return False + + +# These classes can be used to create simple ops that don't take local parameters +# All is needed is a unique name, and to register an implementation in ibis_mappings.py +@dataclasses.dataclass(frozen=True) +class UnaryOp(NaryOp): @property def arguments(self) -> int: return 1 - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - raise NotImplementedError("Abstract operation has no output type") - def as_expr( self, input_id: typing.Union[str, bigframes.core.expression.Expression] = "arg" ) -> bigframes.core.expression.Expression: @@ -72,25 +77,13 @@ def as_expr( self, (_convert_expr_input(input_id),) ) - @property - def order_preserving(self) -> bool: - """Whether the row operation preserves total ordering. Can be pruned from ordering expressions.""" - return False - @dataclasses.dataclass(frozen=True) -class BinaryOp: - @property - def name(self) -> str: - raise NotImplementedError("RowOp abstract base class has no implementation") - +class BinaryOp(NaryOp): @property def arguments(self) -> int: return 2 - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - raise NotImplementedError("Abstract operation has no output type") - def as_expr( self, left_input: typing.Union[str, bigframes.core.expression.Expression] = "arg1", @@ -106,25 +99,13 @@ def as_expr( ), ) - @property - def order_preserving(self) -> bool: - """Whether the row operation preserves total ordering. Can be pruned from ordering expressions.""" - return False - @dataclasses.dataclass(frozen=True) -class TernaryOp: - @property - def name(self) -> str: - raise NotImplementedError("RowOp abstract base class has no implementation") - +class TernaryOp(NaryOp): @property def arguments(self) -> int: return 3 - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - raise NotImplementedError("Abstract operation has no output type") - def as_expr( self, input1: typing.Union[str, bigframes.core.expression.Expression] = "arg1", @@ -142,11 +123,6 @@ def as_expr( ), ) - @property - def order_preserving(self) -> bool: - """Whether the row operation preserves total ordering. Can be pruned from ordering expressions.""" - return False - def _convert_expr_input( input: typing.Union[str, bigframes.core.expression.Expression] @@ -664,6 +640,46 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT clip_op = ClipOp() + +class CaseWhenOp(NaryOp): + name: typing.ClassVar[str] = "switch" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + assert len(input_types) % 2 == 0 + # predicate1, output1, predicate2, output2... + if not all(map(lambda x: x == dtypes.BOOL_DTYPE, input_types[::2])): + raise TypeError(f"Case inputs {input_types[::2]} must be boolean-valued") + output_expr_types = input_types[1::2] + return functools.reduce( + lambda t1, t2: dtypes.coerce_to_common(t1, t2), + output_expr_types, + ) + + def as_expr( + self, + *case_output_pairs: Tuple[ + Union[str | bigframes.core.expression.Expression], + Union[str | bigframes.core.expression.Expression], + ], + ) -> bigframes.core.expression.Expression: + import bigframes.core.expression + + # Keep this in sync with output_type and compilers + inputs: list[bigframes.core.expression.Expression] = [] + + for case, output in case_output_pairs: + inputs.append(_convert_expr_input(case)) + inputs.append(_convert_expr_input(output)) + + return bigframes.core.expression.OpExpression( + self, + tuple(inputs), + ) + + +case_when_op = CaseWhenOp() + + # Just parameterless unary ops for now # TODO: Parameter mappings NUMPY_TO_OP: typing.Final = { diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 7fef7a9dc7..4c598a682d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2987,10 +2987,14 @@ def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ord bf_result = bf_series.to_pandas(ordered=ordered) # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_series = pd_series.astype("Float64") # Pandas has object index type + pd_series.index = pd_series.index.astype(pd.StringDtype(storage="pyarrow")) assert_series_equal( - pd_series, bf_result, check_index_type=False, ignore_order=not ordered + pd_series, + bf_result, + check_index_type=False, + ignore_order=not ordered, + check_dtype=False, ) @@ -3079,7 +3083,7 @@ def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op pd_series = op(scalars_pandas_df_index).astype("boolean") bf_result = bf_series.to_pandas() - # Pandas has object index type + pd_series.index = pd_series.index.astype(bf_result.index.dtype) pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index e894900646..c692bdbfec 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4327,16 +4327,16 @@ def min(self, axis=0, *, numeric_only: bool = False): Finding the minimum value in each column (the default behavior without an explicit axis parameter). >>> df.min() - A 1.0 - B 2.0 - dtype: Float64 + A 1 + B 2 + dtype: Int64 Finding the minimum value in each row. >>> df.min(axis=1) - 0 1.0 - 1 3.0 - dtype: Float64 + 0 1 + 1 3 + dtype: Int64 Args: axis ({index (0), columns (1)}): @@ -4372,16 +4372,16 @@ def max(self, axis=0, *, numeric_only: bool = False): Finding the maximum value in each column (the default behavior without an explicit axis parameter). >>> df.max() - A 3.0 - B 4.0 - dtype: Float64 + A 3 + B 4 + dtype: Int64 Finding the maximum value in each row. >>> df.max(axis=1) - 0 2.0 - 1 4.0 - dtype: Float64 + 0 2 + 1 4 + dtype: Int64 Args: axis ({index (0), columns (1)}): @@ -4416,16 +4416,16 @@ def sum(self, axis=0, *, numeric_only: bool = False): Calculating the sum of each column (the default behavior without an explicit axis parameter). >>> df.sum() - A 4.0 - B 6.0 - dtype: Float64 + A 4 + B 6 + dtype: Int64 Calculating the sum of each row. >>> df.sum(axis=1) - 0 3.0 - 1 7.0 - dtype: Float64 + 0 3 + 1 7 + dtype: Int64 Args: axis ({index (0), columns (1)}): @@ -4500,9 +4500,9 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): Finding the median value of each column. >>> df.median() - A 1.0 - B 2.0 - dtype: Float64 + A 1 + B 2 + dtype: Int64 Args: numeric_only (bool. default False): @@ -4748,10 +4748,10 @@ def count(self, *, numeric_only: bool = False): Counting non-NA values for each column: >>> df.count() - A 4.0 - B 5.0 - C 3.0 - dtype: Float64 + A 4 + B 5 + C 3 + dtype: Int64 Args: numeric_only (bool, default False): @@ -5051,17 +5051,17 @@ def melt(self, id_vars, value_vars, var_name, value_name): Using `melt` with `id_vars` and `value_vars`: >>> df.melt(id_vars='A', value_vars=['B', 'C']) - A variable value - 0 1.0 B 1 - 1 B 2 - 2 3.0 B 3 - 3 4.0 B 4 - 4 5.0 B 5 - 5 1.0 C - 6 C 3 - 7 3.0 C - 8 4.0 C 4 - 9 5.0 C 5 + A variable value + 0 1.0 B 1.0 + 1 B 2.0 + 2 3.0 B 3.0 + 3 4.0 B 4.0 + 4 5.0 B 5.0 + 5 1.0 C + 6 C 3.5 + 7 3.0 C + 8 4.0 C 4.5 + 9 5.0 C 5.0 [10 rows x 3 columns] @@ -5102,9 +5102,9 @@ def nunique(self): [3 rows x 2 columns] >>> df.nunique() - A 3.0 - B 2.0 - dtype: Float64 + A 3 + B 2 + dtype: Int64 Returns: bigframes.series.Series: Series with number of distinct elements. @@ -5313,9 +5313,9 @@ def agg(self, func): Using a single function: >>> df.agg('sum') - A 6.0 - B 6.0 - dtype: Float64 + A 6 + B 6 + dtype: Int64 Using a list of functions: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 5e3b4c46ef..edefb334b3 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -593,9 +593,9 @@ def agg(self, func): 1 >>> s.agg(['min', 'max']) - min 1.0 - max 4.0 - dtype: Float64 + min 1 + max 4 + dtype: Int64 Args: func (function): From 9c106bd24482620ef5ff3c85f94be9da76c49716 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 18 Apr 2024 09:56:38 -0700 Subject: [PATCH 06/15] feat: Add fine tuning `fit()` for Palm2TextGenerator (#616) * feat: support list of numerics in pandas.cut (#580) An internal user encountered this missing overload * move the tests to load-testing * add predict tests * address comments * address comments --------- Co-authored-by: Henry Solberg --- bigframes/ml/core.py | 40 +++++++++++++++++ bigframes/ml/llm.py | 71 ++++++++++++++++++++++++++++++- bigframes/ml/sql.py | 17 ++++++++ tests/system/load/test_llm.py | 68 +++++++++++++++++++++++++++++ tests/system/small/ml/test_llm.py | 2 +- tests/unit/ml/test_sql.py | 23 ++++++++++ 6 files changed, 219 insertions(+), 2 deletions(-) create mode 100644 tests/system/load/test_llm.py diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 04aaeec1bc..b94ae39687 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -321,6 +321,46 @@ def create_model( return self._create_model_with_sql(session=session, sql=sql) + def create_llm_remote_model( + self, + X_train: bpd.DataFrame, + y_train: bpd.DataFrame, + connection_name: str, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> BqmlModel: + """Create a session-temporary BQML model with the CREATE OR REPLACE MODEL statement + + Args: + X_train: features columns for training + y_train: labels columns for training + options: a dict of options to configure the model. Generates a BQML OPTIONS + clause + connection_name: + a BQ connection to talk with Vertex AI, of the format ... https://cloud.google.com/bigquery/docs/create-cloud-resource-connection + + Returns: a BqmlModel, wrapping a trained model in BigQuery + """ + options = dict(options) + # Cache dataframes to make sure base table is not a snapshot + # cached dataframe creates a full copy, never uses snapshot + input_data = X_train._cached(force=True).join( + y_train._cached(force=True), how="outer" + ) + options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) + + session = X_train._session + + model_ref = self._create_model_ref(session._anonymous_dataset) + + sql = self._model_creation_sql_generator.create_llm_remote_model( + source_df=input_data, + model_ref=model_ref, + options=options, + connection_name=connection_name, + ) + + return self._create_model_with_sql(session=session, sql=sql) + def create_time_series_model( self, X_train: bpd.DataFrame, diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 31c691fd51..37a38cdd5c 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -27,6 +27,10 @@ from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd +_BQML_PARAMS_MAPPING = { + "max_iterations": "maxIterations", +} + _TEXT_GENERATOR_BISON_ENDPOINT = "text-bison" _TEXT_GENERATOR_BISON_32K_ENDPOINT = "text-bison-32k" _TEXT_GENERATOR_ENDPOINTS = ( @@ -62,6 +66,8 @@ class PaLM2TextGenerator(base.BaseEstimator): Connection to connect with remote service. str of the format ... if None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach permission if the connection isn't fully setup. + max_iterations (Optional[int], Default to 300): + The number of steps to run when performing supervised tuning. """ def __init__( @@ -70,9 +76,11 @@ def __init__( model_name: Literal["text-bison", "text-bison-32k"] = "text-bison", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, + max_iterations: int = 300, ): self.model_name = model_name self.session = session or bpd.get_global_session() + self.max_iterations = max_iterations self._bq_connection_manager = self.session.bqconnectionmanager connection_name = connection_name or self.session._bq_connection @@ -132,12 +140,73 @@ def _from_bq( model_connection = model._properties["remoteModelInfo"]["connection"] model_endpoint = bqml_endpoint.split("/")[-1] + # Get the optional params + kwargs: dict = {} + last_fitting = model.training_runs[-1]["trainingOptions"] + + dummy_text_generator = cls() + for bf_param, _ in dummy_text_generator.__dict__.items(): + bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) + if bqml_param in last_fitting: + # Convert types + if bf_param in ["max_iterations"]: + kwargs[bf_param] = int(last_fitting[bqml_param]) + text_generator_model = cls( - session=session, model_name=model_endpoint, connection_name=model_connection + **kwargs, + session=session, + model_name=model_endpoint, + connection_name=model_connection, ) text_generator_model._bqml_model = core.BqmlModel(session, model) return text_generator_model + @property + def _bqml_options(self) -> dict: + """The model options as they will be set for BQML""" + options = { + "max_iterations": self.max_iterations, + "data_split_method": "NO_SPLIT", + } + return options + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + ) -> PaLM2TextGenerator: + """Fine tune PaLM2TextGenerator model. + + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + DataFrame of shape (n_samples, n_features). Training data. + y (bigframes.dataframe.DataFrame or bigframes.series.Series: + Training labels. + + Returns: + PaLM2TextGenerator: Fitted Estimator. + """ + X, y = utils.convert_to_dataframe(X, y) + + options = self._bqml_options + options["endpoint"] = self.model_name + "@001" + options["prompt_col"] = X.columns.tolist()[0] + + self._bqml_model = self._bqml_model_factory.create_llm_remote_model( + X, + y, + options=options, + connection_name=self.connection_name, + ) + return self + def predict( self, X: Union[bpd.DataFrame, bpd.Series], diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index fab358cce3..59c768ce81 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -177,6 +177,23 @@ def create_model( parts.append(f"AS {source_sql}") return "\n".join(parts) + def create_llm_remote_model( + self, + source_df: bpd.DataFrame, + connection_name: str, + model_ref: google.cloud.bigquery.ModelReference, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> str: + """Encode the CREATE OR REPLACE MODEL statement for BQML""" + source_sql = source_df.sql + + parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] + parts.append(self.connection(connection_name)) + if options: + parts.append(self.options(**options)) + parts.append(f"AS {source_sql}") + return "\n".join(parts) + def create_remote_model( self, connection_name: str, diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py new file mode 100644 index 0000000000..62ef7d5c72 --- /dev/null +++ b/tests/system/load/test_llm.py @@ -0,0 +1,68 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +import bigframes.ml.llm + + +@pytest.fixture(scope="session") +def llm_fine_tune_df_default_index( + session: bigframes.Session, +) -> bigframes.dataframe.DataFrame: + sql = """ +SELECT + CONCAT("Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: ", text) as prompt, + CAST(label AS STRING) as label +FROM `llm_tuning.emotion_classification_train` +""" + return session.read_gbq(sql) + + +@pytest.fixture(scope="session") +def llm_remote_text_pandas_df(): + """Additional data matching the penguins dataset, with a new index""" + return pd.DataFrame( + { + "prompt": [ + "Please do sentiment analysis on the following text and only output a number from 0 to 5where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey", + "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: i was feeling a little vain when i did this one", + "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: a father of children killed in an accident", + ], + } + ) + + +def test_llm_palm_configure_fit( + llm_fine_tune_df_default_index, llm_remote_text_pandas_df +): + model = bigframes.ml.llm.PaLM2TextGenerator( + model_name="text-bison", max_iterations=1 + ) + + df = llm_fine_tune_df_default_index.dropna() + X_train = df[["prompt"]] + y_train = df[["label"]] + model.fit(X_train, y_train) + + assert model is not None + + df = model.predict(llm_remote_text_pandas_df).to_pandas() + assert df.shape == (3, 4) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() == 1) + + # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index b9e4889801..6f6b67597a 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 5b1ff37775..3560f05cb6 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -181,6 +181,29 @@ def test_create_model_transform_correct( ) +def test_create_llm_remote_model_correct( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = model_creation_sql_generator.create_llm_remote_model( + source_df=mock_df, + connection_name="my_project.us.my_connection", + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_remote_model" + ), + options={"option_key1": "option_value1", "option_key2": 2}, + ) + assert ( + sql + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model` +REMOTE WITH CONNECTION `my_project.us.my_connection` +OPTIONS( + option_key1="option_value1", + option_key2=2) +AS input_X_y_sql""" + ) + + def test_create_remote_model_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): From 9665e39ef288841f03a9d823bd2210ef58394ad3 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 18 Apr 2024 10:10:30 -0700 Subject: [PATCH 07/15] docs: Fix rendering of examples for multiple apis (#620) --- third_party/bigframes_vendored/pandas/core/frame.py | 1 + third_party/bigframes_vendored/pandas/core/generic.py | 1 + .../bigframes_vendored/pandas/core/groupby/__init__.py | 1 + third_party/bigframes_vendored/pandas/core/series.py | 4 ++++ 4 files changed, 7 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index c692bdbfec..f06128f150 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4523,6 +4523,7 @@ def quantile( Return values at the given quantile over requested axis. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 9c6120fd6c..54c876ef3c 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -61,6 +61,7 @@ def __iter__(self) -> Iterator: iterator **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 6310d7e271..7347963d17 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -90,6 +90,7 @@ def quantile(self, q=0.5, *, numeric_only: bool = False): Return group values at the given quantile, a la numpy.percentile. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame([ diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index edefb334b3..a5e14c5b1c 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -862,6 +862,7 @@ def autocorr(self, lag: int = 1) -> float: the Series and its shifted self. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None @@ -2812,6 +2813,7 @@ def combine_first(self, other) -> Series: of the two indexes. **Examples:** + >>> import bigframes.pandas as bpd >>> import numpy as np >>> bpd.options.display.progress_bar = None @@ -2852,6 +2854,7 @@ def update(self, other) -> None: on index. **Examples:** + >>> import bigframes.pandas as bpd >>> import pandas as pd >>> import numpy as np @@ -3168,6 +3171,7 @@ def quantile( Return value at the given quantile. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> s = bpd.Series([1, 2, 3, 4]) From 3706b4f9dde65788b5e6343a6428fb1866499461 Mon Sep 17 00:00:00 2001 From: Stephanie A <129541811+DevStephanie@users.noreply.github.com> Date: Thu, 18 Apr 2024 15:29:22 -0500 Subject: [PATCH 08/15] feat: warn if location is set to unknown location (#609) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: warn if location is set to unknown location * tests error message --------- Co-authored-by: Shobhit Singh Co-authored-by: Tim Swe帽a (Swast) --- bigframes/_config/bigquery_options.py | 23 ++++++++++ bigframes/exceptions.py | 17 +++++++ tests/unit/_config/test_bigquery_options.py | 51 +++++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 bigframes/exceptions.py diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 50e14eaf28..74561e6f24 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -22,11 +22,33 @@ import google.api_core.exceptions import google.auth.credentials +import bigframes.constants +import bigframes.exceptions + SESSION_STARTED_MESSAGE = ( "Cannot change '{attribute}' once a session has started. " "Call bigframes.pandas.close_session() first, if you are using the bigframes.pandas API." ) +UNKNOWN_LOCATION_MESSAGE = "The location '{location}' is set to an unknown value." + + +def _validate_location(value: Optional[str]): + + if value is None: + return + + if value not in bigframes.constants.ALL_BIGQUERY_LOCATIONS: + warnings.warn( + UNKNOWN_LOCATION_MESSAGE.format(location=value), + # There are many layers before we get to (possibly) the user's code: + # -> bpd.options.bigquery.location = "us-central-1" + # -> location.setter + # -> _validate_location + stacklevel=3, + category=bigframes.exceptions.UnknownLocationWarning, + ) + class BigQueryOptions: """Encapsulates configuration for working with a session.""" @@ -93,6 +115,7 @@ def location(self) -> Optional[str]: def location(self, value: Optional[str]): if self._session_started and self._location != value: raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="location")) + _validate_location(value) self._location = value @property diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py new file mode 100644 index 0000000000..62122e79d2 --- /dev/null +++ b/bigframes/exceptions.py @@ -0,0 +1,17 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class UnknownLocationWarning(Warning): + """The location is set to an unknown value.""" diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index cf13084610..7d9a452f42 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -13,10 +13,13 @@ # limitations under the License. import re +import warnings import pytest +import bigframes import bigframes._config.bigquery_options as bigquery_options +import bigframes.exceptions @pytest.mark.parametrize( @@ -78,3 +81,51 @@ def test_setter_if_session_started_but_setting_the_same_value(attribute): setattr(options, attribute, original_object) assert getattr(options, attribute) is original_object + + +@pytest.mark.parametrize( + [ + "valid_location", + ], + [ + (None,), + ("us-central1",), + ], +) +def test_location_set_to_valid_no_warning(valid_location): + options = bigquery_options.BigQueryOptions() + # Ensure that no warnings are emitted. + # https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html#additional-use-cases-of-warnings-in-tests + with warnings.catch_warnings(): + # Turn matching UnknownLocationWarning into exceptions. + # https://docs.python.org/3/library/warnings.html#warning-filter + warnings.simplefilter( + "error", category=bigframes.exceptions.UnknownLocationWarning + ) + options.location = valid_location + + +@pytest.mark.parametrize( + [ + "invalid_location", + ], + [ + # Test with common mistakes, see article. + # https://en.wikipedia.org/wiki/Edit_distance#Formal_definition_and_properties + # Substitution + ("us-wist-3",), + # Insertion + ("us-central-1",), + # Deletion + ("asia-suth2",), + ], +) +def test_location_set_to_invalid_warning(invalid_location): + options = bigquery_options.BigQueryOptions() + with pytest.warns( + bigframes.exceptions.UnknownLocationWarning, + match=re.escape( + f"The location '{invalid_location}' is set to an unknown value." + ), + ): + options.location = invalid_location From 9d205aecb77f35baeec82a8f6e1b72c2d852ca46 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 18 Apr 2024 15:31:03 -0700 Subject: [PATCH 09/15] fix: Use exact median implementation by default (#619) --- bigframes/core/block_transforms.py | 3 +- bigframes/core/groupby/__init__.py | 8 ++--- bigframes/dataframe.py | 10 +++---- bigframes/series.py | 2 +- tests/system/small/test_series.py | 29 +++++++++++++++---- .../bigframes_vendored/pandas/core/frame.py | 12 ++++---- .../pandas/core/groupby/__init__.py | 7 ++--- .../bigframes_vendored/pandas/core/series.py | 8 ++--- 8 files changed, 48 insertions(+), 31 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 562689a736..a221b343a5 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -111,6 +111,7 @@ def quantile( columns: Sequence[str], qs: Sequence[float], grouping_column_ids: Sequence[str] = (), + dropna: bool = False, ) -> blocks.Block: # TODO: handle windowing and more interpolation methods window = core.WindowSpec( @@ -134,7 +135,7 @@ def quantile( block, results = block.aggregate( grouping_column_ids, tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols), - dropna=True, + dropna=dropna, ) return block.select_columns(results).with_column_labels(labels) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 0f53342352..05b1cc7f41 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame: self._raise_on_non_numeric("mean") return self._aggregate_all(agg_ops.mean_op, numeric_only=True) - def median( - self, numeric_only: bool = False, *, exact: bool = False - ) -> df.DataFrame: + def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("median") if exact: @@ -138,6 +136,7 @@ def quantile( q_cols, qs=tuple(q) if multi_q else (q,), # type: ignore grouping_column_ids=self._by_col_ids, + dropna=self._dropna, ) result_df = df.DataFrame(result) if multi_q: @@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series: def median( self, *args, - exact: bool = False, + exact: bool = True, **kwargs, ) -> series.Series: if exact: @@ -508,6 +507,7 @@ def quantile( (self._value_column,), qs=tuple(q) if multi_q else (q,), # type: ignore grouping_column_ids=self._by_col_ids, + dropna=self._dropna, ) if multi_q: return series.Series(result.stack()) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 11e592542c..ff8404761c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1995,18 +1995,16 @@ def mean( return bigframes.series.Series(block.select_column("values")) def median( - self, *, numeric_only: bool = False, exact: bool = False + self, *, numeric_only: bool = False, exact: bool = True ) -> bigframes.series.Series: - if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) if not numeric_only: frame = self._raise_on_non_numeric("median") else: frame = self._drop_non_numeric() if exact: - return self.quantile() + result = frame.quantile() + result.name = None + return result else: block = frame._block.aggregate_all_and_stack(agg_ops.median_op) return bigframes.series.Series(block.select_column("values")) diff --git a/bigframes/series.py b/bigframes/series.py index b834411bce..47acfd0afb 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -966,7 +966,7 @@ def mode(self) -> Series: def mean(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.mean_op)) - def median(self, *, exact: bool = False) -> float: + def median(self, *, exact: bool = True) -> float: if exact: return typing.cast(float, self.quantile(0.5)) else: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 87267696ba..9cb615fdcb 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1345,10 +1345,9 @@ def test_numeric_literal(scalars_dfs): scalars_df, _ = scalars_dfs col_name = "numeric_col" assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9)) - bf_result = scalars_df[col_name] - scalars_df[col_name].median() + bf_result = scalars_df[col_name] + 42 assert bf_result.size == scalars_df[col_name].size - # TODO(b/323387826): The precision increased by 1 unexpectedly. - # assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) def test_repr(scalars_dfs): @@ -1523,12 +1522,32 @@ def test_groupby_mean(scalars_dfs): ) -def test_groupby_median(scalars_dfs): +def test_groupby_median_exact(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_series = ( + bf_result = ( scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median() ) + pd_result = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .median() + ) + + assert_series_equal( + pd_result, + bf_result.to_pandas(), + ) + + +def test_groupby_median_inexact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name] + .groupby(scalars_df["string_col"], dropna=False) + .median(exact=False) + ) pd_max = ( scalars_pandas_df[col_name] .groupby(scalars_pandas_df["string_col"], dropna=False) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f06128f150..0515f690e3 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def median(self, *, numeric_only: bool = False, exact: bool = False): + def median(self, *, numeric_only: bool = False, exact: bool = True): """Return the median of the values over colunms. **Examples:** @@ -4500,15 +4500,15 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): Finding the median value of each column. >>> df.median() - A 1 - B 2 - dtype: Int64 + A 2.0 + B 3.0 + dtype: Float64 Args: numeric_only (bool. default False): Default False. Include only float, int, boolean columns. - exact (bool. default False): - Default False. Get the exact median instead of an approximate + exact (bool. default True): + Default True. Get the exact median instead of an approximate one. Returns: diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 7347963d17..f3f7748e34 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -68,7 +68,7 @@ def median( self, numeric_only: bool = False, *, - exact: bool = False, + exact: bool = True, ): """ Compute median of groups, excluding missing values. @@ -76,9 +76,8 @@ def median( Args: numeric_only (bool, default False): Include only float, int, boolean columns. - exact (bool, default False): - Calculate the exact median instead of an approximation. Note: - ``exact=True`` is not supported. + exact (bool, default True): + Calculate the exact median instead of an approximation. Returns: pandas.Series or pandas.DataFrame: Median of groups. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a5e14c5b1c..0c5b8d4521 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3150,13 +3150,13 @@ def mean(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def median(self, *, exact: bool = False): + def median(self, *, exact: bool = True): """Return the median of the values over the requested axis. Args: - exact (bool. default False): - Default False. Get the exact median instead of an approximate - one. Note: ``exact=True`` not yet supported. + exact (bool. default True): + Default True. Get the exact median instead of an approximate + one. Returns: scalar: Scalar. From 240a1ac6fa914550bb6216cd5d179a36009f2657 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 19 Apr 2024 21:54:50 +0000 Subject: [PATCH 10/15] feat: expose `max_batching_rows` in `remote_function` (#622) * feat: expose `max_batching_rows` in `remote_function` * fix option formation, add tests * fix type annotation * assert max_batching_rows after routing creation * add forgotten assert --- bigframes/functions/remote_function.py | 46 +++++++++++++++++++--- bigframes/pandas/__init__.py | 2 + bigframes/session/__init__.py | 11 ++++++ tests/system/large/test_remote_function.py | 36 +++++++++++++++++ 4 files changed, 89 insertions(+), 6 deletions(-) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 178c911591..f866575a26 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -145,7 +145,13 @@ def __init__( self._cloud_function_docker_repository = cloud_function_docker_repository def create_bq_remote_function( - self, input_args, input_types, output_type, endpoint, bq_function_name + self, + input_args, + input_types, + output_type, + endpoint, + bq_function_name, + max_batching_rows, ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" @@ -169,14 +175,25 @@ def create_bq_remote_function( bq_function_args.append( f"{name} {third_party_ibis_bqtypes.BigQueryType.from_ibis(input_types[idx])}" ) + + remote_function_options = { + "endpoint": endpoint, + "max_batching_rows": max_batching_rows, + } + + remote_function_options_str = ", ".join( + [ + f'{key}="{val}"' if isinstance(val, str) else f"{key}={val}" + for key, val in remote_function_options.items() + if val is not None + ] + ) + create_function_ddl = f""" CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) RETURNS {bq_function_return_type} REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` - OPTIONS ( - endpoint = "{endpoint}", - max_batching_rows = 1000 - )""" + OPTIONS ({remote_function_options_str})""" logger.info(f"Creating BQ remote function: {create_function_ddl}") @@ -438,6 +455,7 @@ def provision_bq_remote_function( reuse, name, package_requirements, + max_batching_rows, ): """Provision a BigQuery remote function.""" # If reuse of any existing function with the same name (indicated by the @@ -485,7 +503,12 @@ def provision_bq_remote_function( "Exactly one type should be provided for every input arg." ) self.create_bq_remote_function( - input_args, input_types, output_type, cf_endpoint, remote_function_name + input_args, + input_types, + output_type, + cf_endpoint, + remote_function_name, + max_batching_rows, ) else: logger.info(f"Remote function {remote_function_name} already exists.") @@ -607,6 +630,7 @@ def remote_function( cloud_function_service_account: Optional[str] = None, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -723,6 +747,15 @@ def remote_function( projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. + max_batching_rows (int, Optional): + The maximum number of rows to be batched for processing in the + BQ remote function. Default value is 1000. A lower number can be + passed to avoid timeouts in case the user code is too complex to + process large number of rows fast enough. A higher number can be + used to increase throughput in case the user code is fast enough. + `None` can be passed to let BQ remote functions service apply + default batching. See for more details + https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. """ import bigframes.pandas as bpd @@ -846,6 +879,7 @@ def wrapper(f): reuse, name, packages, + max_batching_rows, ) # TODO: Move ibis logic to compiler step diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 91c3eb603b..96af6ab1b3 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -643,6 +643,7 @@ def remote_function( cloud_function_service_account: Optional[str] = None, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -656,6 +657,7 @@ def remote_function( cloud_function_service_account=cloud_function_service_account, cloud_function_kms_key_name=cloud_function_kms_key_name, cloud_function_docker_repository=cloud_function_docker_repository, + max_batching_rows=max_batching_rows, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index b6d56006be..64bcebb6cc 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1541,6 +1541,7 @@ def remote_function( cloud_function_service_account: Optional[str] = None, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1635,6 +1636,15 @@ def remote_function( projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. + max_batching_rows (int, Optional): + The maximum number of rows to be batched for processing in the + BQ remote function. Default value is 1000. A lower number can be + passed to avoid timeouts in case the user code is too complex to + process large number of rows fast enough. A higher number can be + used to increase throughput in case the user code is fast enough. + `None` can be passed to let BQ remote functions service apply + default batching. See for more details + https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. Returns: callable: A remote function object pointing to the cloud assets created in the background to support the remote execution. The cloud assets can be @@ -1656,6 +1666,7 @@ def remote_function( cloud_function_service_account=cloud_function_service_account, cloud_function_kms_key_name=cloud_function_kms_key_name, cloud_function_docker_repository=cloud_function_docker_repository, + max_batching_rows=max_batching_rows, ) def read_gbq_function( diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index cf6b2a01f8..ec9acc292e 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1300,3 +1300,39 @@ def square_num(x): cleanup_remote_function_assets( session.bqclient, session.cloudfunctionsclient, square_num ) + + +@pytest.mark.parametrize( + ("max_batching_rows"), + [ + 10_000, + None, + ], +) +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_max_batching_rows(session, scalars_dfs, max_batching_rows): + try: + + def square(x): + return x * x + + square_remote = session.remote_function( + [int], int, reuse=False, max_batching_rows=max_batching_rows + )(square) + + bq_routine = session.bqclient.get_routine( + square_remote.bigframes_remote_function + ) + assert bq_routine.remote_function_options.max_batching_rows == max_batching_rows + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["int64_too"].apply(square_remote).to_pandas() + pd_result = scalars_pandas_df["int64_too"].apply(square) + + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square_remote + ) From b66e3e6b221ea18d944ac478330bb009fe1a2c93 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 22 Apr 2024 18:03:25 +0000 Subject: [PATCH 11/15] chore: add synthetic data generation notebook (#615) * chore: add synthetic data generation notebook * markdown improvements * add copyright header * add a title --- .../apps/synthetic_data_generation.ipynb | 1133 +++++++++++++++++ noxfile.py | 4 + 2 files changed, 1137 insertions(+) create mode 100644 notebooks/apps/synthetic_data_generation.ipynb diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb new file mode 100644 index 0000000000..a6e8444aac --- /dev/null +++ b/notebooks/apps/synthetic_data_generation.ipynb @@ -0,0 +1,1133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigQuery DataFrames: Synthetic Data Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to BigQuery DataFrames (installing which also installs `pandas` as a dependency) we will use\n", + "`faker` library as a building block for synthetic data generation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "suoG7eWDZARj", + "outputId": "b5c620a9-8f5b-413f-dd38-93448f941846" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting faker\n", + " Downloading Faker-24.9.0-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣鈹佲攣\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.4 in /usr/local/lib/python3.10/dist-packages (from faker) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.4->faker) (1.16.0)\n", + "Installing collected packages: faker\n", + "Successfully installed faker-24.9.0\n" + ] + } + ], + "source": [ + "!pip install faker" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "m3q1oeJALhsG" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "bpd.options.bigquery.project = PROJECT_ID" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use `GeminiTextGenerator` for our purpose, which is BigQuery DataFrame's state-of-the-art LLM integration at the time of writing this notebook (Apr 16 2024)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 69 + }, + "id": "lIYdn1woOS1n", + "outputId": "be474338-44c2-4ce0-955e-d525b8b9c84b" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/bigframes/session/__init__.py:1907: UserWarning: No explicit location is set, so using location US for the session.\n", + " return Session(context)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 3e8423da-737c-42e2-a3d2-d2180ca18579 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from bigframes.ml.llm import GeminiTextGenerator\n", + "\n", + "model = GeminiTextGenerator()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Craft a prompt for the LLM to indicate the schema of the desired data and hints for the code that could generate such data. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "id": "SSR-lLScLa95", + "outputId": "cbaec34e-6fa6-45b4-e54a-f11ca06b61e1" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d651d0bf-300c-4b1d-9e3c-03310b71287c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job c67b9bb9-2f3e-4b9e-b680-0b7b6e9d2279 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
prompt
0Write python code to generate a pandas datafra...
\n", + "

1 rows 脳 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " prompt\n", + "0 Write python code to generate a pandas datafra...\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"\"\"\\\n", + "Write python code to generate a pandas dataframe based on the requirements:\n", + " Column name: Name, type: string, Description: Latin American Names\n", + " Column name: Age, type: int\n", + " Column name: Gender, type: string, Description: Inclusive\n", + "\n", + "Note:\n", + " - Return the code only, no additional texts or comments\n", + " - Use faker library\n", + " - Generate 100 rows\n", + " - The final dataframe should be named 'result_df'.\n", + "\"\"\"\n", + "\n", + "df_prompt = bpd.DataFrame({\"prompt\" : [prompt]})\n", + "df_prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Be accommodating that LLM may not produce a runnable code in the first go and may need some nudging. We will retry by adding the failing code and the exception it throws as additional context in the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 277 + }, + "id": "miDe3K4GNvOo", + "outputId": "f2039e80-5ad7-4551-f8b2-7ef714a89d63" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d5c0725d-9070-4712-adfd-8a9bd86eefc3 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4eb581a3-7f97-411a-bee1-91e8c150cef4 is DONE. 8 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job f3d5503d-a3e7-49ce-b985-5ffbdbd856e3 is DONE. 2 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 8ef76041-f077-4a05-bc03-63e6983ef853 is DONE. 332 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "import pandas as pd\n", + "from faker import Faker\n", + "\n", + "fake = Faker('es_ES')\n", + "result_df = pd.DataFrame({\n", + " 'Name': [fake.name() for _ in range(100)],\n", + " 'Age': [fake.random_int(min=18, max=65) for _ in range(100)],\n", + " 'Gender': [fake.random_element(elements=['Male', 'Female', 'Non-binary']) for _ in range(100)]\n", + "})\n", + "\n" + ] + } + ], + "source": [ + "max_tries = 5\n", + "for i in range(max_tries):\n", + " # Get LLM generated code\n", + " df_result = model.predict(df_prompt)\n", + " llm_result = df_result['ml_generate_text_llm_result'].iloc[0]\n", + "\n", + " # Python code comes back as a markdown code block,\n", + " # remove the prefix \"```python\" and suffix \"```\"\n", + " code = llm_result[9:-3]\n", + " print(code)\n", + "\n", + " # Check if the generated code is runnable\n", + " try:\n", + " exec(code)\n", + " break\n", + " except Exception as ex:\n", + " print(ex)\n", + " error_context = f\"\"\"\n", + "Previous code:\n", + "{code}\n", + "\n", + "Had this exception:\n", + "{ex}\"\"\"\n", + "\n", + " # Update the prompt to help LLM correct error\n", + " df_prompt[\"prompt\"] += error_context\n", + "\n", + " # If we have exhausted max tries then stop trying\n", + " if i+1 == max_tries:\n", + " raise Exception(\"Failed to generate runnable code\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the generated code and verify that it produced the desired data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "GODcPwX2PBEu", + "outputId": "dec4c872-c464-49e4-cd7f-9442fc977d18" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"execution_context\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 100,\n \"samples\": [\n \"Renata Pla Cases\",\n \"Guiomar Carnero-Paz\",\n \"Luciano Garmendia\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13,\n \"min\": 18,\n \"max\": 64,\n \"num_unique_values\": 39,\n \"samples\": [\n 56,\n 31,\n 34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Male\",\n \"Non-binary\",\n \"Female\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeGender
0Pastora Acu帽a Company21Male
1Le贸n Reig-Salom39Non-binary
2Aura Tom谩s Llobet30Female
3Vicente Correa Palomar64Female
4Benito del Fuster34Female
............
95Eduardo Cabrera27Non-binary
96Nazaret de Izaguirre40Non-binary
97Manuela Agullo Bustamante27Female
98Eugenio Mateo Naranjo Blazquez36Non-binary
99Heriberto Vicens Baeza53Female
\n", + "

100 rows 脳 3 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " Name Age Gender\n", + "0 Pastora Acu帽a Company 21 Male\n", + "1 Le贸n Reig-Salom 39 Non-binary\n", + "2 Aura Tom谩s Llobet 30 Female\n", + "3 Vicente Correa Palomar 64 Female\n", + "4 Benito del Fuster 34 Female\n", + ".. ... ... ...\n", + "95 Eduardo Cabrera 27 Non-binary\n", + "96 Nazaret de Izaguirre 40 Non-binary\n", + "97 Manuela Agullo Bustamante 27 Female\n", + "98 Eugenio Mateo Naranjo Blazquez 36 Non-binary\n", + "99 Heriberto Vicens Baeza 53 Female\n", + "\n", + "[100 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "execution_context = {}\n", + "exec(code, execution_context)\n", + "execution_context.get(\"result_df\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to run this code at scale to generate since we want to generate large amount of data. Let's deploy a `remote_function` for this purpose." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 36 + }, + "id": "n-BsGciNqSwU", + "outputId": "996e5639-a49c-4542-a0dc-ede450e0eb6d" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'projects/bigframes-dev/locations/us-central1/functions/bigframes-19f2f35637098969770261a2974bef32'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@bpd.remote_function([int], str, packages=['faker', 'pandas'])\n", + "def data_generator(id):\n", + " context = {}\n", + " exec(code, context)\n", + " result_df = context.get(\"result_df\")\n", + " return result_df.to_json(orient=\"records\")\n", + "\n", + "data_generator.bigframes_cloud_function" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let鈥檚 say we want to generate 1 million rows of synthetic data. Since our generated code produces 100 rows in one run, we can initialize an indicator dataframe with 1M/100 = 10K indicator rows. Then we can apply the remote function to produce 100 synthetic data rows for each indicator row." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "id": "Odkmev9nsYqA", + "outputId": "4aa7a1fd-0c0d-4412-f326-a20e19f583b5" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Load job 40b9c3a8-27fc-40a8-9edf-4aa2e0fec332 is DONE. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "desired_num_rows = 1_000_000 # 1 million rows\n", + "batch_size = 100 # used in the prompt\n", + "num_batches = int(desired_num_rows/batch_size)\n", + "\n", + "df = bpd.DataFrame({\"row_id\": range(num_batches)})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "id": "UyBhlJFVsmQC", + "outputId": "29748df5-673b-4320-bb1f-53abaace3b81" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 9dd49b50-2dbf-4351-b9ad-b17aeb627caf is DONE. 240.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df[\"json_data\"] = df[\"row_id\"].apply(data_generator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point each item in `df[\"json_data\"]` is a json serialized array of 100 records. Let鈥檚 flatten that into 1 record per row using a direct SQL." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 932 + }, + "id": "6p3eM21qvRvy", + "outputId": "333f4e49-a555-4d2f-b527-02142782b3a7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 3f8d2133-b01d-402d-a731-79592810ca1c is DONE. 63.7 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4a613aa3-6323-4914-8e34-93323885d458 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0deb03be-725b-40b4-a7a1-1023b0477f35 is DONE. 40.1 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeGender
0Eloy Santiago-Arag贸n31Male
1Amanda Mata Abril20Non-binary
2Danilo Vel谩zquez Salcedo58Male
3Leyre Alba Espa帽a61Female
4Paulina Amores Pastor41Male
5Jorge Cuadrado Mena50Female
6Chucho Catal谩n36Non-binary
7Vidal Benavente Lerma38Male
8Clementina 脕lamo32Female
9Petrona Rosell贸-Valls61Male
10Lu铆s Camilo Sastre Marin45Male
11Gil Baudelio Carbajo Ord贸帽ez58Non-binary
12David del Donoso44Female
13Dolores Arnau Ros21Non-binary
14Febe de Le贸n46Non-binary
15Ariadna Almaz谩n34Female
16Blas Serna Aguil贸24Non-binary
17Paulino Barreda Almeida59Female
18Eligio Valc谩rcel Tormo35Non-binary
19To帽o Amador Torres Portillo48Female
20Florencia del Bejarano65Non-binary
21Cl铆maco Andreu G贸mez18Male
22Xiomara Dominguez Solana35Female
23Leire Castilla Borrego19Non-binary
24Angelita Garmendia Carpio21Non-binary
\n", + "

25 rows 脳 3 columns

\n", + "
[1000000 rows x 3 columns in total]" + ], + "text/plain": [ + " Name Age Gender\n", + "0 Eloy Santiago-Arag贸n 31 Male\n", + "1 Amanda Mata Abril 20 Non-binary\n", + "2 Danilo Vel谩zquez Salcedo 58 Male\n", + "3 Leyre Alba Espa帽a 61 Female\n", + "4 Paulina Amores Pastor 41 Male\n", + "5 Jorge Cuadrado Mena 50 Female\n", + "6 Chucho Catal谩n 36 Non-binary\n", + "7 Vidal Benavente Lerma 38 Male\n", + "8 Clementina 脕lamo 32 Female\n", + "9 Petrona Rosell贸-Valls 61 Male\n", + "10 Lu铆s Camilo Sastre Marin 45 Male\n", + "11 Gil Baudelio Carbajo Ord贸帽ez 58 Non-binary\n", + "12 David del Donoso 44 Female\n", + "13 Dolores Arnau Ros 21 Non-binary\n", + "14 Febe de Le贸n 46 Non-binary\n", + "15 Ariadna Almaz谩n 34 Female\n", + "16 Blas Serna Aguil贸 24 Non-binary\n", + "17 Paulino Barreda Almeida 59 Female\n", + "18 Eligio Valc谩rcel Tormo 35 Non-binary\n", + "19 To帽o Amador Torres Portillo 48 Female\n", + "20 Florencia del Bejarano 65 Non-binary\n", + "21 Cl铆maco Andreu G贸mez 18 Male\n", + "22 Xiomara Dominguez Solana 35 Female\n", + "23 Leire Castilla Borrego 19 Non-binary\n", + "24 Angelita Garmendia Carpio 21 Non-binary\n", + "...\n", + "\n", + "[1000000 rows x 3 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = f\"\"\"\n", + "WITH T0 AS ({df.sql}),\n", + "T1 AS (\n", + " SELECT PARSE_JSON(json_row) AS json_row\n", + " FROM T0, UNNEST(JSON_EXTRACT_ARRAY(json_data)) AS json_row\n", + ")\n", + "SELECT STRING(json_row.Name) AS Name,\n", + " INT64(json_row.Age) AS Age,\n", + " STRING(json_row.Gender) AS Gender\n", + "FROM T1\n", + "\"\"\"\n", + "df_result = bpd.read_gbq(sql)\n", + "df_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There you have it, 1 million synthetic data rows ready to use, or save them in a BigQuery table for future use." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/noxfile.py b/noxfile.py index 9479a7a318..91ad6bc0e6 100644 --- a/noxfile.py +++ b/noxfile.py @@ -723,6 +723,10 @@ def notebook(session: nox.Session): # The experimental notebooks imagine features that don't yet # exist or only exist as temporary prototypes. "notebooks/experimental/longer_ml_demo.ipynb", + # The notebooks that are added for more use cases, such as backing a + # blog post, which may take longer to execute and need not be + # continuously tested. + "notebooks/apps/synthetic_data_generation.ipynb", ] # Convert each Path notebook object to a string using a list comprehension. From d924ec2937c158644b5d1bbae4f82476de2c1655 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:20:14 -0700 Subject: [PATCH 12/15] feat: add `Series.struct.dtypes` property (#599) --- bigframes/operations/structs.py | 12 ++++++++ .../pandas/core/arrays/arrow/accessors.py | 29 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index e8a1af9602..d222f0993b 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -15,9 +15,11 @@ from __future__ import annotations import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors +import pandas as pd from bigframes.core import log_adapter import bigframes.dataframe +import bigframes.dtypes import bigframes.operations import bigframes.operations.base import bigframes.series @@ -45,3 +47,13 @@ def explode(self) -> bigframes.dataframe.DataFrame: return bigframes.pandas.concat( [self.field(i) for i in range(pa_type.num_fields)], axis="columns" ) + + def dtypes(self) -> pd.Series: + pa_type = self._dtype.pyarrow_dtype + return pd.Series( + data=[ + bigframes.dtypes.arrow_dtype_to_bigframes_dtype(pa_type.field(i).type) + for i in range(pa_type.num_fields) + ], + index=[pa_type.field(i).name for i in range(pa_type.num_fields)], + ) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 8e3ea06a3d..bd6e50d096 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -92,3 +92,32 @@ def explode(self): The data corresponding to all child fields. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def dtypes(self): + """ + Return the dtype object of each child field of the struct. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.dtypes() + version Int64 + project string[pyarrow] + dtype: object + + Returns: + A *pandas* Series with the data type of all child fields. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 70015b79e8cff16ff1b36c5e3f019fe099750a9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 22 Apr 2024 15:05:33 -0500 Subject: [PATCH 13/15] docs: set `index_cols` in `read_gbq` as a best practice (#624) --- .../bigframes_vendored/pandas/io/gbq.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index b5feeb13c5..c60a276338 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -27,13 +27,17 @@ def read_gbq( ): """Loads a DataFrame from BigQuery. - BigQuery tables are an unordered, unindexed data source. By default, - the DataFrame will have an arbitrary index and ordering. - - Set the `index_col` argument to one or more columns to choose an - index. The resulting DataFrame is sorted by the index columns. For the - best performance, ensure the index columns don't contain duplicate - values. + BigQuery tables are an unordered, unindexed data source. To add support + pandas-compatibility, the following indexing options are supported: + + * (Default behavior) Add an arbitrary sequential index and ordering + using an an analytic windowed operation that prevents filtering + push down. + * (Recommended) Set the ``index_col`` argument to one or more columns. + Unique values for the row labels are recommended. Duplicate labels + are possible, but note that joins on a non-unique index can duplicate + rows and operations like ``cumsum()`` that window across a non-unique + index can have some non-deternimism. .. note:: By default, even SQL query inputs with an ORDER BY clause create a From 75bb2409532e80de742030d05ffcbacacf5ffba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 22 Apr 2024 17:00:36 -0500 Subject: [PATCH 14/15] feat: support primary key(s) in `read_gbq` by using as the `index_col` by default (#625) * feat: support primary key(s) in `read_gbq` by using as the `index_col` by default * revert WIP commit * address type error in tests --- bigframes/session/__init__.py | 25 ++++++------ setup.py | 2 +- testing/constraints-3.9.txt | 2 +- tests/system/small/test_session.py | 13 +++---- tests/unit/resources.py | 7 ++-- tests/unit/session/test_session.py | 39 +++++++++++++++++++ .../bigframes_vendored/pandas/io/gbq.py | 3 ++ 7 files changed, 68 insertions(+), 23 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 64bcebb6cc..f3f1ffce16 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -708,13 +708,15 @@ def _get_snapshot_sql_and_primary_key( f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" ) - # TODO(b/305264153): Use public properties to fetch primary keys once - # added to google-cloud-bigquery. - primary_keys = ( - table._properties.get("tableConstraints", {}) - .get("primaryKey", {}) - .get("columns") - ) + primary_keys = None + if ( + (table_constraints := getattr(table, "table_constraints", None)) is not None + and (primary_key := table_constraints.primary_key) is not None + # This will be False for either None or empty list. + # We want primary_keys = None if no primary keys are set. + and (columns := primary_key.columns) + ): + primary_keys = columns job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name @@ -777,12 +779,13 @@ def _read_gbq_table( query, default_project=self.bqclient.project ) - ( - table_expression, - total_ordering_cols, - ) = self._get_snapshot_sql_and_primary_key( + (table_expression, primary_keys,) = self._get_snapshot_sql_and_primary_key( table_ref, api_name=api_name, use_cache=use_cache ) + total_ordering_cols = primary_keys + + if not index_col and primary_keys is not None: + index_col = primary_keys for key in columns: if key not in table_expression.columns: diff --git a/setup.py b/setup.py index 83049f9715..2ccf63259c 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ "gcsfs >=2023.3.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0dev", - "google-cloud-bigquery[bqstorage,pandas] >=3.10.0", + "google-cloud-bigquery[bqstorage,pandas] >=3.16.0", "google-cloud-functions >=1.12.0", "google-cloud-bigquery-connection >=1.12.0", "google-cloud-iam >=2.12.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 1e1f3a3e66..f5007ed564 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -4,7 +4,7 @@ fsspec==2023.3.0 gcsfs==2023.3.0 geopandas==0.12.2 google-auth==2.15.0 -google-cloud-bigquery==3.10.0 +google-cloud-bigquery==3.16.0 google-cloud-functions==1.12.0 google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index ce415f9324..1e76a8bd8b 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -236,14 +236,13 @@ def test_read_gbq_w_anonymous_query_results_table(session: bigframes.Session): def test_read_gbq_w_primary_keys_table( session: bigframes.Session, usa_names_grouped_table: bigquery.Table ): + # Validate that the table we're querying has a primary key. table = usa_names_grouped_table - # TODO(b/305264153): Use public properties to fetch primary keys once - # added to google-cloud-bigquery. - primary_keys = ( - table._properties.get("tableConstraints", {}) - .get("primaryKey", {}) - .get("columns") - ) + table_constraints = table.table_constraints + assert table_constraints is not None + primary_key = table_constraints.primary_key + assert primary_key is not None + primary_keys = primary_key.columns assert len(primary_keys) != 0 df = session.read_gbq(f"{table.project}.{table.dataset_id}.{table.table_id}") diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 6846659930..28b08e49dc 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -13,7 +13,7 @@ # limitations under the License. import datetime -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Sequence import unittest.mock as mock import google.auth.credentials @@ -37,6 +37,7 @@ def create_bigquery_session( bqclient: Optional[mock.Mock] = None, session_id: str = "abcxyz", + table_schema: Sequence[google.cloud.bigquery.SchemaField] = TEST_SCHEMA, anonymous_dataset: Optional[google.cloud.bigquery.DatasetReference] = None, ) -> bigframes.Session: credentials = mock.create_autospec( @@ -51,7 +52,7 @@ def create_bigquery_session( table = mock.create_autospec(google.cloud.bigquery.Table, instance=True) table._properties = {} type(table).location = mock.PropertyMock(return_value="test-region") - type(table).schema = mock.PropertyMock(return_value=TEST_SCHEMA) + type(table).schema = mock.PropertyMock(return_value=table_schema) bqclient.get_table.return_value = table if anonymous_dataset is None: @@ -72,7 +73,7 @@ def query_mock(query, *args, **kwargs): if query.startswith("SELECT CURRENT_TIMESTAMP()"): query_job.result = mock.MagicMock(return_value=[[datetime.datetime.now()]]) else: - type(query_job).schema = mock.PropertyMock(return_value=TEST_SCHEMA) + type(query_job).schema = mock.PropertyMock(return_value=table_schema) return query_job diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 3e2b28c200..543196066a 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -19,9 +19,11 @@ import google.api_core.exceptions import google.cloud.bigquery +import google.cloud.bigquery.table import pytest import bigframes +import bigframes.exceptions from .. import resources @@ -50,6 +52,43 @@ def test_read_gbq_cached_table(): assert "1999-01-02T03:04:05.678901" in df.sql +def test_read_gbq_clustered_table_ok_default_index_with_primary_key(): + """If a primary key is set on the table, we use that as the index column + by default, no error should be raised in this case. + + See internal issue 335727141. + """ + table = google.cloud.bigquery.Table("my-project.my_dataset.my_table") + table.clustering_fields = ["col1", "col2"] + table.schema = ( + google.cloud.bigquery.SchemaField("pk_1", "INT64"), + google.cloud.bigquery.SchemaField("pk_2", "INT64"), + google.cloud.bigquery.SchemaField("col_1", "INT64"), + google.cloud.bigquery.SchemaField("col_2", "INT64"), + ) + + # TODO(b/305264153): use setter for table_constraints in client library + # when available. + table._properties["tableConstraints"] = { + "primaryKey": { + "columns": ["pk_1", "pk_2"], + }, + } + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + bqclient.get_table.return_value = table + session = resources.create_bigquery_session( + bqclient=bqclient, table_schema=table.schema + ) + table._properties["location"] = session._location + + df = session.read_gbq("my-project.my_dataset.my_table") + + # There should be no analytic operators to prevent row filtering pushdown. + assert "OVER" not in df.sql + assert tuple(df.index.names) == ("pk_1", "pk_2") + + @pytest.mark.parametrize( "not_found_table_id", [("unknown.dataset.table"), ("project.unknown.table"), ("project.dataset.unknown")], diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index c60a276338..93cee71289 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -109,6 +109,9 @@ def read_gbq( In tha case, will read all the matched table as one DataFrame. index_col (Iterable[str] or str): Name of result column(s) to use for index in results DataFrame. + + **New in bigframes version 1.3.0**: If ``index_cols`` is not + set, the primary key(s) of the table are used as the index. columns (Iterable[str]): List of BigQuery column names in the desired order for results DataFrame. From 7227a6af37a3c0553db2d9a5a6d86c1e37d33b21 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 23:16:17 +0000 Subject: [PATCH 15/15] chore(main): release 1.3.0 (#617) :robot: I have created a release *beep* *boop* --- ## [1.3.0](https://togithub.com/googleapis/python-bigquery-dataframes/compare/v1.2.0...v1.3.0) (2024-04-22) ### Features * Add `Series.struct.dtypes` property ([#599](https://togithub.com/googleapis/python-bigquery-dataframes/issues/599)) ([d924ec2](https://togithub.com/googleapis/python-bigquery-dataframes/commit/d924ec2937c158644b5d1bbae4f82476de2c1655)) * Add fine tuning `fit()` for Palm2TextGenerator ([#616](https://togithub.com/googleapis/python-bigquery-dataframes/issues/616)) ([9c106bd](https://togithub.com/googleapis/python-bigquery-dataframes/commit/9c106bd24482620ef5ff3c85f94be9da76c49716)) * Add quantile statistic ([#613](https://togithub.com/googleapis/python-bigquery-dataframes/issues/613)) ([bc82804](https://togithub.com/googleapis/python-bigquery-dataframes/commit/bc82804da43c03c2311cd56f47a2316d3aae93d2)) * Expose `max_batching_rows` in `remote_function` ([#622](https://togithub.com/googleapis/python-bigquery-dataframes/issues/622)) ([240a1ac](https://togithub.com/googleapis/python-bigquery-dataframes/commit/240a1ac6fa914550bb6216cd5d179a36009f2657)) * Support primary key(s) in `read_gbq` by using as the `index_col` by default ([#625](https://togithub.com/googleapis/python-bigquery-dataframes/issues/625)) ([75bb240](https://togithub.com/googleapis/python-bigquery-dataframes/commit/75bb2409532e80de742030d05ffcbacacf5ffba2)) * Warn if location is set to unknown location ([#609](https://togithub.com/googleapis/python-bigquery-dataframes/issues/609)) ([3706b4f](https://togithub.com/googleapis/python-bigquery-dataframes/commit/3706b4f9dde65788b5e6343a6428fb1866499461)) ### Bug Fixes * Address technical writers fb ([#611](https://togithub.com/googleapis/python-bigquery-dataframes/issues/611)) ([9f8f181](https://togithub.com/googleapis/python-bigquery-dataframes/commit/9f8f181279133abdb7da3aa045df6fa278587013)) * Infer narrowest numeric type when combining numeric columns ([#602](https://togithub.com/googleapis/python-bigquery-dataframes/issues/602)) ([8f9ece6](https://togithub.com/googleapis/python-bigquery-dataframes/commit/8f9ece6d13f57f02d677bf0e3fea97dea94ae240)) * Use exact median implementation by default ([#619](https://togithub.com/googleapis/python-bigquery-dataframes/issues/619)) ([9d205ae](https://togithub.com/googleapis/python-bigquery-dataframes/commit/9d205aecb77f35baeec82a8f6e1b72c2d852ca46)) ### Documentation * Fix rendering of examples for multiple apis ([#620](https://togithub.com/googleapis/python-bigquery-dataframes/issues/620)) ([9665e39](https://togithub.com/googleapis/python-bigquery-dataframes/commit/9665e39ef288841f03a9d823bd2210ef58394ad3)) * Set `index_cols` in `read_gbq` as a best practice ([#624](https://togithub.com/googleapis/python-bigquery-dataframes/issues/624)) ([70015b7](https://togithub.com/googleapis/python-bigquery-dataframes/commit/70015b79e8cff16ff1b36c5e3f019fe099750a9d)) --- This PR was generated with [Release Please](https://togithub.com/googleapis/release-please). See [documentation](https://togithub.com/googleapis/release-please#release-please). --- CHANGELOG.md | 25 +++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3314c976e..a96c902835 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,31 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.3.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.2.0...v1.3.0) (2024-04-22) + + +### Features + +* Add `Series.struct.dtypes` property ([#599](https://github.com/googleapis/python-bigquery-dataframes/issues/599)) ([d924ec2](https://github.com/googleapis/python-bigquery-dataframes/commit/d924ec2937c158644b5d1bbae4f82476de2c1655)) +* Add fine tuning `fit()` for Palm2TextGenerator ([#616](https://github.com/googleapis/python-bigquery-dataframes/issues/616)) ([9c106bd](https://github.com/googleapis/python-bigquery-dataframes/commit/9c106bd24482620ef5ff3c85f94be9da76c49716)) +* Add quantile statistic ([#613](https://github.com/googleapis/python-bigquery-dataframes/issues/613)) ([bc82804](https://github.com/googleapis/python-bigquery-dataframes/commit/bc82804da43c03c2311cd56f47a2316d3aae93d2)) +* Expose `max_batching_rows` in `remote_function` ([#622](https://github.com/googleapis/python-bigquery-dataframes/issues/622)) ([240a1ac](https://github.com/googleapis/python-bigquery-dataframes/commit/240a1ac6fa914550bb6216cd5d179a36009f2657)) +* Support primary key(s) in `read_gbq` by using as the `index_col` by default ([#625](https://github.com/googleapis/python-bigquery-dataframes/issues/625)) ([75bb240](https://github.com/googleapis/python-bigquery-dataframes/commit/75bb2409532e80de742030d05ffcbacacf5ffba2)) +* Warn if location is set to unknown location ([#609](https://github.com/googleapis/python-bigquery-dataframes/issues/609)) ([3706b4f](https://github.com/googleapis/python-bigquery-dataframes/commit/3706b4f9dde65788b5e6343a6428fb1866499461)) + + +### Bug Fixes + +* Address technical writers fb ([#611](https://github.com/googleapis/python-bigquery-dataframes/issues/611)) ([9f8f181](https://github.com/googleapis/python-bigquery-dataframes/commit/9f8f181279133abdb7da3aa045df6fa278587013)) +* Infer narrowest numeric type when combining numeric columns ([#602](https://github.com/googleapis/python-bigquery-dataframes/issues/602)) ([8f9ece6](https://github.com/googleapis/python-bigquery-dataframes/commit/8f9ece6d13f57f02d677bf0e3fea97dea94ae240)) +* Use exact median implementation by default ([#619](https://github.com/googleapis/python-bigquery-dataframes/issues/619)) ([9d205ae](https://github.com/googleapis/python-bigquery-dataframes/commit/9d205aecb77f35baeec82a8f6e1b72c2d852ca46)) + + +### Documentation + +* Fix rendering of examples for multiple apis ([#620](https://github.com/googleapis/python-bigquery-dataframes/issues/620)) ([9665e39](https://github.com/googleapis/python-bigquery-dataframes/commit/9665e39ef288841f03a9d823bd2210ef58394ad3)) +* Set `index_cols` in `read_gbq` as a best practice ([#624](https://github.com/googleapis/python-bigquery-dataframes/issues/624)) ([70015b7](https://github.com/googleapis/python-bigquery-dataframes/commit/70015b79e8cff16ff1b36c5e3f019fe099750a9d)) + ## [1.2.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.1.0...v1.2.0) (2024-04-15) diff --git a/bigframes/version.py b/bigframes/version.py index ec2105b648..1f103401e4 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.2.0" +__version__ = "1.3.0"