From bbbd21ea0d8c5fa13ba66877ce28d20247884afe Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 18 Sep 2023 16:38:16 +0000 Subject: [PATCH 01/24] chore: use correct SCM name for release trigger (#29) Closes #19 --- .github/release-trigger.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/release-trigger.yml b/.github/release-trigger.yml index b0a6cadefc..4fbd4aa427 100644 --- a/.github/release-trigger.yml +++ b/.github/release-trigger.yml @@ -1,2 +1,2 @@ enabled: true -multiScmName: bigframes +multiScmName: python-bigquery-dataframes From 69e51a67fb4aba9001fd5ea2b25cfcd6a6fdb80f Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 18 Sep 2023 23:12:13 +0000 Subject: [PATCH 02/24] chore: enforce use of conventional commits (#31) This will prevent accidental merging of commits that release-please can't handle. --- .github/sync-repo-settings.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml index b7dae76ba3..cfa62f787c 100644 --- a/.github/sync-repo-settings.yaml +++ b/.github/sync-repo-settings.yaml @@ -7,6 +7,7 @@ branchProtectionRules: requiresCodeOwnerReviews: true requiresStrictStatusChecks: true requiredStatusCheckContexts: + - 'conventionalcommits.org' - 'cla/google' - 'OwlBot Post Processor' - 'docs' From c1900c29a44199d5d8d036d6d842b4f00448fa79 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 20 Sep 2023 16:26:56 +0000 Subject: [PATCH 03/24] chore: sync internal changes to GitHub (#34) feat: support `optimize_strategy` parameter in `bigframes.ml.linear_model.LinearRegression` feat: support `l2_reg` parameter in `bigframes.ml.linear_model.LinearRegression` feat: support `max_iterations` parameter in `bigframes.ml.linear_model.LinearRegression` feat: support `learn_rate_strategy` parameter in `bigframes.ml.linear_model.LinearRegression` feat: support `early_stop` parameter in `bigframes.ml.linear_model.LinearRegression` feat: support `min_rel_progress` parameter in `bigframes.ml.linear_model.LinearRegression` feat: support `ls_init_learn_rate` parameter in `bigframes.ml.linear_model.LinearRegression` feat: support `calculate_p_values` parameter in `bigframes.ml.linear_model.LinearRegression` feat: support `enable_global_explain` parameter in `bigframes.ml.linear_model.LinearRegression` test: add golden SQL test for logistic model test: extend ml golden sql test linear_reg docs: link to Remote Functions code samples from README and API reference feat: support `df[column_name] = df_only_one_column` feat: add `DataFrame.rolling` and `DataFrame.expanding` methods feat: add `DataFrame.kurtosis` / `DF.kurt` method feat: support `class_weights="balanced"` in `LogisticRegression` model --- README.rst | 4 +- bigframes/core/__init__.py | 21 +-- bigframes/core/block_transforms.py | 101 +++++++++++++- bigframes/core/blocks.py | 12 +- bigframes/core/groupby/__init__.py | 70 +++++++++- bigframes/core/window/__init__.py | 50 ++++--- bigframes/dataframe.py | 37 +++++ bigframes/ml/core.py | 4 +- bigframes/ml/ensemble.py | 4 +- bigframes/ml/linear_model.py | 79 +++++++++-- bigframes/operations/__init__.py | 11 ++ bigframes/series.py | 4 +- bigframes/session.py | 3 +- tests/system/large/ml/test_linear_model.py | 93 ++++++++----- tests/system/small/test_dataframe.py | 48 +++++++ tests/system/small/test_groupby.py | 24 ++++ tests/system/small/test_window.py | 42 +++++- tests/unit/ml/test_golden_sql.py | 130 ++++++++++++++++-- .../bigframes_vendored/pandas/core/frame.py | 15 ++ .../bigframes_vendored/pandas/core/generic.py | 55 ++++++++ .../pandas/core/groupby/__init__.py | 21 +++ .../bigframes_vendored/pandas/core/series.py | 55 -------- .../sklearn/linear_model/_base.py | 22 ++- .../sklearn/linear_model/_logistic.py | 11 +- 24 files changed, 745 insertions(+), 171 deletions(-) diff --git a/README.rst b/README.rst index 23aea446ff..77c42e4325 100644 --- a/README.rst +++ b/README.rst @@ -241,7 +241,9 @@ Remote functions BigQuery DataFrames gives you the ability to turn your custom scalar functions into `BigQuery remote functions `_ . Creating a remote -function in BigQuery DataFrames creates a BigQuery remote function, a `BigQuery +function in BigQuery DataFrames (See `code samples +`_) +creates a BigQuery remote function, a `BigQuery connection `_ , and a `Cloud Functions (2nd gen) function diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 27fe4a4fe6..8e7beb73db 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -600,7 +600,7 @@ def project_window_op( window_spec: WindowSpec, output_name=None, *, - skip_null_groups=False, + never_skip_nulls=False, skip_reproject_unsafe: bool = False, ) -> ArrayValue: """ @@ -609,7 +609,7 @@ def project_window_op( op: the windowable operator to apply to the input column window_spec: a specification of the window over which to apply the operator output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided - skip_null_groups: will filter out any rows where any of the grouping keys is null + never_skip_nulls: will disable null skipping for operators that would otherwise do so skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection """ column = typing.cast(ibis_types.Column, self.get_column(column_name)) @@ -618,20 +618,23 @@ def project_window_op( window_op = op._as_ibis(column, window) clauses = [] - if op.skips_nulls: + if op.skips_nulls and not never_skip_nulls: clauses.append((column.isnull(), ibis.NA)) - if skip_null_groups: - for key in window_spec.grouping_keys: - clauses.append((self.get_column(key).isnull(), ibis.NA)) if window_spec.min_periods: + if op.skips_nulls: + # Most operations do not count NULL values towards min_periods + observation_count = agg_ops.count_op._as_ibis(column, window) + else: + # Operations like count treat even NULLs as valid observations for the sake of min_periods + # notnull is just used to convert null values to non-null (FALSE) values to be counted + denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) + observation_count = agg_ops.count_op._as_ibis(denulled_value, window) clauses.append( ( - agg_ops.count_op._as_ibis(column, window) - < ibis_types.literal(window_spec.min_periods), + observation_count < ibis_types.literal(window_spec.min_periods), ibis.NA, ) ) - if clauses: case_statement = ibis.case() for clause in clauses: diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 5dcd9fe753..da6ba65b8a 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -355,6 +355,46 @@ def skew( return block +def kurt( + block: blocks.Block, + skew_column_ids: typing.Sequence[str], + grouping_column_ids: typing.Sequence[str] = (), +) -> blocks.Block: + original_columns = skew_column_ids + column_labels = block.select_columns(original_columns).column_labels + + block, delta4_ids = _mean_delta_to_power( + block, 4, original_columns, grouping_column_ids + ) + # counts, moment4 for each column + aggregations = [] + for i, col in enumerate(original_columns): + count_agg = (col, agg_ops.count_op) + moment4_agg = (delta4_ids[i], agg_ops.mean_op) + variance_agg = (col, agg_ops.PopVarOp()) + aggregations.extend([count_agg, moment4_agg, variance_agg]) + + block, agg_ids = block.aggregate( + by_column_ids=grouping_column_ids, aggregations=aggregations + ) + + kurt_ids = [] + for i, col in enumerate(original_columns): + # Corresponds to order of aggregations in preceding loop + count_id, moment4_id, var_id = agg_ids[i * 3 : (i * 3) + 3] + block, kurt_id = _kurt_from_moments_and_count( + block, count_id, moment4_id, var_id + ) + kurt_ids.append(kurt_id) + + block = block.select_columns(kurt_ids).with_column_labels(column_labels) + if not grouping_column_ids: + # When ungrouped, stack everything into single column so can be returned as series + block = block.stack() + block = block.drop_levels([block.index_columns[0]]) + return block + + def _mean_delta_to_power( block: blocks.Block, n_power, @@ -375,13 +415,13 @@ def _mean_delta_to_power( def _skew_from_moments_and_count( - block: blocks.Block, count_id: str, moment3_id: str, var_id: str + block: blocks.Block, count_id: str, moment3_id: str, moment2_id: str ) -> typing.Tuple[blocks.Block, str]: # Calculate skew using count, third moment and population variance # See G1 estimator: # https://en.wikipedia.org/wiki/Skewness#Sample_skewness block, denominator_id = block.apply_unary_op( - var_id, ops.partial_right(ops.pow_op, 3 / 2) + moment2_id, ops.partial_right(ops.unsafe_pow_op, 3 / 2) ) block, base_id = block.apply_binary_op(moment3_id, denominator_id, ops.div_op) block, countminus1_id = block.apply_unary_op( @@ -392,7 +432,7 @@ def _skew_from_moments_and_count( ) block, adjustment_id = block.apply_binary_op(count_id, countminus1_id, ops.mul_op) block, adjustment_id = block.apply_unary_op( - adjustment_id, ops.partial_right(ops.pow_op, 1 / 2) + adjustment_id, ops.partial_right(ops.unsafe_pow_op, 1 / 2) ) block, adjustment_id = block.apply_binary_op( adjustment_id, countminus2_id, ops.div_op @@ -405,3 +445,58 @@ def _skew_from_moments_and_count( skew_id, na_cond_id, ops.partial_arg3(ops.where_op, None) ) return block, skew_id + + +def _kurt_from_moments_and_count( + block: blocks.Block, count_id: str, moment4_id: str, moment2_id: str +) -> typing.Tuple[blocks.Block, str]: + # Kurtosis is often defined as the second standardize moment: moment(4)/moment(2)**2 + # Pandas however uses Fisher’s estimator, implemented below + # numerator = (count + 1) * (count - 1) * moment4 + # denominator = (count - 2) * (count - 3) * moment2**2 + # adjustment = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) + # kurtosis = (numerator / denominator) - adjustment + + # Numerator + block, countminus1_id = block.apply_unary_op( + count_id, ops.partial_right(ops.sub_op, 1) + ) + block, countplus1_id = block.apply_unary_op( + count_id, ops.partial_right(ops.add_op, 1) + ) + block, num_adj = block.apply_binary_op(countplus1_id, countminus1_id, ops.mul_op) + block, numerator_id = block.apply_binary_op(moment4_id, num_adj, ops.mul_op) + + # Denominator + block, countminus2_id = block.apply_unary_op( + count_id, ops.partial_right(ops.sub_op, 2) + ) + block, countminus3_id = block.apply_unary_op( + count_id, ops.partial_right(ops.sub_op, 3) + ) + block, denom_adj = block.apply_binary_op(countminus2_id, countminus3_id, ops.mul_op) + block, popvar_squared = block.apply_unary_op( + moment2_id, ops.partial_right(ops.unsafe_pow_op, 2) + ) + block, denominator_id = block.apply_binary_op(popvar_squared, denom_adj, ops.mul_op) + + # Adjustment + block, countminus1_square = block.apply_unary_op( + countminus1_id, ops.partial_right(ops.unsafe_pow_op, 2) + ) + block, adj_num = block.apply_unary_op( + countminus1_square, ops.partial_right(ops.mul_op, 3) + ) + block, adj_denom = block.apply_binary_op(countminus2_id, countminus3_id, ops.mul_op) + block, adjustment_id = block.apply_binary_op(adj_num, adj_denom, ops.div_op) + + # Combine + block, base_id = block.apply_binary_op(numerator_id, denominator_id, ops.div_op) + block, kurt_id = block.apply_binary_op(base_id, adjustment_id, ops.sub_op) + + # Need to produce NA if have less than 4 data points + block, na_cond_id = block.apply_unary_op(count_id, ops.partial_right(ops.ge_op, 4)) + block, kurt_id = block.apply_binary_op( + kurt_id, na_cond_id, ops.partial_arg3(ops.where_op, None) + ) + return block, kurt_id diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 5b414252ee..fb9ede9f4c 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -709,6 +709,7 @@ def multi_apply_window_op( window_spec: core.WindowSpec, *, skip_null_groups: bool = False, + never_skip_nulls: bool = False, ) -> typing.Tuple[Block, typing.Sequence[str]]: block = self result_ids = [] @@ -721,6 +722,7 @@ def multi_apply_window_op( skip_reproject_unsafe=(i + 1) < len(columns), result_label=label, skip_null_groups=skip_null_groups, + never_skip_nulls=never_skip_nulls, ) result_ids.append(result_id) return block, result_ids @@ -751,15 +753,21 @@ def apply_window_op( result_label: Label = None, skip_null_groups: bool = False, skip_reproject_unsafe: bool = False, + never_skip_nulls: bool = False, ) -> typing.Tuple[Block, str]: + block = self + if skip_null_groups: + for key in window_spec.grouping_keys: + block, not_null_id = block.apply_unary_op(key, ops.notnull_op) + block = block.filter(not_null_id).drop_columns([not_null_id]) result_id = guid.generate_guid() - expr = self._expr.project_window_op( + expr = block._expr.project_window_op( column, op, window_spec, result_id, - skip_null_groups=skip_null_groups, skip_reproject_unsafe=skip_reproject_unsafe, + never_skip_nulls=never_skip_nulls, ) block = Block( expr, diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 810e145d33..9be7f22a71 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -156,6 +156,18 @@ def skew( block = block_ops.skew(self._block, self._selected_cols, self._by_col_ids) return df.DataFrame(block) + def kurt( + self, + *, + numeric_only: bool = False, + ) -> df.DataFrame: + if not numeric_only: + self._raise_on_non_numeric("kurt") + block = block_ops.kurt(self._block, self._selected_cols, self._by_col_ids) + return df.DataFrame(block) + + kurtosis = kurt + def all(self) -> df.DataFrame: return self._aggregate_all(agg_ops.all_op) @@ -195,6 +207,36 @@ def diff(self, periods=1) -> series.Series: ) return self._apply_window_op(agg_ops.DiffOp(periods), window=window) + def rolling(self, window: int, min_periods=None) -> windows.Window: + # To get n size window, need current row and n-1 preceding rows. + window_spec = core.WindowSpec( + grouping_keys=self._by_col_ids, + preceding=window - 1, + following=0, + min_periods=min_periods or window, + ) + block = self._block.order_by( + [order.OrderingColumnReference(col) for col in self._by_col_ids], + stable=True, + ) + return windows.Window( + block, window_spec, self._selected_cols, drop_null_groups=self._dropna + ) + + def expanding(self, min_periods: int = 1) -> windows.Window: + window_spec = core.WindowSpec( + grouping_keys=self._by_col_ids, + following=0, + min_periods=min_periods, + ) + block = self._block.order_by( + [order.OrderingColumnReference(col) for col in self._by_col_ids], + stable=True, + ) + return windows.Window( + block, window_spec, self._selected_cols, drop_null_groups=self._dropna + ) + def agg(self, func=None, **kwargs) -> df.DataFrame: if func: if isinstance(func, str): @@ -351,7 +393,7 @@ def _apply_window_op( ) columns = self._aggregated_columns(numeric_only=numeric_only) block, result_ids = self._block.multi_apply_window_op( - columns, op, window_spec=window_spec, skip_null_groups=self._dropna + columns, op, window_spec=window_spec ) block = block.select_columns(result_ids) return df.DataFrame(block) @@ -422,6 +464,12 @@ def skew(self, *args, **kwargs) -> series.Series: block = block_ops.skew(self._block, [self._value_column], self._by_col_ids) return series.Series(block) + def kurt(self, *args, **kwargs) -> series.Series: + block = block_ops.kurt(self._block, [self._value_column], self._by_col_ids) + return series.Series(block) + + kurtosis = kurt + def prod(self, *args) -> series.Series: return self._aggregate(agg_ops.product_op) @@ -510,7 +558,13 @@ def rolling(self, window: int, min_periods=None) -> windows.Window: [order.OrderingColumnReference(col) for col in self._by_col_ids], stable=True, ) - return windows.Window(block, window_spec, self._value_column) + return windows.Window( + block, + window_spec, + [self._value_column], + drop_null_groups=self._dropna, + is_series=True, + ) def expanding(self, min_periods: int = 1) -> windows.Window: window_spec = core.WindowSpec( @@ -522,10 +576,13 @@ def expanding(self, min_periods: int = 1) -> windows.Window: [order.OrderingColumnReference(col) for col in self._by_col_ids], stable=True, ) - return windows.Window(block, window_spec, self._value_column) - - def _ungroup(self) -> series.Series: - return series.Series(self._block.select_column(self._value_column)) + return windows.Window( + block, + window_spec, + [self._value_column], + drop_null_groups=self._dropna, + is_series=True, + ) def _aggregate(self, aggregate_op: agg_ops.AggregateOp) -> series.Series: result_block, _ = self._block.aggregate( @@ -553,6 +610,5 @@ def _apply_window_op( op, result_label=label, window_spec=window_spec, - skip_null_groups=self._dropna, ) return series.Series(block.select_column(result_id)) diff --git a/bigframes/core/window/__init__.py b/bigframes/core/window/__init__.py index 8994004e0b..d3d081124e 100644 --- a/bigframes/core/window/__init__.py +++ b/bigframes/core/window/__init__.py @@ -21,54 +21,57 @@ import bigframes.operations.aggregations as agg_ops import third_party.bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling -if typing.TYPE_CHECKING: - from bigframes.series import Series - class Window(vendored_pandas_rolling.Window): __doc__ = vendored_pandas_rolling.Window.__doc__ - # TODO(tbergeron): Windows with groupings should create multi-indexed results - def __init__( self, block: blocks.Block, window_spec: core.WindowSpec, - value_column_id: str, + value_column_ids: typing.Sequence[str], + drop_null_groups: bool = True, + is_series: bool = False, ): self._block = block self._window_spec = window_spec - self._value_column_id = value_column_id + self._value_column_ids = value_column_ids + self._drop_null_groups = drop_null_groups + self._is_series = is_series - def count(self) -> Series: + def count(self): return self._apply_aggregate(agg_ops.count_op) - def sum(self) -> Series: + def sum(self): return self._apply_aggregate(agg_ops.sum_op) - def mean(self) -> Series: + def mean(self): return self._apply_aggregate(agg_ops.mean_op) - def var(self) -> Series: + def var(self): return self._apply_aggregate(agg_ops.var_op) - def std(self) -> Series: + def std(self): return self._apply_aggregate(agg_ops.std_op) - def max(self) -> Series: + def max(self): return self._apply_aggregate(agg_ops.max_op) - def min(self) -> Series: + def min(self): return self._apply_aggregate(agg_ops.min_op) def _apply_aggregate( self, op: agg_ops.AggregateOp, - ) -> Series: + ): block = self._block - label = block.col_id_to_label[self._value_column_id] - block, result_id = block.apply_window_op( - self._value_column_id, op, self._window_spec, result_label=label + labels = [block.col_id_to_label[col] for col in self._value_column_ids] + block, result_ids = block.multi_apply_window_op( + self._value_column_ids, + op, + self._window_spec, + skip_null_groups=self._drop_null_groups, + never_skip_nulls=True, ) if self._window_spec.grouping_keys: @@ -80,6 +83,13 @@ def _apply_aggregate( ) block = block.set_index(col_ids=index_ids) - from bigframes.series import Series + if self._is_series: + from bigframes.series import Series - return Series(block.select_column(result_id)) + return Series(block.select_columns(result_ids).with_column_labels(labels)) + else: + from bigframes.dataframe import DataFrame + + return DataFrame( + block.select_columns(result_ids).with_column_labels(labels) + ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d65d4ce344..de4adb912e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -49,6 +49,7 @@ import bigframes.core.joins as joins import bigframes.core.ordering as order import bigframes.core.utils as utils +import bigframes.core.window import bigframes.dtypes import bigframes.formatting_helpers as formatter import bigframes.operations as ops @@ -282,6 +283,10 @@ def empty(self) -> bool: def values(self) -> numpy.ndarray: return self.to_numpy() + @property + def _session(self) -> bigframes.Session: + return self._get_block().expr._session + def __len__(self): rows, _ = self.shape return rows @@ -1056,6 +1061,13 @@ def _assign_single_item( ) -> DataFrame: if isinstance(v, bigframes.series.Series): return self._assign_series_join_on_index(k, v) + elif isinstance(v, bigframes.dataframe.DataFrame): + v_df_col_count = len(v._block.value_columns) + if v_df_col_count != 1: + raise ValueError( + f"Cannot set a DataFrame with {v_df_col_count} columns to the single column {k}" + ) + return self._assign_series_join_on_index(k, v[v.columns[0]]) elif callable(v): copy = self.copy() copy[k] = v(copy) @@ -1627,6 +1639,16 @@ def skew(self, *, numeric_only: bool = False): result_block = block_ops.skew(frame._block, frame._block.value_columns) return bigframes.series.Series(result_block) + def kurt(self, *, numeric_only: bool = False): + if not numeric_only: + frame = self._raise_on_non_numeric("kurt") + else: + frame = self._drop_non_numeric() + result_block = block_ops.kurt(frame._block, frame._block.value_columns) + return bigframes.series.Series(result_block) + + kurtosis = kurt + def pivot( self, *, @@ -1882,6 +1904,21 @@ def _perform_join_by_index(self, other: DataFrame, *, how: str = "left"): ) return DataFrame(combined_index._block) + def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window: + # To get n size window, need current row and n-1 preceding rows. + window_spec = bigframes.core.WindowSpec( + preceding=window - 1, following=0, min_periods=min_periods or window + ) + return bigframes.core.window.Window( + self._block, window_spec, self._block.value_columns + ) + + def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: + window_spec = bigframes.core.WindowSpec(following=0, min_periods=min_periods) + return bigframes.core.window.Window( + self._block, window_spec, self._block.value_columns + ) + def groupby( self, by: typing.Union[ diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 57f610c4c4..667d42f7ee 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -221,7 +221,7 @@ def create_bqml_model( input_data = X_train.join(y_train, how="outer") options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) - session = X_train._get_block().expr._session + session = X_train._session source_sql = input_data.sql options_sql = ml_sql.options(**options) @@ -255,7 +255,7 @@ def create_bqml_time_series_model( options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) - session = X_train._get_block().expr._session + session = X_train._session source_sql = input_data.sql options_sql = ml_sql.options(**options) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 142edaa00f..b0f3e5f081 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -110,7 +110,7 @@ def _from_bq( dummy_regressor = cls() for bf_param, bf_value in dummy_regressor.__dict__.items(): bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param is not None: + if bqml_param in last_fitting: kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) new_xgb_regressor = cls(**kwargs) @@ -431,7 +431,7 @@ def _from_bq( dummy_model = cls() for bf_param, bf_value in dummy_model.__dict__.items(): bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) - if bqml_param is not None: + if bqml_param in last_fitting: kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) new_random_forest_regressor = cls(**kwargs) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 1606a15d73..f27b798eea 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import cast, Dict, List, Optional, Union +from typing import cast, Dict, List, Literal, Optional, Union from google.cloud import bigquery @@ -28,6 +28,23 @@ import third_party.bigframes_vendored.sklearn.linear_model._base import third_party.bigframes_vendored.sklearn.linear_model._logistic +_BQML_PARAMS_MAPPING = { + "optimize_strategy": "optimizationStrategy", + "fit_intercept": "fitIntercept", + "l1_reg": "l1Regularization", + "l2_reg": "l2Regularization", + "max_iterations": "maxIterations", + "learn_rate_strategy": "learnRateStrategy", + "learn_rate": "learnRate", + "early_stop": "earlyStop", + "min_rel_progress": "minRelativeProgress", + "ls_init_learn_rate": "initialLearnRate", + "warm_start": "warmStart", + "calculate_p_values": "calculatePValues", + "enable_global_explain": "enableGlobalExplain", + "category_encoding_method": "categoryEncodingMethod", +} + class LinearRegression( base.SupervisedTrainablePredictor, @@ -39,9 +56,29 @@ class LinearRegression( def __init__( self, + optimize_strategy: Literal[ + "auto_strategy", "batch_gradient_descent", "normal_equation" + ] = "normal_equation", fit_intercept: bool = True, + l2_reg: float = 0.0, + max_iterations: int = 20, + learn_rate_strategy: Literal["line_search", "constant"] = "line_search", + early_stop: bool = True, + min_rel_progress: float = 0.01, + ls_init_learn_rate: float = 0.1, + calculate_p_values: bool = False, + enable_global_explain: bool = False, ): + self.optimize_strategy = optimize_strategy self.fit_intercept = fit_intercept + self.l2_reg = l2_reg + self.max_iterations = max_iterations + self.learn_rate_strategy = learn_rate_strategy + self.early_stop = early_stop + self.min_rel_progress = min_rel_progress + self.ls_init_learn_rate = ls_init_learn_rate + self.calculate_p_values = calculate_p_values + self.enable_global_explain = enable_global_explain self._bqml_model: Optional[core.BqmlModel] = None @classmethod @@ -55,8 +92,12 @@ def _from_bq( # See https://cloud.google.com/bigquery/docs/reference/rest/v2/models#trainingrun last_fitting = model.training_runs[-1]["trainingOptions"] - if "fitIntercept" in last_fitting: - kwargs["fit_intercept"] = last_fitting["fitIntercept"] + + dummy_linear = cls() + for bf_param, bf_value in dummy_linear.__dict__.items(): + bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) + if bqml_param in last_fitting: + kwargs[bf_param] = type(bf_value)(last_fitting[bqml_param]) new_linear_regression = cls(**kwargs) new_linear_regression._bqml_model = core.BqmlModel(session, model) @@ -65,10 +106,20 @@ def _from_bq( @property def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: """The model options as they will be set for BQML""" + # TODO: Support l1_reg, warm_start, and learn_rate with error catching. return { "model_type": "LINEAR_REG", "data_split_method": "NO_SPLIT", + "optimize_strategy": self.optimize_strategy, "fit_intercept": self.fit_intercept, + "l2_reg": self.l2_reg, + "max_iterations": self.max_iterations, + "learn_rate_strategy": self.learn_rate_strategy, + "early_stop": self.early_stop, + "min_rel_progress": self.min_rel_progress, + "ls_init_learn_rate": self.ls_init_learn_rate, + "calculate_p_values": self.calculate_p_values, + "enable_global_explain": self.enable_global_explain, } def _fit( @@ -147,10 +198,11 @@ class LogisticRegression( def __init__( self, fit_intercept: bool = True, - auto_class_weights: bool = False, + class_weights: Optional[Union[Literal["balanced"], Dict[str, float]]] = None, ): self.fit_intercept = fit_intercept - self.auto_class_weights = auto_class_weights + self.class_weights = class_weights + self._auto_class_weight = class_weights == "balanced" self._bqml_model: Optional[core.BqmlModel] = None @classmethod @@ -165,10 +217,8 @@ def _from_bq( last_fitting = model.training_runs[-1]["trainingOptions"] if "fitIntercept" in last_fitting: kwargs["fit_intercept"] = last_fitting["fitIntercept"] - # TODO(ashleyxu): b/285162045 support auto_class_weights once the API is - # fixed and enable the tests. - if "autoClassWeights" in last_fitting: - kwargs["auto_class_weights"] = last_fitting["autoClassWeights"] + if last_fitting["autoClassWeights"]: + kwargs["class_weights"] = "balanced" # TODO(ashleyxu) support class_weights in the constructor. # if "labelClassWeights" in last_fitting: # kwargs["class_weights"] = last_fitting["labelClassWeights"] @@ -184,8 +234,8 @@ def _bqml_options(self) -> Dict[str, str | int | float | List[str]]: "model_type": "LOGISTIC_REG", "data_split_method": "NO_SPLIT", "fit_intercept": self.fit_intercept, - "auto_class_weights": self.auto_class_weights, - # TODO(ashleyxu): support class_weights (struct array) + "auto_class_weights": self._auto_class_weight, + # TODO(ashleyxu): support class_weights (struct array as dict in our API) # "class_weights": self.class_weights, } @@ -253,11 +303,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LogisticRegression: if not self._bqml_model: raise RuntimeError("A model must be fitted before it can be saved") - # TODO(ashleyxu): b/285162045 support auto_class_weights once the API is - # fixed and enable the tests. - if self.auto_class_weights is True: + # TODO(ashleyxu): support class_weights (struct array as dict in our API) + if self.class_weights not in (None, "balanced"): raise NotImplementedError( - f"auto_class_weight is not supported yet. {constants.FEEDBACK_LINK}" + f"class_weights is not supported yet. {constants.FEEDBACK_LINK}" ) new_model = self._bqml_model.copy(model_name, replace) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index fa43f725f6..c5c55607ae 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -773,6 +773,17 @@ def pow_op( return _float_pow_op(x, y) +@short_circuit_nulls(ibis_dtypes.float) +def unsafe_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + """For internal use only - where domain and overflow checks are not needed.""" + return typing.cast(ibis_types.NumericValue, x) ** typing.cast( + ibis_types.NumericValue, y + ) + + def _int_pow_op( x: ibis_types.Value, y: ibis_types.Value, diff --git a/bigframes/series.py b/bigframes/series.py index 12e72c58b6..9db64fae9c 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1016,13 +1016,13 @@ def rolling(self, window: int, min_periods=None) -> bigframes.core.window.Window preceding=window - 1, following=0, min_periods=min_periods or window ) return bigframes.core.window.Window( - self._block, window_spec, self._value_column + self._block, window_spec, self._block.value_columns, is_series=True ) def expanding(self, min_periods: int = 1) -> bigframes.core.window.Window: window_spec = WindowSpec(following=0, min_periods=min_periods) return bigframes.core.window.Window( - self._block, window_spec, self._value_column + self._block, window_spec, self._block.value_columns, is_series=True ) def groupby( diff --git a/bigframes/session.py b/bigframes/session.py index 3ca79a7b53..04ae6ba454 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -1324,7 +1324,8 @@ def remote_function( reuse: bool = True, name: Optional[str] = None, ): - """Decorator to turn a user defined function into a BigQuery remote function. + """Decorator to turn a user defined function into a BigQuery remote function. Check out + the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. .. note:: Please make sure following is setup before using this API: diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 3b90568450..a0f4182e6f 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -18,7 +18,7 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, dataset_id): - model = bigframes.ml.linear_model.LinearRegression(fit_intercept=False) + model = bigframes.ml.linear_model.LinearRegression() df = penguins_df_default_index.dropna() X_train = df[ @@ -55,15 +55,24 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase assert ( f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name ) - - # TODO(yunmengxie): enable this once b/277242951 (fit_intercept missing from API) is fixed - # assert reloaded_model.fit_intercept == False + assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" + assert reloaded_model.fit_intercept is True + assert reloaded_model.calculate_p_values is False + assert reloaded_model.early_stop is True + assert reloaded_model.enable_global_explain is False + assert reloaded_model.l2_reg == 0.0 + assert reloaded_model.learn_rate_strategy == "line_search" + assert reloaded_model.ls_init_learn_rate == 0.1 + assert reloaded_model.max_iterations == 20 + assert reloaded_model.min_rel_progress == 0.01 -def test_linear_regression_manual_split_configure_fit_score( +def test_linear_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): - model = bigframes.ml.linear_model.LinearRegression(fit_intercept=True) + model = bigframes.ml.linear_model.LinearRegression( + fit_intercept=False, l2_reg=0.1, min_rel_progress=0.01 + ) df = penguins_df_default_index.dropna() X_train = df[ @@ -83,12 +92,12 @@ def test_linear_regression_manual_split_configure_fit_score( result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { - "mean_absolute_error": [225.735767], - "mean_squared_error": [80417.461828], - "mean_squared_log_error": [0.004967], - "median_absolute_error": [172.543702], - "r2_score": [0.87548], - "explained_variance": [0.87548], + "mean_absolute_error": [226.108411], + "mean_squared_error": [80459.668456], + "mean_squared_log_error": [0.00497], + "median_absolute_error": [171.618872], + "r2_score": [0.875415], + "explained_variance": [0.875417], }, dtype="Float64", ) @@ -100,13 +109,21 @@ def test_linear_regression_manual_split_configure_fit_score( assert ( f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name ) - assert reloaded_model.fit_intercept is True - - -def test_logistic_regression_auto_class_weights_configure_fit_score( - penguins_df_default_index, dataset_id -): + assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" + assert reloaded_model.fit_intercept is False + assert reloaded_model.calculate_p_values is False + assert reloaded_model.early_stop is True + assert reloaded_model.enable_global_explain is False + assert reloaded_model.l2_reg == 0.1 + assert reloaded_model.learn_rate_strategy == "line_search" + assert reloaded_model.ls_init_learn_rate == 0.1 + assert reloaded_model.max_iterations == 20 + assert reloaded_model.min_rel_progress == 0.01 + + +def test_logistic_regression_configure_fit_score(penguins_df_default_index, dataset_id): model = bigframes.ml.linear_model.LogisticRegression() + df = penguins_df_default_index.dropna() X_train = df[ [ @@ -115,6 +132,7 @@ def test_logistic_regression_auto_class_weights_configure_fit_score( "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", + "body_mass_g", ] ] y_train = df[["sex"]] @@ -124,12 +142,12 @@ def test_logistic_regression_auto_class_weights_configure_fit_score( result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { - "precision": [0.58085], - "recall": [0.582576], - "accuracy": [0.871257], - "f1_score": [0.58171], - "log_loss": [1.59285], - "roc_auc": [0.9602], + "precision": [0.616753], + "recall": [0.618615], + "accuracy": [0.92515], + "f1_score": [0.617681], + "log_loss": [1.498832], + "roc_auc": [0.975807], }, dtype="Float64", ) @@ -145,15 +163,15 @@ def test_logistic_regression_auto_class_weights_configure_fit_score( in reloaded_model._bqml_model.model_name ) assert reloaded_model.fit_intercept is True - # TODO(gaotianxiang): enable this once (auto_class_weights missing from API) is fixed - # assert reloaded_model.auto_class_weights is True + assert reloaded_model.class_weights is None -def test_logistic_regression_manual_split_configure_fit_score( +def test_logistic_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): - model = bigframes.ml.linear_model.LogisticRegression(fit_intercept=True) - + model = bigframes.ml.linear_model.LogisticRegression( + fit_intercept=False, class_weights="balanced" + ) df = penguins_df_default_index.dropna() X_train = df[ [ @@ -162,7 +180,6 @@ def test_logistic_regression_manual_split_configure_fit_score( "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", - "body_mass_g", ] ] y_train = df[["sex"]] @@ -172,12 +189,12 @@ def test_logistic_regression_manual_split_configure_fit_score( result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { - "precision": [0.616753], - "recall": [0.618615], - "accuracy": [0.92515], - "f1_score": [0.617681], - "log_loss": [1.498832], - "roc_auc": [0.975807], + "precision": [0.58483], + "recall": [0.586616], + "accuracy": [0.877246], + "f1_score": [0.58571], + "log_loss": [1.032699], + "roc_auc": [0.924132], }, dtype="Float64", ) @@ -192,5 +209,5 @@ def test_logistic_regression_manual_split_configure_fit_score( f"{dataset_id}.temp_configured_logistic_reg_model" in reloaded_model._bqml_model.model_name ) - assert reloaded_model.fit_intercept is True - assert reloaded_model.auto_class_weights is False + assert reloaded_model.fit_intercept is False + assert reloaded_model.class_weights == "balanced" diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index a85777c59d..ed682c855b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -411,6 +411,30 @@ def test_assign_new_column_w_setitem(scalars_dfs): pd.testing.assert_frame_equal(bf_result, pd_result) +def test_assign_new_column_w_setitem_dataframe(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["int64_col"] = bf_df["int64_too"].to_frame() + pd_df["int64_col"] = pd_df["int64_too"].to_frame() + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_df["int64_col"] = pd_df["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_assign_new_column_w_setitem_dataframe_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + bf_df["impossible_col"] = bf_df[["int64_too", "string_col"]] + with pytest.raises(ValueError): + pd_df["impossible_col"] = pd_df[["int64_too", "string_col"]] + + def test_assign_new_column_w_setitem_list(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_df = scalars_df.copy() @@ -1996,6 +2020,30 @@ def test_df_skew(scalars_dfs): pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) +def test_df_kurt_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).kurt().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_df_kurt(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].kurt().to_pandas() + pd_result = scalars_pandas_df[columns].kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + @pytest.mark.parametrize( ("frac", "n", "random_state"), [ diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 18741468c5..05154f7ab7 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -238,6 +238,16 @@ def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) +def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.groupby("bool_col")["int64_too"].kurt().to_pandas() + # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139 + pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply( + pd.Series.kurt + ) + + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index): col_names = ["float64_col", "int64_col", "bool_col"] bf_result = scalars_df_index[col_names].groupby("bool_col").skew().to_pandas() @@ -246,6 +256,20 @@ def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) +def test_dataframe_groupby_kurt(scalars_df_index, scalars_pandas_df_index): + col_names = ["float64_col", "int64_col", "bool_col"] + bf_result = scalars_df_index[col_names].groupby("bool_col").kurt().to_pandas() + # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139 + pd_result = ( + scalars_pandas_df_index[col_names] + .groupby("bool_col") + .apply(pd.Series.kurt) + .drop("bool_col", axis=1) + ) + + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + def test_dataframe_groupby_diff(scalars_df_index, scalars_pandas_df_index): col_names = ["float64_col", "int64_col", "string_col"] bf_result = scalars_df_index[col_names].groupby("string_col").diff(-1) diff --git a/tests/system/small/test_window.py b/tests/system/small/test_window.py index e2f0fe999b..2b9ec1a3c0 100644 --- a/tests/system/small/test_window.py +++ b/tests/system/small/test_window.py @@ -41,7 +41,9 @@ pytest.param(lambda x: x.var(), id="var"), ], ) -def test_window_agg_ops(scalars_df_index, scalars_pandas_df_index, windowing, agg_op): +def test_series_window_agg_ops( + scalars_df_index, scalars_pandas_df_index, windowing, agg_op +): col_name = "int64_too" bf_series = agg_op(windowing(scalars_df_index[col_name])).to_pandas() pd_series = agg_op(windowing(scalars_pandas_df_index[col_name])) @@ -53,3 +55,41 @@ def test_window_agg_ops(scalars_df_index, scalars_pandas_df_index, windowing, ag pd_series, bf_series, ) + + +@pytest.mark.parametrize( + ("windowing"), + [ + pytest.param(lambda x: x.expanding(), id="expanding"), + pytest.param(lambda x: x.rolling(3, min_periods=3), id="rolling"), + pytest.param( + lambda x: x.groupby(level=0).rolling(3, min_periods=3), id="rollinggroupby" + ), + pytest.param( + lambda x: x.groupby("int64_too").expanding(min_periods=2), + id="expandinggroupby", + ), + ], +) +@pytest.mark.parametrize( + ("agg_op"), + [ + pytest.param(lambda x: x.sum(), id="sum"), + pytest.param(lambda x: x.min(), id="min"), + pytest.param(lambda x: x.max(), id="max"), + pytest.param(lambda x: x.mean(), id="mean"), + pytest.param(lambda x: x.count(), id="count"), + pytest.param(lambda x: x.std(), id="std"), + pytest.param(lambda x: x.var(), id="var"), + ], +) +def test_dataframe_window_agg_ops( + scalars_df_index, scalars_pandas_df_index, windowing, agg_op +): + scalars_df_index = scalars_df_index.set_index("bool_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("bool_col") + col_names = ["int64_too", "float64_col"] + bf_result = agg_op(windowing(scalars_df_index[col_names])).to_pandas() + pd_result = agg_op(windowing(scalars_pandas_df_index[col_names])) + + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 8d4932a3c3..584d080d42 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -14,34 +14,146 @@ from unittest import mock +from google.cloud import bigquery +import pandas as pd +import pytest import pytest_mock import bigframes -from bigframes.ml import linear_model +from bigframes.ml import core, linear_model import bigframes.pandas as bpd -def test_linear_regression_default_fit(mocker: pytest_mock.MockerFixture): +@pytest.fixture +def mock_session(): mock_session = mock.create_autospec(spec=bigframes.Session) - mock_X = mock.create_autospec(spec=bpd.DataFrame) - mock_X._get_block().expr._session = mock_session + # return values we don't care about, but need to provide to continue the program when calling session._start_query() + mock_session._start_query.return_value = (None, mock.MagicMock()) + + return mock_session + +@pytest.fixture +def mock_y(): mock_y = mock.create_autospec(spec=bpd.DataFrame) - mock_y.columns.tolist.return_value = ["input_label_column"] + mock_y.columns = pd.Index(["input_column_label"]) - mock_X.join(mock_y).sql = "input_dataframe_sql" + return mock_y - # return values we don't care about, but need to provide to continue the program - mock_session._start_query.return_value = (None, mock.MagicMock()) +@pytest.fixture +def mock_X(mock_y, mock_session): + mock_X = mock.create_autospec(spec=bpd.DataFrame) + mock_X._session = mock_session + mock_X._to_sql_query.return_value = ( + "input_X_sql", + ["index_column_id"], + ["index_column_label"], + ) + mock_X.join(mock_y).sql = "input_X_y_sql" + mock_X.join(mock_y)._to_sql_query.return_value = ( + "input_X_y_sql", + ["index_column_id"], + ["index_column_label"], + ) + + return mock_X + + +@pytest.fixture +def bqml_model(mock_session): + bqml_model = core.BqmlModel( + mock_session, bigquery.Model("model_project.model_dataset.model_name") + ) + + return bqml_model + + +@pytest.fixture +def ml_mocker(mocker: pytest_mock.MockerFixture): mocker.patch( "bigframes.ml.core._create_temp_model_name", return_value="temp_model_name" ) + return mocker + + +def test_linear_regression_default_fit(ml_mocker, mock_session, mock_X, mock_y): + model = linear_model.LinearRegression() + model.fit(mock_X, mock_y) + + mock_session._start_query.assert_called_once_with( + 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + ) + + +def test_linear_regression_params_fit(ml_mocker, mock_session, mock_X, mock_y): + model = linear_model.LinearRegression(fit_intercept=False) + model.fit(mock_X, mock_y) + + mock_session._start_query.assert_called_once_with( + 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + ) + + +def test_linear_regression_predict(mock_session, bqml_model, mock_X): + model = linear_model.LinearRegression() + model._bqml_model = bqml_model + model.predict(mock_X) + + mock_session.read_gbq.assert_called_once_with( + "SELECT * FROM ML.PREDICT(MODEL `model_project.model_dataset.model_name`,\n (input_X_sql))", + index_col=["index_column_id"], + ) + + +def test_linear_regression_score(mock_session, bqml_model, mock_X, mock_y): model = linear_model.LinearRegression() + model._bqml_model = bqml_model + model.score(mock_X, mock_y) + + mock_session.read_gbq.assert_called_once_with( + "SELECT * FROM ML.EVALUATE(MODEL `model_project.model_dataset.model_name`,\n (input_X_y_sql))" + ) + + +def test_logistic_regression_default_fit(ml_mocker, mock_session, mock_X, mock_y): + model = linear_model.LogisticRegression() model.fit(mock_X, mock_y) mock_session._start_query.assert_called_once_with( - 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n INPUT_LABEL_COLS=["input_label_column"])\nAS input_dataframe_sql' + 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + ) + + +def test_logistic_regression_params_fit(ml_mocker, mock_session, mock_X, mock_y): + model = linear_model.LogisticRegression( + fit_intercept=False, class_weights="balanced" + ) + model.fit(mock_X, mock_y) + + mock_session._start_query.assert_called_once_with( + 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + ) + + +def test_logistic_regression_predict(mock_session, bqml_model, mock_X): + model = linear_model.LogisticRegression() + model._bqml_model = bqml_model + model.predict(mock_X) + + mock_session.read_gbq.assert_called_once_with( + "SELECT * FROM ML.PREDICT(MODEL `model_project.model_dataset.model_name`,\n (input_X_sql))", + index_col=["index_column_id"], + ) + + +def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y): + model = linear_model.LogisticRegression() + model._bqml_model = bqml_model + model.score(mock_X, mock_y) + + mock_session.read_gbq.assert_called_once_with( + "SELECT * FROM ML.EVALUATE(MODEL `model_project.model_dataset.model_name`,\n (input_X_y_sql))" ) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 113c6547a0..27cc2144e0 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1569,6 +1569,21 @@ def skew(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def kurt(self, *, numeric_only: bool = False): + """Return unbiased kurtosis over requested axis. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Args: + numeric_only (bool, default False): + Include only float, int, boolean columns. + + Returns: + Series + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def std(self, *, numeric_only: bool = False): """Return sample standard deviation over requested axis. diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 7d496891b0..27d2e84537 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -511,6 +511,61 @@ def rank( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def rolling( + self, + window, + min_periods: int | None = None, + ): + """ + Provide rolling window calculations. + + Args: + window (int, timedelta, str, offset, or BaseIndexer subclass): + Size of the moving window. + + If an integer, the fixed number of observations used for + each window. + + If a timedelta, str, or offset, the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetime-like indexes. + To learn more about the offsets & frequency strings, please see `this link + `__. + + If a BaseIndexer subclass, the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely ``min_periods``, ``center``, ``closed`` and + ``step`` will be passed to ``get_window_bounds``. + + min_periods (int, default None): + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + For a window that is specified by an offset, ``min_periods`` will default to 1. + + For a window that is specified by an integer, ``min_periods`` will default + to the size of the window. + + Returns: + bigframes.core.window.Window: ``Window`` subclass if a ``win_type`` is passed. + ``Rolling`` subclass if ``win_type`` is not passed. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def expanding(self, min_periods=1): + """ + Provide expanding window calculations. + + Args: + min_periods (int, default 1): + Minimum number of observations in window required to have a value; + otherwise, result is ``np.nan``. + + Returns: + bigframes.core.window.Window: ``Expanding`` subclass. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def __nonzero__(self): raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 7849a3afd5..b05319b4f7 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -144,6 +144,27 @@ def skew( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def kurt( + self, + *, + numeric_only: bool = False, + ): + """ + Return unbiased kurtosis over requested axis. + + Kurtosis obtained using Fisher's definition of + kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + + Args: + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. + + Returns: + Series or DataFrame + Variance of values within each group. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def sum( self, numeric_only: bool = False, diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c6dd973372..d58c1ccc3b 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1673,61 +1673,6 @@ def rename_axis(self, mapper, **kwargs): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rolling( - self, - window, - min_periods: int | None = None, - ): - """ - Provide rolling window calculations. - - Args: - window (int, timedelta, str, offset, or BaseIndexer subclass): - Size of the moving window. - - If an integer, the fixed number of observations used for - each window. - - If a timedelta, str, or offset, the time period of each window. Each - window will be a variable sized based on the observations included in - the time-period. This is only valid for datetime-like indexes. - To learn more about the offsets & frequency strings, please see `this link - `__. - - If a BaseIndexer subclass, the window boundaries - based on the defined ``get_window_bounds`` method. Additional rolling - keyword arguments, namely ``min_periods``, ``center``, ``closed`` and - ``step`` will be passed to ``get_window_bounds``. - - min_periods (int, default None): - Minimum number of observations in window required to have a value; - otherwise, result is ``np.nan``. - - For a window that is specified by an offset, ``min_periods`` will default to 1. - - For a window that is specified by an integer, ``min_periods`` will default - to the size of the window. - - Returns: - bigframes.core.window.Window: ``Window`` subclass if a ``win_type`` is passed. - ``Rolling`` subclass if ``win_type`` is not passed. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - - def expanding(self, min_periods=1): - """ - Provide expanding window calculations. - - Args: - min_periods (int, default 1): - Minimum number of observations in window required to have a value; - otherwise, result is ``np.nan``. - - Returns: - bigframes.core.window.Window: ``Expanding`` subclass. - """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def value_counts( self, normalize: bool = False, diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 81b4fca157..8dc3b6280a 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -64,10 +64,30 @@ class LinearRegression(RegressorMixin, LinearModel): the dataset, and the targets predicted by the linear approximation. Args: - fit_intercept (default True): + optimize_strategy (str, default "normal_equation"): + The strategy to train linear regression models. Possible values are + "auto_strategy", "batch_gradient_descent", "normal_equation". Default + to "normal_equation". + fit_intercept (bool, default True): Default ``True``. Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). + l2_reg (float, default 0.0): + The amount of L2 regularization applied. Default to 0. + max_iterations (int, default 20): + The maximum number of training iterations or steps. Default to 20. + learn_rate_strategy (str, default "line_search"): + The strategy for specifying the learning rate during training. Default to "line_search". + early_stop (bool, default True): + Whether training should stop after the first iteration in which the relative loss improvement is less than the value specified for min_rel_progress. Default to True. + min_rel_progress (float, default 0.01): + The minimum relative loss improvement that is necessary to continue training when EARLY_STOP is set to true. For example, a value of 0.01 specifies that each iteration must reduce the loss by 1% for training to continue. Default to 0.01. + ls_init_learn_rate (float, default 0.1): + Sets the initial learning rate that learn_rate_strategy='line_search' uses. This option can only be used if line_search is specified. Default to 0.1. + calculate_p_values (bool, default False): + Specifies whether to compute p-values and standard errors during training. Default to False. + enable_global_explain (bool, default False): + Whether to compute global explanations using explainable AI to evaluate global feature importance to the model. Default to False. """ def fit( diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index 133dc4498e..989ca03c82 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -28,9 +28,14 @@ class LogisticRegression(LinearClassifierMixin, BaseEstimator): fit_intercept (default True): Default True. Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. - auto_class_weights (default False): - Default False. If True, balance class labels using weights for each - class in inverse proportion to the frequency of that class. + class_weights (dict or 'balanced', default None): + Default None. Weights associated with classes in the form + ``{class_label: weight}``.If not given, all classes are supposed + to have weight one. The "balanced" mode uses the values of y to + automatically adjust weights inversely proportional to class + frequencies in the input data as + ``n_samples / (n_classes * np.bincount(y))``. Dict isn't + supported now. """ def fit( From 5056da6b385dbcfc179d2bcbb6549fa539428cda Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 20 Sep 2023 15:16:38 -0700 Subject: [PATCH 04/24] perf: simplify join order to use multiple order keys instead of string. (#36) Change-Id: I8c37e9296b2e4e0ea87f6a7e836d48988d161d37 --- bigframes/core/joins/single_column.py | 118 +++++++++++--------------- 1 file changed, 50 insertions(+), 68 deletions(-) diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/joins/single_column.py index 4c865fffdf..7aba71fd95 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/joins/single_column.py @@ -63,7 +63,6 @@ def join_by_column( allow_row_identity_join (bool): If True, allow matching by row identity. Set to False to always perform a true JOIN in generated SQL. - Returns: The joined expression and the objects needed to interpret it. @@ -123,13 +122,13 @@ def join_by_column( ), ) else: - # Generate offsets if non-default ordering is applied - # Assumption, both sides are totally ordered, otherwise offsets will be nondeterministic left_table = left.to_ibis_expr( - ordering_mode="string_encoded", order_col_name=core.ORDER_ID_COLUMN + ordering_mode="unordered", + expose_hidden_cols=True, ) right_table = right.to_ibis_expr( - ordering_mode="string_encoded", order_col_name=core.ORDER_ID_COLUMN + ordering_mode="unordered", + expose_hidden_cols=True, ) join_conditions = [ value_to_join_key(left_table[left_index]) @@ -178,41 +177,13 @@ def get_column_right(key: str) -> str: return key - left_ordering_encoding_size = ( - left._ordering.string_encoding.length - if left._ordering.is_string_encoded - else bigframes.core.ordering.DEFAULT_ORDERING_ID_LENGTH - ) - right_ordering_encoding_size = ( - right._ordering.string_encoding.length - if right._ordering.is_string_encoded - else bigframes.core.ordering.DEFAULT_ORDERING_ID_LENGTH - ) - - # Preserve original ordering accross joins. - left_order_id = get_column_left(core.ORDER_ID_COLUMN) - right_order_id = get_column_right(core.ORDER_ID_COLUMN) - new_order_id_col = _merge_order_ids( - typing.cast(ibis_types.StringColumn, combined_table[left_order_id]), - left_ordering_encoding_size, - typing.cast(ibis_types.StringColumn, combined_table[right_order_id]), - right_ordering_encoding_size, - how, - ) - new_order_id = new_order_id_col.get_name() - if new_order_id is None: - raise ValueError("new_order_id unexpectedly has no name") - - hidden_columns = (new_order_id_col,) - ordering = core.ExpressionOrdering( - # Order id is non-nullable but na_last=False generates simpler sql with current impl - ordering_value_columns=[ - core.OrderingColumnReference(new_order_id, na_last=False) - ], - total_ordering_columns=frozenset([new_order_id]), - string_encoding=core.StringEncoding( - True, left_ordering_encoding_size + right_ordering_encoding_size - ), + # Preserve ordering accross joins. + ordering = join_orderings( + left._ordering, + right._ordering, + get_column_left, + get_column_right, + left_order_dominates=(how != "right"), ) left_join_keys = [ @@ -234,11 +205,21 @@ def get_column_right(key: str) -> str: for col in right.columns ] ) + hidden_ordering_columns = [ + *[ + combined_table[get_column_left(col.get_name())] + for col in left.hidden_ordering_columns + ], + *[ + combined_table[get_column_right(col.get_name())] + for col in right.hidden_ordering_columns + ], + ] combined_expr = core.ArrayValue( left._session, combined_table, columns=columns, - hidden_ordering_columns=hidden_columns, + hidden_ordering_columns=hidden_ordering_columns, ordering=ordering, ) if sort: @@ -313,32 +294,33 @@ def value_to_join_key(value: ibis_types.Value): return value.fillna(ibis_types.literal("$NULL_SENTINEL$")) -def _merge_order_ids( - left_id: ibis_types.StringColumn, - left_encoding_size: int, - right_id: ibis_types.StringColumn, - right_encoding_size: int, - how: str, -) -> ibis_types.StringColumn: - if how == "right": - return _merge_order_ids( - right_id, right_encoding_size, left_id, left_encoding_size, "left" - ) +def join_orderings( + left: core.ExpressionOrdering, + right: core.ExpressionOrdering, + left_id_mapping: Callable[[str], str], + right_id_mapping: Callable[[str], str], + left_order_dominates: bool = True, +) -> core.ExpressionOrdering: + left_ordering_refs = [ + ref.with_name(left_id_mapping(ref.column_id)) + for ref in left.all_ordering_columns + ] + right_ordering_refs = [ + ref.with_name(right_id_mapping(ref.column_id)) + for ref in right.all_ordering_columns + ] + if left_order_dominates: + joined_refs = [*left_ordering_refs, *right_ordering_refs] + else: + joined_refs = [*right_ordering_refs, *left_ordering_refs] - if how == "left": - right_id = typing.cast( - ibis_types.StringColumn, - right_id.fillna(ibis_types.literal(":" * right_encoding_size)), - ) - elif how != "inner": # outer join - left_id = typing.cast( - ibis_types.StringColumn, - left_id.fillna(ibis_types.literal(":" * left_encoding_size)), - ) - right_id = typing.cast( - ibis_types.StringColumn, - right_id.fillna(ibis_types.literal(":" * right_encoding_size)), - ) - return (left_id + right_id).name( - bigframes.core.guid.generate_guid(prefix="bigframes_ordering_id_") + left_total_order_cols = frozenset( + [left_id_mapping(id) for id in left.total_ordering_columns] + ) + right_total_order_cols = frozenset( + [right_id_mapping(id) for id in right.total_ordering_columns] + ) + return core.ExpressionOrdering( + ordering_value_columns=joined_refs, + total_ordering_columns=left_total_order_cols | right_total_order_cols, ) From edabdbb131150707ea9211292cacbb60b8d076dd Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 20 Sep 2023 17:22:07 -0700 Subject: [PATCH 05/24] fix: loosen filter items tests to accomodate shifting pandas impl (#41) --- tests/system/small/test_dataframe.py | 10 ++++++---- tests/system/small/test_series.py | 4 +++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ed682c855b..6c96387e97 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2129,10 +2129,10 @@ def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.filter(items=["string_col", "int64_col"]).to_pandas() pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"]) - + # Ignore column ordering as pandas order differently depending on version pd.testing.assert_frame_equal( - bf_result, - pd_result, + bf_result.sort_index(axis=1), + pd_result.sort_index(axis=1), ) @@ -2167,9 +2167,11 @@ def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal( + # Ignore ordering as pandas order differently depending on version + assert_pandas_df_equal_ignore_ordering( bf_result, pd_result, + check_names=False, ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d3560540cc..d702049e68 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1950,9 +1950,11 @@ def test_series_filter_items(scalars_df_index, scalars_pandas_df_index): # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_series_equal( + # Ignore ordering as pandas order differently depending on version + assert_series_equal_ignoring_order( bf_result, pd_result, + check_names=False, ) From 109ee24108875389a654674569583e2c7a32d853 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 20 Sep 2023 19:52:36 -0700 Subject: [PATCH 06/24] refactor: remove ibis references outside of arrayvalue code. (#37) Change-Id: I1386355446e90f89a43cee8a9f447f0775639902 --- bigframes/core/__init__.py | 59 ++++++++++++++----- bigframes/core/blocks.py | 84 ++++++++++++++++++--------- bigframes/core/indexers.py | 44 +++++--------- bigframes/core/joins/single_column.py | 4 +- bigframes/core/scalar.py | 50 +--------------- bigframes/dataframe.py | 46 +++------------ bigframes/operations/__init__.py | 34 ++++++++--- bigframes/series.py | 16 +---- tests/unit/test_core.py | 12 ++-- 9 files changed, 160 insertions(+), 189 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 8e7beb73db..dd91f80e63 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -269,7 +269,7 @@ def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) def apply_limit(self, max_results: int) -> ArrayValue: - table = self.to_ibis_expr( + table = self._to_ibis_expr( ordering_mode="order_by", expose_hidden_cols=True, ).limit(max_results) @@ -285,11 +285,23 @@ def apply_limit(self, max_results: int) -> ArrayValue: ordering=self._ordering, ) - def filter(self, predicate: ibis_types.BooleanValue) -> ArrayValue: + def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + condition = typing.cast(ibis_types.BooleanValue, self.get_column(predicate_id)) + if keep_null: + condition = typing.cast( + ibis_types.BooleanValue, + condition.fillna( + typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) + ), + ) + return self._filter(condition) + + def _filter(self, predicate_value: ibis_types.BooleanValue) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" expr = self.builder() expr.ordering = expr.ordering.with_non_sequential() - expr.predicates = [*self._predicates, predicate] + expr.predicates = [*self._predicates, predicate_value] return expr.build() def order_by( @@ -310,7 +322,7 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue: .. warning:: The row numbers of result is non-deterministic, avoid to use. """ - table = self.to_ibis_expr( + table = self._to_ibis_expr( ordering_mode="order_by", expose_hidden_cols=True, fraction=fraction ) columns = [table[column_name] for column_name in self._column_names] @@ -342,7 +354,7 @@ def project_offsets(self) -> ArrayValue: if self._ordering.is_sequential: return self # TODO(tbergeron): Enforce total ordering - table = self.to_ibis_expr( + table = self._to_ibis_expr( ordering_mode="offset_col", order_col_name=ORDER_ID_COLUMN ) columns = [table[column_name] for column_name in self._column_names] @@ -412,7 +424,7 @@ def projection(self, columns: Iterable[ibis_types.Value]) -> ArrayValue: def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" width = len(self.columns) - count_expr = self.to_ibis_expr(ordering_mode="unordered").count() + count_expr = self._to_ibis_expr(ordering_mode="unordered").count() sql = self._session.ibis_client.compile(count_expr) row_iterator, _ = self._session._start_query( sql=sql, @@ -435,7 +447,7 @@ def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: ) for i, expr in enumerate([self, *other]): ordering_prefix = str(i).zfill(prefix_size) - table = expr.to_ibis_expr( + table = expr._to_ibis_expr( ordering_mode="string_encoded", order_col_name=ORDER_ID_COLUMN ) # Rename the value columns based on horizontal offset before applying union. @@ -522,7 +534,7 @@ def aggregate( by_column_id: column id of the aggregation key, this is preserved through the transform dropna: whether null keys should be dropped """ - table = self.to_ibis_expr(ordering_mode="unordered") + table = self._to_ibis_expr(ordering_mode="unordered") stats = { col_out: agg_op._as_ibis(table[col_in]) for col_in, agg_op, col_out in aggregations @@ -541,7 +553,7 @@ def aggregate( expr = ArrayValue(self._session, result, columns=columns, ordering=ordering) if dropna: for column_id in by_column_ids: - expr = expr.filter( + expr = expr._filter( ops.notnull_op._as_ibis(expr.get_column(column_id)) ) # Can maybe remove this as Ordering id is redundant as by_column is unique after aggregation @@ -572,7 +584,7 @@ def corr_aggregate( Arguments: corr_aggregations: left_column_id, right_column_id, output_column_id tuples """ - table = self.to_ibis_expr(ordering_mode="unordered") + table = self._to_ibis_expr(ordering_mode="unordered") stats = { col_out: table[col_left].corr(table[col_right], how="pop") for col_left, col_right, col_out in corr_aggregations @@ -646,7 +658,24 @@ def project_window_op( # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. return result._reproject_to_table() if not skip_reproject_unsafe else result - def to_ibis_expr( + def to_sql( + self, + ordering_mode: Literal[ + "order_by", "string_encoded", "offset_col", "unordered" + ] = "order_by", + order_col_name: Optional[str] = ORDER_ID_COLUMN, + col_id_overrides: typing.Mapping[str, str] = {}, + ) -> str: + sql = self._session.ibis_client.compile( + self._to_ibis_expr( + ordering_mode=ordering_mode, + order_col_name=order_col_name, + col_id_overrides=col_id_overrides, + ) + ) + return typing.cast(str, sql) + + def _to_ibis_expr( self, ordering_mode: Literal[ "order_by", "string_encoded", "offset_col", "unordered" @@ -814,7 +843,7 @@ def start_query( # a LocalSession for unit testing. # TODO(swast): Add a timeout here? If the query is taking a long time, # maybe we just print the job metadata that we have so far? - table = self.to_ibis_expr(expose_hidden_cols=expose_extra_columns) + table = self._to_ibis_expr(expose_hidden_cols=expose_extra_columns) sql = self._session.ibis_client.compile(table) # type:ignore return self._session._start_query( sql=sql, @@ -833,7 +862,7 @@ def _reproject_to_table(self) -> ArrayValue: some operations such as window operations that cannot be used recursively in projections. """ - table = self.to_ibis_expr( + table = self._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True, ) @@ -912,7 +941,7 @@ def unpivot( Returns: ArrayValue: The unpivoted ArrayValue """ - table = self.to_ibis_expr(ordering_mode="offset_col") + table = self._to_ibis_expr(ordering_mode="offset_col") sub_expressions = [] # Use ibis memtable to infer type of rowlabels (if possible) @@ -1054,7 +1083,7 @@ def slice( start = start if (start is not None) else last_offset cond_list.append((start - expr_with_offsets.offsets) % (-step) == 0) - sliced_expr = expr_with_offsets.filter( + sliced_expr = expr_with_offsets._filter( functools.reduce(lambda x, y: x & y, cond_list) ) return sliced_expr if step > 0 else sliced_expr.reversed() diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index fb9ede9f4c..e691a30f9c 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -30,8 +30,6 @@ import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery -import ibis.expr.schema as ibis_schema -import ibis.expr.types as ibis_types import numpy import pandas as pd import pyarrow as pa # type: ignore @@ -42,6 +40,7 @@ import bigframes.core.indexes as indexes import bigframes.core.ordering as ordering import bigframes.core.utils +import bigframes.core.utils as utils import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -368,7 +367,10 @@ def reorder_levels(self, ids: typing.Sequence[str]): level_names = [self.col_id_to_index_name[index_id] for index_id in ids] return Block(self.expr, ids, self.column_labels, level_names) - def _to_dataframe(self, result, schema: ibis_schema.Schema) -> pd.DataFrame: + @classmethod + def _to_dataframe( + cls, result, schema: typing.Mapping[str, bigframes.dtypes.Dtype] + ) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" df = result.to_dataframe( bool_dtype=pd.BooleanDtype(), @@ -382,8 +384,8 @@ def _to_dataframe(self, result, schema: ibis_schema.Schema) -> pd.DataFrame: ) # Convert Geography column from StringDType to GeometryDtype. - for column_name, ibis_dtype in schema.items(): - if ibis_dtype.is_geospatial(): + for column_name, dtype in schema.items(): + if dtype == gpd.array.GeometryDtype(): df[column_name] = gpd.GeoSeries.from_wkt( # https://github.com/geopandas/geopandas/issues/1879 df[column_name].replace({numpy.nan: None}), @@ -473,7 +475,8 @@ def _compute_and_count( if sampling_method == _HEAD: total_rows = int(results_iterator.total_rows * fraction) results_iterator.max_results = total_rows - df = self._to_dataframe(results_iterator, expr.to_ibis_expr().schema()) + schema = dict(zip(self.value_columns, self.dtypes)) + df = self._to_dataframe(results_iterator, schema) if self.index_columns: df.set_index(list(self.index_columns), inplace=True) @@ -508,7 +511,8 @@ def _compute_and_count( ) else: total_rows = results_iterator.total_rows - df = self._to_dataframe(results_iterator, expr.to_ibis_expr().schema()) + schema = dict(zip(self.value_columns, self.dtypes)) + df = self._to_dataframe(results_iterator, schema) if self.index_columns: df.set_index(list(self.index_columns), inplace=True) @@ -639,13 +643,6 @@ def with_index_labels(self, value: typing.Sequence[Label]) -> Block: index_labels=tuple(value), ) - def get_value_col_exprs( - self, column_names: Optional[Sequence[str]] = None - ) -> List[ibis_types.Value]: - """Retrive value column expressions.""" - column_names = self.value_columns if column_names is None else column_names - return [self._expr.get_column(column_name) for column_name in column_names] - def apply_unary_op( self, column: str, op: ops.UnaryOp, result_label: Label = None ) -> typing.Tuple[Block, str]: @@ -816,20 +813,9 @@ def assign_label(self, column_id: str, new_label: Label) -> Block: ) return self.with_column_labels(new_labels) - def filter(self, column_name: str, keep_null: bool = False): - condition = typing.cast( - ibis_types.BooleanValue, self._expr.get_column(column_name) - ) - if keep_null: - condition = typing.cast( - ibis_types.BooleanValue, - condition.fillna( - typing.cast(ibis_types.BooleanScalar, ibis_types.literal(True)) - ), - ) - filtered_expr = self.expr.filter(condition) + def filter(self, column_id: str, keep_null: bool = False): return Block( - filtered_expr, + self._expr.filter(column_id, keep_null), index_columns=self.index_columns, column_labels=self.column_labels, index_labels=self.index.names, @@ -1436,6 +1422,50 @@ def is_monotonic_decreasing( ) -> bool: return self._is_monotonic(column_id, increasing=False) + def to_sql_query( + self, include_index: bool + ) -> typing.Tuple[str, list[str], list[Label]]: + """ + Compiles this DataFrame's expression tree to SQL, optionally + including index columns. + + Args: + include_index (bool): + whether to include index columns. + + Returns: + a tuple of (sql_string, index_column_id_list, index_column_label_list). + If include_index is set to False, index_column_id_list and index_column_label_list + return empty lists. + """ + array_value = self._expr + col_labels, idx_labels = list(self.column_labels), list(self.index_labels) + old_col_ids, old_idx_ids = list(self.value_columns), list(self.index_columns) + + if not include_index: + idx_labels, old_idx_ids = [], [] + array_value = array_value.drop_columns(self.index_columns) + + old_ids = old_idx_ids + old_col_ids + + new_col_ids, new_idx_ids = utils.get_standardized_ids(col_labels, idx_labels) + new_ids = new_idx_ids + new_col_ids + + substitutions = {} + for old_id, new_id in zip(old_ids, new_ids): + # TODO(swast): Do we need to further escape this, or can we rely on + # the BigQuery unicode column name feature? + substitutions[old_id] = new_id + + sql = array_value.to_sql( + ordering_mode="unordered", col_id_overrides=substitutions + ) + return ( + sql, + new_ids[: len(idx_labels)], + idx_labels, + ) + def _is_monotonic( self, column_ids: typing.Union[str, Sequence[str]], increasing: bool ) -> bool: diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 28bce05338..a538c80711 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -21,11 +21,11 @@ import pandas as pd import bigframes.constants as constants -import bigframes.core as core import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.scalar import bigframes.dataframe +import bigframes.operations as ops import bigframes.series if typing.TYPE_CHECKING: @@ -59,35 +59,23 @@ def __setitem__(self, key, value) -> None: # Assume the key is for the index label. block = self._series._block - value_column = self._series._value - index_column = block.expr.get_column(block.index_columns[0]) - new_value = ( - ibis.case() - .when( - index_column == ibis.literal(key, index_column.type()), - ibis.literal(value, value_column.type()), - ) - .else_(value_column) - .end() - .name(value_column.get_name()) + value_column = self._series._value_column + index_column = block.index_columns[0] + + # if index == key return value else value_colum + block, insert_cond = block.apply_unary_op( + index_column, ops.partial_right(ops.eq_op, key) ) - all_columns = [] - for column in block.expr.columns: - if column.get_name() != value_column.get_name(): - all_columns.append(column) - else: - all_columns.append(new_value) - new_expr = block.expr.projection(all_columns) - - # TODO(tbergeron): Use block operators rather than directly building desired ibis expressions. - self._series._set_block( - core.blocks.Block( - new_expr, - self._series._block.index_columns, - self._series._block.column_labels, - self._series._block.index.names, - ) + block, result_id = block.apply_binary_op( + insert_cond, + self._series._value_column, + ops.partial_arg1(ops.where_op, value), ) + block = block.copy_values(result_id, value_column).drop_columns( + [insert_cond, result_id] + ) + + self._series._set_block(block) class IlocSeriesIndexer: diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/joins/single_column.py index 7aba71fd95..434cc2cd79 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/joins/single_column.py @@ -122,11 +122,11 @@ def join_by_column( ), ) else: - left_table = left.to_ibis_expr( + left_table = left._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True, ) - right_table = right.to_ibis_expr( + right_table = right._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True, ) diff --git a/bigframes/core/scalar.py b/bigframes/core/scalar.py index 6dfbd31b77..5db83b4a62 100644 --- a/bigframes/core/scalar.py +++ b/bigframes/core/scalar.py @@ -14,55 +14,7 @@ from __future__ import annotations -import typing -from typing import Any, Optional - -import google.cloud.bigquery as bigquery -import ibis.expr.types as ibis_types - -import bigframes -import bigframes.formatting_helpers as formatter - -if typing.TYPE_CHECKING: - import bigframes.session - - -class DeferredScalar: - """A deferred scalar object.""" - - def __init__(self, value: ibis_types.Scalar, session: bigframes.session.Session): - self._value = value - self._session = session - self._query_job: Optional[bigquery.QueryJob] = None - - @property - def query_job(self) -> Optional[bigquery.QueryJob]: - """BigQuery job metadata for the most recent query.""" - if self._query_job is None: - self._query_job = self._compute_dry_run() - return self._query_job - - def __repr__(self) -> str: - """Converts a Series to a string.""" - # TODO(swast): Add a timeout here? If the query is taking a long time, - # maybe we just print the job metadata that we have so far? - opts = bigframes.options.display - if opts.repr_mode == "deferred": - return formatter.repr_query_job(self.query_job) - else: - return repr(self.to_pandas()) - - def to_pandas(self) -> Any: - """Executes deferred operations and downloads the resulting scalar.""" - result, query_job = self._session._start_query(self._value.compile()) - self._query_job = query_job - df = self._session._rows_to_dataframe(result) - return df.iloc[0, 0] - - def _compute_dry_run(self): - job_config = bigquery.QueryJobConfig(dry_run=True) - return self._session._start_query(self._value.compile(), job_config=job_config) - +from typing import Any # All public APIs return Any at present # Later implementation may sometimes return a lazy scalar diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index de4adb912e..46c27eaccb 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -312,37 +312,7 @@ def _to_sql_query( If include_index is set to False, index_column_id_list and index_column_label_list return empty lists. """ - # Has to be unordered as it is impossible to order the sql without - # including metadata columns in selection with ibis. - ibis_expr = self._block.expr.to_ibis_expr(ordering_mode="unordered") - col_labels, idx_labels = list(self._block.column_labels), list( - self._block.index_labels - ) - old_col_ids, old_idx_ids = list(self._block.value_columns), list( - self._block.index_columns - ) - - if not include_index: - idx_labels, old_idx_ids = [], [] - ibis_expr = ibis_expr.drop(*self._block.index_columns) - - old_ids = old_idx_ids + old_col_ids - - new_col_ids, new_idx_ids = utils.get_standardized_ids(col_labels, idx_labels) - new_ids = new_idx_ids + new_col_ids - - substitutions = {} - for old_id, new_id in zip(old_ids, new_ids): - # TODO(swast): Do we need to further escape this, or can we rely on - # the BigQuery unicode column name feature? - substitutions[old_id] = new_id - - ibis_expr = ibis_expr.relabel(substitutions) - return ( - typing.cast(str, ibis_expr.compile()), - new_ids[: len(idx_labels)], - idx_labels, - ) + return self._block.to_sql_query(include_index) @property def sql(self) -> str: @@ -2340,8 +2310,7 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame: def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: """Create query text representing this dataframe for I/O.""" - expr = self._block.expr - session = expr._session + array_value = self._block.expr columns = list(self._block.value_columns) column_labels = list(self._block.column_labels) # This code drops unnamed indexes to keep consistent with the behavior of @@ -2352,7 +2321,7 @@ def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: columns.extend(self._block.index_columns) column_labels.extend(self.index.names) else: - expr = expr.drop_columns(self._block.index_columns) + array_value = array_value.drop_columns(self._block.index_columns) # Make columns in SQL reflect _labels_ not _ids_. Note: This may use # the arbitrary unicode column labels feature in BigQuery, which is @@ -2365,19 +2334,17 @@ def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: } if ordering_id is not None: - ibis_expr = expr.to_ibis_expr( + return array_value.to_sql( ordering_mode="offset_col", col_id_overrides=id_overrides, order_col_name=ordering_id, ) else: - ibis_expr = expr.to_ibis_expr( + return array_value.to_sql( ordering_mode="unordered", col_id_overrides=id_overrides, ) - return session.ibis_client.compile(ibis_expr) # type: ignore - def _run_io_query( self, index: bool, @@ -2458,6 +2425,9 @@ def rank( df = self._drop_non_numeric() if numeric_only else self return DataFrame(block_ops.rank(df._block, method, na_option, ascending)) + def first_valid_index(self): + return + applymap = map def _slice( diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index c5c55607ae..bc08298eb7 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -206,7 +206,7 @@ def _as_ibis(self, x: ibis_types.Value): class LenOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).length() + return typing.cast(ibis_types.StringValue, x).length().cast(ibis_dtypes.int64) class NotNullOp(UnaryOp): @@ -443,7 +443,7 @@ def _as_ibis(self, x: ibis_types.Value): ## Datetime Ops class DayOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).day() + return typing.cast(ibis_types.TimestampValue, x).day().cast(ibis_dtypes.int64) class DateOp(UnaryOp): @@ -453,32 +453,42 @@ def _as_ibis(self, x: ibis_types.Value): class DayofweekOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).day_of_week.index() + return ( + typing.cast(ibis_types.TimestampValue, x) + .day_of_week.index() + .cast(ibis_dtypes.int64) + ) class HourOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).hour() + return typing.cast(ibis_types.TimestampValue, x).hour().cast(ibis_dtypes.int64) class MinuteOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).minute() + return ( + typing.cast(ibis_types.TimestampValue, x).minute().cast(ibis_dtypes.int64) + ) class MonthOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).month() + return typing.cast(ibis_types.TimestampValue, x).month().cast(ibis_dtypes.int64) class QuarterOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).quarter() + return ( + typing.cast(ibis_types.TimestampValue, x).quarter().cast(ibis_dtypes.int64) + ) class SecondOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).second() + return ( + typing.cast(ibis_types.TimestampValue, x).second().cast(ibis_dtypes.int64) + ) class TimeOp(UnaryOp): @@ -488,7 +498,7 @@ def _as_ibis(self, x: ibis_types.Value): class YearOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.TimestampValue, x).year() + return typing.cast(ibis_types.TimestampValue, x).year().cast(ibis_dtypes.int64) # Parameterized ops @@ -969,6 +979,12 @@ def fillna_op( return x.fillna(typing.cast(ibis_types.Scalar, y)) +def round_op(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).round( + digits=typing.cast(ibis_types.IntegerValue, y) + ) + + def clip_lower( value: ibis_types.Value, lower: ibis_types.Value, diff --git a/bigframes/series.py b/bigframes/series.py index 9db64fae9c..8f3a24698b 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -22,7 +22,6 @@ from typing import Any, Mapping, Optional, Tuple, Union import google.cloud.bigquery as bigquery -import ibis.expr.types as ibis_types import numpy import pandas import pandas.core.dtypes.common @@ -223,14 +222,6 @@ def __repr__(self) -> str: return repr(pandas_df.iloc[:, 0]) - def _to_ibis_expr(self): - """Creates an Ibis table expression representing the Series.""" - expr = self._block.expr.projection([self._value]) - ibis_expr = expr.to_ibis_expr()[self._value_column] - if self._name: - return ibis_expr.name(self._name) - return ibis_expr - def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], @@ -661,12 +652,7 @@ def abs(self) -> Series: return self._apply_unary_op(ops.abs_op) def round(self, decimals=0) -> "Series": - def round_op(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).round( - digits=typing.cast(ibis_types.IntegerValue, y) - ) - - return self._apply_binary_op(decimals, round_op) + return self._apply_binary_op(decimals, ops.round_op) def corr(self, other: Series, method="pearson", min_periods=None) -> float: """ diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 8f3e0beb0e..ee0cefb3d2 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -103,7 +103,7 @@ def test_arrayvalue_to_ibis_expr_with_projection(): value.table["col2"].name("string_col"), ] ) - actual = expr.to_ibis_expr() + actual = expr._to_ibis_expr() assert len(actual.columns) == 3 assert actual.columns[0] == "int64_col" assert actual.columns[1] == "literals" @@ -138,7 +138,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr.to_ibis_expr() + actual = expr._to_ibis_expr() assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" @@ -175,7 +175,7 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ) expr = value.project_binary_op("col2", "col3", ops.add_op, "col4") assert expr.columns[3].type().is_float64() - actual = expr.to_ibis_expr() + actual = expr._to_ibis_expr() assert len(expr.columns) == 4 assert actual.columns[3] == "col4" @@ -194,7 +194,7 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ) expr = value.project_ternary_op("col2", "col3", "col4", ops.where_op, "col5") assert expr.columns[4].type().is_float64() - actual = expr.to_ibis_expr() + actual = expr._to_ibis_expr() assert len(expr.columns) == 5 assert actual.columns[4] == "col5" @@ -215,7 +215,7 @@ def test_arrayvalue_to_ibis_expr_with_aggregate(): by_column_ids=["col1"], dropna=False, ) - actual = expr.to_ibis_expr() + actual = expr._to_ibis_expr() assert len(expr.columns) == 2 assert actual.columns[0] == "col1" assert actual.columns[1] == "col4" @@ -234,7 +234,7 @@ def test_arrayvalue_to_ibis_expr_with_corr_aggregate(): total_ordering_columns=["col1"], ) expr = value.corr_aggregate(corr_aggregations=[("col1", "col3", "col4")]) - actual = expr.to_ibis_expr() + actual = expr._to_ibis_expr() assert len(expr.columns) == 1 assert actual.columns[0] == "col4" assert expr.columns[0].type().is_float64() From 3adc1b3aa3e2b218d4fa5debdaa4298276bdf801 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 21 Sep 2023 09:56:41 -0700 Subject: [PATCH 07/24] feat: add `items`, `apply` methods to `DataFrame`. (#43) Change-Id: Id3a0e78da3bb9ccce64e190f7797f737b239c33f Co-authored-by: Tim Swast --- bigframes/dataframe.py | 18 +++++++ tests/system/small/test_dataframe.py | 51 +++++++++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 34 +++++++++++++ 3 files changed, 103 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 46c27eaccb..0b741feff6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1418,6 +1418,12 @@ def isin(self, values) -> DataFrame: f"isin(), you passed a [{type(values).__name__}]" ) + def items(self): + column_ids = self._block.value_columns + column_labels = self._block.column_labels + for col_id, col_label in zip(column_ids, column_labels): + yield col_label, bigframes.series.Series(self._block.select_column(col_id)) + def dropna( self, *, @@ -2382,6 +2388,18 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: ops.RemoteFunctionOp(func, apply_on_null=(na_action is None)) ) + def apply(self, func, *, args: typing.Tuple = (), **kwargs): + results = {name: func(col, *args, **kwargs) for name, col in self.items()} + if all( + [ + isinstance(val, bigframes.series.Series) or utils.is_list_like(val) + for val in results.values() + ] + ): + return DataFrame(data=results) + else: + return pandas.Series(data=results) + def drop_duplicates( self, subset: typing.Union[blocks.Label, typing.Sequence[blocks.Label]] = None, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 6c96387e97..3eeb368ad2 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -663,6 +663,57 @@ def test_df_bfill(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_apply_series_series_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(series, arg1, arg2, *, kwarg1=0, kwarg2=0): + return series**2 + (arg1 * arg2 % 4) + (kwarg1 * kwarg2 % 7) + + bf_result = ( + scalars_df_index[columns] + .apply(foo, args=(33, 61), kwarg1=52, kwarg2=21) + .to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply( + foo, args=(33, 61), kwarg1=52, kwarg2=21 + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_listlike_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = ( + scalars_df_index[columns].apply(lambda x: [len(x), x.min(), 24]).to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: [len(x), x.min(), 24]) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result.index = pd_result.index.astype("Int64") + pd_result = pd_result.astype("Int64") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_scalar_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = scalars_df_index[columns].apply(lambda x: x.sum()) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: x.sum()) + + pandas.testing.assert_series_equal(bf_result, pd_result) + + def test_df_isin_list(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs values = ["Hello, World!", 55555, 2.51, pd.NA, True] diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 27cc2144e0..9d26938e08 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -734,6 +734,18 @@ def isin(self, values): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def items(self): + """ + Iterate over (column name, Series) pairs. + + Iterates over the DataFrame columns, returning a tuple with + the column name and the content as a Series. + + Returns: + Iterator: Iterator of label, Series for each column. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # Sorting @@ -1420,6 +1432,28 @@ def merge( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def apply(self, func, *, args=(), **kwargs): + """Apply a function along an axis of the DataFrame. + + Objects passed to the function are Series objects whose index is + the DataFrame's index (``axis=0``) the final return type + is inferred from the return type of the applied function. + + Args: + func (function): + Function to apply to each column or row. + args (tuple): + Positional arguments to pass to `func` in addition to the + array/series. + **kwargs: + Additional keyword arguments to pass as keywords arguments to + `func`. + + Returns: + pandas.Series or bigframes.DataFrame: Result of applying ``func`` along the given axis of the DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # ndarray-like stats methods From 1a254a496633957b9506dd8392dcc6fd10762201 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 21 Sep 2023 10:42:59 -0700 Subject: [PATCH 08/24] feat: add index `dtype`, `astype`, `drop`, `fillna`, aggregate attributes. (#38) Change-Id: I4af249d10b2fcd779ad05d1f1d95049893e40135 --- bigframes/core/indexes/index.py | 155 ++++++++++++++++-- bigframes/series.py | 4 +- tests/system/small/test_index.py | 110 +++++++++++++ .../pandas/core/indexes/base.py | 120 ++++++++++++++ 4 files changed, 373 insertions(+), 16 deletions(-) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 748a68c944..c08c851c91 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -17,7 +17,7 @@ from __future__ import annotations import typing -from typing import Callable, Tuple +from typing import Callable, Sequence, Tuple, Union import numpy as np import pandas @@ -26,7 +26,11 @@ import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.joins as joins +import bigframes.core.utils as utils +import bigframes.dtypes import bigframes.dtypes as bf_dtypes +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops import third_party.bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index @@ -51,16 +55,34 @@ def names(self) -> typing.Sequence[blocks.Label]: @names.setter def names(self, values: typing.Sequence[blocks.Label]): - return self._data._set_block(self._data._get_block().with_index_labels(values)) + return self._data._set_block(self._block.with_index_labels(values)) @property def nlevels(self) -> int: return len(self._data._get_block().index_columns) + @property + def values(self) -> np.ndarray: + return self.to_numpy() + + @property + def ndim(self) -> int: + return 1 + @property def shape(self) -> typing.Tuple[int]: return (self._data._get_block().shape[0],) + @property + def dtype(self): + return self._block.index_dtypes[0] if self.nlevels == 1 else np.dtype("O") + + @property + def dtypes(self) -> pandas.Series: + return pandas.Series( + data=self._block.index_dtypes, index=self._block.index_labels # type:ignore + ) + @property def size(self) -> int: """Returns the size of the Index.""" @@ -103,23 +125,120 @@ def is_monotonic_decreasing(self) -> bool: @property def is_unique(self) -> bool: + # TODO: Cache this at block level + # Avoid circular imports + return not self.has_duplicates + + @property + def has_duplicates(self) -> bool: # TODO: Cache this at block level # Avoid circular imports import bigframes.core.block_transforms as block_ops import bigframes.dataframe as df - duplicates_block, _ = block_ops.indicate_duplicates( - self._data._get_block(), self._data._get_block().index_columns - ) - duplicates_block = duplicates_block.with_column_labels( - ["values", "is_duplicate"] + duplicates_block, indicator = block_ops.indicate_duplicates( + self._block, self._block.index_columns ) + duplicates_block = duplicates_block.select_columns( + [indicator] + ).with_column_labels(["is_duplicate"]) duplicates_df = df.DataFrame(duplicates_block) - return not duplicates_df["is_duplicate"].any() + return duplicates_df["is_duplicate"].any() + + @property + def _block(self) -> blocks.Block: + return self._data._get_block() + + def astype( + self, + dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], + ) -> Index: + if self.nlevels > 1: + raise TypeError("Multiindex does not support 'astype'") + return self._apply_unary_op(ops.AsTypeOp(dtype)) + + def all(self) -> bool: + if self.nlevels > 1: + raise TypeError("Multiindex does not support 'all'") + return typing.cast(bool, self._apply_aggregation(agg_ops.all_op)) + + def any(self) -> bool: + if self.nlevels > 1: + raise TypeError("Multiindex does not support 'any'") + return typing.cast(bool, self._apply_aggregation(agg_ops.any_op)) + + def nunique(self) -> int: + return typing.cast(int, self._apply_aggregation(agg_ops.nunique_op)) + + def max(self) -> typing.Any: + return self._apply_aggregation(agg_ops.max_op) + + def min(self) -> typing.Any: + return self._apply_aggregation(agg_ops.min_op) + + def fillna(self, value=None) -> Index: + if self.nlevels > 1: + raise TypeError("Multiindex does not support 'fillna'") + return self._apply_unary_op(ops.partial_right(ops.fillna_op, value)) + + def rename(self, name: Union[str, Sequence[str]]) -> Index: + names = [name] if isinstance(name, str) else list(name) + if len(names) != self.nlevels: + raise ValueError("'name' must be same length as levels") + + import bigframes.dataframe as df + + return Index(df.DataFrame(self._block.with_index_labels(names))) + + def drop( + self, + labels: typing.Any, + ) -> Index: + # ignore axis, columns params + block = self._block + level_id = self._block.index_columns[0] + if utils.is_list_like(labels): + block, inverse_condition_id = block.apply_unary_op( + level_id, ops.IsInOp(labels, match_nulls=True) + ) + block, condition_id = block.apply_unary_op( + inverse_condition_id, ops.invert_op + ) + else: + block, condition_id = block.apply_unary_op( + level_id, ops.partial_right(ops.ne_op, labels) + ) + block = block.filter(condition_id, keep_null=True) + block = block.drop_columns([condition_id]) + import bigframes.dataframe as df + + return Index(df.DataFrame(block.select_columns([]))) + + def _apply_unary_op( + self, + op: ops.UnaryOp, + ) -> Index: + """Applies a unary operator to the index.""" + block = self._block + result_ids = [] + for col in self._block.index_columns: + block, result_id = block.apply_unary_op(col, op) + result_ids.append(result_id) + + block = block.set_index(result_ids, index_labels=self._block.index_labels) + import bigframes.dataframe as df + + return Index(df.DataFrame(block)) + + def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any: + if self.nlevels > 1: + raise NotImplementedError(f"Multiindex does not yet support {op.name}") + column_id = self._block.index_columns[0] + return self._block.get_stat(column_id, op) def __getitem__(self, key: int) -> typing.Any: if isinstance(key, int): - result_pd_df, _ = self._data._get_block().slice(key, key + 1, 1).to_pandas() + result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas() if result_pd_df.empty: raise IndexError("single positional indexer is out-of-bounds") return result_pd_df.index[0] @@ -133,7 +252,7 @@ def to_pandas(self) -> pandas.Index: pandas.Index: A pandas Index with all of the labels from this Index. """ - return IndexValue(self._data._get_block()).to_pandas() + return IndexValue(self._block).to_pandas() def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: return self.to_pandas().to_numpy(dtype, **kwargs) @@ -184,13 +303,15 @@ def __repr__(self) -> str: def to_pandas(self) -> pandas.Index: """Executes deferred operations and downloads the results.""" # Project down to only the index column. So the query can be cached to visualize other data. - index_column = self._block.index_columns[0] - expr = self._expr.projection([self._expr.get_any_column(index_column)]) + index_columns = list(self._block.index_columns) + expr = self._expr.projection( + [self._expr.get_any_column(col) for col in index_columns] + ) results, _ = expr.start_query() df = expr._session._rows_to_dataframe(results) - df.set_index(index_column) + df = df.set_index(index_columns) index = df.index - index.name = self._block._index_labels[0] + index.names = list(self._block._index_labels) return index def join( @@ -235,6 +356,12 @@ def resolve_level_name(self: IndexValue, label: blocks.Label) -> str: def is_uniquely_named(self: IndexValue): return len(set(self.names)) == len(self.names) + def _set_block(self, block: blocks.Block): + self._block = block + + def _get_block(self) -> blocks.Block: + return self._block + def join_mono_indexed( left: IndexValue, diff --git a/bigframes/series.py b/bigframes/series.py index 8f3a24698b..8e47088c14 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -287,7 +287,7 @@ def drop( # ignore axis, columns params block = self._block level_id = self._resolve_levels(level or 0)[0] - if _is_list_like(labels): + if _is_list_like(index): block, inverse_condition_id = block.apply_unary_op( level_id, ops.IsInOp(index, match_nulls=True) ) @@ -296,7 +296,7 @@ def drop( ) else: block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, labels) + level_id, ops.partial_right(ops.ne_op, index) ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 558dd12e69..7f09e3a9d5 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -13,6 +13,7 @@ # limitations under the License. import numpy +import pandas as pd from tests.system.utils import assert_pandas_index_equal_ignore_index_type @@ -25,6 +26,44 @@ def test_get_index(scalars_df_index, scalars_pandas_df_index): assert_pandas_index_equal_ignore_index_type(bf_result, pd_result) +def test_index_has_duplicates(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.has_duplicates + pd_result = scalars_pandas_df_index.set_index("int64_col").index.has_duplicates + assert bf_result == pd_result + + +def test_index_values(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.index.values + pd_result = scalars_pandas_df_index.index.values + + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_series_equal( + pd.Series(bf_result), pd.Series(pd_result), check_dtype=False + ) + + +def test_index_ndim(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.index.ndim + pd_result = scalars_pandas_df_index.index.ndim + + assert pd_result == bf_result + + +def test_index_dtype(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.index.dtype + pd_result = scalars_pandas_df_index.index.dtype + + assert pd_result == bf_result + + +def test_index_dtypes(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index(["string_col", "int64_too"]).index.dtypes + pd_result = scalars_pandas_df_index.set_index( + ["string_col", "int64_too"] + ).index.dtypes + pd.testing.assert_series_equal(bf_result, pd_result) + + def test_index_shape(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.index.shape pd_result = scalars_pandas_df_index.index.shape @@ -32,6 +71,77 @@ def test_index_shape(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +def test_index_astype(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index.set_index("int64_col").index.astype("Float64").to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index("int64_col").index.astype("Float64") + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_any(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.any() + pd_result = scalars_pandas_df_index.set_index("int64_col").index.any() + assert bf_result == pd_result + + +def test_index_all(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.all() + pd_result = scalars_pandas_df_index.set_index("int64_col").index.all() + assert bf_result == pd_result + + +def test_index_max(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.max() + pd_result = scalars_pandas_df_index.set_index("int64_col").index.max() + assert bf_result == pd_result + + +def test_index_min(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.min() + pd_result = scalars_pandas_df_index.set_index("int64_col").index.min() + assert bf_result == pd_result + + +def test_index_nunique(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.nunique() + pd_result = scalars_pandas_df_index.set_index("int64_col").index.nunique() + assert bf_result == pd_result + + +def test_index_fillna(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.fillna(42).to_pandas() + pd_result = scalars_pandas_df_index.set_index("int64_col").index.fillna(42) + + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_drop(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index.set_index("int64_col").index.drop([2, 314159]).to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index("int64_col").index.drop([2, 314159]) + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_rename(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.rename("name").to_pandas() + pd_result = scalars_pandas_df_index.set_index("int64_col").index.rename("name") + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_multi_rename(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index.set_index(["int64_col", "int64_too"]) + .index.rename(["new", "names"]) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index( + ["int64_col", "int64_too"] + ).index.rename(["new", "names"]) + pd.testing.assert_index_equal(bf_result, pd_result) + + def test_index_len(scalars_df_index, scalars_pandas_df_index): bf_result = len(scalars_df_index.index) pd_result = len(scalars_pandas_df_index.index) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index 864007b774..f89964e220 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1,4 +1,5 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/base.py +from __future__ import annotations from bigframes import constants @@ -14,6 +15,11 @@ def name(self): """Returns Index name.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def values(self): + """Return an array representing the data in the Index.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def shape(self): """ @@ -31,6 +37,120 @@ def is_unique(self) -> bool: """Return if the index has unique values.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def has_duplicates(self) -> bool: + """Check if the Index has duplicate values.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def dtype(self): + """Return the dtype object of the underlying data.""" + + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def dtypes(self): + """Return the dtypes as a Series for the underlying MultiIndex.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def astype(self, dtype): + """Create an Index with values cast to dtypes. + + The class of a new Index is determined by dtype. When conversion is + impossible, a TypeError exception is raised. + + Args: + dtype (numpy dtype or pandas type): + + Returns: + Index: Index with values cast to specified dtype. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def all(self) -> bool: + """Return whether all elements are Truthy. + + Returns: + bool: A single element array-like may be converted to bool. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def any(self) -> bool: + """Return whether any element is Truthy. + + Returns: + bool: A single element array-like may be converted to bool. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def min(self): + """Return the minimum value of the Index. + + Returns: + scalar: Minimum value. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def max(self): + """Return the maximum value of the Index. + + Returns: + scalar: Maximum value. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def nunique(self) -> int: + """Return number of unique elements in the object. + + Excludes NA values by default. + + Returns: + int + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def fillna(self, value) -> Index: + """ + Fill NA/NaN values with the specified value. + + Args: + value (scalar): + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + + Returns: + Index + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def rename(self, name) -> Index: + """ + Alter Index or MultiIndex name. + + Able to set new names without level. Defaults to returning new index. + Length of names must match number of levels in MultiIndex. + + Args: + name (label or list of labels): + Name(s) to set. + + Returns: + Index: The same type as the caller. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def drop(self, labels) -> Index: + """ + Make new Index with passed list of labels deleted. + + Args: + labels (array-like or scalar): + + Returns: + Index: Will be same type as self + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_numpy(self, dtype): """ A NumPy ndarray representing the values in this Series or Index. From 5e199ecf1ecf13a68a2ed0dd4464afd9db977ab1 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 21 Sep 2023 14:28:13 -0500 Subject: [PATCH 09/24] perf: inline small `Series` and `DataFrames` in query text (#45) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prevents unnecessary load and query jobs. Towards internal issue 296474170 🦕 --- bigframes/core/__init__.py | 62 ++++++++++++++++++++----- bigframes/core/blocks.py | 55 +++++++++++----------- bigframes/dataframe.py | 4 +- bigframes/dtypes.py | 21 +++++---- bigframes/operations/base.py | 4 +- setup.py | 4 +- testing/constraints-3.9.txt | 2 +- tests/unit/core/__init__.py | 13 ++++++ tests/unit/core/test_blocks.py | 85 ++++++++++++++++++++++++++++++++++ 9 files changed, 194 insertions(+), 56 deletions(-) create mode 100644 tests/unit/core/__init__.py create mode 100644 tests/unit/core/test_blocks.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index dd91f80e63..3b3754642e 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -144,21 +144,56 @@ def mem_expr_from_pandas( """ Builds an in-memory only (SQL only) expr from a pandas dataframe. - Caution: If session is None, only a subset of expr functionality will be available (null Session is usually not supported). + Caution: If session is None, only a subset of expr functionality will + be available (null Session is usually not supported). """ - # must set non-null column labels. these are not the user-facing labels - pd_df = pd_df.set_axis( - [column or bigframes.core.guid.generate_guid() for column in pd_df.columns], - axis="columns", - ) + # We can't include any hidden columns in the ArrayValue constructor, so + # grab the column names before we add the hidden ordering column. + column_names = [str(column) for column in pd_df.columns] + # Make sure column names are all strings. + pd_df = pd_df.set_axis(column_names, axis="columns") pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) + # ibis memtable cannot handle NA, must convert to None pd_df = pd_df.astype("object") # type: ignore pd_df = pd_df.where(pandas.notnull(pd_df), None) + + # NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases. keys_memtable = ibis.memtable(pd_df) + schema = keys_memtable.schema() + new_schema = [] + for column_index, column in enumerate(schema): + if column == ORDER_ID_COLUMN: + new_type: ibis_dtypes.DataType = ibis_dtypes.int64 + else: + column_type = schema[column] + # The autodetected type might not be one we can support, such + # as NULL type for empty rows, so convert to a type we do + # support. + new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype( + bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type) + ) + # TODO(swast): Ibis memtable doesn't use backticks in struct + # field names, so spaces and other characters aren't allowed in + # the memtable context. Blocked by + # https://github.com/ibis-project/ibis/issues/7187 + column = f"col_{column_index}" + new_schema.append((column, new_type)) + + # must set non-null column labels. these are not the user-facing labels + pd_df = pd_df.set_axis( + [column for column, _ in new_schema], + axis="columns", + ) + keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema)) + return cls( session, # type: ignore # Session cannot normally be none, see "caution" above keys_memtable, + columns=[ + keys_memtable[f"col_{column_index}"].name(column) + for column_index, column in enumerate(column_names) + ], ordering=ExpressionOrdering( ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], total_ordering_columns=frozenset([ORDER_ID_COLUMN]), @@ -426,11 +461,16 @@ def shape(self) -> typing.Tuple[int, int]: width = len(self.columns) count_expr = self._to_ibis_expr(ordering_mode="unordered").count() sql = self._session.ibis_client.compile(count_expr) - row_iterator, _ = self._session._start_query( - sql=sql, - max_results=1, - ) - length = next(row_iterator)[0] + + # Support in-memory engines for hermetic unit tests. + if not isinstance(sql, str): + length = self._session.ibis_client.execute(count_expr) + else: + row_iterator, _ = self._session._start_query( + sql=sql, + max_results=1, + ) + length = next(row_iterator)[0] return (length, width) def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index e691a30f9c..ad4f72070f 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -44,6 +44,7 @@ import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common # Type constraint for wherever column labels are used Label = typing.Hashable @@ -1522,37 +1523,35 @@ def _is_monotonic( return result -def block_from_local(data, session=None, use_index=True) -> Block: - # TODO(tbergeron): Handle duplicate column labels +def block_from_local(data, session=None) -> Block: pd_data = pd.DataFrame(data) + columns = pd_data.columns - column_labels = list(pd_data.columns) - if not all((label is None) or isinstance(label, str) for label in column_labels): - raise NotImplementedError( - f"Only string column labels supported. {constants.FEEDBACK_LINK}" - ) + # Make a flattened version to treat as a table. + if len(pd_data.columns.names) > 1: + pd_data.columns = columns.to_flat_index() - if use_index: - if pd_data.index.nlevels > 1: - raise NotImplementedError( - f"multi-indices not supported. {constants.FEEDBACK_LINK}" - ) - index_label = pd_data.index.name - - index_id = guid.generate_guid() - pd_data = pd_data.reset_index(names=index_id) - keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session) - return Block( - keys_expr, - column_labels=column_labels, - index_columns=[index_id], - index_labels=[index_label], - ) - else: - keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session) - keys_expr, offsets_id = keys_expr.promote_offsets() - # Constructor will create default range index - return Block(keys_expr, index_columns=[offsets_id], column_labels=column_labels) + index_labels = list(pd_data.index.names) + # The ArrayValue layer doesn't know about indexes, so make sure indexes + # are real columns with unique IDs. + pd_data = pd_data.reset_index( + names=[f"level_{level}" for level in range(len(index_labels))] + ) + pd_data = pd_data.set_axis( + vendored_pandas_io_common.dedup_names( + list(pd_data.columns), is_potential_multiindex=False + ), + axis="columns", + ) + index_ids = pd_data.columns[: len(index_labels)] + + keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session) + return Block( + keys_expr, + column_labels=columns, + index_columns=index_ids, + index_labels=index_labels, + ) def _align_block_to_schema( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0b741feff6..76377cd477 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -65,9 +65,7 @@ # BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. # TODO(tbergeron): Convert to bytes-based limit -# TODO(swast): Address issues with string escaping and empty tables before -# re-enabling inline data (ibis.memtable) feature. -MAX_INLINE_DF_SIZE = -1 +MAX_INLINE_DF_SIZE = 5000 LevelType = typing.Union[str, int] LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index af3209b0e1..271b8aa2f2 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -96,13 +96,13 @@ ), ) -BIGFRAMES_TO_IBIS: Dict[Dtype, IbisDtype] = { +BIGFRAMES_TO_IBIS: Dict[Dtype, ibis_dtypes.DataType] = { pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS } -IBIS_TO_BIGFRAMES: Dict[ - Union[IbisDtype, ReadOnlyIbisDtype], Union[Dtype, np.dtype[Any]] -] = {ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS} +IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, Union[Dtype, np.dtype[Any]]] = { + ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS +} # Allow REQUIRED fields to map correctly. IBIS_TO_BIGFRAMES.update( {ibis.copy(nullable=False): pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS} @@ -130,7 +130,7 @@ def ibis_dtype_to_bigframes_dtype( - ibis_dtype: Union[IbisDtype, ReadOnlyIbisDtype] + ibis_dtype: ibis_dtypes.DataType, ) -> Union[Dtype, np.dtype[Any]]: """Converts an Ibis dtype to a BigQuery DataFrames dtype @@ -155,6 +155,9 @@ def ibis_dtype_to_bigframes_dtype( if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] + elif isinstance(ibis_dtype, ibis_dtypes.Null): + # Fallback to STRING for NULL values for most flexibility in SQL. + return IBIS_TO_BIGFRAMES[ibis_dtypes.string] else: raise ValueError( f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}" @@ -185,8 +188,8 @@ def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table: def bigframes_dtype_to_ibis_dtype( - bigframes_dtype: Union[DtypeString, Dtype] -) -> IbisDtype: + bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]] +) -> ibis_dtypes.DataType: """Converts a BigQuery DataFrames supported dtype to an Ibis dtype. Args: @@ -281,7 +284,9 @@ def literal_to_ibis_scalar( return scalar_expr -def cast_ibis_value(value: ibis_types.Value, to_type: IbisDtype) -> ibis_types.Value: +def cast_ibis_value( + value: ibis_types.Value, to_type: ibis_dtypes.DataType +) -> ibis_types.Value: """Perform compatible type casts of ibis values Args: diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 81a5bc4c41..add6af57f4 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -30,9 +30,7 @@ # BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. # TODO(tbergeron): Convert to bytes-based limit -# TODO(swast): Address issues with string escaping and empty tables before -# re-enabling inline data (ibis.memtable) feature. -MAX_INLINE_SERIES_SIZE = -1 +MAX_INLINE_SERIES_SIZE = 5000 class SeriesMethods: diff --git a/setup.py b/setup.py index 69b71c88f1..29eacb74a9 100644 --- a/setup.py +++ b/setup.py @@ -44,12 +44,12 @@ "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. - "ibis-framework[bigquery] >=6.0.0,<=6.1.0", + "ibis-framework[bigquery] >=6.2.0,<7.0.0dev", "pandas >=1.5.0", "pydata-google-auth >=1.8.2", "requests >=2.27.1", "scikit-learn >=1.2.2", - "sqlalchemy >=1.4,<3.0", + "sqlalchemy >=1.4,<3.0dev", "ipywidgets >=7.7.1", "humanize >= 4.6.0", ] diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index cd69d45dc9..f43d3b4ca0 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -45,7 +45,7 @@ greenlet==2.0.2 grpc-google-iam-v1==0.12.6 grpcio==1.53.0 grpcio-status==1.48.2 -ibis-framework==6.0.0 +ibis-framework==6.2.0 humanize==4.6.0 identify==2.5.22 idna==3.4 diff --git a/tests/unit/core/__init__.py b/tests/unit/core/__init__.py new file mode 100644 index 0000000000..1dc90d1848 --- /dev/null +++ b/tests/unit/core/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/core/test_blocks.py b/tests/unit/core/test_blocks.py new file mode 100644 index 0000000000..a7e9b5a84b --- /dev/null +++ b/tests/unit/core/test_blocks.py @@ -0,0 +1,85 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas +import pandas.testing +import pytest + +import bigframes.core.blocks as blocks + +from .. import resources + + +@pytest.mark.parametrize( + ("data",), + ( + pytest.param( + {"test 1": [1, 2, 3], "test 2": [0.25, 0.5, 0.75]}, + id="dict_spaces_in_column_names", + ), + pytest.param( + [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]], + id="nested_list", + ), + pytest.param( + pandas.concat( + [ + pandas.Series([1, 2, 3], name="some col"), + pandas.Series([2, 3, 4], name="some col"), + ], + axis="columns", + ), + id="duplicate_column_names", + ), + pytest.param( + pandas.DataFrame( + {"test": [1, 2, 3]}, + index=pandas.Index(["a", "b", "c"], name="string index"), + ), + id="string_index", + ), + pytest.param( + pandas.DataFrame( + [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], + columns=pandas.MultiIndex.from_tuples( + [(1, 1), (1, 2), (0, 0), (0, 1)], + names=["some level", "another level"], + ), + ), + marks=[ + pytest.mark.skipif( + tuple(pandas.__version__.split()) < ("2", "0", "0"), + reason="pandas 1.5.3 treats column MultiIndex as Index of tuples", + ), + ], + id="multiindex_columns", + ), + pytest.param( + pandas.DataFrame( + {"test": [1, 2, 3]}, + index=pandas.MultiIndex.from_tuples([(1, 1), (1, 2), (0, 0)]), + ), + id="multiindex_rows", + ), + ), +) +def test_block_from_local(data): + expected = pandas.DataFrame(data) + session = resources.create_pandas_session({}) + + block = blocks.block_from_local(data, session=session) + + pandas.testing.assert_index_equal(block.column_labels, expected.columns) + assert tuple(block.index_labels) == tuple(expected.index.names) + assert block.shape == expected.shape From 33274c2fc5035541fe9eb035fb5a1d92b35af4e5 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 21 Sep 2023 15:14:44 -0700 Subject: [PATCH 10/24] refactor: ml.sql to Object (#44) * refactor: ml.sql to Object Change-Id: Ibf795b81619778eaf28572fccd95a09b65f8ad58 --- bigframes/ml/cluster.py | 5 +- bigframes/ml/compose.py | 5 +- bigframes/ml/core.py | 301 +++++++++++++------------- bigframes/ml/decomposition.py | 5 +- bigframes/ml/ensemble.py | 14 +- bigframes/ml/forecasting.py | 12 +- bigframes/ml/globals.py | 30 +++ bigframes/ml/imported.py | 12 +- bigframes/ml/linear_model.py | 8 +- bigframes/ml/llm.py | 8 +- bigframes/ml/preprocessing.py | 16 +- bigframes/ml/sql.py | 326 +++++++++++++++-------------- tests/system/large/ml/test_core.py | 16 +- tests/system/small/ml/conftest.py | 3 +- tests/unit/ml/test_golden_sql.py | 52 +++-- tests/unit/ml/test_sql.py | 156 +++++++++----- 16 files changed, 559 insertions(+), 410 deletions(-) create mode 100644 bigframes/ml/globals.py diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 14cce2879e..772b90f666 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -22,7 +22,7 @@ from google.cloud import bigquery import bigframes -from bigframes.ml import base, core, utils +from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.cluster._kmeans @@ -37,6 +37,7 @@ class KMeans( def __init__(self, n_clusters: int = 8): self.n_clusters = n_clusters self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() @classmethod def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> KMeans: @@ -66,7 +67,7 @@ def _fit( ) -> KMeans: (X,) = utils.convert_to_dataframe(X) - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X_train=X, transforms=transforms, options=self._bqml_options, diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index a3d3503ad0..02365f261c 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -22,7 +22,7 @@ from typing import List, Optional, Tuple, Union from bigframes import constants -from bigframes.ml import base, core, preprocessing, utils +from bigframes.ml import base, core, globals, preprocessing, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.compose._column_transformer @@ -53,6 +53,7 @@ def __init__( # TODO: if any(transformers) has fitted raise warning self.transformers = transformers self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() # call self.transformers_ to check chained transformers self.transformers_ @@ -114,7 +115,7 @@ def fit( compiled_transforms = self._compile_to_sql(X.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 667d42f7ee..70be0d35ee 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -36,6 +36,9 @@ class BqmlModel: def __init__(self, session: bigframes.Session, model: bigquery.Model): self._session = session self._model = model + self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator( + self.model_name + ) @property def session(self) -> bigframes.Session: @@ -85,18 +88,14 @@ def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( input_data, - lambda source_sql: ml_sql.ml_predict( - model_name=self.model_name, source_sql=source_sql - ), + self._model_manipulation_sql_generator.ml_predict, ) def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( input_data, - lambda source_sql: ml_sql.ml_transform( - model_name=self.model_name, source_sql=source_sql - ), + self._model_manipulation_sql_generator.ml_transform, ) def generate_text( @@ -107,10 +106,11 @@ def generate_text( # TODO: validate input data schema return self._apply_sql( input_data, - lambda source_sql: ml_sql.ml_generate_text( - model_name=self.model_name, + lambda source_sql: self._model_manipulation_sql_generator.ml_generate_text( source_sql=source_sql, - struct_options=ml_sql.struct_options(**options), + struct_options=self._model_manipulation_sql_generator.struct_options( + **options + ), ), ) @@ -122,15 +122,16 @@ def generate_text_embedding( # TODO: validate input data schema return self._apply_sql( input_data, - lambda source_sql: ml_sql.ml_generate_text_embedding( - model_name=self.model_name, + lambda source_sql: self._model_manipulation_sql_generator.ml_generate_text_embedding( source_sql=source_sql, - struct_options=ml_sql.struct_options(**options), + struct_options=self._model_manipulation_sql_generator.struct_options( + **options + ), ), ) def forecast(self) -> bpd.DataFrame: - sql = ml_sql.ml_forecast(self.model_name) + sql = self._model_manipulation_sql_generator.ml_forecast() return self._session.read_gbq(sql) def evaluate(self, input_data: Optional[bpd.DataFrame] = None): @@ -141,28 +142,28 @@ def evaluate(self, input_data: Optional[bpd.DataFrame] = None): if (input_data is not None) else (None, None, None) ) - sql = ml_sql.ml_evaluate(self.model_name, source_sql) + sql = self._model_manipulation_sql_generator.ml_evaluate(source_sql) return self._session.read_gbq(sql) - def centroids(self): + def centroids(self) -> bpd.DataFrame: assert self._model.model_type == "KMEANS" - sql = ml_sql.ml_centroids(self.model_name) + sql = self._model_manipulation_sql_generator.ml_centroids() return self._session.read_gbq(sql) - def principal_components(self): + def principal_components(self) -> bpd.DataFrame: assert self._model.model_type == "PCA" - sql = ml_sql.ml_principal_components(self.model_name) + sql = self._model_manipulation_sql_generator.ml_principal_components() return self._session.read_gbq(sql) - def principal_component_info(self): + def principal_component_info(self) -> bpd.DataFrame: assert self._model.model_type == "PCA" - sql = ml_sql.ml_principal_component_info(self.model_name) + sql = self._model_manipulation_sql_generator.ml_principal_component_info() return self._session.read_gbq(sql) @@ -187,8 +188,12 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: # truncate as Vertex ID only accepts 63 characters, easily exceeding the limit for temp models. # The possibility of conflicts should be low. vertex_ai_model_id = vertex_ai_model_id[:63] - options_sql = ml_sql.options(**{"vertex_ai_model_id": vertex_ai_model_id}) - sql = ml_sql.alter_model(self.model_name, options_sql=options_sql) + options_sql = self._model_manipulation_sql_generator.options( + **{"vertex_ai_model_id": vertex_ai_model_id} + ) + sql = self._model_manipulation_sql_generator.alter_model( + options_sql=options_sql + ) # Register the model and wait it to finish self._session._start_query(sql) @@ -196,137 +201,149 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: return self -def create_bqml_model( - X_train: bpd.DataFrame, - y_train: Optional[bpd.DataFrame] = None, - transforms: Optional[Iterable[str]] = None, - options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, -) -> BqmlModel: - """Create a session-temporary BQML model with the CREATE MODEL statement +class BqmlModelFactory: + def __init__(self): + model_id = self._create_temp_model_id() + self._model_creation_sql_generator = ml_sql.ModelCreationSqlGenerator(model_id) - Args: - X_train: features columns for training - y_train: labels columns for training, if applicable - transforms: an optional list of SQL expressions that implement preprocessing - on top of the input data. Generates a BQML TRANSFORM clause - options: a dict of options to configure the model. Generates a BQML OPTIONS - clause + def _create_temp_model_id(self) -> str: + return uuid.uuid4().hex - Returns: a BqmlModel, wrapping a trained model in BigQuery - """ - options = dict(options) - if y_train is None: - input_data = X_train - else: + def _reset_model_id(self): + self._model_creation_sql_generator._model_id = self._create_temp_model_id() + + def _create_model_with_sql(self, session: bigframes.Session, sql: str) -> BqmlModel: + # fit the model, synchronously + _, job = session._start_query(sql) + + # real model path in the session specific hidden dataset and table prefix + model_name_full = f"{job.destination.dataset_id}.{job.destination.table_id}" + model = session.bqclient.get_model(model_name_full) + + self._reset_model_id() + return BqmlModel(session, model) + + def create_model( + self, + X_train: bpd.DataFrame, + y_train: Optional[bpd.DataFrame] = None, + transforms: Optional[Iterable[str]] = None, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> BqmlModel: + """Create a session-temporary BQML model with the CREATE MODEL statement + + Args: + X_train: features columns for training + y_train: labels columns for training, if applicable + transforms: an optional list of SQL expressions that implement preprocessing + on top of the input data. Generates a BQML TRANSFORM clause + options: a dict of options to configure the model. Generates a BQML OPTIONS + clause + + Returns: a BqmlModel, wrapping a trained model in BigQuery + """ + options = dict(options) + if y_train is None: + input_data = X_train + else: + input_data = X_train.join(y_train, how="outer") + options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) + + session = X_train._session + + source_sql = input_data.sql + options_sql = self._model_creation_sql_generator.options(**options) + transform_sql = ( + self._model_creation_sql_generator.transform(*transforms) + if transforms is not None + else None + ) + sql = self._model_creation_sql_generator.create_model( + source_sql=source_sql, + transform_sql=transform_sql, + options_sql=options_sql, + ) + + return self._create_model_with_sql(session=session, sql=sql) + + def create_time_series_model( + self, + X_train: bpd.DataFrame, + y_train: bpd.DataFrame, + transforms: Optional[Iterable[str]] = None, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> BqmlModel: + assert ( + X_train.columns.size == 1 + ), "Time series timestamp input must only contain 1 column." + assert ( + y_train.columns.size == 1 + ), "Time stamp data input must only contain 1 column." + + options = dict(options) input_data = X_train.join(y_train, how="outer") - options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) - - session = X_train._session - - source_sql = input_data.sql - options_sql = ml_sql.options(**options) - transform_sql = ml_sql.transform(*transforms) if transforms is not None else None - sql = ml_sql.create_model( - model_name=_create_temp_model_name(), - source_sql=source_sql, - transform_sql=transform_sql, - options_sql=options_sql, - ) - - return _create_bqml_model_with_sql(session=session, sql=sql) - - -def create_bqml_time_series_model( - X_train: bpd.DataFrame, - y_train: bpd.DataFrame, - transforms: Optional[Iterable[str]] = None, - options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, -) -> BqmlModel: - - assert ( - X_train.columns.size == 1 - ), "Time series timestamp input must only contain 1 column." - assert ( - y_train.columns.size == 1 - ), "Time stamp data input must only contain 1 column." - - options = dict(options) - input_data = X_train.join(y_train, how="outer") - options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) - options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) - - session = X_train._session - - source_sql = input_data.sql - options_sql = ml_sql.options(**options) - - transform_sql = ml_sql.transform(*transforms) if transforms is not None else None - sql = ml_sql.create_model( - model_name=_create_temp_model_name(), - source_sql=source_sql, - transform_sql=transform_sql, - options_sql=options_sql, - ) - - return _create_bqml_model_with_sql(session=session, sql=sql) - - -def create_bqml_remote_model( - session: bigframes.Session, - connection_name: str, - options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, -) -> BqmlModel: - """Create a session-temporary BQML remote model with the CREATE MODEL statement - - Args: - connection_name: - a BQ connection to talk with Vertex AI, of the format ... https://cloud.google.com/bigquery/docs/create-cloud-resource-connection - options: - a dict of options to configure the model. Generates a BQML OPTIONS clause - - Returns: - BqmlModel: a BqmlModel wrapping a trained model in BigQuery - """ - options_sql = ml_sql.options(**options) - sql = ml_sql.create_remote_model( - model_name=_create_temp_model_name(), - connection_name=connection_name, - options_sql=options_sql, - ) + options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) + options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) - return _create_bqml_model_with_sql(session=session, sql=sql) + session = X_train._session + source_sql = input_data.sql + options_sql = self._model_creation_sql_generator.options(**options) -def create_bqml_imported_model( - session: bigframes.Session, - options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, -) -> BqmlModel: - """Create a session-temporary BQML imported model with the CREATE MODEL statement + transform_sql = ( + self._model_creation_sql_generator.transform(*transforms) + if transforms is not None + else None + ) + sql = self._model_creation_sql_generator.create_model( + source_sql=source_sql, + transform_sql=transform_sql, + options_sql=options_sql, + ) - Args: - options: a dict of options to configure the model. Generates a BQML OPTIONS - clause + return self._create_model_with_sql(session=session, sql=sql) - Returns: a BqmlModel, wrapping a trained model in BigQuery - """ - options_sql = ml_sql.options(**options) - sql = ml_sql.create_imported_model( - model_name=_create_temp_model_name(), - options_sql=options_sql, - ) + def create_remote_model( + self, + session: bigframes.Session, + connection_name: str, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> BqmlModel: + """Create a session-temporary BQML remote model with the CREATE MODEL statement - return _create_bqml_model_with_sql(session=session, sql=sql) + Args: + connection_name: + a BQ connection to talk with Vertex AI, of the format ... https://cloud.google.com/bigquery/docs/create-cloud-resource-connection + options: + a dict of options to configure the model. Generates a BQML OPTIONS clause + + Returns: + BqmlModel: a BqmlModel wrapping a trained model in BigQuery + """ + options_sql = self._model_creation_sql_generator.options(**options) + sql = self._model_creation_sql_generator.create_remote_model( + connection_name=connection_name, + options_sql=options_sql, + ) + return self._create_model_with_sql(session=session, sql=sql) -def _create_temp_model_name() -> str: - return uuid.uuid4().hex + def create_imported_model( + self, + session: bigframes.Session, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> BqmlModel: + """Create a session-temporary BQML imported model with the CREATE MODEL statement + Args: + options: a dict of options to configure the model. Generates a BQML OPTIONS + clause -def _create_bqml_model_with_sql(session: bigframes.Session, sql: str) -> BqmlModel: - # fit the model, synchronously - _, job = session._start_query(sql) + Returns: a BqmlModel, wrapping a trained model in BigQuery + """ + options_sql = self._model_creation_sql_generator.options(**options) + sql = self._model_creation_sql_generator.create_imported_model( + options_sql=options_sql, + ) - # real model path in the session specific hidden dataset and table prefix - model_name_full = f"{job.destination.dataset_id}.{job.destination.table_id}" - model = session.bqclient.get_model(model_name_full) - return BqmlModel(session, model) + return self._create_model_with_sql(session=session, sql=sql) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 0cfe3b3ddf..8e6be6d28c 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -22,7 +22,7 @@ from google.cloud import bigquery import bigframes -from bigframes.ml import base, core, utils +from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.decomposition._pca @@ -36,6 +36,7 @@ class PCA( def __init__(self, n_components: int = 3): self.n_components = n_components self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() @classmethod def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> PCA: @@ -60,7 +61,7 @@ def _fit( ) -> PCA: (X,) = utils.convert_to_dataframe(X) - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X_train=X, transforms=transforms, options={ diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index b0f3e5f081..113ad872b5 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -22,7 +22,7 @@ from google.cloud import bigquery import bigframes -from bigframes.ml import base, core, utils +from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.ensemble._forest import third_party.bigframes_vendored.xgboost.sklearn @@ -95,6 +95,7 @@ def __init__( self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() @classmethod def _from_bq( @@ -151,7 +152,7 @@ def _fit( ) -> XGBRegressor: X, y = utils.convert_to_dataframe(X, y) - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, @@ -259,6 +260,7 @@ def __init__( self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() @classmethod def _from_bq( @@ -315,7 +317,7 @@ def _fit( ) -> XGBClassifier: X, y = utils.convert_to_dataframe(X, y) - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, @@ -416,6 +418,7 @@ def __init__( self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() @classmethod def _from_bq( @@ -469,7 +472,7 @@ def _fit( ) -> RandomForestRegressor: X, y = utils.convert_to_dataframe(X, y) - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, @@ -584,6 +587,7 @@ def __init__( self.enable_global_explain = enable_global_explain self.xgboost_version = xgboost_version self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() @classmethod def _from_bq( @@ -637,7 +641,7 @@ def _fit( ) -> RandomForestClassifier: X, y = utils.convert_to_dataframe(X, y) - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 22d81294fc..8a6de1dd81 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -21,7 +21,7 @@ from google.cloud import bigquery import bigframes -from bigframes.ml import base, core, utils +from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd _PREDICT_OUTPUT_COLUMNS = ["forecast_timestamp", "forecast_value"] @@ -32,6 +32,7 @@ class ARIMAPlus(base.SupervisedTrainablePredictor): def __init__(self): self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() @classmethod def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ARIMAPlus: @@ -69,9 +70,16 @@ def _fit( Returns: ARIMAPlus: Fitted estimator. """ + if X.columns.size != 1: + raise ValueError( + "Time series timestamp input X must only contain 1 column." + ) + if y.columns.size != 1: + raise ValueError("Time series data input y must only contain 1 column.") + X, y = utils.convert_to_dataframe(X, y) - self._bqml_model = core.create_bqml_time_series_model( + self._bqml_model = self._bqml_model_factory.create_time_series_model( X, y, transforms=transforms, diff --git a/bigframes/ml/globals.py b/bigframes/ml/globals.py new file mode 100644 index 0000000000..c139476daa --- /dev/null +++ b/bigframes/ml/globals.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Global Singletons for BigQuery DataFrame ML.""" + +from bigframes.ml import core, sql + +_BASE_SQL_GENERATOR = sql.BaseSqlGenerator() +_BQML_MODEL_FACTORY = core.BqmlModelFactory() + + +def base_sql_generator() -> sql.BaseSqlGenerator: + """Base SQL Generator.""" + return _BASE_SQL_GENERATOR + + +def bqml_model_factory() -> core.BqmlModelFactory: + """BQML Model Factory""" + return _BQML_MODEL_FACTORY diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index d4571eb3e5..fb8aa98bef 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -21,7 +21,7 @@ from google.cloud import bigquery import bigframes -from bigframes.ml import base, core, utils +from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd @@ -42,10 +42,13 @@ def __init__( self.session = session or bpd.get_global_session() self.model_path = model_path self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() def _create_bqml_model(self): options = {"model_type": "TENSORFLOW", "model_path": self.model_path} - return core.create_bqml_imported_model(session=self.session, options=options) + return self._bqml_model_factory.create_imported_model( + session=self.session, options=options + ) @classmethod def _from_bq( @@ -124,10 +127,13 @@ def __init__( self.session = session or bpd.get_global_session() self.model_path = model_path self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() def _create_bqml_model(self): options = {"model_type": "ONNX", "model_path": self.model_path} - return core.create_bqml_imported_model(session=self.session, options=options) + return self._bqml_model_factory.create_imported_model( + session=self.session, options=options + ) @classmethod def _from_bq(cls, session: bigframes.Session, model: bigquery.Model) -> ONNXModel: diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index f27b798eea..f11879500b 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -23,7 +23,7 @@ import bigframes import bigframes.constants as constants -from bigframes.ml import base, core, utils +from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.linear_model._base import third_party.bigframes_vendored.sklearn.linear_model._logistic @@ -80,6 +80,7 @@ def __init__( self.calculate_p_values = calculate_p_values self.enable_global_explain = enable_global_explain self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() @classmethod def _from_bq( @@ -130,7 +131,7 @@ def _fit( ) -> LinearRegression: X, y = utils.convert_to_dataframe(X, y) - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, @@ -204,6 +205,7 @@ def __init__( self.class_weights = class_weights self._auto_class_weight = class_weights == "balanced" self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() @classmethod def _from_bq( @@ -248,7 +250,7 @@ def _fit( """Fit model with transforms.""" X, y = utils.convert_to_dataframe(X, y) - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X, y, transforms=transforms, diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 973fbf2ad9..c86e5fb3b6 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -21,7 +21,7 @@ import bigframes from bigframes import clients, constants from bigframes.core import blocks -from bigframes.ml import base, core, utils +from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd _REMOTE_TEXT_GENERATOR_MODEL_CODE = "CLOUD_AI_LARGE_LANGUAGE_MODEL_V1" @@ -52,6 +52,7 @@ def __init__( self._bq_connection_manager = clients.BqConnectionManager( self.session.bqconnectionclient, self.session.resourcemanagerclient ) + self._bqml_model_factory = globals.bqml_model_factory() self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): @@ -76,7 +77,7 @@ def _create_bqml_model(self): "remote_service_type": _REMOTE_TEXT_GENERATOR_MODEL_CODE, } - return core.create_bqml_remote_model( + return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options ) @@ -183,6 +184,7 @@ def __init__( self._bq_connection_manager = clients.BqConnectionManager( self.session.bqconnectionclient, self.session.resourcemanagerclient ) + self._bqml_model_factory = globals.bqml_model_factory() self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): @@ -207,7 +209,7 @@ def _create_bqml_model(self): "remote_service_type": _REMOTE_EMBEDDING_GENERATOR_MODEL_CODE, } - return core.create_bqml_remote_model( + return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options ) diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 8add7bdd76..cd4ae27b8c 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -20,9 +20,7 @@ import typing from typing import Any, cast, List, Literal, Optional, Tuple, Union -from bigframes.ml import base, core -from bigframes.ml import sql as ml_sql -from bigframes.ml import utils +from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.preprocessing._data import third_party.bigframes_vendored.sklearn.preprocessing._encoder @@ -38,6 +36,8 @@ class StandardScaler( def __init__(self): self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() # TODO(garrettwu): implement __hash__ def __eq__(self, other: Any) -> bool: @@ -53,7 +53,7 @@ def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: Returns: a list of tuples of (sql_expression, output_name)""" return [ ( - ml_sql.ml_standard_scaler(column, f"scaled_{column}"), + self._base_sql_generator.ml_standard_scaler(column, f"scaled_{column}"), f"scaled_{column}", ) for column in columns @@ -81,7 +81,7 @@ def fit( compiled_transforms = self._compile_to_sql(X.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, @@ -132,6 +132,8 @@ def __init__( self.min_frequency = min_frequency self.max_categories = max_categories self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() # TODO(garrettwu): implement __hash__ def __eq__(self, other: Any) -> bool: @@ -167,7 +169,7 @@ def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: ) return [ ( - ml_sql.ml_one_hot_encoder( + self._base_sql_generator.ml_one_hot_encoder( column, drop, top_k, frequency_threshold, f"onehotencoded_{column}" ), f"onehotencoded_{column}", @@ -206,7 +208,7 @@ def fit( compiled_transforms = self._compile_to_sql(X.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] - self._bqml_model = core.create_bqml_model( + self._bqml_model = self._bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=transform_sqls, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index feb7ff7835..7cf030485b 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -21,174 +21,180 @@ import bigframes.constants as constants -def _encode_value(v: Union[str, int, float, Iterable[str]]) -> str: - """Encode a parameter value for SQL""" - if isinstance(v, str): - return f'"{v}"' - elif isinstance(v, int) or isinstance(v, float): - return f"{v}" - elif isinstance(v, Iterable): - inner = ", ".join([_encode_value(x) for x in v]) - return f"[{inner}]" - else: - raise ValueError(f"Unexpected value type. {constants.FEEDBACK_LINK}") - - -def _build_parameters(**kwargs: Union[str, int, float, Iterable[str]]) -> str: - """Encode a dict of values into a formatted Iterable of key-value pairs for SQL""" - indent_str = " " - param_strs = [f"{k}={_encode_value(v)}" for k, v in kwargs.items()] - return "\n" + indent_str + f",\n{indent_str}".join(param_strs) - - -def options(**kwargs: Union[str, int, float, Iterable[str]]) -> str: - """Encode the OPTIONS clause for BQML""" - return f"OPTIONS({_build_parameters(**kwargs)})" - - -def _build_structs(**kwargs: Union[int, float]) -> str: - """Encode a dict of values into a formatted STRUCT items for SQL""" - indent_str = " " - param_strs = [f"{v} AS {k}" for k, v in kwargs.items()] - return "\n" + indent_str + f",\n{indent_str}".join(param_strs) - - -def struct_options(**kwargs: Union[int, float]) -> str: - """Encode a BQ STRUCT as options.""" - return f"STRUCT({_build_structs(**kwargs)})" - - -def _build_expressions(*expr_sqls: str) -> str: - """Encode a Iterable of SQL expressions into a formatted Iterable for SQL""" - indent_str = " " - return "\n" + indent_str + f",\n{indent_str}".join(expr_sqls) - - -def transform(*expr_sqls: str) -> str: - """Encode the TRANSFORM clause for BQML""" - return f"TRANSFORM({_build_expressions(*expr_sqls)})" - - -def connection(conn_name: str) -> str: - """Encode the REMOTE WITH CONNECTION clause for BQML. conn_name is of the format ...""" - return f"REMOTE WITH CONNECTION `{conn_name}`" - - -def ml_standard_scaler(numeric_expr_sql: str, name: str) -> str: - """Encode ML.STANDARD_SCALER for BQML""" - return f"""ML.STANDARD_SCALER({numeric_expr_sql}) OVER() AS {name}""" - - -def ml_one_hot_encoder( - numeric_expr_sql: str, drop: str, top_k: int, frequency_threshold: int, name: str -) -> str: - """Encode ML.ONE_HOT_ENCODER for BQML. - https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder for params.""" - return f"""ML.ONE_HOT_ENCODER({numeric_expr_sql}, '{drop}', {top_k}, {frequency_threshold}) OVER() AS {name}""" - - -def create_model( - model_name: str, - source_sql: str, - transform_sql: Optional[str] = None, - options_sql: Optional[str] = None, -) -> str: - """Encode the CREATE TEMP MODEL statement for BQML""" - parts = [f"CREATE TEMP MODEL `{model_name}`"] - if transform_sql: - parts.append(transform_sql) - if options_sql: - parts.append(options_sql) - parts.append(f"AS {source_sql}") - return "\n".join(parts) - - -def create_remote_model( - model_name: str, - connection_name: str, - options_sql: Optional[str] = None, -) -> str: - """Encode the CREATE TEMP MODEL statement for BQML remote model.""" - parts = [f"CREATE TEMP MODEL `{model_name}`"] - parts.append(connection(connection_name)) - if options_sql: - parts.append(options_sql) - return "\n".join(parts) - - -def create_imported_model( - model_name: str, - options_sql: Optional[str] = None, -) -> str: - """Encode the CREATE TEMP MODEL statement for BQML remote model.""" - parts = [f"CREATE TEMP MODEL `{model_name}`"] - if options_sql: - parts.append(options_sql) - return "\n".join(parts) - - -def alter_model( - model_name: str, - options_sql: str, -) -> str: - """Encode the ALTER MODEL statement for BQML""" - parts = [f"ALTER MODEL `{model_name}`"] - parts.append(f"SET {options_sql}") - return "\n".join(parts) - - -def ml_evaluate(model_name: str, source_sql: Optional[str] = None) -> str: - """Encode ML.EVALUATE for BQML""" - if source_sql is None: - return f"""SELECT * FROM ML.EVALUATE(MODEL `{model_name}`)""" - else: - return f"""SELECT * FROM ML.EVALUATE(MODEL `{model_name}`, +class BaseSqlGenerator: + """Generate base SQL strings for ML. Model name isn't needed in this class.""" + + # General methods + def encode_value(self, v: Union[str, int, float, Iterable[str]]) -> str: + """Encode a parameter value for SQL""" + if isinstance(v, str): + return f'"{v}"' + elif isinstance(v, int) or isinstance(v, float): + return f"{v}" + elif isinstance(v, Iterable): + inner = ", ".join([self.encode_value(x) for x in v]) + return f"[{inner}]" + else: + raise ValueError(f"Unexpected value type. {constants.FEEDBACK_LINK}") + + def build_parameters(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: + """Encode a dict of values into a formatted Iterable of key-value pairs for SQL""" + indent_str = " " + param_strs = [f"{k}={self.encode_value(v)}" for k, v in kwargs.items()] + return "\n" + indent_str + f",\n{indent_str}".join(param_strs) + + def build_structs(self, **kwargs: Union[int, float]) -> str: + """Encode a dict of values into a formatted STRUCT items for SQL""" + indent_str = " " + param_strs = [f"{v} AS {k}" for k, v in kwargs.items()] + return "\n" + indent_str + f",\n{indent_str}".join(param_strs) + + def build_expressions(self, *expr_sqls: str) -> str: + """Encode a Iterable of SQL expressions into a formatted Iterable for SQL""" + indent_str = " " + return "\n" + indent_str + f",\n{indent_str}".join(expr_sqls) + + def options(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: + """Encode the OPTIONS clause for BQML""" + return f"OPTIONS({self.build_parameters(**kwargs)})" + + def struct_options(self, **kwargs: Union[int, float]) -> str: + """Encode a BQ STRUCT as options.""" + return f"STRUCT({self.build_structs(**kwargs)})" + + # Connection + def connection(self, conn_name: str) -> str: + """Encode the REMOTE WITH CONNECTION clause for BQML. conn_name is of the format ...""" + return f"REMOTE WITH CONNECTION `{conn_name}`" + + # Transformers + def transform(self, *expr_sqls: str) -> str: + """Encode the TRANSFORM clause for BQML""" + return f"TRANSFORM({self.build_expressions(*expr_sqls)})" + + def ml_standard_scaler(self, numeric_expr_sql: str, name: str) -> str: + """Encode ML.STANDARD_SCALER for BQML""" + return f"""ML.STANDARD_SCALER({numeric_expr_sql}) OVER() AS {name}""" + + def ml_one_hot_encoder( + self, + numeric_expr_sql: str, + drop: str, + top_k: int, + frequency_threshold: int, + name: str, + ) -> str: + """Encode ML.ONE_HOT_ENCODER for BQML. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder for params.""" + return f"""ML.ONE_HOT_ENCODER({numeric_expr_sql}, '{drop}', {top_k}, {frequency_threshold}) OVER() AS {name}""" + + +class ModelCreationSqlGenerator(BaseSqlGenerator): + """Sql generator for creating a model entity. Model id is the standalone id without project id and dataset id.""" + + def __init__(self, model_id: str): + self._model_id = model_id + + # Model create and alter + def create_model( + self, + source_sql: str, + transform_sql: Optional[str] = None, + options_sql: Optional[str] = None, + ) -> str: + """Encode the CREATE TEMP MODEL statement for BQML""" + parts = [f"CREATE TEMP MODEL `{self._model_id}`"] + if transform_sql: + parts.append(transform_sql) + if options_sql: + parts.append(options_sql) + parts.append(f"AS {source_sql}") + return "\n".join(parts) + + def create_remote_model( + self, + connection_name: str, + options_sql: Optional[str] = None, + ) -> str: + """Encode the CREATE TEMP MODEL statement for BQML remote model.""" + parts = [f"CREATE TEMP MODEL `{self._model_id}`"] + parts.append(self.connection(connection_name)) + if options_sql: + parts.append(options_sql) + return "\n".join(parts) + + def create_imported_model( + self, + options_sql: Optional[str] = None, + ) -> str: + """Encode the CREATE TEMP MODEL statement for BQML remote model.""" + parts = [f"CREATE TEMP MODEL `{self._model_id}`"] + if options_sql: + parts.append(options_sql) + return "\n".join(parts) + + +class ModelManipulationSqlGenerator(BaseSqlGenerator): + """Sql generator for manipulating a model entity. Model name is the fully model path of project_id.dataset_id.model_id.""" + + def __init__(self, model_name: str): + self._model_name = model_name + + # Alter model + def alter_model( + self, + options_sql: str, + ) -> str: + """Encode the ALTER MODEL statement for BQML""" + parts = [f"ALTER MODEL `{self._model_name}`"] + parts.append(f"SET {options_sql}") + return "\n".join(parts) + + # ML prediction TVFs + def ml_predict(self, source_sql: str) -> str: + """Encode ML.PREDICT for BQML""" + return f"""SELECT * FROM ML.PREDICT(MODEL `{self._model_name}`, ({source_sql}))""" + def ml_forecast(self) -> str: + """Encode ML.FORECAST for BQML""" + return f"""SELECT * FROM ML.FORECAST(MODEL `{self._model_name}`)""" -def ml_centroids( - model_name: str, -) -> str: - """Encode ML.CENTROIDS for BQML""" - return f"""SELECT * FROM ML.CENTROIDS(MODEL `{model_name}`)""" - - -def ml_predict(model_name: str, source_sql: str) -> str: - """Encode ML.PREDICT for BQML""" - return f"""SELECT * FROM ML.PREDICT(MODEL `{model_name}`, - ({source_sql}))""" - - -def ml_transform(model_name: str, source_sql: str) -> str: - """Encode ML.TRANSFORM for BQML""" - return f"""SELECT * FROM ML.TRANSFORM(MODEL `{model_name}`, - ({source_sql}))""" - - -def ml_generate_text(model_name: str, source_sql: str, struct_options: str) -> str: - """Encode ML.GENERATE_TEXT for BQML""" - return f"""SELECT * FROM ML.GENERATE_TEXT(MODEL `{model_name}`, + def ml_generate_text(self, source_sql: str, struct_options: str) -> str: + """Encode ML.GENERATE_TEXT for BQML""" + return f"""SELECT * FROM ML.GENERATE_TEXT(MODEL `{self._model_name}`, ({source_sql}), {struct_options})""" - -def ml_generate_text_embedding( - model_name: str, source_sql: str, struct_options: str -) -> str: - """Encode ML.GENERATE_TEXT_EMBEDDING for BQML""" - return f"""SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `{model_name}`, + def ml_generate_text_embedding(self, source_sql: str, struct_options: str) -> str: + """Encode ML.GENERATE_TEXT_EMBEDDING for BQML""" + return f"""SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `{self._model_name}`, ({source_sql}), {struct_options})""" + # ML evaluation TVFs + def ml_evaluate(self, source_sql: Optional[str] = None) -> str: + """Encode ML.EVALUATE for BQML""" + if source_sql is None: + return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`)""" + else: + return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`, + ({source_sql}))""" -def ml_forecast(model_name: str) -> str: - """Encode ML.FORECAST for BQML""" - return f"""SELECT * FROM ML.FORECAST(MODEL `{model_name}`)""" - + def ml_centroids(self) -> str: + """Encode ML.CENTROIDS for BQML""" + return f"""SELECT * FROM ML.CENTROIDS(MODEL `{self._model_name}`)""" -def ml_principal_components(model_name: str) -> str: - """Encode ML.PRINCIPAL_COMPONENTS for BQML""" - return f"""SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `{model_name}`)""" + def ml_principal_components(self) -> str: + """Encode ML.PRINCIPAL_COMPONENTS for BQML""" + return f"""SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `{self._model_name}`)""" + def ml_principal_component_info(self) -> str: + """Encode ML.PRINCIPAL_COMPONENT_INFO for BQML""" + return ( + f"""SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `{self._model_name}`)""" + ) -def ml_principal_component_info(model_name: str) -> str: - """Encode ML.PRINCIPAL_COMPONENT_INFO for BQML""" - return f"""SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `{model_name}`)""" + # ML transform TVF, that require a transform_only type model + def ml_transform(self, source_sql: str) -> str: + """Encode ML.TRANSFORM for BQML""" + return f"""SELECT * FROM ML.TRANSFORM(MODEL `{self._model_name}`, + ({source_sql}))""" diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index 133af2dae4..3b30d7eb1d 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -14,8 +14,7 @@ import pandas -import bigframes.ml.core -import bigframes.ml.sql +from bigframes.ml import globals def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_df): @@ -32,7 +31,7 @@ def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_d ] y_train = df[["body_mass_g"]] - model = bigframes.ml.core.create_bqml_model( + model = globals.bqml_model_factory().create_model( X_train, y_train, options={"model_type": "linear_reg"} ) @@ -84,6 +83,9 @@ def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_d def test_bqml_manual_preprocessing_e2e( session, dataset_id, penguins_df_default_index, new_penguins_df ): + base_sql_generator = globals.base_sql_generator() + bqml_model_factory = globals.bqml_model_factory() + df = penguins_df_default_index.dropna() X_train = df[ [ @@ -94,12 +96,12 @@ def test_bqml_manual_preprocessing_e2e( ] y_train = df[["body_mass_g"]] transforms = [ - bigframes.ml.sql.ml_standard_scaler(column, column) + base_sql_generator.ml_standard_scaler(column, column) for column in X_train.columns.tolist() ] transforms.extend(y_train.columns.tolist()) options = {"model_type": "linear_reg"} - model = bigframes.ml.core.create_bqml_model( + model = bqml_model_factory.create_model( X_train, y_train, transforms=transforms, options=options ) @@ -150,8 +152,10 @@ def test_bqml_manual_preprocessing_e2e( def test_bqml_standalone_transform(penguins_df_default_index, new_penguins_df): + bqml_model_factory = globals.bqml_model_factory() + X = penguins_df_default_index[["culmen_length_mm", "species"]] - model = bigframes.ml.core.create_bqml_model( + model = bqml_model_factory.create_model( X, options={"model_type": "transform_only"}, transforms=[ diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index 9ca5a2fd0e..1dd1c813b8 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -25,6 +25,7 @@ decomposition, ensemble, forecasting, + globals, imported, linear_model, llm, @@ -202,7 +203,7 @@ def bqml_palm2_text_generator_model(session, bq_connection) -> core.BqmlModel: options = { "remote_service_type": "CLOUD_AI_LARGE_LANGUAGE_MODEL_V1", } - return core.create_bqml_remote_model( + return globals.bqml_model_factory().create_remote_model( session=session, connection_name=bq_connection, options=options ) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 584d080d42..3ca7e144a5 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -62,38 +62,44 @@ def mock_X(mock_y, mock_session): @pytest.fixture -def bqml_model(mock_session): - bqml_model = core.BqmlModel( - mock_session, bigquery.Model("model_project.model_dataset.model_name") +def bqml_model_factory(mocker: pytest_mock.MockerFixture): + mocker.patch( + "bigframes.ml.core.BqmlModelFactory._create_temp_model_id", + return_value="temp_model_id", ) + bqml_model_factory = core.BqmlModelFactory() - return bqml_model + return bqml_model_factory @pytest.fixture -def ml_mocker(mocker: pytest_mock.MockerFixture): - mocker.patch( - "bigframes.ml.core._create_temp_model_name", return_value="temp_model_name" +def bqml_model(mock_session): + bqml_model = core.BqmlModel( + mock_session, bigquery.Model("model_project.model_dataset.model_id") ) - return mocker + return bqml_model -def test_linear_regression_default_fit(ml_mocker, mock_session, mock_X, mock_y): +def test_linear_regression_default_fit( + bqml_model_factory, mock_session, mock_X, mock_y +): model = linear_model.LinearRegression() + model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) mock_session._start_query.assert_called_once_with( - 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) -def test_linear_regression_params_fit(ml_mocker, mock_session, mock_X, mock_y): +def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, mock_y): model = linear_model.LinearRegression(fit_intercept=False) + model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) mock_session._start_query.assert_called_once_with( - 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -103,7 +109,7 @@ def test_linear_regression_predict(mock_session, bqml_model, mock_X): model.predict(mock_X) mock_session.read_gbq.assert_called_once_with( - "SELECT * FROM ML.PREDICT(MODEL `model_project.model_dataset.model_name`,\n (input_X_sql))", + "SELECT * FROM ML.PREDICT(MODEL `model_project.model_dataset.model_id`,\n (input_X_sql))", index_col=["index_column_id"], ) @@ -114,27 +120,33 @@ def test_linear_regression_score(mock_session, bqml_model, mock_X, mock_y): model.score(mock_X, mock_y) mock_session.read_gbq.assert_called_once_with( - "SELECT * FROM ML.EVALUATE(MODEL `model_project.model_dataset.model_name`,\n (input_X_y_sql))" + "SELECT * FROM ML.EVALUATE(MODEL `model_project.model_dataset.model_id`,\n (input_X_y_sql))" ) -def test_logistic_regression_default_fit(ml_mocker, mock_session, mock_X, mock_y): +def test_logistic_regression_default_fit( + bqml_model_factory, mock_session, mock_X, mock_y +): model = linear_model.LogisticRegression() + model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) mock_session._start_query.assert_called_once_with( - 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) -def test_logistic_regression_params_fit(ml_mocker, mock_session, mock_X, mock_y): +def test_logistic_regression_params_fit( + bqml_model_factory, mock_session, mock_X, mock_y +): model = linear_model.LogisticRegression( fit_intercept=False, class_weights="balanced" ) + model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) mock_session._start_query.assert_called_once_with( - 'CREATE TEMP MODEL `temp_model_name`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + 'CREATE TEMP MODEL `temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -144,7 +156,7 @@ def test_logistic_regression_predict(mock_session, bqml_model, mock_X): model.predict(mock_X) mock_session.read_gbq.assert_called_once_with( - "SELECT * FROM ML.PREDICT(MODEL `model_project.model_dataset.model_name`,\n (input_X_sql))", + "SELECT * FROM ML.PREDICT(MODEL `model_project.model_dataset.model_id`,\n (input_X_sql))", index_col=["index_column_id"], ) @@ -155,5 +167,5 @@ def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y): model.score(mock_X, mock_y) mock_session.read_gbq.assert_called_once_with( - "SELECT * FROM ML.EVALUATE(MODEL `model_project.model_dataset.model_name`,\n (input_X_y_sql))" + "SELECT * FROM ML.EVALUATE(MODEL `model_project.model_dataset.model_id`,\n (input_X_y_sql))" ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index c20a17f7d6..495e8759e8 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -12,11 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest + import bigframes.ml.sql as ml_sql -def test_options_produces_correct_sql(): - sql = ml_sql.options(model_type="lin_reg", input_label_cols=["col_a"], l1_reg=0.6) +@pytest.fixture(scope="session") +def base_sql_generator() -> ml_sql.BaseSqlGenerator: + return ml_sql.BaseSqlGenerator() + + +@pytest.fixture(scope="session") +def model_creation_sql_generator() -> ml_sql.ModelCreationSqlGenerator: + return ml_sql.ModelCreationSqlGenerator(model_id="my_model_id") + + +@pytest.fixture(scope="session") +def model_manipulation_sql_generator() -> ml_sql.ModelManipulationSqlGenerator: + return ml_sql.ModelManipulationSqlGenerator( + model_name="my_project_id.my_dataset_id.my_model_id" + ) + + +def test_options_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerator): + sql = base_sql_generator.options( + model_type="lin_reg", input_label_cols=["col_a"], l1_reg=0.6 + ) assert ( sql == """OPTIONS( @@ -26,8 +47,8 @@ def test_options_produces_correct_sql(): ) -def test_transform_produces_correct_sql(): - sql = ml_sql.transform( +def test_transform_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerator): + sql = base_sql_generator.transform( "ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a", "ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b", ) @@ -39,141 +60,172 @@ def test_transform_produces_correct_sql(): ) -def test_standard_scaler_produces_correct_sql(): - sql = ml_sql.ml_standard_scaler("col_a", "scaled_col_a") +def test_standard_scaler_produces_correct_sql( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_standard_scaler("col_a", "scaled_col_a") assert sql == "ML.STANDARD_SCALER(col_a) OVER() AS scaled_col_a" -def test_one_hot_encoder_produces_correct_sql(): - sql = ml_sql.ml_one_hot_encoder("col_a", "none", 1000000, 0, "encoded_col_a") +def test_one_hot_encoder_produces_correct_sql( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_one_hot_encoder( + "col_a", "none", 1000000, 0, "encoded_col_a" + ) assert ( sql == "ML.ONE_HOT_ENCODER(col_a, 'none', 1000000, 0) OVER() AS encoded_col_a" ) -def test_create_model_produces_correct_sql(): - sql = ml_sql.create_model( - model_name="my_dataset.my_model", +def test_create_model_produces_correct_sql( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, +): + sql = model_creation_sql_generator.create_model( source_sql="my_source_sql", options_sql="my_options_sql", ) assert ( sql - == """CREATE TEMP MODEL `my_dataset.my_model` + == """CREATE TEMP MODEL `my_model_id` my_options_sql AS my_source_sql""" ) -def test_create_model_transform_produces_correct_sql(): - sql = ml_sql.create_model( - model_name="my_model", +def test_create_model_transform_produces_correct_sql( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, +): + sql = model_creation_sql_generator.create_model( source_sql="my_source_sql", options_sql="my_options_sql", transform_sql="my_transform_sql", ) assert ( sql - == """CREATE TEMP MODEL `my_model` + == """CREATE TEMP MODEL `my_model_id` my_transform_sql my_options_sql AS my_source_sql""" ) -def test_create_remote_model_produces_correct_sql(): - sql = ml_sql.create_remote_model( - model_name="my_model", +def test_create_remote_model_produces_correct_sql( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, +): + sql = model_creation_sql_generator.create_remote_model( connection_name="my_project.us.my_connection", options_sql="my_options_sql", ) assert ( sql - == """CREATE TEMP MODEL `my_model` + == """CREATE TEMP MODEL `my_model_id` REMOTE WITH CONNECTION `my_project.us.my_connection` my_options_sql""" ) -def test_create_imported_model_produces_correct_sql(): - sql = ml_sql.create_imported_model( - model_name="my_model", +def test_create_imported_model_produces_correct_sql( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, +): + sql = model_creation_sql_generator.create_imported_model( options_sql="my_options_sql", ) assert ( sql - == """CREATE TEMP MODEL `my_model` + == """CREATE TEMP MODEL `my_model_id` my_options_sql""" ) -def test_alter_model_correct_sql(): - sql = ml_sql.alter_model( - model_name="my_dataset.my_model", +def test_alter_model_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.alter_model( options_sql="my_options_sql", ) assert ( sql - == """ALTER MODEL `my_dataset.my_model` + == """ALTER MODEL `my_project_id.my_dataset_id.my_model_id` SET my_options_sql""" ) -def test_ml_predict_produces_correct_sql(): - sql = ml_sql.ml_predict( - model_name="my_dataset.my_model", source_sql="SELECT * FROM my_table" +def test_ml_predict_produces_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.ml_predict( + source_sql="SELECT * FROM my_table" ) assert ( sql - == """SELECT * FROM ML.PREDICT(MODEL `my_dataset.my_model`, + == """SELECT * FROM ML.PREDICT(MODEL `my_project_id.my_dataset_id.my_model_id`, (SELECT * FROM my_table))""" ) -def test_ml_evaluate_produces_correct_sql(): - sql = ml_sql.ml_evaluate( - model_name="my_dataset.my_model", source_sql="SELECT * FROM my_table" +def test_ml_evaluate_produces_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.ml_evaluate( + source_sql="SELECT * FROM my_table" ) assert ( sql - == """SELECT * FROM ML.EVALUATE(MODEL `my_dataset.my_model`, + == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`, (SELECT * FROM my_table))""" ) -def test_ml_evaluate_no_source_produces_correct_sql(): - sql = ml_sql.ml_evaluate(model_name="my_dataset.my_model") - assert sql == """SELECT * FROM ML.EVALUATE(MODEL `my_dataset.my_model`)""" +def test_ml_evaluate_no_source_produces_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.ml_evaluate() + assert ( + sql + == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`)""" + ) -def test_ml_centroids_produces_correct_sql(): - sql = ml_sql.ml_centroids(model_name="my_dataset.my_model") - assert sql == """SELECT * FROM ML.CENTROIDS(MODEL `my_dataset.my_model`)""" +def test_ml_centroids_produces_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.ml_centroids() + assert ( + sql + == """SELECT * FROM ML.CENTROIDS(MODEL `my_project_id.my_dataset_id.my_model_id`)""" + ) -def test_ml_generate_text_produces_correct_sql(): - sql = ml_sql.ml_generate_text( - model_name="my_dataset.my_model", +def test_ml_generate_text_produces_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.ml_generate_text( source_sql="SELECT * FROM my_table", struct_options="STRUCT(value AS item)", ) assert ( sql - == """SELECT * FROM ML.GENERATE_TEXT(MODEL `my_dataset.my_model`, + == """SELECT * FROM ML.GENERATE_TEXT(MODEL `my_project_id.my_dataset_id.my_model_id`, (SELECT * FROM my_table), STRUCT(value AS item))""" ) -def test_ml_principal_components_produces_correct_sql(): - sql = ml_sql.ml_principal_components(model_name="my_dataset.my_model") +def test_ml_principal_components_produces_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.ml_principal_components() assert ( - sql == """SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `my_dataset.my_model`)""" + sql + == """SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `my_project_id.my_dataset_id.my_model_id`)""" ) -def test_ml_principal_component_info_produces_correct_sql(): - sql = ml_sql.ml_principal_component_info(model_name="my_dataset.my_model") +def test_ml_principal_component_info_produces_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.ml_principal_component_info() assert ( sql - == """SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `my_dataset.my_model`)""" + == """SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `my_project_id.my_dataset_id.my_model_id`)""" ) From 25104610e5ffe526315923946533a66713c1d155 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 22 Sep 2023 11:50:47 -0700 Subject: [PATCH 11/24] feat: add ml.preprocessing.LabelEncoder (#50) --- bigframes/ml/compose.py | 1 + bigframes/ml/pipeline.py | 15 +- bigframes/ml/preprocessing.py | 119 ++++++++++++++ bigframes/ml/sql.py | 11 ++ tests/system/large/ml/test_pipeline.py | 61 ++++++- tests/system/small/ml/test_preprocessing.py | 149 ++++++++++++++++++ tests/unit/ml/test_compose.py | 3 + tests/unit/ml/test_sql.py | 11 +- .../sklearn/preprocessing/_label.py | 52 ++++++ 9 files changed, 415 insertions(+), 7 deletions(-) create mode 100644 third_party/bigframes_vendored/sklearn/preprocessing/_label.py diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 02365f261c..db5d8cf260 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -29,6 +29,7 @@ CompilablePreprocessorType = Union[ preprocessing.OneHotEncoder, preprocessing.StandardScaler, + preprocessing.LabelEncoder, ] diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index bff0bf36ad..71c21d565a 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -50,6 +50,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): compose.ColumnTransformer, preprocessing.StandardScaler, preprocessing.OneHotEncoder, + preprocessing.LabelEncoder, ), ): self._transform = transform @@ -143,7 +144,11 @@ def _extract_as_column_transformer( transformers: List[ Tuple[ str, - Union[preprocessing.OneHotEncoder, preprocessing.StandardScaler], + Union[ + preprocessing.OneHotEncoder, + preprocessing.StandardScaler, + preprocessing.LabelEncoder, + ], Union[str, List[str]], ] ] = [] @@ -167,6 +172,13 @@ def _extract_as_column_transformer( *preprocessing.OneHotEncoder._parse_from_sql(transform_sql), ) ) + elif transform_sql.startswith("ML.LABEL_ENCODER"): + transformers.append( + ( + "label_encoder", + *preprocessing.LabelEncoder._parse_from_sql(transform_sql), + ) + ) else: raise NotImplementedError( f"Unsupported transformer type. {constants.FEEDBACK_LINK}" @@ -181,6 +193,7 @@ def _merge_column_transformer( compose.ColumnTransformer, preprocessing.StandardScaler, preprocessing.OneHotEncoder, + preprocessing.LabelEncoder, ]: """Try to merge the column transformer to a simple transformer.""" transformers = column_transformer.transformers_ diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index cd4ae27b8c..6ee17751df 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -24,6 +24,7 @@ import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.preprocessing._data import third_party.bigframes_vendored.sklearn.preprocessing._encoder +import third_party.bigframes_vendored.sklearn.preprocessing._label class StandardScaler( @@ -229,3 +230,121 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: bpd.DataFrame, df[self._output_names], ) + + +class LabelEncoder( + base.Transformer, + third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder, +): + # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax + TOP_K_DEFAULT = 1000000 + FREQUENCY_THRESHOLD_DEFAULT = 0 + + __doc__ = ( + third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder.__doc__ + ) + + # All estimators must implement __init__ to document their parameters, even + # if they don't have any + def __init__( + self, + min_frequency: Optional[int] = None, + max_categories: Optional[int] = None, + ): + if max_categories is not None and max_categories < 2: + raise ValueError( + f"max_categories has to be larger than or equal to 2, input is {max_categories}." + ) + self.min_frequency = min_frequency + self.max_categories = max_categories + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return ( + type(other) is LabelEncoder + and self._bqml_model == other._bqml_model + and self.min_frequency == other.min_frequency + and self.max_categories == other.max_categories + ) + + def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + """Compile this transformer to a list of SQL expressions that can be included in + a BQML TRANSFORM clause + + Args: + columns: + a list of column names to transform + + Returns: a list of tuples of (sql_expression, output_name)""" + + # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that. + top_k = ( + (self.max_categories - 1) + if self.max_categories is not None + else LabelEncoder.TOP_K_DEFAULT + ) + frequency_threshold = ( + self.min_frequency + if self.min_frequency is not None + else LabelEncoder.FREQUENCY_THRESHOLD_DEFAULT + ) + return [ + ( + self._base_sql_generator.ml_label_encoder( + column, top_k, frequency_threshold, f"labelencoded_{column}" + ), + f"labelencoded_{column}", + ) + for column in columns + ] + + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[LabelEncoder, str]: + """Parse SQL to tuple(LabelEncoder, column_label). + + Args: + sql: SQL string of format "ML.LabelEncoder({col_label}, {top_k}, {frequency_threshold}) OVER() " + + Returns: + tuple(LabelEncoder, column_label)""" + s = sql[sql.find("(") + 1 : sql.find(")")] + col_label, top_k, frequency_threshold = s.split(", ") + max_categories = int(top_k) + 1 + min_frequency = int(frequency_threshold) + + return cls(min_frequency, max_categories), col_label + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> LabelEncoder: + (X,) = utils.convert_to_dataframe(X) + + compiled_transforms = self._compile_to_sql(X.columns.tolist()) + transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] + + self._bqml_model = self._bqml_model_factory.create_model( + X, + options={"model_type": "transform_only"}, + transforms=transform_sqls, + ) + + # The schema of TRANSFORM output is not available in the model API, so save it during fitting + self._output_names = [name for _, name in compiled_transforms] + return self + + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("Must be fitted before transform") + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.transform(X) + return typing.cast( + bpd.DataFrame, + df[self._output_names], + ) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 7cf030485b..5d35a10b96 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -88,6 +88,17 @@ def ml_one_hot_encoder( https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder for params.""" return f"""ML.ONE_HOT_ENCODER({numeric_expr_sql}, '{drop}', {top_k}, {frequency_threshold}) OVER() AS {name}""" + def ml_label_encoder( + self, + numeric_expr_sql: str, + top_k: int, + frequency_threshold: int, + name: str, + ) -> str: + """Encode ML.LABEL_ENCODER for BQML. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params.""" + return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}""" + class ModelCreationSqlGenerator(BaseSqlGenerator): """Sql generator for creating a model entity. Model id is the standalone id without project id and dataset id.""" diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index bec1a51a99..87664b4c3d 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -570,6 +570,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "label", + preprocessing.LabelEncoder(), + "species", + ), ] ), ), @@ -632,6 +637,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "label", + preprocessing.LabelEncoder(), + "species", + ), ] ), ), @@ -650,7 +660,7 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id assert isinstance(pl_loaded._transform, compose.ColumnTransformer) transformers = pl_loaded._transform.transformers_ - assert len(transformers) == 3 + assert len(transformers) == 4 assert transformers[0][0] == "ont_hot_encoder" assert isinstance(transformers[0][1], preprocessing.OneHotEncoder) @@ -660,13 +670,20 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id assert one_hot_encoder.max_categories == 100 assert transformers[0][2] == "species" - assert transformers[1][0] == "standard_scaler" - assert isinstance(transformers[1][1], preprocessing.StandardScaler) - assert transformers[1][2] == "culmen_length_mm" + assert transformers[1][0] == "label_encoder" + assert isinstance(transformers[1][1], preprocessing.LabelEncoder) + one_hot_encoder = transformers[1][1] + assert one_hot_encoder.min_frequency == 0 + assert one_hot_encoder.max_categories == 1000001 + assert transformers[1][2] == "species" assert transformers[2][0] == "standard_scaler" assert isinstance(transformers[2][1], preprocessing.StandardScaler) - assert transformers[2][2] == "flipper_length_mm" + assert transformers[2][2] == "culmen_length_mm" + + assert transformers[3][0] == "standard_scaler" + assert isinstance(transformers[2][1], preprocessing.StandardScaler) + assert transformers[3][2] == "flipper_length_mm" assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) assert pl_loaded._estimator.fit_intercept is False @@ -735,3 +752,37 @@ def test_pipeline_one_hot_encoder_to_gbq(penguins_df_default_index, dataset_id): assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) assert pl_loaded._estimator.fit_intercept is False + + +def test_pipeline_label_encoder_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ( + "transform", + preprocessing.LabelEncoder(min_frequency=5, max_categories=100), + ), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[ + [ + "sex", + "species", + ] + ] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_label_encoder", replace=True + ) + assert isinstance(pl_loaded._transform, preprocessing.LabelEncoder) + + label_encoder = pl_loaded._transform + assert label_encoder.min_frequency == 5 + assert label_encoder.max_categories == 100 + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 57b9900c48..1f08ef2c2c 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -264,4 +264,153 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ pd.testing.assert_frame_equal(result, expected) +def test_label_encoder_default_params(new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder.fit(new_penguins_df[["species", "sex"]]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_sex": [ + 2, + 1, + 1, + ], + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_label_encoder_default_params_fit_transform(new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder() + + result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_sex": [ + 2, + 1, + 1, + ], + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_label_encoder_series_default_params(new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder.fit(new_penguins_df["species"]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_label_encoder_params(new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder(100, 2) + encoder.fit(new_penguins_df[["species", "sex"]]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_sex": [ + 0, + 0, + 0, + ], + "labelencoded_species": [ + 0, + 0, + 0, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + +def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df): + encoder = bigframes.ml.preprocessing.LabelEncoder() + encoder.fit(penguins_df_default_index[["species", "sex"]]) + + result = encoder.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "labelencoded_sex": [ + 3, + 2, + 2, + ], + "labelencoded_species": [ + 1, + 1, + 2, + ], + }, + dtype="Int64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + # TODO(garrettwu): add OneHotEncoder tests to compare with sklearn. diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 772a148c95..c5b3b50876 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -23,10 +23,12 @@ def test_columntransformer_init_expectedtransforms(): onehot_transformer = bigframes.ml.preprocessing.OneHotEncoder() scaler_transformer = bigframes.ml.preprocessing.StandardScaler() + label_transformer = bigframes.ml.preprocessing.LabelEncoder() column_transformer = bigframes.ml.compose.ColumnTransformer( [ ("onehot", onehot_transformer, "species"), ("scale", scaler_transformer, ["culmen_length_mm", "flipper_length_mm"]), + ("onehot", label_transformer, "species"), ] ) @@ -34,6 +36,7 @@ def test_columntransformer_init_expectedtransforms(): ("onehot", onehot_transformer, "species"), ("scale", scaler_transformer, "culmen_length_mm"), ("scale", scaler_transformer, "flipper_length_mm"), + ("onehot", label_transformer, "species"), ] diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 495e8759e8..23b68aa150 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -51,12 +51,14 @@ def test_transform_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenera sql = base_sql_generator.transform( "ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a", "ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b", + "ML.LABEL_ENCODER(col_c) OVER(col_c) AS encoded_col_c", ) assert ( sql == """TRANSFORM( ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a, - ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b)""" + ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b, + ML.LABEL_ENCODER(col_c) OVER(col_c) AS encoded_col_c)""" ) @@ -78,6 +80,13 @@ def test_one_hot_encoder_produces_correct_sql( ) +def test_label_encoder_produces_correct_sql( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_label_encoder("col_a", 1000000, 0, "encoded_col_a") + assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a" + + def test_create_model_produces_correct_sql( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py new file mode 100644 index 0000000000..7e60c846d4 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -0,0 +1,52 @@ +# Authors: Alexandre Gramfort +# Mathieu Blondel +# Olivier Grisel +# Andreas Mueller +# Joel Nothman +# Hamzeh Alsalhi +# License: BSD 3 clause + +from bigframes import constants +from third_party.bigframes_vendored.sklearn.base import BaseEstimator + + +class LabelEncoder(BaseEstimator): + """Encode target labels with value between 0 and n_classes-1. + + This transformer should be used to encode target values, *i.e.* `y`, and + not the input `X`. + + Args: + min_frequency (Optional[int], default None): + Specifies the minimum frequency below which a category will be considered infrequent. + Default None. + int: categories with a smaller cardinality will be considered infrequent as ßindex 0. + max_categories (Optional[int], default None): + Specifies an upper limit to the number of output features for each input feature + when considering infrequent categories. If there are infrequent categories, + max_categories includes the category representing the infrequent categories along with the frequent categories. + Default None, set limit to 1,000,000. + """ + + def fit(self, X): + """Fit LabelEncoder to X. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series with training data. + + Returns: + LabelEncoder: Fitted encoder. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transform(self, X): + """Transform X using label encoding. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. + + Returns: + bigframes.dataframe.DataFrame: The result is an array-like of values.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From f9a93ce71d053aa17b1e3a2946c90e0227076184 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 22 Sep 2023 14:00:16 -0700 Subject: [PATCH 12/24] perf: reimplement unpivot to use cross join rather than union (#47) --- bigframes/core/__init__.py | 97 ++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 3b3754642e..ce3b1aa630 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -982,61 +982,78 @@ def unpivot( ArrayValue: The unpivoted ArrayValue """ table = self._to_ibis_expr(ordering_mode="offset_col") - sub_expressions = [] - - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type() - labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type) - row_n = len(row_labels) if not all( len(source_columns) == row_n for _, source_columns in unpivot_columns ): raise ValueError("Columns and row labels must all be same length.") - for i in range(row_n): - values = [] - for j in range(len(unpivot_columns)): - result_col, source_cols = unpivot_columns[j] - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - if source_cols[i] is not None: - values.append( - ops.AsTypeOp(col_dtype) - ._as_ibis(table[source_cols[i]]) - .name(result_col) - ) - else: - values.append( - bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ).name(result_col) - ) - offsets_value = ( - ((table[ORDER_ID_COLUMN] * row_n) + i) - .cast(ibis_dtypes.int64) - .name(ORDER_ID_COLUMN), + unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + unpivot_table = table.cross_join( + ibis.memtable({unpivot_offset_id: range(row_n)}) + ) + unpivot_offsets_value = ( + ( + (unpivot_table[ORDER_ID_COLUMN] * row_n) + + unpivot_table[unpivot_offset_id] ) - sub_expr = table.select( - passthrough_columns, + .cast(ibis_dtypes.int64) + .name(ORDER_ID_COLUMN), + ) + + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type() + labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type) + cases = [ + ( + i, bigframes.dtypes.literal_to_ibis_scalar( row_labels[i], force_dtype=labels_dtype # type:ignore - ).name(index_col_id), - *values, - offsets_value, + ), + ) + for i in range(len(row_labels)) + ] + labels_value = ( + typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) + .cases(cases, default=None) # type:ignore + .name(index_col_id) + ) + + unpivot_values = [] + for j in range(len(unpivot_columns)): + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + result_col, source_cols = unpivot_columns[j] + null_value = bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ) + ibis_values = [ + ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col]) + if col is not None + else null_value + for col in source_cols + ] + cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] + unpivot_value = typing.cast( + ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] + ).cases( + cases, default=null_value # type:ignore ) - sub_expressions.append(sub_expr) - rotated_table = ibis.union(*sub_expressions) + unpivot_values.append(unpivot_value.name(result_col)) + + unpivot_table = unpivot_table.select( + passthrough_columns, labels_value, *unpivot_values, unpivot_offsets_value + ) value_columns = [ - rotated_table[value_col_id] for value_col_id, _ in unpivot_columns + unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns ] - passthrough_values = [rotated_table[col] for col in passthrough_columns] + passthrough_values = [unpivot_table[col] for col in passthrough_columns] return ArrayValue( session=self._session, - table=rotated_table, - columns=[rotated_table[index_col_id], *value_columns, *passthrough_values], - hidden_ordering_columns=[rotated_table[ORDER_ID_COLUMN]], + table=unpivot_table, + columns=[unpivot_table[index_col_id], *value_columns, *passthrough_values], + hidden_ordering_columns=[unpivot_table[ORDER_ID_COLUMN]], ordering=ExpressionOrdering( ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), From 416d7cb9b560d7e33dcc0227f03a00d43f55ba0d Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 22 Sep 2023 17:48:02 -0700 Subject: [PATCH 13/24] perf: add ability to cache dataframe and series to session table (#51) --- bigframes/core/__init__.py | 23 +++++++++++++++++++++++ bigframes/core/blocks.py | 9 +++++++++ bigframes/dataframe.py | 3 +++ bigframes/series.py | 3 +++ tests/system/small/test_dataframe.py | 10 ++++++++++ 5 files changed, 48 insertions(+) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index ce3b1aa630..c529f83351 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -1145,6 +1145,29 @@ def slice( ) return sliced_expr if step > 0 else sliced_expr.reversed() + def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: + """Write the ArrayValue to a session table and create a new block object that references it.""" + ibis_expr = self._to_ibis_expr( + ordering_mode="unordered", expose_hidden_cols=True + ) + destination = self._session._ibis_to_session_table( + ibis_expr, cluster_cols=cluster_cols, api_name="cache" + ) + table_expression = self._session.ibis_client.sql( + f"SELECT * FROM `_SESSION`.`{destination.table_id}`" + ) + new_columns = [table_expression[column] for column in self.column_names] + new_hidden_columns = [ + table_expression[column] for column in self._hidden_ordering_column_names + ] + return ArrayValue( + self._session, + table_expression, + columns=new_columns, + hidden_ordering_columns=new_hidden_columns, + ordering=self._ordering, + ) + class ArrayValueBuilder: """Mutable expression class. diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index ad4f72070f..c4127c5fd5 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1467,6 +1467,15 @@ def to_sql_query( idx_labels, ) + def cached(self) -> Block: + """Write the block to a session table and create a new block object that references it.""" + return Block( + self.expr.cached(cluster_cols=self.index_columns), + index_columns=self.index_columns, + column_labels=self.column_labels, + index_labels=self.index_labels, + ) + def _is_monotonic( self, column_ids: typing.Union[str, Sequence[str]], increasing: bool ) -> bool: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 76377cd477..0d357e7c3d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2480,3 +2480,6 @@ def _set_block(self, block: blocks.Block): def _get_block(self) -> blocks.Block: return self._block + + def _cached(self) -> DataFrame: + return DataFrame(self._block.cached()) diff --git a/bigframes/series.py b/bigframes/series.py index 8e47088c14..c1c0cb0537 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1389,6 +1389,9 @@ def _slice( ), ) + def _cached(self) -> Series: + return Series(self._block.cached()) + def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: return pandas.api.types.is_list_like(obj) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3eeb368ad2..b6ca958c03 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2717,3 +2717,13 @@ def test_query_job_setters(scalars_df_default_index: dataframe.DataFrame): job_ids.add(scalars_df_default_index.query_job.job_id) assert len(job_ids) == 2 + + +def test_df_cached(scalars_df_index): + df = scalars_df_index.set_index(["int64_too", "int64_col"]).sort_values( + "string_col" + ) + df = df[df["rowindex_2"] % 2 == 0] + + df_cached_copy = df._cached() + pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) From 14b262bde2bb86093bf4df63862e369c5a84b0ad Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Mon, 25 Sep 2023 14:33:40 -0700 Subject: [PATCH 14/24] feat: add ml.preprocessing.MaxAbsScaler (#56) --- bigframes/clients.py | 3 +- bigframes/ml/compose.py | 1 + bigframes/ml/pipeline.py | 10 ++ bigframes/ml/preprocessing.py | 86 +++++++++++++- bigframes/ml/sql.py | 4 + tests/system/large/ml/test_compose.py | 24 ++-- tests/system/large/ml/test_pipeline.py | 82 ++++++++++---- tests/system/small/ml/test_preprocessing.py | 105 ++++++++++++++++-- tests/unit/ml/test_compose.py | 72 ++++++++---- tests/unit/ml/test_sql.py | 7 ++ .../sklearn/preprocessing/_data.py | 42 ++++++- .../sklearn/preprocessing/_encoder.py | 5 +- 12 files changed, 370 insertions(+), 71 deletions(-) diff --git a/bigframes/clients.py b/bigframes/clients.py index 5c019e0fc8..b60fcba04a 100644 --- a/bigframes/clients.py +++ b/bigframes/clients.py @@ -18,7 +18,7 @@ import logging import time -from typing import Optional +from typing import cast, Optional import google.api_core.exceptions from google.cloud import bigquery_connection_v1, resourcemanager_v3 @@ -80,6 +80,7 @@ def create_bq_connection( logger.info( f"Created BQ connection {connection_name} with service account id: {service_account_id}" ) + service_account_id = cast(str, service_account_id) # Ensure IAM role on the BQ connection # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function self._ensure_iam_binding(project_id, service_account_id, iam_role) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index db5d8cf260..a1075c2398 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -29,6 +29,7 @@ CompilablePreprocessorType = Union[ preprocessing.OneHotEncoder, preprocessing.StandardScaler, + preprocessing.MaxAbsScaler, preprocessing.LabelEncoder, ] diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 71c21d565a..86b2099619 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -50,6 +50,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): compose.ColumnTransformer, preprocessing.StandardScaler, preprocessing.OneHotEncoder, + preprocessing.MaxAbsScaler, preprocessing.LabelEncoder, ), ): @@ -147,6 +148,7 @@ def _extract_as_column_transformer( Union[ preprocessing.OneHotEncoder, preprocessing.StandardScaler, + preprocessing.MaxAbsScaler, preprocessing.LabelEncoder, ], Union[str, List[str]], @@ -172,6 +174,13 @@ def _extract_as_column_transformer( *preprocessing.OneHotEncoder._parse_from_sql(transform_sql), ) ) + elif transform_sql.startswith("ML.MAX_ABS_SCALER"): + transformers.append( + ( + "max_abs_encoder", + *preprocessing.MaxAbsScaler._parse_from_sql(transform_sql), + ) + ) elif transform_sql.startswith("ML.LABEL_ENCODER"): transformers.append( ( @@ -193,6 +202,7 @@ def _merge_column_transformer( compose.ColumnTransformer, preprocessing.StandardScaler, preprocessing.OneHotEncoder, + preprocessing.MaxAbsScaler, preprocessing.LabelEncoder, ]: """Try to merge the column transformer to a simple transformer.""" diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 6ee17751df..f4f5446651 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -54,8 +54,10 @@ def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: Returns: a list of tuples of (sql_expression, output_name)""" return [ ( - self._base_sql_generator.ml_standard_scaler(column, f"scaled_{column}"), - f"scaled_{column}", + self._base_sql_generator.ml_standard_scaler( + column, f"standard_scaled_{column}" + ), + f"standard_scaled_{column}", ) for column in columns ] @@ -105,6 +107,86 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +class MaxAbsScaler( + base.Transformer, + third_party.bigframes_vendored.sklearn.preprocessing._data.MaxAbsScaler, +): + __doc__ = ( + third_party.bigframes_vendored.sklearn.preprocessing._data.MaxAbsScaler.__doc__ + ) + + def __init__(self): + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return type(other) is MaxAbsScaler and self._bqml_model == other._bqml_model + + def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + """Compile this transformer to a list of SQL expressions that can be included in + a BQML TRANSFORM clause + + Args: + columns: a list of column names to transform + + Returns: a list of tuples of (sql_expression, output_name)""" + return [ + ( + self._base_sql_generator.ml_max_abs_scaler( + column, f"max_abs_scaled_{column}" + ), + f"max_abs_scaled_{column}", + ) + for column in columns + ] + + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[MaxAbsScaler, str]: + """Parse SQL to tuple(StandardScaler, column_label). + + Args: + sql: SQL string of format "ML.MAX_ABS_SCALER({col_label}) OVER()" + + Returns: + tuple(StandardScaler, column_label)""" + col_label = sql[sql.find("(") + 1 : sql.find(")")] + return cls(), col_label + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> MaxAbsScaler: + (X,) = utils.convert_to_dataframe(X) + + compiled_transforms = self._compile_to_sql(X.columns.tolist()) + transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] + + self._bqml_model = self._bqml_model_factory.create_model( + X, + options={"model_type": "transform_only"}, + transforms=transform_sqls, + ) + + # The schema of TRANSFORM output is not available in the model API, so save it during fitting + self._output_names = [name for _, name in compiled_transforms] + return self + + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("Must be fitted before transform") + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.transform(X) + return typing.cast( + bpd.DataFrame, + df[self._output_names], + ) + + class OneHotEncoder( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 5d35a10b96..a54d39e6b2 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -76,6 +76,10 @@ def ml_standard_scaler(self, numeric_expr_sql: str, name: str) -> str: """Encode ML.STANDARD_SCALER for BQML""" return f"""ML.STANDARD_SCALER({numeric_expr_sql}) OVER() AS {name}""" + def ml_max_abs_scaler(self, numeric_expr_sql: str, name: str) -> str: + """Encode ML.MAX_ABS_SCALER for BQML""" + return f"""ML.MAX_ABS_SCALER({numeric_expr_sql}) OVER() AS {name}""" + def ml_one_hot_encoder( self, numeric_expr_sql: str, diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index b65baa63eb..0c280e5d02 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -56,20 +56,20 @@ def test_columntransformer_standalone_fit_and_transform( [{"index": 1, "value": 1.0}], [{"index": 2, "value": 1.0}], ], - "scaled_culmen_length_mm": [ + "standard_scaled_culmen_length_mm": [ -0.811119671289163, -0.9945520581113803, -1.104611490204711, ], - "scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], + "standard_scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], }, index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), ) - expected.scaled_culmen_length_mm = expected.scaled_culmen_length_mm.astype( - "Float64" + expected.standard_scaled_culmen_length_mm = ( + expected.standard_scaled_culmen_length_mm.astype("Float64") ) - expected.scaled_flipper_length_mm = expected.scaled_flipper_length_mm.astype( - "Float64" + expected.standard_scaled_flipper_length_mm = ( + expected.standard_scaled_flipper_length_mm.astype("Float64") ) pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) @@ -107,20 +107,20 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): [{"index": 1, "value": 1.0}], [{"index": 2, "value": 1.0}], ], - "scaled_culmen_length_mm": [ + "standard_scaled_culmen_length_mm": [ 1.313249, -0.20198, -1.111118, ], - "scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338], + "standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338], }, index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), ) - expected.scaled_culmen_length_mm = expected.scaled_culmen_length_mm.astype( - "Float64" + expected.standard_scaled_culmen_length_mm = ( + expected.standard_scaled_culmen_length_mm.astype("Float64") ) - expected.scaled_flipper_length_mm = expected.scaled_flipper_length_mm.astype( - "Float64" + expected.standard_scaled_flipper_length_mm = ( + expected.standard_scaled_flipper_length_mm.astype("Float64") ) pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 87664b4c3d..c69a00b81c 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -566,10 +566,15 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind "species", ), ( - "scale", + "standard_scale", preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "max_abs_scale", + preprocessing.MaxAbsScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), ( "label", preprocessing.LabelEncoder(), @@ -637,6 +642,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "max_abs_scale", + preprocessing.MaxAbsScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), ( "label", preprocessing.LabelEncoder(), @@ -660,30 +670,26 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id assert isinstance(pl_loaded._transform, compose.ColumnTransformer) transformers = pl_loaded._transform.transformers_ - assert len(transformers) == 4 - - assert transformers[0][0] == "ont_hot_encoder" - assert isinstance(transformers[0][1], preprocessing.OneHotEncoder) - one_hot_encoder = transformers[0][1] - assert one_hot_encoder.drop == "most_frequent" - assert one_hot_encoder.min_frequency == 5 - assert one_hot_encoder.max_categories == 100 - assert transformers[0][2] == "species" - - assert transformers[1][0] == "label_encoder" - assert isinstance(transformers[1][1], preprocessing.LabelEncoder) - one_hot_encoder = transformers[1][1] - assert one_hot_encoder.min_frequency == 0 - assert one_hot_encoder.max_categories == 1000001 - assert transformers[1][2] == "species" - - assert transformers[2][0] == "standard_scaler" - assert isinstance(transformers[2][1], preprocessing.StandardScaler) - assert transformers[2][2] == "culmen_length_mm" + expected = [ + ( + "ont_hot_encoder", + preprocessing.OneHotEncoder( + drop="most_frequent", max_categories=100, min_frequency=5 + ), + "species", + ), + ( + "label_encoder", + preprocessing.LabelEncoder(max_categories=1000001, min_frequency=0), + "species", + ), + ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), + ("max_abs_encoder", preprocessing.MaxAbsScaler(), "culmen_length_mm"), + ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), + ("max_abs_encoder", preprocessing.MaxAbsScaler(), "flipper_length_mm"), + ] - assert transformers[3][0] == "standard_scaler" - assert isinstance(transformers[2][1], preprocessing.StandardScaler) - assert transformers[3][2] == "flipper_length_mm" + assert transformers == expected assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) assert pl_loaded._estimator.fit_intercept is False @@ -717,6 +723,34 @@ def test_pipeline_standard_scaler_to_gbq(penguins_df_default_index, dataset_id): assert pl_loaded._estimator.fit_intercept is False +def test_pipeline_max_abs_scaler_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ("transform", preprocessing.MaxAbsScaler()), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[ + [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + ] + ] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_standard_scaler", replace=True + ) + assert isinstance(pl_loaded._transform, preprocessing.MaxAbsScaler) + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False + + def test_pipeline_one_hot_encoder_to_gbq(penguins_df_default_index, dataset_id): pl = pipeline.Pipeline( [ diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 1f08ef2c2c..7779eb8f6e 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -47,9 +47,9 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): expected = pd.DataFrame( { - "scaled_culmen_depth_mm": [0.836148, 0.024748, 0.48116], - "scaled_culmen_length_mm": [-0.81112, -0.994552, -1.104611], - "scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], + "standard_scaled_culmen_depth_mm": [0.836148, 0.024748, 0.48116], + "standard_scaled_culmen_length_mm": [-0.81112, -0.994552, -1.104611], + "standard_scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), @@ -76,9 +76,9 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): expected = pd.DataFrame( { - "scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], - "scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118], - "scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], + "standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], + "standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118], + "standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], }, dtype="Float64", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), @@ -107,7 +107,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui expected = pd.DataFrame( { - "scaled_culmen_length_mm": [ + "standard_scaled_culmen_length_mm": [ -0.811119671289163, -0.9945520581113803, -1.104611490204711, @@ -120,6 +120,97 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler.fit( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ) + + result = scaler.transform( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ).to_pandas() + + # If maxabs-scaled correctly, max should be 1.0 + for column in result.columns: + assert math.isclose(result[column].max(), 1.0, abs_tol=1e-3) + + result = scaler.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "max_abs_scaled_culmen_depth_mm": [0.874419, 0.8, 0.84186], + "max_abs_scaled_culmen_length_mm": [0.662752, 0.645973, 0.635906], + "max_abs_scaled_flipper_length_mm": [0.848485, 0.78355, 0.813853], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df): + scaler = bigframes.ml.preprocessing.MaxAbsScaler() + result = scaler.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766], + "max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494], + "max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): + scaler = bigframes.ml.preprocessing.MaxAbsScaler() + scaler.fit(penguins_df_default_index["culmen_length_mm"]) + + result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() + + # If maxabs-scaled correctly, max should be 1.0 + for column in result.columns: + assert math.isclose(result[column].max(), 1.0, abs_tol=1e-3) + + result = scaler.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "max_abs_scaled_culmen_length_mm": [0.662752, 0.645973, 0.635906], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + def test_one_hot_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder() encoder.fit(new_penguins_df[["species", "sex"]]) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index c5b3b50876..24cf0a333e 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -12,45 +12,60 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest import sklearn.compose as sklearn_compose # type: ignore import sklearn.preprocessing as sklearn_preprocessing # type: ignore -import bigframes.ml.compose -import bigframes.ml.preprocessing +from bigframes.ml import compose, preprocessing def test_columntransformer_init_expectedtransforms(): - onehot_transformer = bigframes.ml.preprocessing.OneHotEncoder() - scaler_transformer = bigframes.ml.preprocessing.StandardScaler() - label_transformer = bigframes.ml.preprocessing.LabelEncoder() - column_transformer = bigframes.ml.compose.ColumnTransformer( + onehot_transformer = preprocessing.OneHotEncoder() + standard_scaler_transformer = preprocessing.StandardScaler() + max_abs_scaler_transformer = preprocessing.MaxAbsScaler() + label_transformer = preprocessing.LabelEncoder() + column_transformer = compose.ColumnTransformer( [ ("onehot", onehot_transformer, "species"), - ("scale", scaler_transformer, ["culmen_length_mm", "flipper_length_mm"]), - ("onehot", label_transformer, "species"), + ( + "standard_scale", + standard_scaler_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), + ( + "max_abs_scale", + max_abs_scaler_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), + ("label", label_transformer, "species"), ] ) assert column_transformer.transformers_ == [ ("onehot", onehot_transformer, "species"), - ("scale", scaler_transformer, "culmen_length_mm"), - ("scale", scaler_transformer, "flipper_length_mm"), - ("onehot", label_transformer, "species"), + ("standard_scale", standard_scaler_transformer, "culmen_length_mm"), + ("standard_scale", standard_scaler_transformer, "flipper_length_mm"), + ("max_abs_scale", max_abs_scaler_transformer, "culmen_length_mm"), + ("max_abs_scale", max_abs_scaler_transformer, "flipper_length_mm"), + ("label", label_transformer, "species"), ] def test_columntransformer_repr(): - column_transformer = bigframes.ml.compose.ColumnTransformer( + column_transformer = compose.ColumnTransformer( [ ( "onehot", - bigframes.ml.preprocessing.OneHotEncoder(), + preprocessing.OneHotEncoder(), "species", ), ( - "scale", - bigframes.ml.preprocessing.StandardScaler(), + "standard_scale", + preprocessing.StandardScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), + ( + "max_abs_scale", + preprocessing.MaxAbsScaler(), ["culmen_length_mm", "flipper_length_mm"], ), ] @@ -59,23 +74,29 @@ def test_columntransformer_repr(): assert ( column_transformer.__repr__() == """ColumnTransformer(transformers=[('onehot', OneHotEncoder(), 'species'), - ('scale', StandardScaler(), + ('standard_scale', StandardScaler(), + ['culmen_length_mm', 'flipper_length_mm']), + ('max_abs_scale', MaxAbsScaler(), ['culmen_length_mm', 'flipper_length_mm'])])""" ) -@pytest.mark.skipif(sklearn_compose is None, reason="requires sklearn") def test_columntransformer_repr_matches_sklearn(): - bf_column_transformer = bigframes.ml.compose.ColumnTransformer( + bf_column_transformer = compose.ColumnTransformer( [ ( "onehot", - bigframes.ml.preprocessing.OneHotEncoder(), + preprocessing.OneHotEncoder(), "species", ), ( - "scale", - bigframes.ml.preprocessing.StandardScaler(), + "standard_scale", + preprocessing.StandardScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), + ( + "max_abs_scale", + preprocessing.MaxAbsScaler(), ["culmen_length_mm", "flipper_length_mm"], ), ] @@ -88,10 +109,15 @@ def test_columntransformer_repr_matches_sklearn(): "species", ), ( - "scale", + "standard_scale", sklearn_preprocessing.StandardScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "max_abs_scale", + sklearn_preprocessing.MaxAbsScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 23b68aa150..c1b29c5e52 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -69,6 +69,13 @@ def test_standard_scaler_produces_correct_sql( assert sql == "ML.STANDARD_SCALER(col_a) OVER() AS scaled_col_a" +def test_max_abs_scaler_produces_correct_sql( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_max_abs_scaler("col_a", "scaled_col_a") + assert sql == "ML.MAX_ABS_SCALER(col_a) OVER() AS scaled_col_a" + + def test_one_hot_encoder_produces_correct_sql( base_sql_generator: ml_sql.BaseSqlGenerator, ): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py index 89981e34c0..40b4f76ab7 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py @@ -43,13 +43,16 @@ class StandardScaler(BaseEstimator, TransformerMixin): print(scaler.transform(bpd.DataFrame({"a": [2], "b":[2]}))) """ - def fit(self, X): + def fit(self, X, y=None): """Compute the mean and std to be used for later scaling. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): The Dataframe or Series with training data. + y (default None): + Ignored. + Returns: StandardScaler: Fitted scaler. """ @@ -66,3 +69,40 @@ def transform(self, X): bigframes.dataframe.DataFrame: Transformed result. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +class MaxAbsScaler(BaseEstimator, TransformerMixin): + """Scale each feature by its maximum absolute value. + + This estimator scales and translates each feature individually such + that the maximal absolute value of each feature in the + training set will be 1.0. It does not shift/center the data, and + thus does not destroy any sparsity. + """ + + def fit(self, X, y=None): + """Compute the maximum absolute value to be used for later scaling. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Dataframe or Series with training data. + + y (default None): + Ignored. + + Returns: + MaxAbsScaler: Fitted scaler. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transform(self, X): + """Scale the data. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. + + Returns: + bigframes.dataframe.DataFrame: Transformed result. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index b0f0df8e15..cf660ece5d 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -54,13 +54,16 @@ class OneHotEncoder(BaseEstimator): print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]}))) """ - def fit(self, X): + def fit(self, X, y=None): """Fit OneHotEncoder to X. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): The DataFrame or Series with training data. + y (default None): + Ignored. + Returns: OneHotEncoder: Fitted encoder. """ From 9cf99721ed83704e6ee28b15c699326c431eb252 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 25 Sep 2023 15:46:10 -0700 Subject: [PATCH 15/24] feat: add axis param to simple df aggregations (#52) --- bigframes/core/blocks.py | 50 ++++++++++++--- bigframes/dataframe.py | 63 ++++++++++++------- tests/system/small/test_dataframe.py | 30 ++++++++- .../bigframes_vendored/pandas/core/frame.py | 42 ++++++++++--- 4 files changed, 143 insertions(+), 42 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c4127c5fd5..b53c2212c1 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -822,22 +822,54 @@ def filter(self, column_id: str, keep_null: bool = False): index_labels=self.index.names, ) - def aggregate_all_and_pivot( + def aggregate_all_and_stack( self, operation: agg_ops.AggregateOp, *, + axis: int | str = 0, value_col_id: str = "values", dropna: bool = True, dtype=pd.Float64Dtype(), ) -> Block: - aggregations = [(col_id, operation, col_id) for col_id in self.value_columns] - result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( - row_labels=self.column_labels.to_list(), - index_col_id="index", - unpivot_columns=[(value_col_id, self.value_columns)], - dtype=dtype, - ) - return Block(result_expr, index_columns=["index"], column_labels=[None]) + axis_n = utils.get_axis_number(axis) + if axis_n == 0: + aggregations = [ + (col_id, operation, col_id) for col_id in self.value_columns + ] + result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( + row_labels=self.column_labels.to_list(), + index_col_id="index", + unpivot_columns=[(value_col_id, self.value_columns)], + dtype=dtype, + ) + return Block(result_expr, index_columns=["index"], column_labels=[None]) + else: # axis_n == 1 + # using offsets as identity to group on. + # TODO: Allow to promote identity/total_order columns instead for better perf + expr_with_offsets, offset_col = self.expr.promote_offsets() + stacked_expr = expr_with_offsets.unpivot( + row_labels=self.column_labels.to_list(), + index_col_id=guid.generate_guid(), + unpivot_columns=[(value_col_id, self.value_columns)], + passthrough_columns=[*self.index_columns, offset_col], + dtype=dtype, + ) + index_aggregations = [ + (col_id, agg_ops.AnyValueOp(), col_id) + for col_id in [*self.index_columns] + ] + main_aggregation = (value_col_id, operation, value_col_id) + result_expr = stacked_expr.aggregate( + [*index_aggregations, main_aggregation], + by_column_ids=[offset_col], + dropna=dropna, + ) + return Block( + result_expr.drop_columns([offset_col]), + self.index_columns, + column_labels=[None], + index_labels=self.index_labels, + ) def select_column(self, id: str) -> Block: return self.select_columns([id]) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0d357e7c3d..e4e22e0306 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1462,41 +1462,48 @@ def dropna( def any( self, *, + axis: typing.Union[str, int] = 0, bool_only: bool = False, ) -> bigframes.series.Series: if not bool_only: frame = self._raise_on_non_boolean("any") else: frame = self._drop_non_bool() - block = frame._block.aggregate_all_and_pivot( - agg_ops.any_op, dtype=pandas.BooleanDtype() + block = frame._block.aggregate_all_and_stack( + agg_ops.any_op, dtype=pandas.BooleanDtype(), axis=axis ) return bigframes.series.Series(block.select_column("values")) - def all(self, *, bool_only: bool = False) -> bigframes.series.Series: + def all( + self, axis: typing.Union[str, int] = 0, *, bool_only: bool = False + ) -> bigframes.series.Series: if not bool_only: frame = self._raise_on_non_boolean("all") else: frame = self._drop_non_bool() - block = frame._block.aggregate_all_and_pivot( - agg_ops.all_op, dtype=pandas.BooleanDtype() + block = frame._block.aggregate_all_and_stack( + agg_ops.all_op, dtype=pandas.BooleanDtype(), axis=axis ) return bigframes.series.Series(block.select_column("values")) - def sum(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def sum( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("sum") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.sum_op) + block = frame._block.aggregate_all_and_stack(agg_ops.sum_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def mean(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def mean( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("mean") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.mean_op) + block = frame._block.aggregate_all_and_stack(agg_ops.mean_op, axis=axis) return bigframes.series.Series(block.select_column("values")) def median( @@ -1510,47 +1517,57 @@ def median( frame = self._raise_on_non_numeric("median") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.median_op) + block = frame._block.aggregate_all_and_stack(agg_ops.median_op) return bigframes.series.Series(block.select_column("values")) - def std(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def std( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("std") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.std_op) + block = frame._block.aggregate_all_and_stack(agg_ops.std_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def var(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def var( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("var") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.var_op) + block = frame._block.aggregate_all_and_stack(agg_ops.var_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def min(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def min( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("min") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.min_op) + block = frame._block.aggregate_all_and_stack(agg_ops.min_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def max(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def max( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("max") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.max_op) + block = frame._block.aggregate_all_and_stack(agg_ops.max_op, axis=axis) return bigframes.series.Series(block.select_column("values")) - def prod(self, *, numeric_only: bool = False) -> bigframes.series.Series: + def prod( + self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False + ) -> bigframes.series.Series: if not numeric_only: frame = self._raise_on_non_numeric("prod") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.product_op) + block = frame._block.aggregate_all_and_stack(agg_ops.product_op, axis=axis) return bigframes.series.Series(block.select_column("values")) product = prod @@ -1560,11 +1577,11 @@ def count(self, *, numeric_only: bool = False) -> bigframes.series.Series: frame = self else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_pivot(agg_ops.count_op) + block = frame._block.aggregate_all_and_stack(agg_ops.count_op) return bigframes.series.Series(block.select_column("values")) def nunique(self) -> bigframes.series.Series: - block = self._block.aggregate_all_and_pivot(agg_ops.nunique_op) + block = self._block.aggregate_all_and_stack(agg_ops.nunique_op) return bigframes.series.Series(block.select_column("values")) def agg( @@ -1587,7 +1604,7 @@ def agg( ) else: return bigframes.series.Series( - self._block.aggregate_all_and_pivot( + self._block.aggregate_all_and_stack( agg_ops.lookup_agg_func(typing.cast(str, func)) ) ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b6ca958c03..adf17848ee 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1999,6 +1999,29 @@ def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op): pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.sum(axis=1, numeric_only=True)), + (lambda x: x.mean(axis=1, numeric_only=True)), + (lambda x: x.min(axis=1, numeric_only=True)), + (lambda x: x.max(axis=1, numeric_only=True)), + (lambda x: x.std(axis=1, numeric_only=True)), + (lambda x: x.var(axis=1, numeric_only=True)), + ], + ids=["sum", "mean", "min", "max", "std", "var"], +) +def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, op): + col_names = ["int64_too", "int64_col", "float64_col", "bool_col", "string_col"] + bf_result = op(scalars_df_index[col_names]).to_pandas() + pd_result = op(scalars_pandas_df_index[col_names]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] bf_result = scalars_df_index[col_names].median(numeric_only=True).to_pandas() @@ -2019,11 +2042,16 @@ def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): [ (lambda x: x.all(bool_only=True)), (lambda x: x.any(bool_only=True)), + (lambda x: x.all(axis=1, bool_only=True)), + (lambda x: x.any(axis=1, bool_only=True)), ], - ids=["all", "any"], + ids=["all_axis0", "any_axis0", "all_axis1", "any_axis1"], ) def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op): # Pandas will drop nullable 'boolean' dtype so we convert first to bool, then cast back later + scalars_df_index = scalars_df_index.assign( + bool_col=scalars_df_index.bool_col.fillna(False) + ) scalars_pandas_df_index = scalars_pandas_df_index.assign( bool_col=scalars_pandas_df_index.bool_col.fillna(False).astype("bool") ) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 9d26938e08..6ce11cd7e9 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -11,7 +11,7 @@ """ from __future__ import annotations -from typing import Iterable, Literal, Mapping, Optional, Sequence, Union +from typing import Literal, Mapping, Optional, Sequence, Union import numpy as np @@ -1457,7 +1457,7 @@ def apply(self, func, *, args=(), **kwargs): # ---------------------------------------------------------------------- # ndarray-like stats methods - def any(self, *, bool_only: bool = False): + def any(self, *, axis=0, bool_only: bool = False): """ Return whether any element is True, potentially over an axis. @@ -1466,6 +1466,9 @@ def any(self, *, bool_only: bool = False): non-empty). Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. bool_only (bool. default False): Include only boolean columns. @@ -1474,7 +1477,7 @@ def any(self, *, bool_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def all(self, *, bool_only: bool = False): + def all(self, axis=0, *, bool_only: bool = False): """ Return whether all elements are True, potentially over an axis. @@ -1483,6 +1486,9 @@ def all(self, *, bool_only: bool = False): empty). Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. bool_only (bool. default False): Include only boolean columns. @@ -1491,11 +1497,14 @@ def all(self, *, bool_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def prod(self, *, numeric_only: bool = False): + def prod(self, axis=0, *, numeric_only: bool = False): """ Return the product of the values over the requested axis. Args: + aßxis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Include only float, int, boolean columns. @@ -1504,13 +1513,16 @@ def prod(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def min(self, *, numeric_only: bool = False): + def min(self, axis=0, *, numeric_only: bool = False): """Return the minimum of the values over the requested axis. If you want the *index* of the minimum, use ``idxmin``. This is the equivalent of the ``numpy.ndarray`` method ``argmin``. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool, default False): Default False. Include only float, int, boolean columns. @@ -1519,13 +1531,16 @@ def min(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def max(self, *, numeric_only: bool = False): + def max(self, axis=0, *, numeric_only: bool = False): """Return the maximum of the values over the requested axis. If you want the *index* of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Default False. Include only float, int, boolean columns. @@ -1534,12 +1549,15 @@ def max(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def sum(self, *, numeric_only: bool = False): + def sum(self, axis=0, *, numeric_only: bool = False): """Return the sum of the values over the requested axis. This is equivalent to the method ``numpy.sum``. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Default False. Include only float, int, boolean columns. @@ -1548,10 +1566,13 @@ def sum(self, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def mean(self, *, numeric_only: bool = False): + def mean(self, axis=0, *, numeric_only: bool = False): """Return the mean of the values over the requested axis. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Default False. Include only float, int, boolean columns. @@ -1575,12 +1596,15 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def var(self, *, numeric_only: bool = False): + def var(self, axis=0, *, numeric_only: bool = False): """Return unbiased variance over requested axis. Normalized by N-1 by default. Args: + axis ({index (0), columns (1)}): + Axis for the function to be applied on. + For Series this parameter is unused and defaults to 0. numeric_only (bool. default False): Default False. Include only float, int, boolean columns. From d56258cbfcda168cb9e437a021e282818d622d6a Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 26 Sep 2023 01:03:24 +0000 Subject: [PATCH 16/24] fix: Fix header skipping logic in `read_csv` (#49) Change-Id: Ib575e2c2b07f819d1dc499a271fea91107fbb8b4 --- bigframes/session.py | 7 +++---- tests/system/small/test_session.py | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/bigframes/session.py b/bigframes/session.py index 04ae6ba454..7b827c7dcf 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -1050,11 +1050,10 @@ def read_csv( # We want to match pandas behavior. If header is 0, no rows should be skipped, so we # do not need to set `skip_leading_rows`. If header is None, then there is no header. # Setting skip_leading_rows to 0 does that. If header=N and N>0, we want to skip N rows. - # `skip_leading_rows` skips N-1 rows, so we set it to header+1. - if header is not None and header > 0: - job_config.skip_leading_rows = header + 1 - elif header is None: + if header is None: job_config.skip_leading_rows = 0 + elif header > 0: + job_config.skip_leading_rows = header return self._read_bigquery_load_job( filepath_or_buffer, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index b7bee16ffd..614c953764 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -578,9 +578,12 @@ def test_read_csv_gcs_bq_engine_w_header(session, scalars_df_index, gcs_folder): path = gcs_folder + "test_read_csv_gcs_bq_engine_w_header*.csv" scalars_df_index.to_csv(path, index=False) - # Skip the header and the first 2 data rows. Without provided schema, the column names - # would be like `bool_field_0`, `string_field_1` and etc. - df = session.read_csv(path, header=2, engine="bigquery") + # Skip the header and the first 2 data rows. Note that one line of header + # also got added while writing the csv through `to_csv`, so we would have to + # pass headers=3 in the `read_csv` to skip reading the header and two rows. + # Without provided schema, the column names would be like `bool_field_0`, + # `string_field_1` and etc. + df = session.read_csv(path, header=3, engine="bigquery") assert df.shape[0] == scalars_df_index.shape[0] - 2 assert len(df.columns) == len(scalars_df_index.columns) @@ -609,9 +612,12 @@ def test_read_csv_local_bq_engine_w_header(session, scalars_pandas_df_index): # Using the pandas to_csv method because the BQ one does not support local write. scalars_pandas_df_index.to_csv(path, index=False) - # Skip the header and the first 2 data rows. Without provided schema, the column names - # would be like `bool_field_0`, `string_field_1` and etc. - df = session.read_csv(path, header=2, engine="bigquery") + # Skip the header and the first 2 data rows. Note that one line of + # header also got added while writing the csv through `to_csv`, so we + # would have to pass headers=3 in the `read_csv` to skip reading the + # header and two rows. Without provided schema, the column names would + # be like `bool_field_0`, `string_field_1` and etc. + df = session.read_csv(path, header=3, engine="bigquery") assert df.shape[0] == scalars_pandas_df_index.shape[0] - 2 assert len(df.columns) == len(scalars_pandas_df_index.columns) From 632caec420a7e23188f01b96a00c354d205da74e Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 25 Sep 2023 21:17:11 -0700 Subject: [PATCH 17/24] fix: LabelEncoder params consistent with Sklearn (#60) * fix: LabelEncoder params consistent with Sklearn * fix:add LabelTransformer * fix: address comments for base LabelTransformer * fix: type for params --- bigframes/ml/base.py | 20 +++++++++++++ bigframes/ml/preprocessing.py | 17 +++++------ tests/system/small/ml/test_preprocessing.py | 30 ++++--------------- .../sklearn/preprocessing/_label.py | 12 ++++---- 4 files changed, 39 insertions(+), 40 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index f07274f8fc..f899ac7119 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -195,3 +195,23 @@ def fit_transform( y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, ) -> bpd.DataFrame: return self.fit(X, y).transform(X) + + +class LabelTransformer(BaseEstimator): + """A BigQuery DataFrames Label Transformer base class that transforms data. + + Also the transformers can be attached to a pipeline with a predictor.""" + + @abc.abstractmethod + def fit(self, y): + pass + + @abc.abstractmethod + def transform(self, y): + pass + + def fit_transform( + self, + y: Union[bpd.DataFrame, bpd.Series], + ) -> bpd.DataFrame: + return self.fit(y).transform(y) diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index f4f5446651..ed0b36deef 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -315,7 +315,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: class LabelEncoder( - base.Transformer, + base.LabelTransformer, third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder, ): # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax @@ -401,16 +401,15 @@ def _parse_from_sql(cls, sql: str) -> tuple[LabelEncoder, str]: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y=None, # ignored + y: Union[bpd.DataFrame, bpd.Series], ) -> LabelEncoder: - (X,) = utils.convert_to_dataframe(X) + (y,) = utils.convert_to_dataframe(y) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) + compiled_transforms = self._compile_to_sql(y.columns.tolist()) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] self._bqml_model = self._bqml_model_factory.create_model( - X, + y, options={"model_type": "transform_only"}, transforms=transform_sqls, ) @@ -419,13 +418,13 @@ def fit( self._output_names = [name for _, name in compiled_transforms] return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (y,) = utils.convert_to_dataframe(y) - df = self._bqml_model.transform(X) + df = self._bqml_model.transform(y) return typing.cast( bpd.DataFrame, df[self._output_names], diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 7779eb8f6e..61bddb144d 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -357,9 +357,9 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ def test_label_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder() - encoder.fit(new_penguins_df[["species", "sex"]]) + encoder.fit(new_penguins_df["species"]) - result = encoder.transform(new_penguins_df).to_pandas() + result = encoder.transform(new_penguins_df["species"]).to_pandas() # TODO: bug? feature columns seem to be in nondeterministic random order # workaround: sort columns by name. Can't repro it in pantheon, so could @@ -368,11 +368,6 @@ def test_label_encoder_default_params(new_penguins_df): expected = pd.DataFrame( { - "labelencoded_sex": [ - 2, - 1, - 1, - ], "labelencoded_species": [ 1, 1, @@ -389,7 +384,7 @@ def test_label_encoder_default_params(new_penguins_df): def test_label_encoder_default_params_fit_transform(new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder() - result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() + result = encoder.fit_transform(new_penguins_df[["species"]]).to_pandas() # TODO: bug? feature columns seem to be in nondeterministic random order # workaround: sort columns by name. Can't repro it in pantheon, so could @@ -398,11 +393,6 @@ def test_label_encoder_default_params_fit_transform(new_penguins_df): expected = pd.DataFrame( { - "labelencoded_sex": [ - 2, - 1, - 1, - ], "labelencoded_species": [ 1, 1, @@ -444,7 +434,7 @@ def test_label_encoder_series_default_params(new_penguins_df): def test_label_encoder_params(new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder(100, 2) - encoder.fit(new_penguins_df[["species", "sex"]]) + encoder.fit(new_penguins_df[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -455,11 +445,6 @@ def test_label_encoder_params(new_penguins_df): expected = pd.DataFrame( { - "labelencoded_sex": [ - 0, - 0, - 0, - ], "labelencoded_species": [ 0, 0, @@ -475,7 +460,7 @@ def test_label_encoder_params(new_penguins_df): def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df): encoder = bigframes.ml.preprocessing.LabelEncoder() - encoder.fit(penguins_df_default_index[["species", "sex"]]) + encoder.fit(penguins_df_default_index[["species"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -486,11 +471,6 @@ def test_label_encoder_different_data(penguins_df_default_index, new_penguins_df expected = pd.DataFrame( { - "labelencoded_sex": [ - 3, - 2, - 2, - ], "labelencoded_species": [ 1, 1, diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py index 7e60c846d4..83f8eb0f9c 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -28,11 +28,11 @@ class LabelEncoder(BaseEstimator): Default None, set limit to 1,000,000. """ - def fit(self, X): - """Fit LabelEncoder to X. + def fit(self, y): + """Fit label encoder. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series): The DataFrame or Series with training data. Returns: @@ -40,11 +40,11 @@ def fit(self, X): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def transform(self, X): - """Transform X using label encoding. + def transform(self, y): + """Transform y using label encoding. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series): The DataFrame or Series to be transformed. Returns: From 3502f835b35c437933430698e7a1c9badaddcb99 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 26 Sep 2023 12:09:25 -0700 Subject: [PATCH 18/24] feat: support casting string to integer or float (#59) --- bigframes/dtypes.py | 2 +- tests/system/small/test_series.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 271b8aa2f2..59d3007fab 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -313,7 +313,7 @@ def cast_ibis_value( ibis_dtypes.string, ), ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), - ibis_dtypes.string: (), + ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64), ibis_dtypes.date: (), ibis_dtypes.time: (), ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d702049e68..588dcc2c83 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2389,6 +2389,29 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type): pd.testing.assert_series_equal(bf_result, pd_result) +def test_string_astype_int(): + pd_series = pd.Series(["4", "-7", "0", " -03"]) + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype("Int64") + bf_result = bf_series.astype("Int64").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_string_astype_float(): + pd_series = pd.Series( + ["1", "-1", "-0", "000", " -03.235", "naN", "-inf", "INf", ".33", "7.235e-8"] + ) + + bf_series = series.Series(pd_series) + + pd_result = pd_series.astype("Float64") + bf_result = bf_series.astype("Float64").to_pandas() + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + @pytest.mark.parametrize( "index", [0, 5, -2], From a6e32aa875370063c48ce7922c2aa369a770bd30 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 26 Sep 2023 13:40:07 -0700 Subject: [PATCH 19/24] feat: Add more index methods (#54) --- bigframes/core/block_transforms.py | 10 +- bigframes/core/indexes/index.py | 118 +++++++++++++-- bigframes/dataframe.py | 7 +- bigframes/series.py | 6 +- tests/system/small/test_index.py | 120 ++++++++++++++++ .../pandas/core/indexes/base.py | 135 ++++++++++++++++++ 6 files changed, 373 insertions(+), 23 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index da6ba65b8a..d22112417c 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -218,13 +218,17 @@ def rank( return block.select_columns(rownum_col_ids).with_column_labels(labels) -def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"): +def dropna( + block: blocks.Block, + column_ids: typing.Sequence[str], + how: typing.Literal["all", "any"] = "any", +): """ Drop na entries from block """ if how == "any": filtered_block = block - for column in block.value_columns: + for column in column_ids: filtered_block, result_id = filtered_block.apply_unary_op( column, ops.notnull_op ) @@ -234,7 +238,7 @@ def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"): else: # "all" filtered_block = block predicate = None - for column in block.value_columns: + for column in column_ids: filtered_block, partial_predicate = filtered_block.apply_unary_op( column, ops.notnull_op ) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index c08c851c91..f211afe4d5 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -24,8 +24,10 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.joins as joins +import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dtypes import bigframes.dtypes as bf_dtypes @@ -149,6 +151,27 @@ def has_duplicates(self) -> bool: def _block(self) -> blocks.Block: return self._data._get_block() + @property + def T(self) -> Index: + return self.transpose() + + def transpose(self) -> Index: + return self + + def sort_values(self, *, ascending: bool = True, na_position: str = "last"): + if na_position not in ["first", "last"]: + raise ValueError("Param na_position must be one of 'first' or 'last'") + direction = ( + order.OrderingDirection.ASC if ascending else order.OrderingDirection.DESC + ) + na_last = na_position == "last" + index_columns = self._block.index_columns + ordering = [ + order.OrderingColumnReference(column, direction=direction, na_last=na_last) + for column in index_columns + ] + return Index._from_block(self._block.order_by(ordering)) + def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], @@ -176,6 +199,57 @@ def max(self) -> typing.Any: def min(self) -> typing.Any: return self._apply_aggregation(agg_ops.min_op) + def argmax(self) -> int: + block, row_nums = self._block.promote_offsets() + block = block.order_by( + [ + *[ + order.OrderingColumnReference( + col, direction=order.OrderingDirection.DESC + ) + for col in self._block.index_columns + ], + order.OrderingColumnReference(row_nums), + ] + ) + import bigframes.series as series + + return typing.cast(int, series.Series(block.select_column(row_nums)).iloc[0]) + + def argmin(self) -> int: + block, row_nums = self._block.promote_offsets() + block = block.order_by( + [ + *[ + order.OrderingColumnReference(col) + for col in self._block.index_columns + ], + order.OrderingColumnReference(row_nums), + ] + ) + import bigframes.series as series + + return typing.cast(int, series.Series(block.select_column(row_nums)).iloc[0]) + + def value_counts( + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + *, + dropna: bool = True, + ): + block = block_ops.value_counts( + self._block, + self._block.index_columns, + normalize=normalize, + ascending=ascending, + dropna=dropna, + ) + import bigframes.series as series + + return series.Series(block) + def fillna(self, value=None) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'fillna'") @@ -185,10 +259,7 @@ def rename(self, name: Union[str, Sequence[str]]) -> Index: names = [name] if isinstance(name, str) else list(name) if len(names) != self.nlevels: raise ValueError("'name' must be same length as levels") - - import bigframes.dataframe as df - - return Index(df.DataFrame(self._block.with_index_labels(names))) + return Index._from_block(self._block.with_index_labels(names)) def drop( self, @@ -210,9 +281,28 @@ def drop( ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) - import bigframes.dataframe as df + return Index._from_block(block) + + def dropna(self, how: str = "any") -> Index: + if how not in ("any", "all"): + raise ValueError("'how' must be one of 'any', 'all'") + result = block_ops.dropna(self._block, self._block.index_columns, how=how) # type: ignore + return Index._from_block(result) + + def drop_duplicates(self, *, keep: str = "first") -> Index: + block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep) + return Index._from_block(block) + + def isin(self, values) -> Index: + if not utils.is_list_like(values): + raise TypeError( + "only list-like objects are allowed to be passed to " + f"isin(), you passed a [{type(values).__name__}]" + ) - return Index(df.DataFrame(block.select_columns([]))) + return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna( + value=False + ) def _apply_unary_op( self, @@ -226,9 +316,7 @@ def _apply_unary_op( result_ids.append(result_id) block = block.set_index(result_ids, index_labels=self._block.index_labels) - import bigframes.dataframe as df - - return Index(df.DataFrame(block)) + return Index._from_block(block) def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any: if self.nlevels > 1: @@ -262,6 +350,12 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: def __len__(self): return self.shape[0] + @classmethod + def _from_block(cls, block: blocks.Block) -> Index: + import bigframes.dataframe as df + + return Index(df.DataFrame(block)) + class IndexValue: """An immutable index.""" @@ -356,12 +450,6 @@ def resolve_level_name(self: IndexValue, label: blocks.Label) -> str: def is_uniquely_named(self: IndexValue): return len(set(self.names)) == len(self.names) - def _set_block(self, block: blocks.Block): - self._block = block - - def _get_block(self) -> blocks.Block: - return self._block - def join_mono_indexed( left: IndexValue, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index e4e22e0306..113355589b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1440,7 +1440,7 @@ def dropna( axis_n = utils.get_axis_number(axis) if axis_n == 0: - result = block_ops.dropna(self._block, how=how) # type: ignore + result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore if ignore_index: result = result.reset_index() return DataFrame(result) @@ -1674,7 +1674,10 @@ def pivot( def stack(self): # TODO: support 'level' param by simply reordering levels such that selected level is last before passing to Block.stack. # TODO: match impl to pandas future_stack as described in pandas 2.1 release notes - result_block = block_ops.dropna(self._block.stack(), how="all") + stack_block = self._block.stack() + result_block = block_ops.dropna( + stack_block, stack_block.value_columns, how="all" + ) if not isinstance(self.columns, pandas.MultiIndex): return bigframes.series.Series(result_block) return DataFrame(result_block) diff --git a/bigframes/series.py b/bigframes/series.py index c1c0cb0537..47298d59f5 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -459,7 +459,7 @@ def dropna( ) -> Series: if inplace: raise NotImplementedError("'inplace'=True not supported") - result = block_ops.dropna(self._block, how="any") + result = block_ops.dropna(self._block, [self._value_column], how="any") if ignore_index: result = result.reset_index() return Series(result) @@ -856,7 +856,7 @@ def clip(self, lower, upper): ) return Series(block.select_column(result_id).with_column_labels([self.name])) - def argmax(self) -> scalars.Scalar: + def argmax(self) -> int: block, row_nums = self._block.promote_offsets() block = block.order_by( [ @@ -870,7 +870,7 @@ def argmax(self) -> scalars.Scalar: scalars.Scalar, Series(block.select_column(row_nums)).iloc[0] ) - def argmin(self) -> scalars.Scalar: + def argmin(self) -> int: block, row_nums = self._block.promote_offsets() block = block.order_by( [ diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 7f09e3a9d5..f7fa0f0855 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -14,6 +14,7 @@ import numpy import pandas as pd +import pytest from tests.system.utils import assert_pandas_index_equal_ignore_index_type @@ -174,3 +175,122 @@ def test_is_monotonic_decreasing(scalars_df_index, scalars_pandas_df_index): scalars_df_index.index.is_monotonic_increasing == scalars_pandas_df_index.index.is_monotonic_increasing ) + + +def test_index_argmin(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("1."): + pytest.skip("doesn't work in pandas 1.x.") + bf_result = scalars_df_index.set_index(["int64_too", "rowindex_2"]).index.argmin() + pd_result = scalars_pandas_df_index.set_index( + ["int64_too", "rowindex_2"] + ).index.argmin() + assert bf_result == pd_result + + +def test_index_argmax(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("1."): + pytest.skip("doesn't work in pandas 1.x.") + bf_result = scalars_df_index.set_index(["int64_too", "rowindex_2"]).index.argmax() + pd_result = scalars_pandas_df_index.set_index( + ["int64_too", "rowindex_2"] + ).index.argmax() + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("ascending", "na_position"), + [ + (True, "first"), + (True, "last"), + (False, "first"), + (False, "last"), + ], +) +def test_index_sort_values( + scalars_df_index, scalars_pandas_df_index, ascending, na_position +): + # Test needs values to be unique + bf_result = ( + scalars_df_index.set_index(["int64_too", "rowindex_2"]) + .index.sort_values(ascending=ascending, na_position=na_position) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index( + ["int64_too", "rowindex_2"] + ).index.sort_values(ascending=ascending, na_position=na_position) + + pd.testing.assert_index_equal( + bf_result, + pd_result, + ) + + +def test_index_value_counts(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("1."): + pytest.skip("value_counts results different in pandas 1.x.") + bf_result = ( + scalars_df_index.set_index(["int64_too", "rowindex_2"]) + .index.value_counts() + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index( + ["int64_too", "rowindex_2"] + ).index.value_counts() + + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("how",), + [ + ("any",), + ("all",), + ], +) +def test_index_dropna(scalars_df_index, scalars_pandas_df_index, how): + bf_result = ( + scalars_df_index.set_index(["int64_col", "float64_col"]) + .index.dropna(how=how) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index( + ["int64_col", "float64_col"] + ).index.dropna(how=how) + pd.testing.assert_index_equal(pd_result, bf_result) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_index_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep): + bf_series = ( + scalars_df_index.set_index("int64_col") + .index.drop_duplicates(keep=keep) + .to_pandas() + ) + pd_series = scalars_pandas_df_index.set_index("int64_col").index.drop_duplicates( + keep=keep + ) + pd.testing.assert_index_equal( + pd_series, + bf_series, + ) + + +def test_index_isin(scalars_df_index, scalars_pandas_df_index): + bf_series = ( + scalars_df_index.set_index("int64_col").index.isin([2, 55555, 4]).to_pandas() + ) + pd_result_array = scalars_pandas_df_index.set_index("int64_col").index.isin( + [2, 55555, 4] + ) + pd.testing.assert_index_equal( + pd.Index(pd_result_array), + bf_series, + check_names=False, + ) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index f89964e220..e8737341a3 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -53,6 +53,20 @@ def dtypes(self): """Return the dtypes as a Series for the underlying MultiIndex.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def T(self) -> Index: + """Return the transpose, which is by definition self.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transpose(self) -> Index: + """ + Return the transpose, which is by definition self. + + Returns: + Index + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def astype(self, dtype): """Create an Index with values cast to dtypes. @@ -67,6 +81,23 @@ def astype(self, dtype): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def isin(self, values): + """ + Return a boolean array where the index values are in `values`. + + Compute boolean array of whether each index value is found in the + passed set of values. The length of the returned boolean array matches + the length of the index. + + Args: + values (set or list-like): + Sought values. + + Returns: + Series: Series of boolean values. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def all(self) -> bool: """Return whether all elements are Truthy. @@ -99,6 +130,30 @@ def max(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def argmin(self) -> int: + """ + Return int position of the smallest value in the Series. + + If the minimum is achieved in multiple locations, + the first row position is returned. + + Returns: + int: Row position of the minimum value. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def argmax(self) -> int: + """ + Return int position of the largest value in the Series. + + If the maximum is achieved in multiple locations, + the first row position is returned. + + Returns: + int: Row position of the maximum value. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self) -> int: """Return number of unique elements in the object. @@ -109,6 +164,57 @@ def nunique(self) -> int: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def sort_values( + self, *, ascending: bool = True, na_position: str = "last" + ) -> Index: + """ + Return a sorted copy of the index. + + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. + + Args: + ascending (bool, default True): + Should the index values be sorted in an ascending order. + na_position ({'first' or 'last'}, default 'last'): + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + Returns: + pandas.Index: Sorted copy of the index. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def value_counts( + self, + normalize: bool = True, + sort: bool = True, + ascending: bool = False, + *, + dropna: bool = True, + ): + """Return a Series containing counts of unique values. + + The resulting object will be in descending order so that the + first element is the most frequently-occurring element. + Excludes NA values by default. + + Args: + normalize (bool, default False): + If True then the object returned will contain the relative + frequencies of the unique values. + sort (bool, default True): + Sort by frequencies. + ascending (bool, default False): + Sort in ascending order. + dropna (bool, default True): + Don't include counts of NaN. + + Returns: + Series + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def fillna(self, value) -> Index: """ Fill NA/NaN values with the specified value. @@ -151,6 +257,35 @@ def drop(self, labels) -> Index: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def dropna(self, how: str = "any"): + """Return Index without NA/NaN values. + + Args: + how ({'any', 'all'}, default 'any'): + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. + + Returns: + Index + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def drop_duplicates(self, *, keep: str = "first"): + """ + Return Index with duplicate values removed. + + Args: + keep ({'first', 'last', ``False``}, default 'first'): + One of: + 'first' : Drop duplicates except for the first occurrence. + 'last' : Drop duplicates except for the last occurrence. + ``False`` : Drop all duplicates. + + Returns: + Index + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_numpy(self, dtype): """ A NumPy ndarray representing the values in this Series or Index. From e804e130c218d8ac81a8fc0a853eeb7a93884a50 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 26 Sep 2023 14:42:40 -0700 Subject: [PATCH 20/24] refactor: push down SQL generate logic in core.BqmlModelFactory (#62) --- bigframes/ml/core.py | 33 ++++++------------------ bigframes/ml/sql.py | 23 ++++++++++++----- tests/unit/ml/test_sql.py | 54 +++++++++++++++++++++++++++++---------- 3 files changed, 64 insertions(+), 46 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 70be0d35ee..37478d8baf 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -251,17 +251,10 @@ def create_model( session = X_train._session - source_sql = input_data.sql - options_sql = self._model_creation_sql_generator.options(**options) - transform_sql = ( - self._model_creation_sql_generator.transform(*transforms) - if transforms is not None - else None - ) sql = self._model_creation_sql_generator.create_model( - source_sql=source_sql, - transform_sql=transform_sql, - options_sql=options_sql, + source=input_data, + transforms=transforms, + options=options, ) return self._create_model_with_sql(session=session, sql=sql) @@ -287,18 +280,10 @@ def create_time_series_model( session = X_train._session - source_sql = input_data.sql - options_sql = self._model_creation_sql_generator.options(**options) - - transform_sql = ( - self._model_creation_sql_generator.transform(*transforms) - if transforms is not None - else None - ) sql = self._model_creation_sql_generator.create_model( - source_sql=source_sql, - transform_sql=transform_sql, - options_sql=options_sql, + source=input_data, + transforms=transforms, + options=options, ) return self._create_model_with_sql(session=session, sql=sql) @@ -320,10 +305,9 @@ def create_remote_model( Returns: BqmlModel: a BqmlModel wrapping a trained model in BigQuery """ - options_sql = self._model_creation_sql_generator.options(**options) sql = self._model_creation_sql_generator.create_remote_model( connection_name=connection_name, - options_sql=options_sql, + options=options, ) return self._create_model_with_sql(session=session, sql=sql) @@ -341,9 +325,8 @@ def create_imported_model( Returns: a BqmlModel, wrapping a trained model in BigQuery """ - options_sql = self._model_creation_sql_generator.options(**options) sql = self._model_creation_sql_generator.create_imported_model( - options_sql=options_sql, + options=options, ) return self._create_model_with_sql(session=session, sql=sql) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index a54d39e6b2..c1b4f46e18 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -16,9 +16,10 @@ Generates SQL queries needed for BigQuery DataFrames ML """ -from typing import Iterable, Optional, Union +from typing import Iterable, Mapping, Optional, Union import bigframes.constants as constants +import bigframes.pandas as bpd class BaseSqlGenerator: @@ -113,11 +114,15 @@ def __init__(self, model_id: str): # Model create and alter def create_model( self, - source_sql: str, - transform_sql: Optional[str] = None, - options_sql: Optional[str] = None, + source: bpd.DataFrame, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + transforms: Optional[Iterable[str]] = None, ) -> str: """Encode the CREATE TEMP MODEL statement for BQML""" + source_sql = source.sql + transform_sql = self.transform(*transforms) if transforms is not None else None + options_sql = self.options(**options) + parts = [f"CREATE TEMP MODEL `{self._model_id}`"] if transform_sql: parts.append(transform_sql) @@ -129,9 +134,11 @@ def create_model( def create_remote_model( self, connection_name: str, - options_sql: Optional[str] = None, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> str: """Encode the CREATE TEMP MODEL statement for BQML remote model.""" + options_sql = self.options(**options) + parts = [f"CREATE TEMP MODEL `{self._model_id}`"] parts.append(self.connection(connection_name)) if options_sql: @@ -140,9 +147,11 @@ def create_remote_model( def create_imported_model( self, - options_sql: Optional[str] = None, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> str: """Encode the CREATE TEMP MODEL statement for BQML remote model.""" + options_sql = self.options(**options) + parts = [f"CREATE TEMP MODEL `{self._model_id}`"] if options_sql: parts.append(options_sql) @@ -150,7 +159,7 @@ def create_imported_model( class ModelManipulationSqlGenerator(BaseSqlGenerator): - """Sql generator for manipulating a model entity. Model name is the fully model path of project_id.dataset_id.model_id.""" + """Sql generator for manipulating a model entity. Model name is the full model path of project_id.dataset_id.model_id.""" def __init__(self, model_name: str): self._model_name = model_name diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index c1b29c5e52..b88523c7ef 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + import pytest import bigframes.ml.sql as ml_sql +import bigframes.pandas as bpd @pytest.fixture(scope="session") @@ -34,6 +37,14 @@ def model_manipulation_sql_generator() -> ml_sql.ModelManipulationSqlGenerator: ) +@pytest.fixture(scope="session") +def mock_df(): + mock_df = mock.create_autospec(spec=bpd.DataFrame) + mock_df.sql = "input_X_y_sql" + + return mock_df + + def test_options_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerator): sql = base_sql_generator.options( model_type="lin_reg", input_label_cols=["col_a"], l1_reg=0.6 @@ -96,33 +107,44 @@ def test_label_encoder_produces_correct_sql( def test_create_model_produces_correct_sql( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, + mock_df: bpd.DataFrame, ): sql = model_creation_sql_generator.create_model( - source_sql="my_source_sql", - options_sql="my_options_sql", + source=mock_df, + options={"option_key1": "option_value1", "option_key2": 2}, ) assert ( sql == """CREATE TEMP MODEL `my_model_id` -my_options_sql -AS my_source_sql""" +OPTIONS( + option_key1="option_value1", + option_key2=2) +AS input_X_y_sql""" ) def test_create_model_transform_produces_correct_sql( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, + mock_df: bpd.DataFrame, ): sql = model_creation_sql_generator.create_model( - source_sql="my_source_sql", - options_sql="my_options_sql", - transform_sql="my_transform_sql", + source=mock_df, + options={"option_key1": "option_value1", "option_key2": 2}, + transforms=[ + "ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a", + "ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b", + ], ) assert ( sql == """CREATE TEMP MODEL `my_model_id` -my_transform_sql -my_options_sql -AS my_source_sql""" +TRANSFORM( + ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a, + ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b) +OPTIONS( + option_key1="option_value1", + option_key2=2) +AS input_X_y_sql""" ) @@ -131,13 +153,15 @@ def test_create_remote_model_produces_correct_sql( ): sql = model_creation_sql_generator.create_remote_model( connection_name="my_project.us.my_connection", - options_sql="my_options_sql", + options={"option_key1": "option_value1", "option_key2": 2}, ) assert ( sql == """CREATE TEMP MODEL `my_model_id` REMOTE WITH CONNECTION `my_project.us.my_connection` -my_options_sql""" +OPTIONS( + option_key1="option_value1", + option_key2=2)""" ) @@ -145,12 +169,14 @@ def test_create_imported_model_produces_correct_sql( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): sql = model_creation_sql_generator.create_imported_model( - options_sql="my_options_sql", + options={"option_key1": "option_value1", "option_key2": 2}, ) assert ( sql == """CREATE TEMP MODEL `my_model_id` -my_options_sql""" +OPTIONS( + option_key1="option_value1", + option_key2=2)""" ) From 392113b70d6a8c407accbb6684d75b31261e3741 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Tue, 26 Sep 2023 16:25:59 -0700 Subject: [PATCH 21/24] feat: add ml.preprocessing.MinMaxScaler (#64) * feat: add ml.preprocessing.MinMaxScaler * fix comments and typo * add test check for min value * nit fix --- bigframes/ml/compose.py | 1 + bigframes/ml/pipeline.py | 12 ++- bigframes/ml/preprocessing.py | 84 ++++++++++++++++- bigframes/ml/sql.py | 4 + tests/system/large/ml/test_pipeline.py | 46 ++++++++- tests/system/small/ml/test_preprocessing.py | 93 +++++++++++++++++++ tests/unit/ml/test_compose.py | 25 +++++ tests/unit/ml/test_sql.py | 7 ++ .../sklearn/preprocessing/_data.py | 36 +++++++ 9 files changed, 302 insertions(+), 6 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index a1075c2398..9effbf1968 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -30,6 +30,7 @@ preprocessing.OneHotEncoder, preprocessing.StandardScaler, preprocessing.MaxAbsScaler, + preprocessing.MinMaxScaler, preprocessing.LabelEncoder, ] diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 86b2099619..ac02c39112 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -51,6 +51,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): preprocessing.StandardScaler, preprocessing.OneHotEncoder, preprocessing.MaxAbsScaler, + preprocessing.MinMaxScaler, preprocessing.LabelEncoder, ), ): @@ -149,6 +150,7 @@ def _extract_as_column_transformer( preprocessing.OneHotEncoder, preprocessing.StandardScaler, preprocessing.MaxAbsScaler, + preprocessing.MinMaxScaler, preprocessing.LabelEncoder, ], Union[str, List[str]], @@ -177,10 +179,17 @@ def _extract_as_column_transformer( elif transform_sql.startswith("ML.MAX_ABS_SCALER"): transformers.append( ( - "max_abs_encoder", + "max_abs_scaler", *preprocessing.MaxAbsScaler._parse_from_sql(transform_sql), ) ) + elif transform_sql.startswith("ML.MIN_MAX_SCALER"): + transformers.append( + ( + "min_max_scaler", + *preprocessing.MinMaxScaler._parse_from_sql(transform_sql), + ) + ) elif transform_sql.startswith("ML.LABEL_ENCODER"): transformers.append( ( @@ -203,6 +212,7 @@ def _merge_column_transformer( preprocessing.StandardScaler, preprocessing.OneHotEncoder, preprocessing.MaxAbsScaler, + preprocessing.MinMaxScaler, preprocessing.LabelEncoder, ]: """Try to merge the column transformer to a simple transformer.""" diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index ed0b36deef..caf4657a63 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -144,13 +144,13 @@ def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: @classmethod def _parse_from_sql(cls, sql: str) -> tuple[MaxAbsScaler, str]: - """Parse SQL to tuple(StandardScaler, column_label). + """Parse SQL to tuple(MaxAbsScaler, column_label). Args: sql: SQL string of format "ML.MAX_ABS_SCALER({col_label}) OVER()" Returns: - tuple(StandardScaler, column_label)""" + tuple(MaxAbsScaler, column_label)""" col_label = sql[sql.find("(") + 1 : sql.find(")")] return cls(), col_label @@ -187,6 +187,86 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +class MinMaxScaler( + base.Transformer, + third_party.bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler, +): + __doc__ = ( + third_party.bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler.__doc__ + ) + + def __init__(self): + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return type(other) is MinMaxScaler and self._bqml_model == other._bqml_model + + def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + """Compile this transformer to a list of SQL expressions that can be included in + a BQML TRANSFORM clause + + Args: + columns: a list of column names to transform + + Returns: a list of tuples of (sql_expression, output_name)""" + return [ + ( + self._base_sql_generator.ml_min_max_scaler( + column, f"min_max_scaled_{column}" + ), + f"min_max_scaled_{column}", + ) + for column in columns + ] + + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[MinMaxScaler, str]: + """Parse SQL to tuple(MinMaxScaler, column_label). + + Args: + sql: SQL string of format "ML.MIN_MAX_SCALER({col_label}) OVER()" + + Returns: + tuple(MinMaxScaler, column_label)""" + col_label = sql[sql.find("(") + 1 : sql.find(")")] + return cls(), col_label + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> MinMaxScaler: + (X,) = utils.convert_to_dataframe(X) + + compiled_transforms = self._compile_to_sql(X.columns.tolist()) + transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] + + self._bqml_model = self._bqml_model_factory.create_model( + X, + options={"model_type": "transform_only"}, + transforms=transform_sqls, + ) + + # The schema of TRANSFORM output is not available in the model API, so save it during fitting + self._output_names = [name for _, name in compiled_transforms] + return self + + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("Must be fitted before transform") + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.transform(X) + return typing.cast( + bpd.DataFrame, + df[self._output_names], + ) + + class OneHotEncoder( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index c1b4f46e18..3897d1be39 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -81,6 +81,10 @@ def ml_max_abs_scaler(self, numeric_expr_sql: str, name: str) -> str: """Encode ML.MAX_ABS_SCALER for BQML""" return f"""ML.MAX_ABS_SCALER({numeric_expr_sql}) OVER() AS {name}""" + def ml_min_max_scaler(self, numeric_expr_sql: str, name: str) -> str: + """Encode ML.MIN_MAX_SCALER for BQML""" + return f"""ML.MIN_MAX_SCALER({numeric_expr_sql}) OVER() AS {name}""" + def ml_one_hot_encoder( self, numeric_expr_sql: str, diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index c69a00b81c..34a2ca0101 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -575,6 +575,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind preprocessing.MaxAbsScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "min_max_scale", + preprocessing.MinMaxScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), ( "label", preprocessing.LabelEncoder(), @@ -647,6 +652,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id preprocessing.MaxAbsScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "min_max_scale", + preprocessing.MinMaxScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), ( "label", preprocessing.LabelEncoder(), @@ -684,9 +694,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id "species", ), ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), - ("max_abs_encoder", preprocessing.MaxAbsScaler(), "culmen_length_mm"), + ("max_abs_scaler", preprocessing.MaxAbsScaler(), "culmen_length_mm"), + ("min_max_scaler", preprocessing.MinMaxScaler(), "culmen_length_mm"), ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), - ("max_abs_encoder", preprocessing.MaxAbsScaler(), "flipper_length_mm"), + ("max_abs_scaler", preprocessing.MaxAbsScaler(), "flipper_length_mm"), + ("min_max_scaler", preprocessing.MinMaxScaler(), "flipper_length_mm"), ] assert transformers == expected @@ -743,7 +755,7 @@ def test_pipeline_max_abs_scaler_to_gbq(penguins_df_default_index, dataset_id): pl.fit(X_train, y_train) pl_loaded = pl.to_gbq( - f"{dataset_id}.test_penguins_pipeline_standard_scaler", replace=True + f"{dataset_id}.test_penguins_pipeline_min_max_scaler", replace=True ) assert isinstance(pl_loaded._transform, preprocessing.MaxAbsScaler) @@ -751,6 +763,34 @@ def test_pipeline_max_abs_scaler_to_gbq(penguins_df_default_index, dataset_id): assert pl_loaded._estimator.fit_intercept is False +def test_pipeline_min_max_scaler_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ("transform", preprocessing.MinMaxScaler()), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[ + [ + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + ] + ] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_min_max_scaler", replace=True + ) + assert isinstance(pl_loaded._transform, preprocessing.MinMaxScaler) + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False + + def test_pipeline_one_hot_encoder_to_gbq(penguins_df_default_index, dataset_id): pl = pipeline.Pipeline( [ diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 61bddb144d..fc8f3251bd 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -211,6 +211,99 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_min_max_scaler_normalizeds_fit_transform(new_penguins_df): + scaler = bigframes.ml.preprocessing.MinMaxScaler() + result = scaler.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625], + "min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0], + "min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): + scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler.fit(penguins_df_default_index["culmen_length_mm"]) + + result = scaler.transform(penguins_df_default_index["culmen_length_mm"]).to_pandas() + + # If minmax-scaled correctly, min should be 0 and max should be 1. + for column in result.columns: + assert math.isclose(result[column].max(), 1.0, abs_tol=1e-3) + assert math.isclose(result[column].min(), 0.0, abs_tol=1e-3) + + result = scaler.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "min_max_scaled_culmen_length_mm": [0.269091, 0.232727, 0.210909], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + scaler = bigframes.ml.preprocessing.MinMaxScaler() + scaler.fit( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ) + + result = scaler.transform( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ).to_pandas() + + # If minmax-scaled correctly, min should be 0 and max should be 1. + for column in result.columns: + assert math.isclose(result[column].max(), 1.0, abs_tol=1e-3) + assert math.isclose(result[column].min(), 0.0, abs_tol=1e-3) + + result = scaler.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "min_max_scaled_culmen_depth_mm": [0.678571, 0.4880952, 0.595238], + "min_max_scaled_culmen_length_mm": [0.269091, 0.232727, 0.210909], + "min_max_scaled_flipper_length_mm": [0.40678, 0.152542, 0.271186], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + def test_one_hot_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder() encoder.fit(new_penguins_df[["species", "sex"]]) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 24cf0a333e..8c8fbd6ab5 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -22,6 +22,7 @@ def test_columntransformer_init_expectedtransforms(): onehot_transformer = preprocessing.OneHotEncoder() standard_scaler_transformer = preprocessing.StandardScaler() max_abs_scaler_transformer = preprocessing.MaxAbsScaler() + min_max_scaler_transformer = preprocessing.MinMaxScaler() label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ @@ -36,6 +37,11 @@ def test_columntransformer_init_expectedtransforms(): max_abs_scaler_transformer, ["culmen_length_mm", "flipper_length_mm"], ), + ( + "min_max_scale", + min_max_scaler_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), ("label", label_transformer, "species"), ] ) @@ -46,6 +52,8 @@ def test_columntransformer_init_expectedtransforms(): ("standard_scale", standard_scaler_transformer, "flipper_length_mm"), ("max_abs_scale", max_abs_scaler_transformer, "culmen_length_mm"), ("max_abs_scale", max_abs_scaler_transformer, "flipper_length_mm"), + ("min_max_scale", min_max_scaler_transformer, "culmen_length_mm"), + ("min_max_scale", min_max_scaler_transformer, "flipper_length_mm"), ("label", label_transformer, "species"), ] @@ -68,6 +76,11 @@ def test_columntransformer_repr(): preprocessing.MaxAbsScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "min_max_scale", + preprocessing.MinMaxScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) @@ -77,6 +90,8 @@ def test_columntransformer_repr(): ('standard_scale', StandardScaler(), ['culmen_length_mm', 'flipper_length_mm']), ('max_abs_scale', MaxAbsScaler(), + ['culmen_length_mm', 'flipper_length_mm']), + ('min_max_scale', MinMaxScaler(), ['culmen_length_mm', 'flipper_length_mm'])])""" ) @@ -99,6 +114,11 @@ def test_columntransformer_repr_matches_sklearn(): preprocessing.MaxAbsScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "min_max_scale", + preprocessing.MinMaxScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) sk_column_transformer = sklearn_compose.ColumnTransformer( @@ -118,6 +138,11 @@ def test_columntransformer_repr_matches_sklearn(): sklearn_preprocessing.MaxAbsScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "min_max_scale", + sklearn_preprocessing.MinMaxScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index b88523c7ef..f461dc76df 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -87,6 +87,13 @@ def test_max_abs_scaler_produces_correct_sql( assert sql == "ML.MAX_ABS_SCALER(col_a) OVER() AS scaled_col_a" +def test_min_max_scaler_produces_correct_sql( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_min_max_scaler("col_a", "scaled_col_a") + assert sql == "ML.MIN_MAX_SCALER(col_a) OVER() AS scaled_col_a" + + def test_one_hot_encoder_produces_correct_sql( base_sql_generator: ml_sql.BaseSqlGenerator, ): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py index 40b4f76ab7..58e16e135b 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py @@ -106,3 +106,39 @@ def transform(self, X): bigframes.dataframe.DataFrame: Transformed result. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +class MinMaxScaler(BaseEstimator, TransformerMixin): + """Transform features by scaling each feature to a given range. + + This estimator scales and translates each feature individually such + that it is in the given range on the training set, e.g. between + zero and one. + """ + + def fit(self, X, y=None): + """Compute the minimum and maximum to be used for later scaling. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Dataframe or Series with training data. + + y (default None): + Ignored. + + Returns: + MaxAbsScaler: Fitted scaler. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transform(self, X): + """Scale the data. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. + + Returns: + bigframes.dataframe.DataFrame: Transformed result. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 61200bd3ae08ddafcc5e59131ac0295188e81f53 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 27 Sep 2023 11:14:19 -0700 Subject: [PATCH 22/24] refactor: push down SQL generate logic in core.BqmlModel (#66) --- bigframes/ml/core.py | 41 ++++++++++--------------------- bigframes/ml/sql.py | 43 ++++++++++++++++++++++++--------- tests/unit/ml/test_sql.py | 51 +++++++++++++++++++++++++++------------ 3 files changed, 80 insertions(+), 55 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 37478d8baf..4c5a48cf62 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -58,7 +58,7 @@ def model(self) -> bigquery.Model: def _apply_sql( self, input_data: bpd.DataFrame, - func: Callable[[str], str], + func: Callable[[bpd.DataFrame], str], ) -> bpd.DataFrame: """Helper to wrap a dataframe in a SQL query, keeping the index intact. @@ -74,11 +74,9 @@ def _apply_sql( string from which to construct the output dataframe. It must include the index columns of the input SQL. """ - source_sql, index_col_ids, index_labels = input_data._to_sql_query( - include_index=True - ) + _, index_col_ids, index_labels = input_data._to_sql_query(include_index=True) - sql = func(source_sql) + sql = func(input_data) df = self._session.read_gbq(sql, index_col=index_col_ids) df.index.names = index_labels @@ -106,11 +104,9 @@ def generate_text( # TODO: validate input data schema return self._apply_sql( input_data, - lambda source_sql: self._model_manipulation_sql_generator.ml_generate_text( - source_sql=source_sql, - struct_options=self._model_manipulation_sql_generator.struct_options( - **options - ), + lambda source_df: self._model_manipulation_sql_generator.ml_generate_text( + source_df=source_df, + struct_options=options, ), ) @@ -122,11 +118,9 @@ def generate_text_embedding( # TODO: validate input data schema return self._apply_sql( input_data, - lambda source_sql: self._model_manipulation_sql_generator.ml_generate_text_embedding( - source_sql=source_sql, - struct_options=self._model_manipulation_sql_generator.struct_options( - **options - ), + lambda source_df: self._model_manipulation_sql_generator.ml_generate_text_embedding( + source_df=source_df, + struct_options=options, ), ) @@ -136,13 +130,7 @@ def forecast(self) -> bpd.DataFrame: def evaluate(self, input_data: Optional[bpd.DataFrame] = None): # TODO: validate input data schema - # Note: don't need index as evaluate returns a new table - source_sql, _, _ = ( - input_data._to_sql_query(include_index=False) - if (input_data is not None) - else (None, None, None) - ) - sql = self._model_manipulation_sql_generator.ml_evaluate(source_sql) + sql = self._model_manipulation_sql_generator.ml_evaluate(input_data) return self._session.read_gbq(sql) @@ -188,11 +176,8 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: # truncate as Vertex ID only accepts 63 characters, easily exceeding the limit for temp models. # The possibility of conflicts should be low. vertex_ai_model_id = vertex_ai_model_id[:63] - options_sql = self._model_manipulation_sql_generator.options( - **{"vertex_ai_model_id": vertex_ai_model_id} - ) sql = self._model_manipulation_sql_generator.alter_model( - options_sql=options_sql + options={"vertex_ai_model_id": vertex_ai_model_id} ) # Register the model and wait it to finish self._session._start_query(sql) @@ -252,7 +237,7 @@ def create_model( session = X_train._session sql = self._model_creation_sql_generator.create_model( - source=input_data, + source_df=input_data, transforms=transforms, options=options, ) @@ -281,7 +266,7 @@ def create_time_series_model( session = X_train._session sql = self._model_creation_sql_generator.create_model( - source=input_data, + source_df=input_data, transforms=transforms, options=options, ) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 3897d1be39..57c8ba672a 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -118,12 +118,12 @@ def __init__(self, model_id: str): # Model create and alter def create_model( self, - source: bpd.DataFrame, + source_df: bpd.DataFrame, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, transforms: Optional[Iterable[str]] = None, ) -> str: """Encode the CREATE TEMP MODEL statement for BQML""" - source_sql = source.sql + source_sql = source_df.sql transform_sql = self.transform(*transforms) if transforms is not None else None options_sql = self.options(**options) @@ -168,39 +168,58 @@ class ModelManipulationSqlGenerator(BaseSqlGenerator): def __init__(self, model_name: str): self._model_name = model_name + def _source_sql(self, source_df: bpd.DataFrame) -> str: + """Return DataFrame sql with index columns.""" + _source_sql, _, _ = source_df._to_sql_query(include_index=True) + return _source_sql + # Alter model def alter_model( self, - options_sql: str, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> str: """Encode the ALTER MODEL statement for BQML""" + options_sql = self.options(**options) + parts = [f"ALTER MODEL `{self._model_name}`"] parts.append(f"SET {options_sql}") return "\n".join(parts) # ML prediction TVFs - def ml_predict(self, source_sql: str) -> str: + def ml_predict(self, source_df: bpd.DataFrame) -> str: """Encode ML.PREDICT for BQML""" return f"""SELECT * FROM ML.PREDICT(MODEL `{self._model_name}`, - ({source_sql}))""" + ({self._source_sql(source_df)}))""" def ml_forecast(self) -> str: """Encode ML.FORECAST for BQML""" return f"""SELECT * FROM ML.FORECAST(MODEL `{self._model_name}`)""" - def ml_generate_text(self, source_sql: str, struct_options: str) -> str: + def ml_generate_text( + self, source_df: bpd.DataFrame, struct_options: Mapping[str, Union[int, float]] + ) -> str: """Encode ML.GENERATE_TEXT for BQML""" + struct_options_sql = self.struct_options(**struct_options) return f"""SELECT * FROM ML.GENERATE_TEXT(MODEL `{self._model_name}`, - ({source_sql}), {struct_options})""" + ({self._source_sql(source_df)}), {struct_options_sql})""" - def ml_generate_text_embedding(self, source_sql: str, struct_options: str) -> str: + def ml_generate_text_embedding( + self, source_df: bpd.DataFrame, struct_options: Mapping[str, Union[int, float]] + ) -> str: """Encode ML.GENERATE_TEXT_EMBEDDING for BQML""" + struct_options_sql = self.struct_options(**struct_options) return f"""SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `{self._model_name}`, - ({source_sql}), {struct_options})""" + ({self._source_sql(source_df)}), {struct_options_sql})""" # ML evaluation TVFs - def ml_evaluate(self, source_sql: Optional[str] = None) -> str: + def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str: """Encode ML.EVALUATE for BQML""" + if source_df is None: + source_sql = None + else: + # Note: don't need index as evaluate returns a new table + source_sql, _, _ = source_df._to_sql_query(include_index=False) + if source_sql is None: return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`)""" else: @@ -222,7 +241,7 @@ def ml_principal_component_info(self) -> str: ) # ML transform TVF, that require a transform_only type model - def ml_transform(self, source_sql: str) -> str: + def ml_transform(self, source_df: bpd.DataFrame) -> str: """Encode ML.TRANSFORM for BQML""" return f"""SELECT * FROM ML.TRANSFORM(MODEL `{self._model_name}`, - ({source_sql}))""" + ({self._source_sql(source_df)}))""" diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index f461dc76df..a3338e762d 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -41,6 +41,7 @@ def model_manipulation_sql_generator() -> ml_sql.ModelManipulationSqlGenerator: def mock_df(): mock_df = mock.create_autospec(spec=bpd.DataFrame) mock_df.sql = "input_X_y_sql" + mock_df._to_sql_query.return_value = "input_X_sql", None, None return mock_df @@ -117,7 +118,7 @@ def test_create_model_produces_correct_sql( mock_df: bpd.DataFrame, ): sql = model_creation_sql_generator.create_model( - source=mock_df, + source_df=mock_df, options={"option_key1": "option_value1", "option_key2": 2}, ) assert ( @@ -135,7 +136,7 @@ def test_create_model_transform_produces_correct_sql( mock_df: bpd.DataFrame, ): sql = model_creation_sql_generator.create_model( - source=mock_df, + source_df=mock_df, options={"option_key1": "option_value1", "option_key2": 2}, transforms=[ "ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a", @@ -191,38 +192,38 @@ def test_alter_model_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.alter_model( - options_sql="my_options_sql", + options={"option_key1": "option_value1", "option_key2": 2}, ) assert ( sql == """ALTER MODEL `my_project_id.my_dataset_id.my_model_id` -SET my_options_sql""" +SET OPTIONS( + option_key1="option_value1", + option_key2=2)""" ) def test_ml_predict_produces_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, + mock_df: bpd.DataFrame, ): - sql = model_manipulation_sql_generator.ml_predict( - source_sql="SELECT * FROM my_table" - ) + sql = model_manipulation_sql_generator.ml_predict(source_df=mock_df) assert ( sql == """SELECT * FROM ML.PREDICT(MODEL `my_project_id.my_dataset_id.my_model_id`, - (SELECT * FROM my_table))""" + (input_X_sql))""" ) def test_ml_evaluate_produces_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, + mock_df: bpd.DataFrame, ): - sql = model_manipulation_sql_generator.ml_evaluate( - source_sql="SELECT * FROM my_table" - ) + sql = model_manipulation_sql_generator.ml_evaluate(source_df=mock_df) assert ( sql == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`, - (SELECT * FROM my_table))""" + (input_X_sql))""" ) @@ -248,15 +249,35 @@ def test_ml_centroids_produces_correct_sql( def test_ml_generate_text_produces_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, + mock_df: bpd.DataFrame, ): sql = model_manipulation_sql_generator.ml_generate_text( - source_sql="SELECT * FROM my_table", - struct_options="STRUCT(value AS item)", + source_df=mock_df, + struct_options={"option_key1": 1, "option_key2": 2.2}, ) assert ( sql == """SELECT * FROM ML.GENERATE_TEXT(MODEL `my_project_id.my_dataset_id.my_model_id`, - (SELECT * FROM my_table), STRUCT(value AS item))""" + (input_X_sql), STRUCT( + 1 AS option_key1, + 2.2 AS option_key2))""" + ) + + +def test_ml_generate_text_embedding_produces_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = model_manipulation_sql_generator.ml_generate_text_embedding( + source_df=mock_df, + struct_options={"option_key1": 1, "option_key2": 2.2}, + ) + assert ( + sql + == """SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `my_project_id.my_dataset_id.my_model_id`, + (input_X_sql), STRUCT( + 1 AS option_key1, + 2.2 AS option_key2))""" ) From 7ab65e88deb0080e9c36c2709f8a5385ccaf8cf2 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 27 Sep 2023 21:00:26 -0700 Subject: [PATCH 23/24] fix: generate unique ids on join to avoid id collisions (#65) * fix: generate unique ids on join to avoid id collisions --- bigframes/core/joins/single_column.py | 80 +++++++++++---------------- 1 file changed, 31 insertions(+), 49 deletions(-) diff --git a/bigframes/core/joins/single_column.py b/bigframes/core/joins/single_column.py index 434cc2cd79..8a9825cf0b 100644 --- a/bigframes/core/joins/single_column.py +++ b/bigframes/core/joins/single_column.py @@ -16,6 +16,7 @@ from __future__ import annotations +import itertools import typing from typing import Callable, Literal, Tuple @@ -25,7 +26,7 @@ import bigframes.constants as constants import bigframes.core as core -import bigframes.core.guid +import bigframes.core.guid as guid import bigframes.core.joins.row_identity import bigframes.core.ordering @@ -122,17 +123,38 @@ def join_by_column( ), ) else: + lmapping = { + col_id: guid.generate_guid() + for col_id in itertools.chain( + left.column_names, left._hidden_ordering_column_names + ) + } + rmapping = { + col_id: guid.generate_guid() + for col_id in itertools.chain( + right.column_names, right._hidden_ordering_column_names + ) + } + + def get_column_left(col_id): + return lmapping[col_id] + + def get_column_right(col_id): + return rmapping[col_id] + left_table = left._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True, + col_id_overrides=lmapping, ) right_table = right._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True, + col_id_overrides=rmapping, ) join_conditions = [ - value_to_join_key(left_table[left_index]) - == value_to_join_key(right_table[right_index]) + value_to_join_key(left_table[lmapping[left_index]]) + == value_to_join_key(right_table[rmapping[right_index]]) for left_index, right_index in zip(left_column_ids, right_column_ids) ] @@ -145,38 +167,6 @@ def join_by_column( rname="{name}_y", ) - def get_column_left(key: str) -> str: - if ( - how == "inner" - and key in left_column_ids - and key in combined_table.columns - ): - # Ibis doesn't rename the column if the values are guaranteed - # to be equal on left and right (because they're part of an - # inner join condition). See: - # https://github.com/ibis-project/ibis/pull/4651 - pass - elif key in right_table.columns: - key = f"{key}_x" - - return key - - def get_column_right(key: str) -> str: - if ( - how == "inner" - and key in right_column_ids - and key in combined_table.columns - ): - # Ibis doesn't rename the column if the values are guaranteed - # to be equal on left and right (because they're part of an - # inner join condition). See: - # https://github.com/ibis-project/ibis/pull/4651 - pass - elif key in left_table.columns: - key = f"{key}_y" - - return key - # Preserve ordering accross joins. ordering = join_orderings( left._ordering, @@ -245,20 +235,14 @@ def get_join_cols( join_key_cols: list[ibis_types.Value] = [] for left_col, right_col in zip(left_join_cols, right_join_cols): if not coalesce_join_keys: - join_key_cols.append( - left_col.name(bigframes.core.guid.generate_guid(prefix="index_")) - ) - join_key_cols.append( - right_col.name(bigframes.core.guid.generate_guid(prefix="index_")) - ) + join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_"))) + join_key_cols.append(right_col.name(guid.generate_guid(prefix="index_"))) else: if how == "left" or how == "inner": - join_key_cols.append( - left_col.name(bigframes.core.guid.generate_guid(prefix="index_")) - ) + join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_"))) elif how == "right": join_key_cols.append( - right_col.name(bigframes.core.guid.generate_guid(prefix="index_")) + right_col.name(guid.generate_guid(prefix="index_")) ) elif how == "outer": # The left index and the right index might contain null values, for @@ -269,16 +253,14 @@ def get_join_cols( # Don't need to coalesce if they are exactly the same column. if left_col.name("index").equals(right_col.name("index")): join_key_cols.append( - left_col.name( - bigframes.core.guid.generate_guid(prefix="index_") - ) + left_col.name(guid.generate_guid(prefix="index_")) ) else: join_key_cols.append( ibis.coalesce( left_col, right_col, - ).name(bigframes.core.guid.generate_guid(prefix="index_")) + ).name(guid.generate_guid(prefix="index_")) ) else: raise ValueError( From 0e0493f6099e4084d1978afb3f60fb3e0a872379 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Thu, 28 Sep 2023 14:42:06 -0500 Subject: [PATCH 24/24] chore(main): release 0.5.0 (#35) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 48 ++++++++++++++++++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de2edcf31e..e4b2bff3c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,54 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.5.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.4.0...v0.5.0) (2023-09-28) + + +### Features + +* Add `DataFrame.kurtosis` / `DF.kurt` method ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Add `DataFrame.rolling` and `DataFrame.expanding` methods ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Add `items`, `apply` methods to `DataFrame`. ([#43](https://github.com/googleapis/python-bigquery-dataframes/issues/43)) ([3adc1b3](https://github.com/googleapis/python-bigquery-dataframes/commit/3adc1b3aa3e2b218d4fa5debdaa4298276bdf801)) +* Add axis param to simple df aggregations ([#52](https://github.com/googleapis/python-bigquery-dataframes/issues/52)) ([9cf9972](https://github.com/googleapis/python-bigquery-dataframes/commit/9cf99721ed83704e6ee28b15c699326c431eb252)) +* Add index `dtype`, `astype`, `drop`, `fillna`, aggregate attributes. ([#38](https://github.com/googleapis/python-bigquery-dataframes/issues/38)) ([1a254a4](https://github.com/googleapis/python-bigquery-dataframes/commit/1a254a496633957b9506dd8392dcc6fd10762201)) +* Add ml.preprocessing.LabelEncoder ([#50](https://github.com/googleapis/python-bigquery-dataframes/issues/50)) ([2510461](https://github.com/googleapis/python-bigquery-dataframes/commit/25104610e5ffe526315923946533a66713c1d155)) +* Add ml.preprocessing.MaxAbsScaler ([#56](https://github.com/googleapis/python-bigquery-dataframes/issues/56)) ([14b262b](https://github.com/googleapis/python-bigquery-dataframes/commit/14b262bde2bb86093bf4df63862e369c5a84b0ad)) +* Add ml.preprocessing.MinMaxScaler ([#64](https://github.com/googleapis/python-bigquery-dataframes/issues/64)) ([392113b](https://github.com/googleapis/python-bigquery-dataframes/commit/392113b70d6a8c407accbb6684d75b31261e3741)) +* Add more index methods ([#54](https://github.com/googleapis/python-bigquery-dataframes/issues/54)) ([a6e32aa](https://github.com/googleapis/python-bigquery-dataframes/commit/a6e32aa875370063c48ce7922c2aa369a770bd30)) +* Support `calculate_p_values` parameter in `bigframes.ml.linear_model.LinearRegression` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `class_weights="balanced"` in `LogisticRegression` model ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `df[column_name] = df_only_one_column` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `early_stop` parameter in `bigframes.ml.linear_model.LinearRegression` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `enable_global_explain` parameter in `bigframes.ml.linear_model.LinearRegression` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `l2_reg` parameter in `bigframes.ml.linear_model.LinearRegression` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `learn_rate_strategy` parameter in `bigframes.ml.linear_model.LinearRegression` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `ls_init_learn_rate` parameter in `bigframes.ml.linear_model.LinearRegression` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `max_iterations` parameter in `bigframes.ml.linear_model.LinearRegression` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `min_rel_progress` parameter in `bigframes.ml.linear_model.LinearRegression` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support `optimize_strategy` parameter in `bigframes.ml.linear_model.LinearRegression` ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) +* Support casting string to integer or float ([#59](https://github.com/googleapis/python-bigquery-dataframes/issues/59)) ([3502f83](https://github.com/googleapis/python-bigquery-dataframes/commit/3502f835b35c437933430698e7a1c9badaddcb99)) + + +### Bug Fixes + +* Fix header skipping logic in `read_csv` ([#49](https://github.com/googleapis/python-bigquery-dataframes/issues/49)) ([d56258c](https://github.com/googleapis/python-bigquery-dataframes/commit/d56258cbfcda168cb9e437a021e282818d622d6a)) +* Generate unique ids on join to avoid id collisions ([#65](https://github.com/googleapis/python-bigquery-dataframes/issues/65)) ([7ab65e8](https://github.com/googleapis/python-bigquery-dataframes/commit/7ab65e88deb0080e9c36c2709f8a5385ccaf8cf2)) +* LabelEncoder params consistent with Sklearn ([#60](https://github.com/googleapis/python-bigquery-dataframes/issues/60)) ([632caec](https://github.com/googleapis/python-bigquery-dataframes/commit/632caec420a7e23188f01b96a00c354d205da74e)) +* Loosen filter items tests to accomodate shifting pandas impl ([#41](https://github.com/googleapis/python-bigquery-dataframes/issues/41)) ([edabdbb](https://github.com/googleapis/python-bigquery-dataframes/commit/edabdbb131150707ea9211292cacbb60b8d076dd)) + + +### Performance Improvements + +* Add ability to cache dataframe and series to session table ([#51](https://github.com/googleapis/python-bigquery-dataframes/issues/51)) ([416d7cb](https://github.com/googleapis/python-bigquery-dataframes/commit/416d7cb9b560d7e33dcc0227f03a00d43f55ba0d)) +* Inline small `Series` and `DataFrames` in query text ([#45](https://github.com/googleapis/python-bigquery-dataframes/issues/45)) ([5e199ec](https://github.com/googleapis/python-bigquery-dataframes/commit/5e199ecf1ecf13a68a2ed0dd4464afd9db977ab1)) +* Reimplement unpivot to use cross join rather than union ([#47](https://github.com/googleapis/python-bigquery-dataframes/issues/47)) ([f9a93ce](https://github.com/googleapis/python-bigquery-dataframes/commit/f9a93ce71d053aa17b1e3a2946c90e0227076184)) +* Simplify join order to use multiple order keys instead of string. ([#36](https://github.com/googleapis/python-bigquery-dataframes/issues/36)) ([5056da6](https://github.com/googleapis/python-bigquery-dataframes/commit/5056da6b385dbcfc179d2bcbb6549fa539428cda)) + + +### Documentation + +* Link to Remote Functions code samples from README and API reference ([c1900c2](https://github.com/googleapis/python-bigquery-dataframes/commit/c1900c29a44199d5d8d036d6d842b4f00448fa79)) + ## [0.4.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.3.2...v0.4.0) (2023-09-16) diff --git a/bigframes/version.py b/bigframes/version.py index 65b984a0d7..ad3c3082c5 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.4.0" +__version__ = "0.5.0"