feat: support ml.SimpleImputer in bigframes (#708)

ashleyxuu · web-flow · commit 4c4415fb137e · 2024-05-21T18:31:23.000-07:00
* feat: support ml.Imputer in bigframes

* address comments

* address more comments

* address more comments
diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py
@@ -28,7 +28,7 @@
 
 from bigframes import constants
 from bigframes.core import log_adapter
-from bigframes.ml import base, core, globals, preprocessing, utils
+from bigframes.ml import base, core, globals, impute, preprocessing, utils
 import bigframes.pandas as bpd
 
 _BQML_TRANSFROM_TYPE_MAPPING = types.MappingProxyType(
@@ -40,6 +40,7 @@
         "ML.BUCKETIZE": preprocessing.KBinsDiscretizer,
         "ML.QUANTILE_BUCKETIZE": preprocessing.KBinsDiscretizer,
         "ML.LABEL_ENCODER": preprocessing.LabelEncoder,
+        "ML.IMPUTER": impute.SimpleImputer,
     }
 )
 
@@ -58,7 +59,7 @@ def __init__(
         transformers: List[
             Tuple[
                 str,
-                preprocessing.PreprocessingType,
+                Union[preprocessing.PreprocessingType, impute.SimpleImputer],
                 Union[str, List[str]],
             ]
         ],
@@ -73,12 +74,14 @@ def __init__(
     @property
     def transformers_(
         self,
-    ) -> List[Tuple[str, preprocessing.PreprocessingType, str,]]:
+    ) -> List[
+        Tuple[str, Union[preprocessing.PreprocessingType, impute.SimpleImputer], str]
+    ]:
         """The collection of transformers as tuples of (name, transformer, column)."""
         result: List[
             Tuple[
                 str,
-                preprocessing.PreprocessingType,
+                Union[preprocessing.PreprocessingType, impute.SimpleImputer],
                 str,
             ]
         ] = []
@@ -107,7 +110,7 @@ def _extract_from_bq_model(
         transformers: List[
             Tuple[
                 str,
-                preprocessing.PreprocessingType,
+                Union[preprocessing.PreprocessingType, impute.SimpleImputer],
                 Union[str, List[str]],
             ]
         ] = []
@@ -152,7 +155,9 @@ def camel_to_snake(name):
 
     def _merge(
         self, bq_model: bigquery.Model
-    ) -> Union[ColumnTransformer, preprocessing.PreprocessingType,]:
+    ) -> Union[
+        ColumnTransformer, Union[preprocessing.PreprocessingType, impute.SimpleImputer]
+    ]:
         """Try to merge the column transformer to a simple transformer. Depends on all the columns in bq_model are transformed with the same transformer."""
         transformers = self.transformers_
 
diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py
@@ -0,0 +1,123 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformers for missing value imputation. This module is styled after
+scikit-learn's preprocessing module: https://scikit-learn.org/stable/modules/impute.html."""
+
+from __future__ import annotations
+
+import typing
+from typing import Any, List, Literal, Optional, Tuple, Union
+
+import bigframes_vendored.sklearn.impute._base
+
+from bigframes.core import log_adapter
+from bigframes.ml import base, core, globals, utils
+import bigframes.pandas as bpd
+
+
+@log_adapter.class_logger
+class SimpleImputer(
+    base.Transformer,
+    bigframes_vendored.sklearn.impute._base.SimpleImputer,
+):
+
+    __doc__ = bigframes_vendored.sklearn.impute._base.SimpleImputer.__doc__
+
+    def __init__(
+        self,
+        strategy: Literal["mean", "median", "most_frequent"] = "mean",
+    ):
+        self.strategy = strategy
+        self._bqml_model: Optional[core.BqmlModel] = None
+        self._bqml_model_factory = globals.bqml_model_factory()
+        self._base_sql_generator = globals.base_sql_generator()
+
+    # TODO(garrettwu): implement __hash__
+    def __eq__(self, other: Any) -> bool:
+        return (
+            type(other) is SimpleImputer
+            and self.strategy == other.strategy
+            and self._bqml_model == other._bqml_model
+        )
+
+    def _compile_to_sql(
+        self,
+        columns: List[str],
+        X=None,
+    ) -> List[Tuple[str, str]]:
+        """Compile this transformer to a list of SQL expressions that can be included in
+        a BQML TRANSFORM clause
+
+        Args:
+            columns:
+                A list of column names to transform.
+            X:
+                The Dataframe with training data.
+
+        Returns: a list of tuples of (sql_expression, output_name)"""
+        return [
+            (
+                self._base_sql_generator.ml_imputer(
+                    column, self.strategy, f"imputer_{column}"
+                ),
+                f"imputer_{column}",
+            )
+            for column in columns
+        ]
+
+    @classmethod
+    def _parse_from_sql(cls, sql: str) -> tuple[SimpleImputer, str]:
+        """Parse SQL to tuple(SimpleImputer, column_label).
+
+        Args:
+            sql: SQL string of format "ML.IMPUTER({col_label}, {strategy}) OVER()"
+
+        Returns:
+            tuple(SimpleImputer, column_label)"""
+        s = sql[sql.find("(") + 1 : sql.find(")")]
+        col_label, strategy = s.split(", ")
+        return cls(strategy[1:-1]), col_label  # type: ignore[arg-type]
+
+    def fit(
+        self,
+        X: Union[bpd.DataFrame, bpd.Series],
+        y=None,  # ignored
+    ) -> SimpleImputer:
+        (X,) = utils.convert_to_dataframe(X)
+
+        compiled_transforms = self._compile_to_sql(X.columns.tolist(), X)
+        transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms]
+
+        self._bqml_model = self._bqml_model_factory.create_model(
+            X,
+            options={"model_type": "transform_only"},
+            transforms=transform_sqls,
+        )
+
+        # The schema of TRANSFORM output is not available in the model API, so save it during fitting
+        self._output_names = [name for _, name in compiled_transforms]
+        return self
+
+    def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("Must be fitted before transform")
+
+        (X,) = utils.convert_to_dataframe(X)
+
+        df = self._bqml_model.transform(X)
+        return typing.cast(
+            bpd.DataFrame,
+            df[self._output_names],
+        )
diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py
@@ -29,6 +29,7 @@
     ensemble,
     forecasting,
     imported,
+    impute,
     linear_model,
     llm,
     pipeline,
@@ -84,6 +85,7 @@ def from_bq(
     pipeline.Pipeline,
     compose.ColumnTransformer,
     preprocessing.PreprocessingType,
+    impute.SimpleImputer,
 ]:
     """Load a BQML model to BigQuery DataFrames ML.
 
diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py
@@ -26,7 +26,15 @@
 import bigframes
 import bigframes.constants as constants
 from bigframes.core import log_adapter
-from bigframes.ml import base, compose, forecasting, loader, preprocessing, utils
+from bigframes.ml import (
+    base,
+    compose,
+    forecasting,
+    impute,
+    loader,
+    preprocessing,
+    utils,
+)
 import bigframes.pandas as bpd
 
 
@@ -56,6 +64,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]):
                 preprocessing.MinMaxScaler,
                 preprocessing.KBinsDiscretizer,
                 preprocessing.LabelEncoder,
+                impute.SimpleImputer,
             ),
         ):
             self._transform = transform
diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py
@@ -305,6 +305,7 @@ def __eq__(self, other: Any) -> bool:
         return (
             type(other) is KBinsDiscretizer
             and self.n_bins == other.n_bins
+            and self.strategy == other.strategy
             and self._bqml_model == other._bqml_model
         )
 
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
@@ -103,6 +103,15 @@ def ml_min_max_scaler(self, numeric_expr_sql: str, name: str) -> str:
         """Encode ML.MIN_MAX_SCALER for BQML"""
         return f"""ML.MIN_MAX_SCALER({numeric_expr_sql}) OVER() AS {name}"""
 
+    def ml_imputer(
+        self,
+        expr_sql: str,
+        strategy: str,
+        name: str,
+    ) -> str:
+        """Encode ML.IMPUTER for BQML"""
+        return f"""ML.IMPUTER({expr_sql}, '{strategy}') OVER() AS {name}"""
+
     def ml_bucketize(
         self,
         numeric_expr_sql: str,
diff --git a/docs/reference/bigframes.ml/impute.rst b/docs/reference/bigframes.ml/impute.rst
@@ -0,0 +1,7 @@
+bigframes.ml.impute
+==========================
+
+.. automodule:: bigframes.ml.impute
+    :members:
+    :inherited-members:
+    :undoc-members:
diff --git a/docs/reference/bigframes.ml/index.rst b/docs/reference/bigframes.ml/index.rst
@@ -19,6 +19,8 @@ API Reference
 
     imported
 
+    impute
+
     linear_model
 
     llm
diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml
@@ -134,6 +134,12 @@
       - name: XGBoostModel
         uid: bigframes.ml.imported.XGBoostModel
       name: imported
+    - items:
+      - name: Overview
+        uid: bigframes.ml.impute
+      - name: SimpleImputer
+        uid: bigframes.ml.impute.SimpleImputer
+      name: impute
     - items:
       - name: Overview
         uid: bigframes.ml.linear_model
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
@@ -29,13 +29,15 @@
 import google.cloud.resourcemanager_v3 as resourcemanager_v3
 import google.cloud.storage as storage  # type: ignore
 import ibis.backends.base
+import numpy as np
 import pandas as pd
 import pytest
 import pytz
 import test_utils.prefixer
 
 import bigframes
 import bigframes.dataframe
+import bigframes.pandas as bpd
 import tests.system.utils
 
 # Use this to control the number of cloud functions being deleted in a single
@@ -624,6 +626,18 @@ def new_penguins_pandas_df():
     ).set_index("tag_number")
 
 
+@pytest.fixture(scope="session")
+def missing_values_penguins_df():
+    """Additional data matching the missing values penguins dataset"""
+    return bpd.DataFrame(
+        {
+            "culmen_length_mm": [39.5, 38.5, 37.9],
+            "culmen_depth_mm": [np.nan, 17.2, 18.1],
+            "flipper_length_mm": [np.nan, 181.0, 188.0],
+        }
+    )
+
+
 @pytest.fixture(scope="session")
 def new_penguins_df(session, new_penguins_pandas_df):
     return session.read_pandas(new_penguins_pandas_df)
diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py
diff --git a/tests/system/small/ml/test_impute.py b/tests/system/small/ml/test_impute.py
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
diff --git a/third_party/bigframes_vendored/sklearn/impute/_base.py b/third_party/bigframes_vendored/sklearn/impute/_base.py

Original file line number	Diff line number	Diff line change
`@@ -305,6 +305,7 @@ def __eq__(self, other: Any) -> bool:`
`305`	`305`	`return (`
`306`	`306`	`type(other) is KBinsDiscretizer`
`307`	`307`	`and self.n_bins == other.n_bins`
	`308`	`+ and self.strategy == other.strategy`
`308`	`309`	`and self._bqml_model == other._bqml_model`
`309`	`310`	`)`
`310`	`311`