Skip to content

Commit c6c487f

Browse files
authored
feat: add strategy="quantile" in KBinsDiscretizer (#654)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal #310685445 🦕
1 parent 5a7b1c9 commit c6c487f

File tree

6 files changed

+112
-18
lines changed

6 files changed

+112
-18
lines changed

bigframes/ml/compose.py

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"ML.MAX_ABS_SCALER": preprocessing.MaxAbsScaler,
3939
"ML.MIN_MAX_SCALER": preprocessing.MinMaxScaler,
4040
"ML.BUCKETIZE": preprocessing.KBinsDiscretizer,
41+
"ML.QUANTILE_BUCKETIZE": preprocessing.KBinsDiscretizer,
4142
"ML.LABEL_ENCODER": preprocessing.LabelEncoder,
4243
}
4344
)

bigframes/ml/preprocessing.py

+35-16
Original file line numberDiff line numberDiff line change
@@ -290,10 +290,6 @@ def __init__(
290290
n_bins: int = 5,
291291
strategy: Literal["uniform", "quantile"] = "quantile",
292292
):
293-
if strategy != "uniform":
294-
raise NotImplementedError(
295-
f"Only strategy = 'uniform' is supported now, input is {strategy}."
296-
)
297293
if n_bins < 2:
298294
raise ValueError(
299295
f"n_bins has to be larger than or equal to 2, input is {n_bins}."
@@ -337,30 +333,53 @@ def _compile_to_sql(
337333
min_value + i * bin_size for i in range(self.n_bins - 1)
338334
]
339335

340-
return [
341-
(
342-
self._base_sql_generator.ml_bucketize(
343-
column, array_split_points[column], f"kbinsdiscretizer_{column}"
344-
),
345-
f"kbinsdiscretizer_{column}",
336+
return [
337+
(
338+
self._base_sql_generator.ml_bucketize(
339+
column, array_split_points[column], f"kbinsdiscretizer_{column}"
340+
),
341+
f"kbinsdiscretizer_{column}",
342+
)
343+
for column in columns
344+
]
345+
346+
elif self.strategy == "quantile":
347+
348+
return [
349+
(
350+
self._base_sql_generator.ml_quantile_bucketize(
351+
column, self.n_bins, f"kbinsdiscretizer_{column}"
352+
),
353+
f"kbinsdiscretizer_{column}",
354+
)
355+
for column in columns
356+
]
357+
358+
else:
359+
raise ValueError(
360+
f"strategy should be set 'quantile' or 'uniform', but your input is {self.strategy}."
346361
)
347-
for column in columns
348-
]
349362

350363
@classmethod
351364
def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]:
352365
"""Parse SQL to tuple(KBinsDiscretizer, column_label).
353366
354367
Args:
355-
sql: SQL string of format "ML.BUCKETIZE({col_label}, array_split_points, FALSE) OVER()"
368+
sql: SQL string of format "ML.BUCKETIZE({col_label}, array_split_points, FALSE)"
369+
or ML.QUANTILE_BUCKETIZE({col_label}, num_bucket) OVER()"
356370
357371
Returns:
358372
tuple(KBinsDiscretizer, column_label)"""
359373
s = sql[sql.find("(") + 1 : sql.find(")")]
360-
array_split_points = s[s.find("[") + 1 : s.find("]")]
361374
col_label = s[: s.find(",")]
362-
n_bins = array_split_points.count(",") + 2
363-
return cls(n_bins, "uniform"), col_label
375+
376+
if sql.startswith("ML.QUANTILE_BUCKETIZE"):
377+
num_bins = s.split(",")[1]
378+
return cls(int(num_bins), "quantile"), col_label
379+
else:
380+
array_split_points = s[s.find("[") + 1 : s.find("]")]
381+
n_bins = array_split_points.count(",") + 2
382+
return cls(n_bins, "uniform"), col_label
364383

365384
def fit(
366385
self,

bigframes/ml/sql.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,18 @@ def ml_bucketize(
109109
array_split_points: Iterable[Union[int, float]],
110110
name: str,
111111
) -> str:
112-
"""Encode ML.MIN_MAX_SCALER for BQML"""
112+
"""Encode ML.BUCKETIZE for BQML"""
113113
return f"""ML.BUCKETIZE({numeric_expr_sql}, {array_split_points}, FALSE) AS {name}"""
114114

115+
def ml_quantile_bucketize(
116+
self,
117+
numeric_expr_sql: str,
118+
num_bucket: int,
119+
name: str,
120+
) -> str:
121+
"""Encode ML.QUANTILE_BUCKETIZE for BQML"""
122+
return f"""ML.QUANTILE_BUCKETIZE({numeric_expr_sql}, {num_bucket}) OVER() AS {name}"""
123+
115124
def ml_one_hot_encoder(
116125
self,
117126
numeric_expr_sql: str,

tests/system/small/ml/test_preprocessing.py

+58
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,27 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins
373373
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
374374

375375

376+
def test_k_bins_discretizer_normalized_fit_transform_default_params_quantile(
377+
new_penguins_df,
378+
):
379+
discretizer = preprocessing.KBinsDiscretizer(strategy="quantile")
380+
result = discretizer.fit_transform(
381+
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
382+
).to_pandas()
383+
384+
expected = pd.DataFrame(
385+
{
386+
"kbinsdiscretizer_culmen_length_mm": ["bin_2", "bin_2", "bin_1"],
387+
"kbinsdiscretizer_culmen_depth_mm": ["bin_2", "bin_1", "bin_2"],
388+
"kbinsdiscretizer_flipper_length_mm": ["bin_2", "bin_1", "bin_2"],
389+
},
390+
dtype="string[pyarrow]",
391+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
392+
)
393+
394+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
395+
396+
376397
def test_k_bins_discretizer_series_normalizes(
377398
penguins_df_default_index, new_penguins_df
378399
):
@@ -395,6 +416,28 @@ def test_k_bins_discretizer_series_normalizes(
395416
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
396417

397418

419+
def test_k_bins_discretizer_series_normalizes_quantile(
420+
penguins_df_default_index, new_penguins_df
421+
):
422+
discretizer = preprocessing.KBinsDiscretizer(strategy="quantile")
423+
discretizer.fit(penguins_df_default_index["culmen_length_mm"])
424+
425+
result = discretizer.transform(
426+
penguins_df_default_index["culmen_length_mm"]
427+
).to_pandas()
428+
result = discretizer.transform(new_penguins_df).to_pandas()
429+
430+
expected = pd.DataFrame(
431+
{
432+
"kbinsdiscretizer_culmen_length_mm": ["bin_2", "bin_2", "bin_1"],
433+
},
434+
dtype="string[pyarrow]",
435+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
436+
)
437+
438+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
439+
440+
398441
def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df):
399442
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod.
400443
discretizer = preprocessing.KBinsDiscretizer(strategy="uniform")
@@ -488,6 +531,21 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
488531
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
489532

490533

534+
def test_k_bins_discretizer_save_load_quantile(new_penguins_df, dataset_id):
535+
transformer = preprocessing.KBinsDiscretizer(n_bins=6, strategy="quantile")
536+
transformer.fit(
537+
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
538+
)
539+
540+
reloaded_transformer = transformer.to_gbq(
541+
f"{dataset_id}.temp_configured_model", replace=True
542+
)
543+
assert isinstance(reloaded_transformer, preprocessing.KBinsDiscretizer)
544+
assert reloaded_transformer.n_bins == transformer.n_bins
545+
assert reloaded_transformer.strategy == transformer.strategy
546+
assert reloaded_transformer._bqml_model is not None
547+
548+
491549
def test_one_hot_encoder_default_params(new_penguins_df):
492550
encoder = preprocessing.OneHotEncoder()
493551
encoder.fit(new_penguins_df[["species", "sex"]])

tests/unit/ml/test_sql.py

+7
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,13 @@ def test_k_bins_discretizer_correct(
113113
assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a"
114114

115115

116+
def test_k_bins_discretizer_quantile_correct(
117+
base_sql_generator: ml_sql.BaseSqlGenerator,
118+
):
119+
sql = base_sql_generator.ml_quantile_bucketize("col_a", 5, "scaled_col_a")
120+
assert sql == "ML.QUANTILE_BUCKETIZE(col_a, 5) OVER() AS scaled_col_a"
121+
122+
116123
def test_one_hot_encoder_correct(
117124
base_sql_generator: ml_sql.BaseSqlGenerator,
118125
):

third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
1818
strategy ({'uniform', 'quantile'}, default='quantile'):
1919
Strategy used to define the widths of the bins. 'uniform': All bins
2020
in each feature have identical widths. 'quantile': All bins in each
21-
feature have the same number of points. Only `uniform` is supported.
21+
feature have the same number of points.
2222
"""
2323

2424
def fit(self, X, y=None):

0 commit comments

Comments
 (0)