Skip to content

Commit 39fe474

Browse files
authored
fix: reloaded transformer .transform error (#569)
* fix: reloaded transformer .transform error * fix mypy
1 parent 098d444 commit 39fe474

File tree

3 files changed

+149
-16
lines changed

3 files changed

+149
-16
lines changed

bigframes/ml/compose.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -115,14 +115,17 @@ def camel_to_snake(name):
115115
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
116116
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()
117117

118+
output_names = []
118119
for transform_col in bq_model._properties["transformColumns"]:
120+
transform_col_dict = cast(dict, transform_col)
119121
# pass the columns that are not transformed
120-
if "transformSql" not in transform_col:
122+
if "transformSql" not in transform_col_dict:
121123
continue
122-
transform_sql: str = cast(dict, transform_col)["transformSql"]
124+
transform_sql: str = transform_col_dict["transformSql"]
123125
if not transform_sql.startswith("ML."):
124126
continue
125127

128+
output_names.append(transform_col_dict["name"])
126129
found_transformer = False
127130
for prefix in _BQML_TRANSFROM_TYPE_MAPPING:
128131
if transform_sql.startswith(prefix):
@@ -141,7 +144,10 @@ def camel_to_snake(name):
141144
f"Unsupported transformer type. {constants.FEEDBACK_LINK}"
142145
)
143146

144-
return cls(transformers=transformers)
147+
transformer = cls(transformers=transformers)
148+
transformer._output_names = output_names
149+
150+
return transformer
145151

146152
def _merge(
147153
self, bq_model: bigquery.Model
@@ -164,6 +170,7 @@ def _merge(
164170
for feature_column in bq_model.feature_columns
165171
]
166172
) == sorted(columns):
173+
transformer_0._output_names = self._output_names
167174
return transformer_0
168175

169176
return self

tests/system/large/ml/test_compose.py

+23
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,26 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id):
142142
]
143143
assert reloaded_transformer.transformers_ == expected
144144
assert reloaded_transformer._bqml_model is not None
145+
146+
result = transformer.fit_transform(
147+
new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]]
148+
).to_pandas()
149+
150+
expected = pandas.DataFrame(
151+
{
152+
"onehotencoded_species": [
153+
[{"index": 1, "value": 1.0}],
154+
[{"index": 1, "value": 1.0}],
155+
[{"index": 2, "value": 1.0}],
156+
],
157+
"standard_scaled_culmen_length_mm": [
158+
1.313249,
159+
-0.20198,
160+
-1.111118,
161+
],
162+
"standard_scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338],
163+
},
164+
index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"),
165+
)
166+
167+
pandas.testing.assert_frame_equal(result, expected, rtol=0.1, check_dtype=False)

tests/system/small/ml/test_preprocessing.py

+116-13
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
5858
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
5959
)
6060

61-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
61+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
6262

6363

6464
def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
@@ -82,7 +82,7 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
8282
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
8383
)
8484

85-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
85+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
8686

8787

8888
def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
@@ -110,7 +110,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
110110
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
111111
)
112112

113-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
113+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
114114

115115

116116
def test_standard_scaler_save_load(new_penguins_df, dataset_id):
@@ -125,6 +125,22 @@ def test_standard_scaler_save_load(new_penguins_df, dataset_id):
125125
assert isinstance(reloaded_transformer, preprocessing.StandardScaler)
126126
assert reloaded_transformer._bqml_model is not None
127127

128+
result = reloaded_transformer.transform(
129+
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
130+
).to_pandas()
131+
132+
expected = pd.DataFrame(
133+
{
134+
"standard_scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118],
135+
"standard_scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848],
136+
"standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338],
137+
},
138+
dtype="Float64",
139+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
140+
)
141+
142+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
143+
128144

129145
def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
130146
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod.
@@ -157,7 +173,7 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
157173
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
158174
)
159175

160-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
176+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
161177

162178

163179
def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df):
@@ -176,7 +192,7 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df):
176192
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
177193
)
178194

179-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
195+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
180196

181197

182198
def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
@@ -199,7 +215,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin
199215
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
200216
)
201217

202-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
218+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
203219

204220

205221
def test_max_abs_scaler_save_load(new_penguins_df, dataset_id):
@@ -214,6 +230,22 @@ def test_max_abs_scaler_save_load(new_penguins_df, dataset_id):
214230
assert isinstance(reloaded_transformer, preprocessing.MaxAbsScaler)
215231
assert reloaded_transformer._bqml_model is not None
216232

233+
result = reloaded_transformer.transform(
234+
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
235+
).to_pandas()
236+
237+
expected = pd.DataFrame(
238+
{
239+
"max_abs_scaled_culmen_length_mm": [1.0, 0.974684, 0.959494],
240+
"max_abs_scaled_culmen_depth_mm": [1.0, 0.914894, 0.962766],
241+
"max_abs_scaled_flipper_length_mm": [1.0, 0.923469, 0.959184],
242+
},
243+
dtype="Float64",
244+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
245+
)
246+
247+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
248+
217249

218250
def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
219251
scaler = preprocessing.MinMaxScaler()
@@ -231,7 +263,7 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
231263
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
232264
)
233265

234-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
266+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
235267

236268

237269
def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
@@ -255,7 +287,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin
255287
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
256288
)
257289

258-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
290+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
259291

260292

261293
def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
@@ -290,7 +322,7 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
290322
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
291323
)
292324

293-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
325+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
294326

295327

296328
def test_min_max_scaler_save_load(new_penguins_df, dataset_id):
@@ -305,6 +337,22 @@ def test_min_max_scaler_save_load(new_penguins_df, dataset_id):
305337
assert isinstance(reloaded_transformer, preprocessing.MinMaxScaler)
306338
assert reloaded_transformer._bqml_model is not None
307339

340+
result = reloaded_transformer.fit_transform(
341+
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
342+
).to_pandas()
343+
344+
expected = pd.DataFrame(
345+
{
346+
"min_max_scaled_culmen_length_mm": [1.0, 0.375, 0.0],
347+
"min_max_scaled_culmen_depth_mm": [1.0, 0.0, 0.5625],
348+
"min_max_scaled_flipper_length_mm": [1.0, 0.0, 0.466667],
349+
},
350+
dtype="Float64",
351+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
352+
)
353+
354+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
355+
308356

309357
def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df):
310358
discretizer = preprocessing.KBinsDiscretizer(strategy="uniform")
@@ -322,7 +370,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins
322370
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
323371
)
324372

325-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
373+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
326374

327375

328376
def test_k_bins_discretizer_series_normalizes(
@@ -344,7 +392,7 @@ def test_k_bins_discretizer_series_normalizes(
344392
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
345393
)
346394

347-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
395+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
348396

349397

350398
def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df):
@@ -374,7 +422,7 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d
374422
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
375423
)
376424

377-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
425+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
378426

379427

380428
def test_k_bins_discretizer_normalizes_different_params(
@@ -406,7 +454,7 @@ def test_k_bins_discretizer_normalizes_different_params(
406454
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
407455
)
408456

409-
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
457+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
410458

411459

412460
def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
@@ -423,6 +471,22 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
423471
assert reloaded_transformer.strategy == transformer.strategy
424472
assert reloaded_transformer._bqml_model is not None
425473

474+
result = reloaded_transformer.fit_transform(
475+
new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
476+
).to_pandas()
477+
478+
expected = pd.DataFrame(
479+
{
480+
"kbinsdiscretizer_culmen_length_mm": ["bin_6", "bin_4", "bin_2"],
481+
"kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_2", "bin_5"],
482+
"kbinsdiscretizer_flipper_length_mm": ["bin_6", "bin_2", "bin_4"],
483+
},
484+
dtype="string[pyarrow]",
485+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
486+
)
487+
488+
pd.testing.assert_frame_equal(result, expected, rtol=0.1)
489+
426490

427491
def test_one_hot_encoder_default_params(new_penguins_df):
428492
encoder = preprocessing.OneHotEncoder()
@@ -560,6 +624,29 @@ def test_one_hot_encoder_save_load(new_penguins_df, dataset_id):
560624
assert reloaded_transformer.max_categories == transformer.max_categories
561625
assert reloaded_transformer._bqml_model is not None
562626

627+
result = reloaded_transformer.fit_transform(
628+
new_penguins_df[["species", "sex"]]
629+
).to_pandas()
630+
631+
expected = pd.DataFrame(
632+
{
633+
"onehotencoded_species": [
634+
[{"index": 1, "value": 1.0}],
635+
[{"index": 1, "value": 1.0}],
636+
[{"index": 2, "value": 1.0}],
637+
],
638+
"onehotencoded_sex": [
639+
[{"index": 2, "value": 1.0}],
640+
[{"index": 1, "value": 1.0}],
641+
[{"index": 1, "value": 1.0}],
642+
],
643+
},
644+
dtype=ONE_HOT_ENCODED_DTYPE,
645+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
646+
)
647+
648+
pd.testing.assert_frame_equal(result, expected)
649+
563650

564651
def test_label_encoder_default_params(new_penguins_df):
565652
encoder = preprocessing.LabelEncoder()
@@ -677,5 +764,21 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id):
677764
assert reloaded_transformer.max_categories == transformer.max_categories
678765
assert reloaded_transformer._bqml_model is not None
679766

767+
result = reloaded_transformer.transform(new_penguins_df).to_pandas()
768+
769+
expected = pd.DataFrame(
770+
{
771+
"labelencoded_species": [
772+
1,
773+
1,
774+
2,
775+
],
776+
},
777+
dtype="Int64",
778+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
779+
)
780+
781+
pd.testing.assert_frame_equal(result, expected)
782+
680783

681784
# TODO(garrettwu): add OneHotEncoder tests to compare with sklearn.

0 commit comments

Comments
 (0)