@@ -58,7 +58,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
58
58
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
59
59
)
60
60
61
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
61
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
62
62
63
63
64
64
def test_standard_scaler_normalizeds_fit_transform (new_penguins_df ):
@@ -82,7 +82,7 @@ def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
82
82
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
83
83
)
84
84
85
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
85
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
86
86
87
87
88
88
def test_standard_scaler_series_normalizes (penguins_df_default_index , new_penguins_df ):
@@ -110,7 +110,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
110
110
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
111
111
)
112
112
113
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
113
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
114
114
115
115
116
116
def test_standard_scaler_save_load (new_penguins_df , dataset_id ):
@@ -125,6 +125,22 @@ def test_standard_scaler_save_load(new_penguins_df, dataset_id):
125
125
assert isinstance (reloaded_transformer , preprocessing .StandardScaler )
126
126
assert reloaded_transformer ._bqml_model is not None
127
127
128
+ result = reloaded_transformer .transform (
129
+ new_penguins_df [["culmen_length_mm" , "culmen_depth_mm" , "flipper_length_mm" ]]
130
+ ).to_pandas ()
131
+
132
+ expected = pd .DataFrame (
133
+ {
134
+ "standard_scaled_culmen_length_mm" : [1.313249 , - 0.20198 , - 1.111118 ],
135
+ "standard_scaled_culmen_depth_mm" : [1.17072 , - 1.272416 , 0.101848 ],
136
+ "standard_scaled_flipper_length_mm" : [1.251089 , - 1.196588 , - 0.054338 ],
137
+ },
138
+ dtype = "Float64" ,
139
+ index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
140
+ )
141
+
142
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
143
+
128
144
129
145
def test_max_abs_scaler_normalizes (penguins_df_default_index , new_penguins_df ):
130
146
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod.
@@ -157,7 +173,7 @@ def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df):
157
173
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
158
174
)
159
175
160
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
176
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
161
177
162
178
163
179
def test_max_abs_scaler_normalizeds_fit_transform (new_penguins_df ):
@@ -176,7 +192,7 @@ def test_max_abs_scaler_normalizeds_fit_transform(new_penguins_df):
176
192
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
177
193
)
178
194
179
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
195
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
180
196
181
197
182
198
def test_max_abs_scaler_series_normalizes (penguins_df_default_index , new_penguins_df ):
@@ -199,7 +215,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin
199
215
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
200
216
)
201
217
202
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
218
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
203
219
204
220
205
221
def test_max_abs_scaler_save_load (new_penguins_df , dataset_id ):
@@ -214,6 +230,22 @@ def test_max_abs_scaler_save_load(new_penguins_df, dataset_id):
214
230
assert isinstance (reloaded_transformer , preprocessing .MaxAbsScaler )
215
231
assert reloaded_transformer ._bqml_model is not None
216
232
233
+ result = reloaded_transformer .transform (
234
+ new_penguins_df [["culmen_length_mm" , "culmen_depth_mm" , "flipper_length_mm" ]]
235
+ ).to_pandas ()
236
+
237
+ expected = pd .DataFrame (
238
+ {
239
+ "max_abs_scaled_culmen_length_mm" : [1.0 , 0.974684 , 0.959494 ],
240
+ "max_abs_scaled_culmen_depth_mm" : [1.0 , 0.914894 , 0.962766 ],
241
+ "max_abs_scaled_flipper_length_mm" : [1.0 , 0.923469 , 0.959184 ],
242
+ },
243
+ dtype = "Float64" ,
244
+ index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
245
+ )
246
+
247
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
248
+
217
249
218
250
def test_min_max_scaler_normalized_fit_transform (new_penguins_df ):
219
251
scaler = preprocessing .MinMaxScaler ()
@@ -231,7 +263,7 @@ def test_min_max_scaler_normalized_fit_transform(new_penguins_df):
231
263
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
232
264
)
233
265
234
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
266
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
235
267
236
268
237
269
def test_min_max_scaler_series_normalizes (penguins_df_default_index , new_penguins_df ):
@@ -255,7 +287,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin
255
287
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
256
288
)
257
289
258
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
290
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
259
291
260
292
261
293
def test_min_max_scaler_normalizes (penguins_df_default_index , new_penguins_df ):
@@ -290,7 +322,7 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df):
290
322
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
291
323
)
292
324
293
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
325
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
294
326
295
327
296
328
def test_min_max_scaler_save_load (new_penguins_df , dataset_id ):
@@ -305,6 +337,22 @@ def test_min_max_scaler_save_load(new_penguins_df, dataset_id):
305
337
assert isinstance (reloaded_transformer , preprocessing .MinMaxScaler )
306
338
assert reloaded_transformer ._bqml_model is not None
307
339
340
+ result = reloaded_transformer .fit_transform (
341
+ new_penguins_df [["culmen_length_mm" , "culmen_depth_mm" , "flipper_length_mm" ]]
342
+ ).to_pandas ()
343
+
344
+ expected = pd .DataFrame (
345
+ {
346
+ "min_max_scaled_culmen_length_mm" : [1.0 , 0.375 , 0.0 ],
347
+ "min_max_scaled_culmen_depth_mm" : [1.0 , 0.0 , 0.5625 ],
348
+ "min_max_scaled_flipper_length_mm" : [1.0 , 0.0 , 0.466667 ],
349
+ },
350
+ dtype = "Float64" ,
351
+ index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
352
+ )
353
+
354
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
355
+
308
356
309
357
def test_k_bins_discretizer_normalized_fit_transform_default_params (new_penguins_df ):
310
358
discretizer = preprocessing .KBinsDiscretizer (strategy = "uniform" )
@@ -322,7 +370,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins
322
370
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
323
371
)
324
372
325
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
373
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
326
374
327
375
328
376
def test_k_bins_discretizer_series_normalizes (
@@ -344,7 +392,7 @@ def test_k_bins_discretizer_series_normalizes(
344
392
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
345
393
)
346
394
347
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
395
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
348
396
349
397
350
398
def test_k_bins_discretizer_normalizes (penguins_df_default_index , new_penguins_df ):
@@ -374,7 +422,7 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d
374
422
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
375
423
)
376
424
377
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
425
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
378
426
379
427
380
428
def test_k_bins_discretizer_normalizes_different_params (
@@ -406,7 +454,7 @@ def test_k_bins_discretizer_normalizes_different_params(
406
454
index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
407
455
)
408
456
409
- pd .testing .assert_frame_equal (result , expected , rtol = 1e-3 )
457
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
410
458
411
459
412
460
def test_k_bins_discretizer_save_load (new_penguins_df , dataset_id ):
@@ -423,6 +471,22 @@ def test_k_bins_discretizer_save_load(new_penguins_df, dataset_id):
423
471
assert reloaded_transformer .strategy == transformer .strategy
424
472
assert reloaded_transformer ._bqml_model is not None
425
473
474
+ result = reloaded_transformer .fit_transform (
475
+ new_penguins_df [["culmen_length_mm" , "culmen_depth_mm" , "flipper_length_mm" ]]
476
+ ).to_pandas ()
477
+
478
+ expected = pd .DataFrame (
479
+ {
480
+ "kbinsdiscretizer_culmen_length_mm" : ["bin_6" , "bin_4" , "bin_2" ],
481
+ "kbinsdiscretizer_culmen_depth_mm" : ["bin_6" , "bin_2" , "bin_5" ],
482
+ "kbinsdiscretizer_flipper_length_mm" : ["bin_6" , "bin_2" , "bin_4" ],
483
+ },
484
+ dtype = "string[pyarrow]" ,
485
+ index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
486
+ )
487
+
488
+ pd .testing .assert_frame_equal (result , expected , rtol = 0.1 )
489
+
426
490
427
491
def test_one_hot_encoder_default_params (new_penguins_df ):
428
492
encoder = preprocessing .OneHotEncoder ()
@@ -560,6 +624,29 @@ def test_one_hot_encoder_save_load(new_penguins_df, dataset_id):
560
624
assert reloaded_transformer .max_categories == transformer .max_categories
561
625
assert reloaded_transformer ._bqml_model is not None
562
626
627
+ result = reloaded_transformer .fit_transform (
628
+ new_penguins_df [["species" , "sex" ]]
629
+ ).to_pandas ()
630
+
631
+ expected = pd .DataFrame (
632
+ {
633
+ "onehotencoded_species" : [
634
+ [{"index" : 1 , "value" : 1.0 }],
635
+ [{"index" : 1 , "value" : 1.0 }],
636
+ [{"index" : 2 , "value" : 1.0 }],
637
+ ],
638
+ "onehotencoded_sex" : [
639
+ [{"index" : 2 , "value" : 1.0 }],
640
+ [{"index" : 1 , "value" : 1.0 }],
641
+ [{"index" : 1 , "value" : 1.0 }],
642
+ ],
643
+ },
644
+ dtype = ONE_HOT_ENCODED_DTYPE ,
645
+ index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
646
+ )
647
+
648
+ pd .testing .assert_frame_equal (result , expected )
649
+
563
650
564
651
def test_label_encoder_default_params (new_penguins_df ):
565
652
encoder = preprocessing .LabelEncoder ()
@@ -677,5 +764,21 @@ def test_label_encoder_save_load(new_penguins_df, dataset_id):
677
764
assert reloaded_transformer .max_categories == transformer .max_categories
678
765
assert reloaded_transformer ._bqml_model is not None
679
766
767
+ result = reloaded_transformer .transform (new_penguins_df ).to_pandas ()
768
+
769
+ expected = pd .DataFrame (
770
+ {
771
+ "labelencoded_species" : [
772
+ 1 ,
773
+ 1 ,
774
+ 2 ,
775
+ ],
776
+ },
777
+ dtype = "Int64" ,
778
+ index = pd .Index ([1633 , 1672 , 1690 ], name = "tag_number" , dtype = "Int64" ),
779
+ )
780
+
781
+ pd .testing .assert_frame_equal (result , expected )
782
+
680
783
681
784
# TODO(garrettwu): add OneHotEncoder tests to compare with sklearn.
0 commit comments