Skip to content

Commit 8f1bea2

Browse files
Merge remote-tracking branch 'github/main' into fix_read_pandas
2 parents b634f6c + 8d39187 commit 8f1bea2

File tree

8 files changed

+161
-233
lines changed

8 files changed

+161
-233
lines changed

bigframes/ml/metrics/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
auc,
1919
confusion_matrix,
2020
f1_score,
21+
mean_squared_error,
2122
precision_score,
2223
r2_score,
2324
recall_score,
@@ -35,5 +36,6 @@
3536
"confusion_matrix",
3637
"precision_score",
3738
"f1_score",
39+
"mean_squared_error",
3840
"pairwise",
3941
]

bigframes/ml/metrics/_metrics.py

+17-7
Original file line numberDiff line numberDiff line change
@@ -161,14 +161,10 @@ def roc_auc_score(
161161

162162
fpr, tpr, _ = roc_curve(y_true_series, y_score_series, drop_intermediate=False)
163163

164-
# TODO(bmil): remove this once bigframes supports the necessary operations
165-
pd_fpr = fpr.to_pandas()
166-
pd_tpr = tpr.to_pandas()
167-
168164
# Use the trapezoid rule to compute the area under the ROC curve
169-
width_diff = pd_fpr.diff().iloc[1:].reset_index(drop=True)
170-
height_avg = (pd_tpr.iloc[:-1] + pd_tpr.iloc[1:].reset_index(drop=True)) / 2
171-
return (width_diff * height_avg).sum()
165+
width_diff = fpr.diff().iloc[1:].reset_index(drop=True)
166+
height_avg = (tpr.iloc[:-1] + tpr.iloc[1:].reset_index(drop=True)) / 2
167+
return typing.cast(float, (width_diff * height_avg).sum())
172168

173169

174170
roc_auc_score.__doc__ = inspect.getdoc(vendored_metrics_ranking.roc_auc_score)
@@ -335,3 +331,17 @@ def f1_score(
335331

336332

337333
f1_score.__doc__ = inspect.getdoc(vendored_metrics_classification.f1_score)
334+
335+
336+
def mean_squared_error(
337+
y_true: Union[bpd.DataFrame, bpd.Series],
338+
y_pred: Union[bpd.DataFrame, bpd.Series],
339+
) -> float:
340+
y_true_series, y_pred_series = utils.convert_to_series(y_true, y_pred)
341+
342+
return (y_pred_series - y_true_series).pow(2).sum() / len(y_true_series)
343+
344+
345+
mean_squared_error.__doc__ = inspect.getdoc(
346+
vendored_metrics_regression.mean_squared_error
347+
)

bigframes/operations/_matplotlib/core.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
import abc
1616
import typing
17-
import uuid
1817

1918
import pandas as pd
2019

@@ -115,6 +114,18 @@ def _compute_plot_data(self):
115114
if self._is_column_name(c, sample) and sample[c].dtype == dtypes.STRING_DTYPE:
116115
sample[c] = sample[c].astype("object")
117116

117+
# To avoid Matplotlib's automatic conversion of `Float64` or `Int64` columns
118+
# to `object` types (which breaks float-like behavior), this code proactively
119+
# converts the column to a compatible format.
120+
s = self.kwargs.get("s", None)
121+
if pd.core.dtypes.common.is_integer(s):
122+
s = self.data.columns[s]
123+
if self._is_column_name(s, sample):
124+
if sample[s].dtype == dtypes.INT_DTYPE:
125+
sample[s] = sample[s].astype("int64")
126+
elif sample[s].dtype == dtypes.FLOAT_DTYPE:
127+
sample[s] = sample[s].astype("float64")
128+
118129
return sample
119130

120131
def _is_sequence_arg(self, arg):
@@ -130,9 +141,3 @@ def _is_column_name(self, arg, data):
130141
and pd.core.dtypes.common.is_hashable(arg)
131142
and arg in data.columns
132143
)
133-
134-
def _generate_new_column_name(self, data):
135-
col_name = None
136-
while col_name is None or col_name in data.columns:
137-
col_name = f"plot_temp_{str(uuid.uuid4())[:8]}"
138-
return col_name

tests/system/large/ml/test_compose.py

+1-11
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,8 @@ def test_columntransformer_standalone_fit_and_transform(
4545
)
4646
result = transformer.transform(new_penguins_df).to_pandas()
4747

48-
# TODO: bug? feature columns seem to be in nondeterministic random order
49-
# workaround: sort columns by name. Can't repro it in pantheon, so could
50-
# be a bigframes issue...
51-
result = result.reindex(sorted(result.columns), axis=1)
52-
5348
expected = pandas.DataFrame(
5449
{
55-
"min_max_scaled_culmen_length_mm": [0.269, 0.232, 0.210],
5650
"onehotencoded_species": [
5751
[{"index": 1, "value": 1.0}],
5852
[{"index": 1, "value": 1.0}],
@@ -63,6 +57,7 @@ def test_columntransformer_standalone_fit_and_transform(
6357
-0.9945520581113803,
6458
-1.104611490204711,
6559
],
60+
"min_max_scaled_culmen_length_mm": [0.269, 0.232, 0.210],
6661
"standard_scaled_flipper_length_mm": [-0.350044, -1.418336, -0.9198],
6762
},
6863
index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"),
@@ -91,11 +86,6 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df):
9186
new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]]
9287
).to_pandas()
9388

94-
# TODO: bug? feature columns seem to be in nondeterministic random order
95-
# workaround: sort columns by name. Can't repro it in pantheon, so could
96-
# be a bigframes issue...
97-
result = result.reindex(sorted(result.columns), axis=1)
98-
9989
expected = pandas.DataFrame(
10090
{
10191
"onehotencoded_species": [

0 commit comments

Comments
 (0)