Skip to content

Commit 6df28ed

Browse files
authored
feat: add detect_anomalies to ml ARIMAPlus and KMeans models (#426)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 8d82945 commit 6df28ed

File tree

6 files changed

+185
-13
lines changed

6 files changed

+185
-13
lines changed

bigframes/ml/cluster.py

+28
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,34 @@ def predict(
9696

9797
return self._bqml_model.predict(X)
9898

99+
def detect_anomalies(
100+
self, X: Union[bpd.DataFrame, bpd.Series], *, contamination: float = 0.1
101+
) -> bpd.DataFrame:
102+
"""Detect the anomaly data points of the input.
103+
104+
Args:
105+
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
106+
Series or a DataFrame to detect anomalies.
107+
contamination (float, default 0.1):
108+
Identifies the proportion of anomalies in the training dataset that are used to create the model.
109+
The value must be in the range [0, 0.5].
110+
111+
Returns:
112+
bigframes.dataframe.DataFrame: detected DataFrame."""
113+
if contamination < 0.0 or contamination > 0.5:
114+
raise ValueError(
115+
f"contamination must be [0.0, 0.5], but is {contamination}."
116+
)
117+
118+
if not self._bqml_model:
119+
raise RuntimeError("A model must be fitted before detect_anomalies")
120+
121+
(X,) = utils.convert_to_dataframe(X)
122+
123+
return self._bqml_model.detect_anomalies(
124+
X, options={"contamination": contamination}
125+
)
126+
99127
def to_gbq(self, model_name: str, replace: bool = False) -> KMeans:
100128
"""Save the model to BigQuery.
101129

bigframes/ml/decomposition.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame:
111111
return self._bqml_model.predict(X)
112112

113113
def detect_anomalies(
114-
self, X: Union[bpd.DataFrame, bpd.Series], *, contamination=0.1
114+
self, X: Union[bpd.DataFrame, bpd.Series], *, contamination: float = 0.1
115115
) -> bpd.DataFrame:
116116
"""Detect the anomaly data points of the input.
117117

bigframes/ml/forecasting.py

+30
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,36 @@ def predict(
119119
options={"horizon": horizon, "confidence_level": confidence_level}
120120
)
121121

122+
def detect_anomalies(
123+
self,
124+
X: Union[bpd.DataFrame, bpd.Series],
125+
*,
126+
anomaly_prob_threshold: float = 0.95,
127+
) -> bpd.DataFrame:
128+
"""Detect the anomaly data points of the input.
129+
130+
Args:
131+
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
132+
Series or a DataFrame to detect anomalies.
133+
anomaly_prob_threshold (float, default 0.95):
134+
Identifies the custom threshold to use for anomaly detection. The value must be in the range [0, 1), with a default value of 0.95.
135+
136+
Returns:
137+
bigframes.dataframe.DataFrame: detected DataFrame."""
138+
if anomaly_prob_threshold < 0.0 or anomaly_prob_threshold >= 1.0:
139+
raise ValueError(
140+
f"anomaly_prob_threshold must be [0.0, 1.0), but is {anomaly_prob_threshold}."
141+
)
142+
143+
if not self._bqml_model:
144+
raise RuntimeError("A model must be fitted before detect_anomalies")
145+
146+
(X,) = utils.convert_to_dataframe(X)
147+
148+
return self._bqml_model.detect_anomalies(
149+
X, options={"anomaly_prob_threshold": anomaly_prob_threshold}
150+
)
151+
122152
def score(
123153
self,
124154
X: Union[bpd.DataFrame, bpd.Series],

tests/system/small/ml/test_cluster.py

+45
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pandas as pd
1616

1717
from bigframes.ml import cluster
18+
import bigframes.pandas as bpd
1819
from tests.system.utils import assert_pandas_df_equal
1920

2021
_PD_NEW_PENGUINS = pd.DataFrame.from_dict(
@@ -73,6 +74,50 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans):
7374
assert_pandas_df_equal(result, expected, ignore_order=True)
7475

7576

77+
def test_kmeans_detect_anomalies(
78+
penguins_kmeans_model: cluster.KMeans, new_penguins_df: bpd.DataFrame
79+
):
80+
anomalies = penguins_kmeans_model.detect_anomalies(new_penguins_df).to_pandas()
81+
expected = pd.DataFrame(
82+
{
83+
"is_anomaly": [False, False, False],
84+
"normalized_distance": [1.082937, 0.77139, 0.478304],
85+
},
86+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
87+
)
88+
89+
pd.testing.assert_frame_equal(
90+
anomalies[["is_anomaly", "normalized_distance"]].sort_index(),
91+
expected,
92+
check_exact=False,
93+
check_dtype=False,
94+
rtol=0.1,
95+
)
96+
97+
98+
def test_kmeans_detect_anomalies_params(
99+
penguins_kmeans_model: cluster.KMeans, new_penguins_df: bpd.DataFrame
100+
):
101+
anomalies = penguins_kmeans_model.detect_anomalies(
102+
new_penguins_df, contamination=0.4
103+
).to_pandas()
104+
expected = pd.DataFrame(
105+
{
106+
"is_anomaly": [True, False, False],
107+
"normalized_distance": [1.082937, 0.77139, 0.478304],
108+
},
109+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
110+
)
111+
112+
pd.testing.assert_frame_equal(
113+
anomalies[["is_anomaly", "normalized_distance"]].sort_index(),
114+
expected,
115+
check_exact=False,
116+
check_dtype=False,
117+
rtol=0.1,
118+
)
119+
120+
76121
def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans):
77122
new_penguins = session.read_pandas(_PD_NEW_PENGUINS)
78123
result = penguins_kmeans_model.score(new_penguins).to_pandas()

tests/system/small/ml/test_decomposition.py

+23
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,29 @@ def test_pca_detect_anomalies(
5959
)
6060

6161

62+
def test_pca_detect_anomalies_params(
63+
penguins_pca_model: decomposition.PCA, new_penguins_df: bpd.DataFrame
64+
):
65+
anomalies = penguins_pca_model.detect_anomalies(
66+
new_penguins_df, contamination=0.2
67+
).to_pandas()
68+
expected = pd.DataFrame(
69+
{
70+
"is_anomaly": [False, True, True],
71+
"mean_squared_error": [0.254188, 0.731243, 0.298889],
72+
},
73+
index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
74+
)
75+
76+
pd.testing.assert_frame_equal(
77+
anomalies[["is_anomaly", "mean_squared_error"]].sort_index(),
78+
expected,
79+
check_exact=False,
80+
check_dtype=False,
81+
rtol=0.1,
82+
)
83+
84+
6285
def test_pca_score(penguins_pca_model: decomposition.PCA):
6386
result = penguins_pca_model.score().to_pandas()
6487
expected = pd.DataFrame(

tests/system/small/ml/test_forecasting.py

+58-12
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@
3535
]
3636

3737

38-
def test_model_predict_default(time_series_arima_plus_model: forecasting.ARIMAPlus):
38+
def test_arima_plus_predict_default(
39+
time_series_arima_plus_model: forecasting.ARIMAPlus,
40+
):
3941
utc = pytz.utc
4042
predictions = time_series_arima_plus_model.predict().to_pandas()
4143
assert predictions.shape == (3, 8)
@@ -63,7 +65,7 @@ def test_model_predict_default(time_series_arima_plus_model: forecasting.ARIMAPl
6365
)
6466

6567

66-
def test_model_predict_params(time_series_arima_plus_model: forecasting.ARIMAPlus):
68+
def test_arima_plus_predict_params(time_series_arima_plus_model: forecasting.ARIMAPlus):
6769
utc = pytz.utc
6870
predictions = time_series_arima_plus_model.predict(
6971
horizon=4, confidence_level=0.9
@@ -94,7 +96,55 @@ def test_model_predict_params(time_series_arima_plus_model: forecasting.ARIMAPlu
9496
)
9597

9698

97-
def test_model_score(
99+
def test_arima_plus_detect_anomalies(
100+
time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
101+
):
102+
anomalies = time_series_arima_plus_model.detect_anomalies(
103+
new_time_series_df
104+
).to_pandas()
105+
106+
expected = pd.DataFrame(
107+
{
108+
"is_anomaly": [False, False, False],
109+
"lower_bound": [2349.301736, 2153.614829, 1849.040192],
110+
"upper_bound": [3099.642833, 3033.12195, 2858.185876],
111+
"anomaly_probability": [0.757824, 0.322559, 0.43011],
112+
},
113+
)
114+
pd.testing.assert_frame_equal(
115+
anomalies[["is_anomaly", "lower_bound", "upper_bound", "anomaly_probability"]],
116+
expected,
117+
rtol=0.1,
118+
check_index_type=False,
119+
check_dtype=False,
120+
)
121+
122+
123+
def test_arima_plus_detect_anomalies_params(
124+
time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
125+
):
126+
anomalies = time_series_arima_plus_model.detect_anomalies(
127+
new_time_series_df, anomaly_prob_threshold=0.7
128+
).to_pandas()
129+
130+
expected = pd.DataFrame(
131+
{
132+
"is_anomaly": [True, False, False],
133+
"lower_bound": [2525.5363, 2360.1870, 2086.0609],
134+
"upper_bound": [2923.408256, 2826.54981, 2621.165188],
135+
"anomaly_probability": [0.757824, 0.322559, 0.43011],
136+
},
137+
)
138+
pd.testing.assert_frame_equal(
139+
anomalies[["is_anomaly", "lower_bound", "upper_bound", "anomaly_probability"]],
140+
expected,
141+
rtol=0.1,
142+
check_index_type=False,
143+
check_dtype=False,
144+
)
145+
146+
147+
def test_arima_plus_score(
98148
time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
99149
):
100150
result = time_series_arima_plus_model.score(
@@ -118,16 +168,14 @@ def test_model_score(
118168
)
119169

120170

121-
def test_model_summary(
122-
time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
123-
):
171+
def test_arima_plus_summary(time_series_arima_plus_model: forecasting.ARIMAPlus):
124172
result = time_series_arima_plus_model.summary()
125173
assert result.shape == (1, 12)
126174
assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
127175

128176

129-
def test_model_summary_show_all_candidates(
130-
time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
177+
def test_arima_plus_summary_show_all_candidates(
178+
time_series_arima_plus_model: forecasting.ARIMAPlus,
131179
):
132180
result = time_series_arima_plus_model.summary(
133181
show_all_candidate_models=True,
@@ -136,7 +184,7 @@ def test_model_summary_show_all_candidates(
136184
assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
137185

138186

139-
def test_model_score_series(
187+
def test_arima_plus_score_series(
140188
time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
141189
):
142190
result = time_series_arima_plus_model.score(
@@ -160,9 +208,7 @@ def test_model_score_series(
160208
)
161209

162210

163-
def test_model_summary_series(
164-
time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
165-
):
211+
def test_arima_plus_summary_series(time_series_arima_plus_model: forecasting.ARIMAPlus):
166212
result = time_series_arima_plus_model.summary()
167213
assert result.shape == (1, 12)
168214
assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)

0 commit comments

Comments
 (0)