test: add code snippets for using bigframes.ml (#159)

ashleyxuu · web-flow · commit 3d7a0d6f7172 · 2023-11-01T15:45:08.000-07:00
* test: add code snippets for using bigframes.ml
diff --git a/samples/snippets/clustering_model_test.py b/samples/snippets/clustering_model_test.py
@@ -0,0 +1,35 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_clustering_model():
+    # [START bigquery_dataframes_clustering_model]
+    from bigframes.ml.cluster import KMeans
+    import bigframes.pandas as bpd
+
+    # Load data from BigQuery
+    query_or_table = "bigquery-public-data.ml_datasets.penguins"
+    bq_df = bpd.read_gbq(query_or_table)
+
+    # Create the KMeans model
+    cluster_model = KMeans(n_clusters=10)
+    cluster_model.fit(bq_df["culmen_length_mm"], bq_df["sex"])
+
+    # Predict using the model
+    result = cluster_model.predict(bq_df)
+    # Score the model
+    score = cluster_model.score(bq_df)
+    # [END bigquery_dataframes_clustering_model]
+    assert result is not None
+    assert score is not None
diff --git a/samples/snippets/gen_ai_model_test.py b/samples/snippets/gen_ai_model_test.py
@@ -0,0 +1,39 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_llm_model():
+    PROJECT_ID = "bigframes-dev"
+    REGION = "us"
+    CONN_NAME = "bigframes-ml"
+    # [START bigquery_dataframes_gen_ai_model]
+    from bigframes.ml.llm import PaLM2TextGenerator
+    import bigframes.pandas as bpd
+
+    # Create the LLM model
+    session = bpd.get_global_session()
+    connection = f"{PROJECT_ID}.{REGION}.{CONN_NAME}"
+    model = PaLM2TextGenerator(session=session, connection_name=connection)
+
+    df_api = bpd.read_csv("gs://cloud-samples-data/vertex-ai/bigframe/df.csv")
+
+    # Prepare the prompts and send them to the LLM model for prediction
+    df_prompt_prefix = "Generate Pandas sample code for DataFrame."
+    df_prompt = df_prompt_prefix + df_api["API"]
+
+    # Predict using the model
+    df_pred = model.predict(df_prompt.to_frame(), max_output_tokens=1024)
+    # [END bigquery_dataframes_gen_ai_model]
+    assert df_pred["ml_generate_text_llm_result"] is not None
+    assert df_pred["ml_generate_text_llm_result"].iloc[0] is not None
diff --git a/samples/snippets/regression_model_test.py b/samples/snippets/regression_model_test.py
@@ -0,0 +1,57 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_regression_model():
+    # [START bigquery_dataframes_regression_model]
+    from bigframes.ml.linear_model import LinearRegression
+    import bigframes.pandas as bpd
+
+    # Load data from BigQuery
+    query_or_table = "bigquery-public-data.ml_datasets.penguins"
+    bq_df = bpd.read_gbq(query_or_table)
+
+    # Filter down to the data to the Adelie Penguin species
+    adelie_data = bq_df[bq_df.species == "Adelie Penguin (Pygoscelis adeliae)"]
+
+    # Drop the species column
+    adelie_data = adelie_data.drop(columns=["species"])
+
+    # Drop rows with nulls to get training data
+    training_data = adelie_data.dropna()
+
+    # Specify your feature (or input) columns and the label (or output) column:
+    feature_columns = training_data[
+        ["island", "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "sex"]
+    ]
+    label_columns = training_data[["body_mass_g"]]
+
+    test_data = adelie_data[adelie_data.body_mass_g.isnull()]
+
+    # Create the linear model
+    model = LinearRegression()
+    model.fit(feature_columns, label_columns)
+
+    # Score the model
+    score = model.score(feature_columns, label_columns)
+
+    # Predict using the model
+    result = model.predict(test_data)
+    # [END bigquery_dataframes_regression_model]
+    assert test_data is not None
+    assert feature_columns is not None
+    assert label_columns is not None
+    assert model is not None
+    assert score is not None
+    assert result is not None