feat: support the score method for PaLM2TextGenerator (#634)

ashleyxuu · web-flow · commit 3ffc1d275ae1 · 2024-04-25T09:39:18.000-07:00
* feat: support the score method for PaLM2TextGenerator

* address comments

* address additional comments

* address minor comments
diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
@@ -187,6 +187,17 @@ def evaluate(self, input_data: Optional[bpd.DataFrame] = None):
 
         return self._session.read_gbq(sql)
 
+    def llm_evaluate(
+        self,
+        input_data: bpd.DataFrame,
+        task_type: Optional[str] = None,
+    ):
+        sql = self._model_manipulation_sql_generator.ml_llm_evaluate(
+            input_data, task_type
+        )
+
+        return self._session.read_gbq(sql)
+
     def arima_evaluate(self, show_all_candidate_models: bool = False):
         sql = self._model_manipulation_sql_generator.ml_arima_evaluate(
             show_all_candidate_models
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
@@ -220,7 +220,7 @@ def predict(
 
         Args:
             X (bigframes.dataframe.DataFrame or bigframes.series.Series):
-                Input DataFrame or Series, which needs to contain a column with name "prompt". Only the column will be used as input.
+                Input DataFrame or Series, which contains only one column of prompts.
                 Prompts can include preamble, questions, suggestions, instructions, or examples.
 
             temperature (float, default 0.0):
@@ -310,6 +310,63 @@ def predict(
 
         return df
 
+    def score(
+        self,
+        X: Union[bpd.DataFrame, bpd.Series],
+        y: Union[bpd.DataFrame, bpd.Series],
+        task_type: Literal[
+            "text_generation", "classification", "summarization", "question_answering"
+        ] = "text_generation",
+    ) -> bpd.DataFrame:
+        """Calculate evaluation metrics of the model.
+
+        .. note::
+
+            This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the
+            Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is"
+            and might have limited support. For more information, see the launch stage descriptions
+            (https://cloud.google.com/products#product-launch-stages).
+
+        .. note::
+
+            Output matches that of the BigQuery ML.EVALUTE function.
+            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm
+            for the outputs relevant to this model type.
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                A BigQuery DataFrame as evaluation data, which contains only one column of input_text
+                that contains the prompt text to use when evaluating the model.
+            y (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                A BigQuery DataFrame as evaluation labels, which contains only one column of output_text
+                that you would expect to be returned by the model.
+            task_type (str):
+                The type of the task for LLM model. Default to "text_generation".
+                Possible values: "text_generation", "classification", "summarization", and "question_answering".
+
+        Returns:
+            bigframes.dataframe.DataFrame: The DataFrame as evaluation result.
+        """
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before score")
+
+        X, y = utils.convert_to_dataframe(X, y)
+
+        if len(X.columns) != 1 or len(y.columns) != 1:
+            raise ValueError(
+                f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}"
+            )
+
+        # BQML identified the column by name
+        X_col_label = cast(blocks.Label, X.columns[0])
+        y_col_label = cast(blocks.Label, y.columns[0])
+        X = X.rename(columns={X_col_label: "input_text"})
+        y = y.rename(columns={y_col_label: "output_text"})
+
+        input_data = X.join(y, how="outer")
+
+        return self._bqml_model.llm_evaluate(input_data, task_type)
+
     def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator:
         """Save the model to BigQuery.
 
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
@@ -318,6 +318,16 @@ def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str:
             return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`,
   ({source_sql}))"""
 
+    # ML evaluation TVFs
+    def ml_llm_evaluate(
+        self, source_df: bpd.DataFrame, task_type: Optional[str] = None
+    ) -> str:
+        """Encode ML.EVALUATE for BQML"""
+        # Note: don't need index as evaluate returns a new table
+        source_sql, _, _ = source_df._to_sql_query(include_index=False)
+        return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`,
+            ({source_sql}), STRUCT("{task_type}" AS task_type))"""
+
     # ML evaluation TVFs
     def ml_arima_evaluate(self, show_all_candidate_models: bool = False) -> str:
         """Encode ML.ARMIA_EVALUATE for BQML"""
diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
@@ -22,13 +22,12 @@
 def llm_fine_tune_df_default_index(
     session: bigframes.Session,
 ) -> bigframes.dataframe.DataFrame:
-    sql = """
-SELECT
-  CONCAT("Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: ", text) as prompt,
-  CAST(label AS STRING) as label
-FROM `llm_tuning.emotion_classification_train`
-"""
-    return session.read_gbq(sql)
+    training_table_name = "llm_tuning.emotion_classification_train"
+    df = session.read_gbq(training_table_name)
+    prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: "
+    df["prompt"] = prefix + df["text"]
+    df["label"] = df["label"].astype("string")
+    return df
 
 
 @pytest.fixture(scope="session")
@@ -69,3 +68,46 @@ def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_
     assert all(series.str.len() == 1)
 
     # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept
+
+
+def test_llm_palm_score(llm_fine_tune_df_default_index):
+    model = bigframes.ml.llm.PaLM2TextGenerator(model_name="text-bison")
+
+    # Check score to ensure the model was fitted
+    score_result = model.score(
+        X=llm_fine_tune_df_default_index[["prompt"]],
+        y=llm_fine_tune_df_default_index[["label"]],
+    ).to_pandas()
+    score_result_col = score_result.columns.to_list()
+    expected_col = [
+        "bleu4_score",
+        "rouge-l_precision",
+        "rouge-l_recall",
+        "rouge-l_f1_score",
+        "evaluation_status",
+    ]
+    assert all(col in score_result_col for col in expected_col)
+
+
+def test_llm_palm_score_params(llm_fine_tune_df_default_index):
+    model = bigframes.ml.llm.PaLM2TextGenerator(
+        model_name="text-bison", max_iterations=1
+    )
+
+    # Check score to ensure the model was fitted
+    score_result = model.score(
+        X=llm_fine_tune_df_default_index["prompt"],
+        y=llm_fine_tune_df_default_index["label"],
+        task_type="classification",
+    ).to_pandas()
+    score_result_col = score_result.columns.to_list()
+    expected_col = [
+        "trial_id",
+        "precision",
+        "recall",
+        "accuracy",
+        "f1_score",
+        "log_loss",
+        "roc_auc",
+    ]
+    assert all(col in score_result_col for col in expected_col)
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
@@ -319,6 +319,20 @@ def test_ml_predict_correct(
     )
 
 
+def test_ml_llm_evaluate_correct(
+    model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
+    mock_df: bpd.DataFrame,
+):
+    sql = model_manipulation_sql_generator.ml_llm_evaluate(
+        source_df=mock_df, task_type="CLASSIFICATION"
+    )
+    assert (
+        sql
+        == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`,
+            (input_X_sql), STRUCT("CLASSIFICATION" AS task_type))"""
+    )
+
+
 def test_ml_evaluate_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
     mock_df: bpd.DataFrame,
diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py
@@ -95,7 +95,7 @@ class RandomForestRegressor(ForestRegressor):
             Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2.
         tree_method (Optional[str]):
             Specify which tree method to use. Default to "auto". If this parameter is set to
-            default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx",
+            default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx",
             "hist".
         min_child_weight (Optional[float]):
             Minimum sum of instance weight(hessian) needed in a child. Default to 1.
@@ -160,7 +160,7 @@ class RandomForestClassifier(ForestClassifier):
             Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2.
         tree_method (Optional[str]):
             Specify which tree method to use. Default to "auto". If this parameter is set to
-            default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx",
+            default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx",
             "hist".
         min_child_weight (Optional[float]):
             Minimum sum of instance weight(hessian) needed in a child. Default to 1.
diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py
@@ -63,7 +63,7 @@ class XGBRegressor(XGBModel, XGBRegressorBase):
             Type of normalization algorithm for DART booster. Possible values: "TREE", "FOREST". Default to "TREE".
         tree_method (Optional[str]):
             Specify which tree method to use.  Default to "auto". If this parameter is set to
-            default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx",
+            default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx",
             "hist".
         min_child_weight (Optional[float]):
             Minimum sum of instance weight(hessian) needed in a child. Default to 1.
@@ -110,7 +110,7 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
             Type of normalization algorithm for DART booster. Possible values: "TREE", "FOREST". Default to "TREE".
         tree_method (Optional[str]):
             Specify which tree method to use.  Default to "auto". If this parameter is set to
-            default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx",
+            default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx",
             "hist".
         min_child_weight (Optional[float]):
             Minimum sum of instance weight(hessian) needed in a child. Default to 1.