Skip to content

fix: Fix issue with invalid sql generated by ml distance functions #865

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1380,6 +1380,30 @@ def minimum_impl(
return ibis.case().when(upper.isnull() | (value > upper), upper).else_(value).end()


@scalar_op_compiler.register_binary_op(ops.cosine_distance_op)
def cosine_distance_impl(
vector1: ibis_types.Value,
vector2: ibis_types.Value,
):
return vector_distance(vector1, vector2, "COSINE")


@scalar_op_compiler.register_binary_op(ops.euclidean_distance_op)
def euclidean_distance_impl(
vector1: ibis_types.Value,
vector2: ibis_types.Value,
):
return vector_distance(vector1, vector2, "EUCLIDEAN")


@scalar_op_compiler.register_binary_op(ops.manhattan_distance_op)
def manhattan_distance_impl(
vector1: ibis_types.Value,
vector2: ibis_types.Value,
):
return vector_distance(vector1, vector2, "MANHATTAN")


@scalar_op_compiler.register_binary_op(ops.BinaryRemoteFunctionOp, pass_op=True)
def binary_remote_function_op_impl(
x: ibis_types.Value, y: ibis_types.Value, op: ops.BinaryRemoteFunctionOp
Expand Down Expand Up @@ -1501,3 +1525,8 @@ def json_set(
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str, json_value
) -> ibis_dtypes.JSON:
"""Produces a new SQL JSON value with the specified JSON data inserted or replaced."""


@ibis.udf.scalar.builtin(name="ML.DISTANCE")
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""
131 changes: 55 additions & 76 deletions bigframes/ml/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from __future__ import annotations

import datetime
from typing import Callable, cast, Iterable, Literal, Mapping, Optional, Union
from typing import Callable, cast, Iterable, Mapping, Optional, Union
import uuid

from google.cloud import bigquery
Expand All @@ -35,11 +35,27 @@ def __init__(self, session: bigframes.Session):
self._session = session
self._base_sql_generator = ml_sql.BaseSqlGenerator()

def _apply_sql(

class BqmlModel(BaseBqml):
"""Represents an existing BQML model in BigQuery.

Wraps the BQML API and SQL interface to expose the functionality needed for
BigQuery DataFrames ML.
"""

def __init__(self, session: bigframes.Session, model: bigquery.Model):
self._session = session
self._model = model
self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator(
self.model_name
)

def _apply_ml_tvf(
self,
input_data: bpd.DataFrame,
func: Callable[[bpd.DataFrame], str],
apply_sql_tvf: Callable[[str], str],
) -> bpd.DataFrame:
# Used for predict, transform, distance
"""Helper to wrap a dataframe in a SQL query, keeping the index intact.

Args:
Expand All @@ -50,67 +66,28 @@ def _apply_sql(
the dataframe to be wrapped

func (function):
a function that will accept a SQL string and produce a new SQL
string from which to construct the output dataframe. It must
include the index columns of the input SQL.
Takes an input sql table value and applies a prediction tvf. The
resulting table value must include all input columns, with new
columns appended to the end.
"""
_, index_col_ids, index_labels = input_data._to_sql_query(include_index=True)

sql = func(input_data)
df = self._session.read_gbq(sql, index_col=index_col_ids)
df.index.names = index_labels

return df

def distance(
self,
x: bpd.DataFrame,
y: bpd.DataFrame,
type: Literal["EUCLIDEAN", "MANHATTAN", "COSINE"],
name: str,
) -> bpd.DataFrame:
"""Calculate ML.DISTANCE from DataFrame inputs.

Args:
x:
input DataFrame
y:
input DataFrame
type:
Distance types, accept values are "EUCLIDEAN", "MANHATTAN", "COSINE".
name:
name of the output result column
"""
assert len(x.columns) == 1 and len(y.columns) == 1

input_data = x.join(y, how="outer").cache()
x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0]

return self._apply_sql(
input_data,
lambda source_df: self._base_sql_generator.ml_distance(
x_column_id,
y_column_id,
type=type,
source_df=source_df,
name=name,
),
# TODO: Preserve ordering information?
input_sql, index_col_ids, index_labels = input_data._to_sql_query(
include_index=True
)


class BqmlModel(BaseBqml):
"""Represents an existing BQML model in BigQuery.

Wraps the BQML API and SQL interface to expose the functionality needed for
BigQuery DataFrames ML.
"""

def __init__(self, session: bigframes.Session, model: bigquery.Model):
self._session = session
self._model = model
self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator(
self.model_name
result_sql = apply_sql_tvf(input_sql)
df = self._session.read_gbq(result_sql, index_col=index_col_ids)
df.index.names = index_labels
# Restore column labels
df.rename(
columns={
label: original_label
for label, original_label in zip(
df.columns.values, input_data.columns.values
)
}
)
return df

def _keys(self):
return (self._session, self._model)
Expand All @@ -137,13 +114,13 @@ def model(self) -> bigquery.Model:
return self._model

def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
return self._apply_sql(
return self._apply_ml_tvf(
input_data,
self._model_manipulation_sql_generator.ml_predict,
)

def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
return self._apply_sql(
return self._apply_ml_tvf(
input_data,
self._model_manipulation_sql_generator.ml_transform,
)
Expand All @@ -153,10 +130,10 @@ def generate_text(
input_data: bpd.DataFrame,
options: Mapping[str, int | float],
) -> bpd.DataFrame:
return self._apply_sql(
return self._apply_ml_tvf(
input_data,
lambda source_df: self._model_manipulation_sql_generator.ml_generate_text(
source_df=source_df,
lambda source_sql: self._model_manipulation_sql_generator.ml_generate_text(
source_sql=source_sql,
struct_options=options,
),
)
Expand All @@ -166,10 +143,10 @@ def generate_embedding(
input_data: bpd.DataFrame,
options: Mapping[str, int | float],
) -> bpd.DataFrame:
return self._apply_sql(
return self._apply_ml_tvf(
input_data,
lambda source_df: self._model_manipulation_sql_generator.ml_generate_embedding(
source_df=source_df,
lambda source_sql: self._model_manipulation_sql_generator.ml_generate_embedding(
source_sql=source_sql,
struct_options=options,
),
)
Expand All @@ -179,10 +156,10 @@ def detect_anomalies(
) -> bpd.DataFrame:
assert self._model.model_type in ("PCA", "KMEANS", "ARIMA_PLUS")

return self._apply_sql(
return self._apply_ml_tvf(
input_data,
lambda source_df: self._model_manipulation_sql_generator.ml_detect_anomalies(
source_df=source_df,
lambda source_sql: self._model_manipulation_sql_generator.ml_detect_anomalies(
source_sql=source_sql,
struct_options=options,
),
)
Expand All @@ -192,7 +169,9 @@ def forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame:
return self._session.read_gbq(sql, index_col="forecast_timestamp").reset_index()

def evaluate(self, input_data: Optional[bpd.DataFrame] = None):
sql = self._model_manipulation_sql_generator.ml_evaluate(input_data)
sql = self._model_manipulation_sql_generator.ml_evaluate(
input_data.sql if (input_data is not None) else None
)

return self._session.read_gbq(sql)

Expand All @@ -202,7 +181,7 @@ def llm_evaluate(
task_type: Optional[str] = None,
):
sql = self._model_manipulation_sql_generator.ml_llm_evaluate(
input_data, task_type
input_data.sql, task_type
)

return self._session.read_gbq(sql)
Expand Down Expand Up @@ -336,7 +315,7 @@ def create_model(
model_ref = self._create_model_ref(session._anonymous_dataset)

sql = self._model_creation_sql_generator.create_model(
source_df=input_data,
source_sql=input_data.sql,
model_ref=model_ref,
transforms=transforms,
options=options,
Expand Down Expand Up @@ -374,7 +353,7 @@ def create_llm_remote_model(
model_ref = self._create_model_ref(session._anonymous_dataset)

sql = self._model_creation_sql_generator.create_llm_remote_model(
source_df=input_data,
source_sql=input_data.sql,
model_ref=model_ref,
options=options,
connection_name=connection_name,
Expand Down Expand Up @@ -407,7 +386,7 @@ def create_time_series_model(
model_ref = self._create_model_ref(session._anonymous_dataset)

sql = self._model_creation_sql_generator.create_model(
source_df=input_data,
source_sql=input_data.sql,
model_ref=model_ref,
transforms=transforms,
options=options,
Expand Down
47 changes: 30 additions & 17 deletions bigframes/ml/metrics/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,24 @@

import bigframes_vendored.sklearn.metrics.pairwise as vendored_metrics_pairwise

from bigframes.ml import core, utils
from bigframes.ml import utils
import bigframes.operations as ops
import bigframes.pandas as bpd


def paired_cosine_distances(
X: Union[bpd.DataFrame, bpd.Series], Y: Union[bpd.DataFrame, bpd.Series]
) -> bpd.DataFrame:
X, Y = utils.convert_to_dataframe(X, Y)
if len(X.columns) != 1 or len(Y.columns) != 1:
raise ValueError("Inputs X and Y can only contain 1 column.")
X, Y = utils.convert_to_series(X, Y)
joined_block, _ = X._block.join(Y._block, how="outer")

base_bqml = core.BaseBqml(session=X._session)
return base_bqml.distance(X, Y, type="COSINE", name="cosine_distance")
result_block, _ = joined_block.project_expr(
ops.cosine_distance_op.as_expr(
joined_block.value_columns[0], joined_block.value_columns[1]
),
label="cosine_distance",
)
return bpd.DataFrame(result_block)


paired_cosine_distances.__doc__ = inspect.getdoc(
Expand All @@ -40,12 +45,16 @@ def paired_cosine_distances(
def paired_manhattan_distance(
X: Union[bpd.DataFrame, bpd.Series], Y: Union[bpd.DataFrame, bpd.Series]
) -> bpd.DataFrame:
X, Y = utils.convert_to_dataframe(X, Y)
if len(X.columns) != 1 or len(Y.columns) != 1:
raise ValueError("Inputs X and Y can only contain 1 column.")
X, Y = utils.convert_to_series(X, Y)
joined_block, _ = X._block.join(Y._block, how="outer")

base_bqml = core.BaseBqml(session=X._session)
return base_bqml.distance(X, Y, type="MANHATTAN", name="manhattan_distance")
result_block, _ = joined_block.project_expr(
ops.manhattan_distance_op.as_expr(
joined_block.value_columns[0], joined_block.value_columns[1]
),
label="manhattan_distance",
)
return bpd.DataFrame(result_block)


paired_manhattan_distance.__doc__ = inspect.getdoc(
Expand All @@ -56,12 +65,16 @@ def paired_manhattan_distance(
def paired_euclidean_distances(
X: Union[bpd.DataFrame, bpd.Series], Y: Union[bpd.DataFrame, bpd.Series]
) -> bpd.DataFrame:
X, Y = utils.convert_to_dataframe(X, Y)
if len(X.columns) != 1 or len(Y.columns) != 1:
raise ValueError("Inputs X and Y can only contain 1 column.")

base_bqml = core.BaseBqml(session=X._session)
return base_bqml.distance(X, Y, type="EUCLIDEAN", name="euclidean_distance")
X, Y = utils.convert_to_series(X, Y)
joined_block, _ = X._block.join(Y._block, how="outer")

result_block, _ = joined_block.project_expr(
ops.euclidean_distance_op.as_expr(
joined_block.value_columns[0], joined_block.value_columns[1]
),
label="euclidean_distance",
)
return bpd.DataFrame(result_block)


paired_euclidean_distances.__doc__ = inspect.getdoc(
Expand Down
Loading