From 350499bccb62e22169ab2f2e1400175b2179ef85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=2C=20formerly=29?= Date: Wed, 28 Feb 2024 10:34:16 -0600 Subject: [PATCH 1/8] deps: update ibis to version 8.0.0 and refactor `remote_function` to use ibis UDF method (#277) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Follow-up to https://togithub.com/googleapis/python-bigquery-dataframes/pull/53#discussion_r1427224630 🦕 --- bigframes/core/compile/aggregate_compiler.py | 2 +- bigframes/core/compile/compiled.py | 15 +- bigframes/dtypes.py | 4 +- bigframes/functions/remote_function.py | 71 +++---- noxfile.py | 6 +- setup.py | 5 +- testing/constraints-3.9.txt | 4 +- tests/system/small/test_dataframe.py | 10 +- tests/unit/test_core.py | 2 +- tests/unit/test_remote_function.py | 4 +- .../ibis/backends/bigquery/datatypes.py | 176 ++++++++++++++++++ 11 files changed, 230 insertions(+), 69 deletions(-) create mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 1dad128599..86ba16e347 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -331,7 +331,7 @@ def _( op: agg_ops.RankOp, column: ibis_types.Column, window=None ) -> ibis_types.IntegerValue: # Ibis produces 0-based ranks, while pandas creates 1-based ranks - return _apply_window_if_present(column.rank(), window) + 1 + return _apply_window_if_present(ibis.rank(), window) + 1 @compile_unary_agg.register diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 969437939f..7245689aae 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -1099,17 +1099,14 @@ def _to_ibis_expr( if not columns: return ibis.memtable([]) + # Make sure we don't have any unbound (deferred) columns. + table = self._table.select(columns) + # Make sure all dtypes are the "canonical" ones for BigFrames. This is # important for operations like UNION where the schema must match. - table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type( - column.resolve(self._table) - # TODO(https://github.com/ibis-project/ibis/issues/7613): use - # public API to refer to Deferred type. - if isinstance(column, ibis.common.deferred.Deferred) - else column - ) - for column in columns + table = table.select( + bigframes.dtypes.ibis_value_to_canonical_type(table[column]) + for column in table.columns ) base_table = table if self._reduced_predicate is not None: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 6e3bc25c47..8a2055ef7f 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -23,7 +23,6 @@ import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery import ibis -from ibis.backends.bigquery.datatypes import BigQueryType import ibis.expr.datatypes as ibis_dtypes from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type import ibis.expr.types as ibis_types @@ -33,6 +32,7 @@ import bigframes.constants as constants import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers +import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops # Type hints for Pandas dtypes supported by BigQuery DataFrame @@ -643,4 +643,4 @@ def ibis_type_from_python_type(t: type) -> ibis_dtypes.DataType: def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType: if tk not in SUPPORTED_IO_BIGQUERY_TYPEKINDS: raise UnsupportedTypeError(tk, SUPPORTED_IO_BIGQUERY_TYPEKINDS) - return BigQueryType.to_ibis(tk) + return third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index c7bb5d92c6..af4c4b138a 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -14,7 +14,6 @@ from __future__ import annotations -import functools import hashlib import inspect import logging @@ -28,6 +27,7 @@ import textwrap from typing import List, NamedTuple, Optional, Sequence, TYPE_CHECKING +import ibis import requests if TYPE_CHECKING: @@ -43,15 +43,12 @@ resourcemanager_v3, ) import google.iam.v1 -from ibis.backends.bigquery.compiler import compiles -from ibis.backends.bigquery.datatypes import BigQueryType from ibis.expr.datatypes.core import DataType as IbisDataType -import ibis.expr.operations as ops -import ibis.expr.rules as rlz from bigframes import clients import bigframes.constants as constants import bigframes.dtypes +import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes logger = logging.getLogger(__name__) @@ -173,12 +170,14 @@ def create_bq_remote_function( # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 bq_function_args = [] - bq_function_return_type = BigQueryType.from_ibis(output_type) + bq_function_return_type = third_party_ibis_bqtypes.BigQueryType.from_ibis( + output_type + ) # We are expecting the input type annotations to be 1:1 with the input args for idx, name in enumerate(input_args): bq_function_args.append( - f"{name} {BigQueryType.from_ibis(input_types[idx])}" + f"{name} {third_party_ibis_bqtypes.BigQueryType.from_ibis(input_types[idx])}" ) create_function_ddl = f""" CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) @@ -515,33 +514,10 @@ def get_remote_function_specs(self, remote_function_name): return (http_endpoint, bq_connection) -def remote_function_node( - routine_ref: bigquery.RoutineReference, ibis_signature: IbisSignature -): - """Creates an Ibis node representing a remote function call.""" - - fields = { - name: rlz.ValueOf(None if type_ == "ANY TYPE" else type_) - for name, type_ in zip( - ibis_signature.parameter_names, ibis_signature.input_types - ) - } - - fields["dtype"] = ibis_signature.output_type # type: ignore - fields["shape"] = rlz.shape_like("args") - - node = type(routine_ref_to_string_for_query(routine_ref), (ops.ValueOp,), fields) # type: ignore - - @compiles(node) - def compile_node(t, op): - return "{}({})".format(node.__name__, ", ".join(map(t.translate, op.args))) - - def f(*args, **kwargs): - return node(*args, **kwargs).to_expr() - - f.bigframes_remote_function = str(routine_ref) # type: ignore - - return f +class UnsupportedTypeError(ValueError): + def __init__(self, type_, supported_types): + self.type = type_ + self.supported_types = supported_types def ibis_signature_from_python_signature( @@ -831,14 +807,16 @@ def wrapper(f): packages, ) - node = remote_function_node(dataset_ref.routine(rf_name), ibis_signature) - - node = functools.wraps(f)(node) - node.__signature__ = signature + node = ibis.udf.scalar.builtin( + f, + name=rf_name, + schema=f"{dataset_ref.project}.{dataset_ref.dataset_id}", + signature=(ibis_signature.input_types, ibis_signature.output_type), + ) node.bigframes_cloud_function = ( remote_function_client.get_cloud_function_fully_qualified_name(cf_name) ) - + node.bigframes_remote_function = str(dataset_ref.routine(rf_name)) # type: ignore return node return wrapper @@ -888,4 +866,17 @@ def read_gbq_function( f"{constants.FEEDBACK_LINK}" ) - return remote_function_node(routine_ref, ibis_signature) + # The name "args" conflicts with the Ibis operator, so we use + # non-standard names for the arguments here. + def node(*ignored_args, **ignored_kwargs): + f"""Remote function {str(routine_ref)}.""" + + node.__name__ = routine_ref.routine_id + node = ibis.udf.scalar.builtin( + node, + name=routine_ref.routine_id, + schema=f"{routine_ref.project}.{routine_ref.dataset_id}", + signature=(ibis_signature.input_types, ibis_signature.output_type), + ) + node.bigframes_remote_function = str(routine_ref) # type: ignore + return node diff --git a/noxfile.py b/noxfile.py index 259943aaa4..91d26cf695 100644 --- a/noxfile.py +++ b/noxfile.py @@ -565,12 +565,12 @@ def prerelease(session: nox.sessions.Session, tests_path): # session.install( # "--upgrade", # "-e", # Use -e so that py.typed file is included. - # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", + # "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", # ) session.install( "--upgrade", - # "--pre", - "ibis-framework>=7.1.0,<7.2.0dev", + "--pre", + "ibis-framework>=8.0.0,<9.0.0dev", ) already_installed.add("ibis-framework") diff --git a/setup.py b/setup.py index 4aa07904f7..516d5b8a19 100644 --- a/setup.py +++ b/setup.py @@ -44,8 +44,7 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - # TODO: Relax upper bound once we have fixed unit tests with 7.2.0. - "ibis-framework[bigquery] >=7.1.0,<7.2.0dev", + "ibis-framework[bigquery] >=8.0.0,<9.0.0dev", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. "pandas >=1.5.0,<2.1.4", "pydata-google-auth >=1.8.2", @@ -55,7 +54,7 @@ # Keep sqlglot versions in sync with ibis-framework. This avoids problems # where the incorrect version of sqlglot is installed, such as # https://github.com/googleapis/python-bigquery-dataframes/issues/315 - "sqlglot >=19.9.0,<20", + "sqlglot >=20.8.0,<=20.11", "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 42cc68eb04..c4fed64fbd 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -10,13 +10,13 @@ google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 google-cloud-resource-manager==1.10.3 google-cloud-storage==2.0.0 -ibis-framework==7.1.0 +ibis-framework==8.0.0 pandas==1.5.0 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 sqlalchemy==1.4 -sqlglot==19.9.0 +sqlglot==20.8.0 tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8f75534fc6..9f4e138b73 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -157,15 +157,13 @@ def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_inde ], ) def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): - bf_result = scalars_df_index.nlargest( - 3, ["bool_col", "int64_too"], keep=keep - ).to_pandas() + bf_result = scalars_df_index.nlargest(3, ["bool_col", "int64_too"], keep=keep) pd_result = scalars_pandas_df_index.nlargest( 3, ["bool_col", "int64_too"], keep=keep ) pd.testing.assert_frame_equal( - bf_result, + bf_result.to_pandas(), pd_result, ) @@ -179,11 +177,11 @@ def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): ], ) def test_df_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): - bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep).to_pandas() + bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep) pd_result = scalars_pandas_df_index.nsmallest(6, ["bool_col"], keep=keep) pd.testing.assert_frame_equal( - bf_result, + bf_result.to_pandas(), pd_result, ) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 42cbcbbc9f..5f940fd7a5 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -118,7 +118,7 @@ def test_arrayvalues_to_ibis_expr_with_concat(): total_ordering_columns=["col1"], ) expr = value.concat([value]) - actual = expr._compile_ordered()._to_ibis_expr(ordering_mode="unordered") + actual = expr._compile_unordered()._to_ibis_expr() assert len(actual.columns) == 3 # TODO(ashleyxu, b/299631930): test out the union expression assert actual.columns[0] == "column_0" diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py index 392872a7be..629bc5326a 100644 --- a/tests/unit/test_remote_function.py +++ b/tests/unit/test_remote_function.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ibis.backends.bigquery import datatypes as bq_types from ibis.expr import datatypes as ibis_types import bigframes.dtypes +import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes def test_supported_types_correspond(): @@ -24,7 +24,7 @@ def test_supported_types_correspond(): ibis_types.dtype(t) for t in bigframes.dtypes.SUPPORTED_IO_PYTHON_TYPES } ibis_types_from_bigquery = { - bq_types.BigQueryType.to_ibis(tk) + third_party_ibis_bqtypes.BigQueryType.to_ibis(tk) for tk in bigframes.dtypes.SUPPORTED_IO_BIGQUERY_TYPEKINDS } diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py new file mode 100644 index 0000000000..e7200cbf2a --- /dev/null +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py @@ -0,0 +1,176 @@ +# Contains code from +# https://github.com/ibis-project/ibis/blob/697d325f13bdf2746a50e86204eb8834b1710bd6/ibis/backends/bigquery/datatypes.py + +from __future__ import annotations + +import google.cloud.bigquery as bq +import ibis +import ibis.expr.datatypes as dt +import ibis.expr.schema as sch +from ibis.formats import SchemaMapper, TypeMapper +import sqlglot as sg + +_from_bigquery_types = { + "INT64": dt.Int64, + "INTEGER": dt.Int64, + "FLOAT": dt.Float64, + "FLOAT64": dt.Float64, + "BOOL": dt.Boolean, + "BOOLEAN": dt.Boolean, + "STRING": dt.String, + "DATE": dt.Date, + "TIME": dt.Time, + "BYTES": dt.Binary, + "JSON": dt.JSON, +} + + +class BigQueryType(TypeMapper): + @classmethod + def to_ibis(cls, typ: str, nullable: bool = True) -> dt.DataType: + if typ == "DATETIME": + return dt.Timestamp(timezone=None, nullable=nullable) + elif typ == "TIMESTAMP": + return dt.Timestamp(timezone="UTC", nullable=nullable) + elif typ == "NUMERIC": + return dt.Decimal(38, 9, nullable=nullable) + elif typ == "BIGNUMERIC": + return dt.Decimal(76, 38, nullable=nullable) + elif typ == "GEOGRAPHY": + return dt.GeoSpatial(geotype="geography", srid=4326, nullable=nullable) + else: + try: + return _from_bigquery_types[typ](nullable=nullable) + except KeyError: + raise TypeError(f"Unable to convert BigQuery type to ibis: {typ}") + + @classmethod + def from_ibis(cls, dtype: dt.DataType) -> str: + if dtype.is_floating(): + return "FLOAT64" + elif dtype.is_uint64(): + raise TypeError( + "Conversion from uint64 to BigQuery integer type (int64) is lossy" + ) + elif dtype.is_integer(): + return "INT64" + elif dtype.is_binary(): + return "BYTES" + elif dtype.is_date(): + return "DATE" + elif dtype.is_timestamp(): + if dtype.timezone is None: + return "DATETIME" + elif dtype.timezone == "UTC": + return "TIMESTAMP" + else: + raise TypeError( + "BigQuery does not support timestamps with timezones other than 'UTC'" + ) + elif dtype.is_decimal(): + if (dtype.precision, dtype.scale) == (76, 38): + return "BIGNUMERIC" + if (dtype.precision, dtype.scale) in [(38, 9), (None, None)]: + return "NUMERIC" + raise TypeError( + "BigQuery only supports decimal types with precision of 38 and " + f"scale of 9 (NUMERIC) or precision of 76 and scale of 38 (BIGNUMERIC). " + f"Current precision: {dtype.precision}. Current scale: {dtype.scale}" + ) + elif dtype.is_array(): + return f"ARRAY<{cls.from_ibis(dtype.value_type)}>" + elif dtype.is_struct(): + fields = ( + f"{sg.to_identifier(k).sql('bigquery')} {cls.from_ibis(v)}" + for k, v in dtype.fields.items() + ) + return "STRUCT<{}>".format(", ".join(fields)) + elif dtype.is_json(): + return "JSON" + elif dtype.is_geospatial(): + if (dtype.geotype, dtype.srid) == ("geography", 4326): + return "GEOGRAPHY" + raise TypeError( + "BigQuery geography uses points on WGS84 reference ellipsoid." + f"Current geotype: {dtype.geotype}, Current srid: {dtype.srid}" + ) + elif dtype.is_map(): + raise NotImplementedError("Maps are not supported in BigQuery") + else: + return str(dtype).upper() + + +class BigQuerySchema(SchemaMapper): + @classmethod + def from_ibis(cls, schema: sch.Schema) -> list[bq.SchemaField]: + schema_fields = [] + + for name, typ in ibis.schema(schema).items(): + if typ.is_array(): + value_type = typ.value_type + if value_type.is_array(): + raise TypeError("Nested arrays are not supported in BigQuery") + + is_struct = value_type.is_struct() + + field_type = ( + "RECORD" if is_struct else BigQueryType.from_ibis(typ.value_type) + ) + mode = "REPEATED" + fields = cls.from_ibis(ibis.schema(getattr(value_type, "fields", {}))) + elif typ.is_struct(): + field_type = "RECORD" + mode = "NULLABLE" if typ.nullable else "REQUIRED" + fields = cls.from_ibis(ibis.schema(typ.fields)) + else: + field_type = BigQueryType.from_ibis(typ) + mode = "NULLABLE" if typ.nullable else "REQUIRED" + fields = [] + + schema_fields.append( + bq.SchemaField(name, field_type=field_type, mode=mode, fields=fields) + ) + return schema_fields + + @classmethod + def _dtype_from_bigquery_field(cls, field: bq.SchemaField) -> dt.DataType: + typ = field.field_type + if typ == "RECORD": + assert field.fields, "RECORD fields are empty" + fields = {f.name: cls._dtype_from_bigquery_field(f) for f in field.fields} + dtype = dt.Struct(fields) + else: + dtype = BigQueryType.to_ibis(typ) + + mode = field.mode + if mode == "NULLABLE": + return dtype.copy(nullable=True) + elif mode == "REQUIRED": + return dtype.copy(nullable=False) + elif mode == "REPEATED": + # arrays with NULL elements aren't supported + return dt.Array(dtype.copy(nullable=False)) + else: + raise TypeError(f"Unknown BigQuery field.mode: {mode}") + + @classmethod + def to_ibis(cls, fields: list[bq.SchemaField]) -> sch.Schema: + return sch.Schema({f.name: cls._dtype_from_bigquery_field(f) for f in fields}) + + +# TODO(kszucs): we can eliminate this function by making dt.DataType traversible +# using ibis.common.graph.Node, similarly to how we traverse ops.Node instances: +# node.find(types) +def spread_type(dt: dt.DataType): + """Returns a generator that contains all the types in the given type. + + For complex types like set and array, it returns the types of the elements. + """ + if dt.is_array(): + yield from spread_type(dt.value_type) + elif dt.is_struct(): + for type_ in dt.types: + yield from spread_type(type_) + elif dt.is_map(): + raise NotImplementedError("Maps are not supported in BigQuery") + yield dt From 1726588beb8894bc08c272d718ca8e3a9451d0c2 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 29 Feb 2024 07:18:52 -0800 Subject: [PATCH 2/8] feat: Add ml.metrics.pairwise.euclidean_distance (#397) --- bigframes/ml/metrics/pairwise.py | 16 ++++++++++++++++ tests/system/small/ml/test_metrics_pairwise.py | 16 ++++++++++++++++ .../sklearn/metrics/pairwise.py | 15 +++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/bigframes/ml/metrics/pairwise.py b/bigframes/ml/metrics/pairwise.py index 9ebea4ef42..ef2c08d471 100644 --- a/bigframes/ml/metrics/pairwise.py +++ b/bigframes/ml/metrics/pairwise.py @@ -50,3 +50,19 @@ def paired_manhattan_distance( paired_manhattan_distance.__doc__ = inspect.getdoc( vendored_metrics_pairwise.paired_manhattan_distance ) + + +def paired_euclidean_distances( + X: Union[bpd.DataFrame, bpd.Series], Y: Union[bpd.DataFrame, bpd.Series] +) -> bpd.DataFrame: + X, Y = utils.convert_to_dataframe(X, Y) + if len(X.columns) != 1 or len(Y.columns) != 1: + raise ValueError("Inputs X and Y can only contain 1 column.") + + base_bqml = core.BaseBqml(session=X._session) + return base_bqml.distance(X, Y, type="EUCLIDEAN", name="euclidean_distance") + + +paired_euclidean_distances.__doc__ = inspect.getdoc( + vendored_metrics_pairwise.paired_euclidean_distances +) diff --git a/tests/system/small/ml/test_metrics_pairwise.py b/tests/system/small/ml/test_metrics_pairwise.py index e2aee971ee..717f32667f 100644 --- a/tests/system/small/ml/test_metrics_pairwise.py +++ b/tests/system/small/ml/test_metrics_pairwise.py @@ -47,3 +47,19 @@ def test_paired_manhattan_distance(): pd.testing.assert_frame_equal( result.to_pandas(), expected_pd_df, check_dtype=False, check_index_type=False ) + + +def test_paired_euclidean_distances(): + x_col = [np.array([4.1, 0.5, 1.0])] + y_col = [np.array([3.0, 0.0, 2.5])] + X = bpd.read_pandas(pd.DataFrame({"X": x_col})) + Y = bpd.read_pandas(pd.DataFrame({"Y": y_col})) + + result = metrics.pairwise.paired_euclidean_distances(X, Y) + expected_pd_df = pd.DataFrame( + {"X": x_col, "Y": y_col, "euclidean_distance": [1.926136]} + ) + + pd.testing.assert_frame_equal( + result.to_pandas(), expected_pd_df, check_dtype=False, check_index_type=False + ) diff --git a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py index 5791d850ff..be3d6753a7 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py +++ b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py @@ -39,3 +39,18 @@ def paired_manhattan_distance(X, Y) -> bpd.DataFrame: bigframes.dataframe.DataFrame: DataFrame with columns of X, Y and manhattan_distance """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + +def paired_euclidean_distances(X, Y) -> bpd.DataFrame: + """Compute the paired euclidean distances between X and Y. + + Args: + X (Series or single column DataFrame of array of numeric type): + Input data. + Y (Series or single column DataFrame of array of numeric type): + Input data. X and Y are mapped by indexes, must have the same index. + + Returns: + bigframes.dataframe.DataFrame: DataFrame with columns of X, Y and euclidean_distance + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From e0f1ab07cbc81034e24767baff54560561950e67 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 29 Feb 2024 10:28:15 -0800 Subject: [PATCH 3/8] feat: add TextEmbedding model version support (#394) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/llm.py | 18 ++++++++++++++++-- bigframes/ml/loader.py | 7 +++++-- bigframes/ml/utils.py | 15 ++++++++++++++- tests/system/small/ml/conftest.py | 9 +++++++++ tests/system/small/ml/test_llm.py | 17 +++++++++++++++++ 5 files changed, 61 insertions(+), 5 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index dfe0af2f25..79f6b90bfd 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -266,6 +266,9 @@ class PaLM2TextEmbeddingGenerator(base.Predictor): The model for text embedding. “textembedding-gecko” returns model embeddings for text inputs. "textembedding-gecko-multilingual" returns model embeddings for text inputs which support over 100 languages Default to "textembedding-gecko". + version (str or None): + Model version. Accepted values are "001", "002", "003", "latest" etc. Will use the default version if unset. + See https://cloud.google.com/vertex-ai/docs/generative-ai/learn/model-versioning for details. session (bigframes.Session or None): BQ session to create the model. If None, use the global default session. connection_name (str or None): @@ -279,10 +282,12 @@ def __init__( model_name: Literal[ "textembedding-gecko", "textembedding-gecko-multilingual" ] = "textembedding-gecko", + version: Optional[str] = None, session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, ): self.model_name = model_name + self.version = version self.session = session or bpd.get_global_session() self._bq_connection_manager = clients.BqConnectionManager( self.session.bqconnectionclient, self.session.resourcemanagerclient @@ -321,8 +326,11 @@ def _create_bqml_model(self): f"Model name {self.model_name} is not supported. We only support {', '.join(_EMBEDDING_GENERATOR_ENDPOINTS)}." ) + endpoint = ( + self.model_name + "@" + self.version if self.version else self.model_name + ) options = { - "endpoint": self.model_name, + "endpoint": endpoint, } return self._bqml_model_factory.create_remote_model( session=self.session, connection_name=self.connection_name, options=options @@ -342,8 +350,14 @@ def _from_bq( model_connection = model._properties["remoteModelInfo"]["connection"] model_endpoint = bqml_endpoint.split("/")[-1] + model_name, version = utils.parse_model_endpoint(model_endpoint) + embedding_generator_model = cls( - session=session, model_name=model_endpoint, connection_name=model_connection + session=session, + # str to literals + model_name=model_name, # type: ignore + version=version, + connection_name=model_connection, ) embedding_generator_model._bqml_model = core.BqmlModel(session, model) return embedding_generator_model diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index dafda43e9d..31912a0129 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -30,6 +30,7 @@ linear_model, llm, pipeline, + utils, ) _BQML_MODEL_TYPE_MAPPING = MappingProxyType( @@ -106,8 +107,10 @@ def _model_from_bq(session: bigframes.Session, bq_model: bigquery.Model): ): # Parse the remote model endpoint bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] - endpoint_model = bqml_endpoint.split("/")[-1] - return _BQML_ENDPOINT_TYPE_MAPPING[endpoint_model]._from_bq( # type: ignore + model_endpoint = bqml_endpoint.split("/")[-1] + model_name, _ = utils.parse_model_endpoint(model_endpoint) + + return _BQML_ENDPOINT_TYPE_MAPPING[model_name]._from_bq( # type: ignore session=session, model=bq_model ) diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index 299282d333..364fb5e88d 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -13,7 +13,7 @@ # limitations under the License. import typing -from typing import Iterable, Union +from typing import Iterable, Optional, Union import bigframes.constants as constants from bigframes.core import blocks @@ -56,3 +56,16 @@ def _convert_to_series(frame: ArrayType) -> bpd.Series: raise ValueError( f"Unsupported type {type(frame)} to convert to Series. {constants.FEEDBACK_LINK}" ) + + +def parse_model_endpoint(model_endpoint: str) -> tuple[str, Optional[str]]: + """Parse model endpoint string to model_name and version.""" + model_name = model_endpoint + version = None + + at_idx = model_endpoint.find("@") + if at_idx != -1: + version = model_endpoint[at_idx + 1 :] + model_name = model_endpoint[:at_idx] + + return model_name, version diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index 8bf08906f9..c9100f36f3 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -256,6 +256,15 @@ def palm2_embedding_generator_model( ) +@pytest.fixture(scope="session") +def palm2_embedding_generator_model_002( + session, bq_connection +) -> llm.PaLM2TextEmbeddingGenerator: + return llm.PaLM2TextEmbeddingGenerator( + version="002", session=session, connection_name=bq_connection + ) + + @pytest.fixture(scope="session") def palm2_embedding_generator_multilingual_model( session, bq_connection diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index db959b854e..4d2ddfe513 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -194,6 +194,23 @@ def test_create_embedding_generator_model( assert reloaded_model.connection_name == bq_connection +def test_create_embedding_generator_model_002( + palm2_embedding_generator_model_002, dataset_id, bq_connection +): + # Model creation doesn't return error + assert palm2_embedding_generator_model_002 is not None + assert palm2_embedding_generator_model_002._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = palm2_embedding_generator_model_002.to_gbq( + f"{dataset_id}.temp_embedding_model", replace=True + ) + assert f"{dataset_id}.temp_embedding_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.model_name == "textembedding-gecko" + assert reloaded_model.version == "002" + assert reloaded_model.connection_name == bq_connection + + def test_create_embedding_generator_multilingual_model( palm2_embedding_generator_multilingual_model, dataset_id, From bfe2b23e2dea0cdf1e1b6ff5b17f6759d73c3e24 Mon Sep 17 00:00:00 2001 From: Dan Lee <71398022+dandhlee@users.noreply.github.com> Date: Thu, 29 Feb 2024 15:06:09 -0500 Subject: [PATCH 4/8] docs: update README to point to new summary pages (#402) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [x] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) Fixes b/263399076 🦕 Updates the link to the client library reference docs page. The page doesn't exist yet but will be added in a future CL, and mentioned in the TOC from #378. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 1323a065d8..f6d6f93e36 100644 --- a/README.rst +++ b/README.rst @@ -15,7 +15,7 @@ Documentation * `BigQuery DataFrames source code (GitHub) `_ * `BigQuery DataFrames sample notebooks `_ -* `BigQuery DataFrames API reference `_ +* `BigQuery DataFrames API reference `_ * `BigQuery documentation `_ From dd3643d3733ca1c2a18352bafac7d32fbdfa2a25 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 29 Feb 2024 23:10:16 +0000 Subject: [PATCH 5/8] fix: exceptions raised in `apply` from a `remote_function` now surface in the client (#387) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 309699263 🦕 --- bigframes/functions/remote_function.py | 35 ++++++++-------------- tests/system/large/test_remote_function.py | 24 ++++++++++++++- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index af4c4b138a..c31105a021 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -21,7 +21,6 @@ import random import shutil import string -import subprocess import sys import tempfile import textwrap @@ -87,19 +86,6 @@ def _get_hash(def_, package_requirements=None): return hashlib.md5(def_repr).hexdigest() -def _run_system_command(command): - program = subprocess.Popen( - [command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True - ) - stdout, stderr = program.communicate() - exit_code = program.wait() - if exit_code: - raise RuntimeError( - f"Command: {command}\nOutput: {stdout.decode()}\nError: {stderr.decode()}" - f"{constants.FEEDBACK_LINK}" - ) - - def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> str: return f"`{routine_ref.project}.{routine_ref.dataset_id}`.{routine_ref.routine_id}" @@ -281,6 +267,8 @@ def generate_cloud_function_main_code(self, def_, dir): code_template = textwrap.dedent( """\ import cloudpickle + import functions_framework + from flask import jsonify import json # original udf code is in {udf_code_file} @@ -289,14 +277,17 @@ def generate_cloud_function_main_code(self, def_, dir): udf = cloudpickle.load(f) def {handler_func_name}(request): - request_json = request.get_json(silent=True) - calls = request_json["calls"] - replies = [] - for call in calls: - reply = udf(*call) - replies.append(reply) - return_json = json.dumps({{"replies" : replies}}) - return return_json + try: + request_json = request.get_json(silent=True) + calls = request_json["calls"] + replies = [] + for call in calls: + reply = udf(*call) + replies.append(reply) + return_json = json.dumps({{"replies" : replies}}) + return return_json + except Exception as e: + return jsonify( {{ "errorMessage": str(e) }} ), 400 """ ) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 773de48adf..b33298ae01 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -21,7 +21,7 @@ import tempfile import textwrap -from google.api_core.exceptions import NotFound, ResourceExhausted +from google.api_core.exceptions import BadRequest, NotFound, ResourceExhausted from google.cloud import bigquery, functions_v2 import pandas import pytest @@ -1214,6 +1214,28 @@ def square(x): ) +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_runtime_error(session, scalars_dfs, dataset_id): + try: + + @session.remote_function([int], int, dataset=dataset_id) + def square(x): + return x * x + + scalars_df, _ = scalars_dfs + + with pytest.raises( + BadRequest, match="400.*errorMessage.*unsupported operand type" + ): + # int64_col has nulls which should cause error in square + scalars_df["int64_col"].apply(square).to_pandas() + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square + ) + + @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_anonymous_dataset(session, scalars_dfs): try: From 67c2bc949c9809e020c27fa9f8207294bfd5dbad Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 4 Mar 2024 11:16:16 -0800 Subject: [PATCH 6/8] chore: add the toc template file for entry page (#378) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal #325083413 🦕 --- docs/templates/toc.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 90dbc504b0..0d6bec5534 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -3,6 +3,16 @@ name: Overview - href: changelog.md name: Changelog + - items: + - href: summary_overview.yml + name: Overview + - href: summary_class.yml + name: Classes + - href: summary_method.yml + name: Methods + - href: summary_property.yml + name: Properties and Attributes + name: BigQuery DataFrames API - items: - items: - name: Options From a60aba712576e2e4e14cfcfffe9349d6972716a5 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 4 Mar 2024 16:00:27 -0800 Subject: [PATCH 7/8] fix: docs link for metrics.pairwise (#400) --- docs/reference/bigframes.ml/index.rst | 2 ++ docs/reference/bigframes.ml/metrics.pairwise.rst | 7 +++++++ 2 files changed, 9 insertions(+) create mode 100644 docs/reference/bigframes.ml/metrics.pairwise.rst diff --git a/docs/reference/bigframes.ml/index.rst b/docs/reference/bigframes.ml/index.rst index 1975d62e6d..37504b0830 100644 --- a/docs/reference/bigframes.ml/index.rst +++ b/docs/reference/bigframes.ml/index.rst @@ -25,6 +25,8 @@ API Reference metrics + metrics.pairwise + model_selection pipeline diff --git a/docs/reference/bigframes.ml/metrics.pairwise.rst b/docs/reference/bigframes.ml/metrics.pairwise.rst new file mode 100644 index 0000000000..c20772ef07 --- /dev/null +++ b/docs/reference/bigframes.ml/metrics.pairwise.rst @@ -0,0 +1,7 @@ +bigframes.ml.metrics.pairwise +============================= + +.. automodule:: bigframes.ml.metrics.pairwise + :members: + :inherited-members: + :undoc-members: From 45e6229da392b8a9cea4dc86b83632fed55843ee Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 5 Mar 2024 11:21:49 -0800 Subject: [PATCH 8/8] chore(main): release 0.23.0 (#399) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 24 ++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8e11d47e3..35eaa3688d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,30 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.23.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.22.0...v0.23.0) (2024-03-05) + + +### Features + +* Add ml.metrics.pairwise.euclidean_distance ([#397](https://github.com/googleapis/python-bigquery-dataframes/issues/397)) ([1726588](https://github.com/googleapis/python-bigquery-dataframes/commit/1726588beb8894bc08c272d718ca8e3a9451d0c2)) +* Add TextEmbedding model version support ([#394](https://github.com/googleapis/python-bigquery-dataframes/issues/394)) ([e0f1ab0](https://github.com/googleapis/python-bigquery-dataframes/commit/e0f1ab07cbc81034e24767baff54560561950e67)) + + +### Bug Fixes + +* Code exception in `remote_function` now prevents retry and surfaces in the client ([#387](https://github.com/googleapis/python-bigquery-dataframes/issues/387)) ([dd3643d](https://github.com/googleapis/python-bigquery-dataframes/commit/dd3643d3733ca1c2a18352bafac7d32fbdfa2a25)) +* Docs link for metrics.pairwise ([#400](https://github.com/googleapis/python-bigquery-dataframes/issues/400)) ([a60aba7](https://github.com/googleapis/python-bigquery-dataframes/commit/a60aba712576e2e4e14cfcfffe9349d6972716a5)) + + +### Dependencies + +* Update ibis to version 8.0.0 and refactor `remote_function` to use ibis UDF method ([#277](https://github.com/googleapis/python-bigquery-dataframes/issues/277)) ([350499b](https://github.com/googleapis/python-bigquery-dataframes/commit/350499bccb62e22169ab2f2e1400175b2179ef85)) + + +### Documentation + +* Update README to point to new summary pages ([#402](https://github.com/googleapis/python-bigquery-dataframes/issues/402)) ([bfe2b23](https://github.com/googleapis/python-bigquery-dataframes/commit/bfe2b23e2dea0cdf1e1b6ff5b17f6759d73c3e24)) + ## [0.22.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.21.0...v0.22.0) (2024-02-27) diff --git a/bigframes/version.py b/bigframes/version.py index 387b7663f2..a50b0b86fd 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.22.0" +__version__ = "0.23.0"