diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 9bee240971..d8a1bbca71 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:346ab2efb51649c5dde7756cbbdc60dd394852ba83b9bbffc292a63549f33c17 -# created: 2023-12-14T22:17:57.611773021Z + digest: sha256:5ea6d0ab82c956b50962f91d94e206d3921537ae5fe1549ec5326381d8905cfa +# created: 2024-01-15T16:32:08.142785673Z diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index e5c1ffca94..bb3d6ca38b 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -263,9 +263,9 @@ jeepney==0.8.0 \ # via # keyring # secretstorage -jinja2==3.1.2 \ - --hash=sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852 \ - --hash=sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61 +jinja2==3.1.3 \ + --hash=sha256:7d6d50dd97d52cbc355597bd845fabfbac3f551e1f99619e39a35ce8c370b5fa \ + --hash=sha256:ac8bd6544d4bb2c9792bf3a159e80bba8fda7f07e81bc3aed565432d5925ba90 # via gcp-releasetool keyring==24.2.0 \ --hash=sha256:4901caaf597bfd3bbd78c9a0c7c4c29fcd8310dab2cffefe749e916b6527acd6 \ diff --git a/CHANGELOG.md b/CHANGELOG.md index c1691e1f64..14c8050d80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,19 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.19.1](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.19.0...v0.19.1) (2024-01-17) + + +### Bug Fixes + +* Handle multi-level columns for df aggregates properly ([#305](https://github.com/googleapis/python-bigquery-dataframes/issues/305)) ([5bb45ba](https://github.com/googleapis/python-bigquery-dataframes/commit/5bb45ba5560f178438d490a62520ccd36fd2f284)) +* Update max_output_token limitation. ([#308](https://github.com/googleapis/python-bigquery-dataframes/issues/308)) ([5cccd36](https://github.com/googleapis/python-bigquery-dataframes/commit/5cccd36fd2081becd741541c4ac8d5cf53c076f2)) + + +### Documentation + +* Add code samples for Series.corr ([#316](https://github.com/googleapis/python-bigquery-dataframes/issues/316)) ([9150c16](https://github.com/googleapis/python-bigquery-dataframes/commit/9150c16e951fb757547721e0003910c7c49e3d27)) + ## [0.19.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.18.0...v0.19.0) (2024-01-09) diff --git a/bigframes/_config/sampling_options.py b/bigframes/_config/sampling_options.py index 1742dabe17..a80b9601ca 100644 --- a/bigframes/_config/sampling_options.py +++ b/bigframes/_config/sampling_options.py @@ -14,6 +14,8 @@ """Options for downsampling.""" +from __future__ import annotations + import dataclasses from typing import Literal, Optional @@ -25,6 +27,28 @@ class SamplingOptions: __doc__ = vendored_pandas_config.sampling_options_doc max_download_size: Optional[int] = 500 + # Enable downsampling enable_downsampling: bool = False sampling_method: Literal["head", "uniform"] = "uniform" random_state: Optional[int] = None + + def with_max_download_size(self, max_rows: Optional[int]) -> SamplingOptions: + return SamplingOptions( + max_rows, self.enable_downsampling, self.sampling_method, self.random_state + ) + + def with_method(self, method: Literal["head", "uniform"]) -> SamplingOptions: + return SamplingOptions(self.max_download_size, True, method, self.random_state) + + def with_random_state(self, state: Optional[int]) -> SamplingOptions: + return SamplingOptions( + self.max_download_size, + self.enable_downsampling, + self.sampling_method, + state, + ) + + def with_disabled(self) -> SamplingOptions: + return SamplingOptions( + self.max_download_size, False, self.sampling_method, self.random_state + ) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 7ff23efad3..8c08698b93 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -21,8 +21,8 @@ import ibis.expr.types as ibis_types import pandas -import bigframes.core.compile.compiled as compiled -import bigframes.core.compile.compiler as compiler +import bigframes.core.compile as compiling +import bigframes.core.expression as ex import bigframes.core.guid import bigframes.core.nodes as nodes from bigframes.core.ordering import OrderingColumnReference @@ -30,7 +30,6 @@ import bigframes.core.utils from bigframes.core.window_spec import WindowSpec import bigframes.dtypes -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.session._io.bigquery @@ -104,23 +103,17 @@ def _try_evaluate_local(self): def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: return self._compile_ordered().get_column_type(key) - def _compile_ordered(self) -> compiled.OrderedIR: - return compiler.compile_ordered(self.node) + def _compile_ordered(self) -> compiling.OrderedIR: + return compiling.compile_ordered(self.node) - def _compile_unordered(self) -> compiled.UnorderedIR: - return compiler.compile_unordered(self.node) + def _compile_unordered(self) -> compiling.UnorderedIR: + return compiling.compile_unordered(self.node) def row_count(self) -> ArrayValue: """Get number of rows in ArrayValue as a single-entry ArrayValue.""" return ArrayValue(nodes.RowCountNode(child=self.node)) # Operations - - def drop_columns(self, columns: Iterable[str]) -> ArrayValue: - return ArrayValue( - nodes.DropColumnsNode(child=self.node, columns=tuple(columns)) - ) - def filter(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" return ArrayValue( @@ -141,59 +134,104 @@ def promote_offsets(self, col_id: str) -> ArrayValue: """ return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id)) - def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: - return ArrayValue( - nodes.SelectNode(child=self.node, column_ids=tuple(column_ids)) - ) - def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: """Append together multiple ArrayValue objects.""" return ArrayValue( nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]])) ) - def project_unary_op( - self, column_name: str, op: ops.UnaryOp, output_name=None - ) -> ArrayValue: - """Creates a new expression based on this expression with unary operation applied to one column.""" + def project_to_id(self, expression: ex.Expression, output_id: str): + if output_id in self.column_ids: # Mutate case + exprs = [ + ((expression if (col_id == output_id) else ex.free_var(col_id)), col_id) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + (ex.free_var(col_id), col_id) for col_id in self.column_ids + ) + exprs = [*self_projection, (expression, output_id)] return ArrayValue( - nodes.ProjectRowOpNode( - child=self.node, input_ids=(column_name,), op=op, output_id=output_name + nodes.ProjectionNode( + child=self.node, + assignments=tuple(exprs), ) ) - def project_binary_op( - self, - left_column_id: str, - right_column_id: str, - op: ops.BinaryOp, - output_column_id: str, - ) -> ArrayValue: - """Creates a new expression based on this expression with binary operation applied to two columns.""" + def assign(self, source_id: str, destination_id: str) -> ArrayValue: + if destination_id in self.column_ids: # Mutate case + exprs = [ + ( + ( + ex.free_var(source_id) + if (col_id == destination_id) + else ex.free_var(col_id) + ), + col_id, + ) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + (ex.free_var(col_id), col_id) for col_id in self.column_ids + ) + exprs = [*self_projection, (ex.free_var(source_id), destination_id)] return ArrayValue( - nodes.ProjectRowOpNode( + nodes.ProjectionNode( child=self.node, - input_ids=(left_column_id, right_column_id), - op=op, - output_id=output_column_id, + assignments=tuple(exprs), ) ) - def project_ternary_op( + def assign_constant( self, - col_id_1: str, - col_id_2: str, - col_id_3: str, - op: ops.TernaryOp, - output_column_id: str, + destination_id: str, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], ) -> ArrayValue: - """Creates a new expression based on this expression with ternary operation applied to three columns.""" + if destination_id in self.column_ids: # Mutate case + exprs = [ + ( + ( + ex.const(value, dtype) + if (col_id == destination_id) + else ex.free_var(col_id) + ), + col_id, + ) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + (ex.free_var(col_id), col_id) for col_id in self.column_ids + ) + exprs = [*self_projection, (ex.const(value, dtype), destination_id)] return ArrayValue( - nodes.ProjectRowOpNode( + nodes.ProjectionNode( child=self.node, - input_ids=(col_id_1, col_id_2, col_id_3), - op=op, - output_id=output_column_id, + assignments=tuple(exprs), + ) + ) + + def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: + selections = ((ex.free_var(col_id), col_id) for col_id in column_ids) + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(selections), + ) + ) + + def drop_columns(self, columns: Iterable[str]) -> ArrayValue: + new_projection = ( + (ex.free_var(col_id), col_id) + for col_id in self.column_ids + if col_id not in columns + ) + return ArrayValue( + nodes.ProjectionNode( + child=self.node, + assignments=tuple(new_projection), ) ) @@ -316,25 +354,6 @@ def unpivot( ) ) - def assign(self, source_id: str, destination_id: str) -> ArrayValue: - return ArrayValue( - nodes.AssignNode( - child=self.node, source_id=source_id, destination_id=destination_id - ) - ) - - def assign_constant( - self, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> ArrayValue: - return ArrayValue( - nodes.AssignConstantNode( - child=self.node, destination_id=destination_id, value=value, dtype=dtype - ) - ) - def join( self, self_column_ids: typing.Sequence[str], diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 0b6886562e..345adb6be3 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core as core import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.ordering as ordering import bigframes.core.window_spec as windows import bigframes.dtypes as dtypes @@ -44,11 +45,10 @@ def equals(block1: blocks.Block, block2: blocks.Block) -> bool: for lcol, rcol in zip(block1.value_columns, block2.value_columns): lcolmapped = lmap[lcol] rcolmapped = rmap[rcol] - joined_block, result_id = joined_block.apply_binary_op( - lcolmapped, rcolmapped, ops.eq_null_match_op - ) - joined_block, result_id = joined_block.apply_unary_op( - result_id, ops.partial_right(ops.fillna_op, False) + joined_block, result_id = joined_block.project_expr( + ops.fillna_op.as_expr( + ops.eq_null_match_op.as_expr(lcolmapped, rcolmapped), ex.const(False) + ) ) equality_ids.append(result_id) @@ -91,9 +91,8 @@ def indicate_duplicates( agg_ops.count_op, window_spec=window_spec, ) - block, duplicate_indicator = block.apply_unary_op( - val_count_col_id, - ops.partial_right(ops.gt_op, 1), + block, duplicate_indicator = block.project_expr( + ops.gt_op.as_expr(val_count_col_id, ex.const(1)) ) return ( block.drop_columns( @@ -183,8 +182,8 @@ def _interpolate_column( # Note, this method may block, notnull = block.apply_unary_op(column, ops.notnull_op) - block, masked_offsets = block.apply_binary_op( - x_values, notnull, ops.partial_arg3(ops.where_op, None) + block, masked_offsets = block.project_expr( + ops.where_op.as_expr(x_values, notnull, ex.const(None)) ) block, previous_value = block.apply_window_op( @@ -271,25 +270,22 @@ def _interpolate_points_nearest( xpredict_id: str, ) -> typing.Tuple[blocks.Block, str]: """Interpolate by taking the y value of the nearest x value""" - block, left_diff = block.apply_binary_op(xpredict_id, x0_id, ops.sub_op) - block, right_diff = block.apply_binary_op(x1_id, xpredict_id, ops.sub_op) + left_diff = ops.sub_op.as_expr(xpredict_id, x0_id) + right_diff = ops.sub_op.as_expr(x1_id, xpredict_id) # If diffs equal, choose left - block, choose_left = block.apply_binary_op(left_diff, right_diff, ops.le_op) - block, choose_left = block.apply_unary_op( - choose_left, ops.partial_right(ops.fillna_op, False) + choose_left = ops.fillna_op.as_expr( + ops.le_op.as_expr(left_diff, right_diff), ex.const(False) ) - block, nearest = block.apply_ternary_op(y0_id, choose_left, y1_id, ops.where_op) - - block, y0_exists = block.apply_unary_op(y0_id, ops.notnull_op) - block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) - block, is_interpolation = block.apply_binary_op(y0_exists, y1_exists, ops.and_op) + nearest = ops.where_op.as_expr(y0_id, choose_left, y1_id) - block, prediction_id = block.apply_binary_op( - nearest, is_interpolation, ops.partial_arg3(ops.where_op, None) + is_interpolation = ops.and_op.as_expr( + ops.notnull_op.as_expr(y0_id), ops.notnull_op.as_expr(y1_id) ) - return block, prediction_id + return block.project_expr( + ops.where_op.as_expr(nearest, is_interpolation, ex.const(None)) + ) def _interpolate_points_ffill( @@ -302,11 +298,9 @@ def _interpolate_points_ffill( ) -> typing.Tuple[blocks.Block, str]: """Interpolates by using the preceding values""" # check for existance of y1, otherwise we are extrapolating instead of interpolating - block, y1_exists = block.apply_unary_op(y1_id, ops.notnull_op) - block, prediction_id = block.apply_binary_op( - y0_id, y1_exists, ops.partial_arg3(ops.where_op, None) + return block.project_expr( + ops.where_op.as_expr(y0_id, ops.notnull_op.as_expr(y1_id), ex.const(None)) ) - return block, prediction_id def drop_duplicates( @@ -519,9 +513,7 @@ def nsmallest( agg_ops.rank_op, window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) - block, condition = block.apply_unary_op( - counter, ops.partial_right(ops.le_op, n) - ) + block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n))) block = block.filter(condition) return block.drop_columns([counter, condition]) @@ -551,9 +543,7 @@ def nlargest( agg_ops.rank_op, window_spec=windows.WindowSpec(ordering=tuple(order_refs)), ) - block, condition = block.apply_unary_op( - counter, ops.partial_right(ops.le_op, n) - ) + block, condition = block.project_expr(ops.le_op.as_expr(counter, ex.const(n))) block = block.filter(condition) return block.drop_columns([counter, condition]) @@ -641,7 +631,7 @@ def kurt( def _mean_delta_to_power( block: blocks.Block, - n_power, + n_power: int, column_ids: typing.Sequence[str], grouping_column_ids: typing.Sequence[str], ) -> typing.Tuple[blocks.Block, typing.Sequence[str]]: @@ -649,11 +639,10 @@ def _mean_delta_to_power( window = windows.WindowSpec(grouping_keys=tuple(grouping_column_ids)) block, mean_ids = block.multi_apply_window_op(column_ids, agg_ops.mean_op, window) delta_ids = [] - cube_op = ops.partial_right(ops.pow_op, n_power) for val_id, mean_val_id in zip(column_ids, mean_ids): - block, delta_id = block.apply_binary_op(val_id, mean_val_id, ops.sub_op) - block, delta_power_id = block.apply_unary_op(delta_id, cube_op) - block = block.drop_columns([delta_id]) + delta = ops.sub_op.as_expr(val_id, mean_val_id) + delta_power = ops.pow_op.as_expr(delta, ex.const(n_power)) + block, delta_power_id = block.project_expr(delta_power) delta_ids.append(delta_power_id) return block, delta_ids @@ -664,31 +653,26 @@ def _skew_from_moments_and_count( # Calculate skew using count, third moment and population variance # See G1 estimator: # https://en.wikipedia.org/wiki/Skewness#Sample_skewness - block, denominator_id = block.apply_unary_op( - moment2_id, ops.partial_right(ops.unsafe_pow_op, 3 / 2) - ) - block, base_id = block.apply_binary_op(moment3_id, denominator_id, ops.div_op) - block, countminus1_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 1) - ) - block, countminus2_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 2) - ) - block, adjustment_id = block.apply_binary_op(count_id, countminus1_id, ops.mul_op) - block, adjustment_id = block.apply_unary_op( - adjustment_id, ops.partial_right(ops.unsafe_pow_op, 1 / 2) + moments_estimator = ops.div_op.as_expr( + moment3_id, ops.pow_op.as_expr(moment2_id, ex.const(3 / 2)) ) - block, adjustment_id = block.apply_binary_op( - adjustment_id, countminus2_id, ops.div_op + + countminus1 = ops.sub_op.as_expr(count_id, ex.const(1)) + countminus2 = ops.sub_op.as_expr(count_id, ex.const(2)) + adjustment = ops.div_op.as_expr( + ops.unsafe_pow_op.as_expr( + ops.mul_op.as_expr(count_id, countminus1), ex.const(1 / 2) + ), + countminus2, ) - block, skew_id = block.apply_binary_op(base_id, adjustment_id, ops.mul_op) + + skew = ops.mul_op.as_expr(moments_estimator, adjustment) # Need to produce NA if have less than 3 data points - block, na_cond_id = block.apply_unary_op(count_id, ops.partial_right(ops.ge_op, 3)) - block, skew_id = block.apply_binary_op( - skew_id, na_cond_id, ops.partial_arg3(ops.where_op, None) + cleaned_skew = ops.where_op.as_expr( + skew, ops.ge_op.as_expr(count_id, ex.const(3)), ex.const(None) ) - return block, skew_id + return block.project_expr(cleaned_skew) def _kurt_from_moments_and_count( @@ -701,49 +685,42 @@ def _kurt_from_moments_and_count( # adjustment = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) # kurtosis = (numerator / denominator) - adjustment - # Numerator - block, countminus1_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 1) - ) - block, countplus1_id = block.apply_unary_op( - count_id, ops.partial_right(ops.add_op, 1) + numerator = ops.mul_op.as_expr( + moment4_id, + ops.mul_op.as_expr( + ops.sub_op.as_expr(count_id, ex.const(1)), + ops.add_op.as_expr(count_id, ex.const(1)), + ), ) - block, num_adj = block.apply_binary_op(countplus1_id, countminus1_id, ops.mul_op) - block, numerator_id = block.apply_binary_op(moment4_id, num_adj, ops.mul_op) # Denominator - block, countminus2_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 2) - ) - block, countminus3_id = block.apply_unary_op( - count_id, ops.partial_right(ops.sub_op, 3) - ) - block, denom_adj = block.apply_binary_op(countminus2_id, countminus3_id, ops.mul_op) - block, popvar_squared = block.apply_unary_op( - moment2_id, ops.partial_right(ops.unsafe_pow_op, 2) + countminus2 = ops.sub_op.as_expr(count_id, ex.const(2)) + countminus3 = ops.sub_op.as_expr(count_id, ex.const(3)) + + # Denominator + denominator = ops.mul_op.as_expr( + ops.unsafe_pow_op.as_expr(moment2_id, ex.const(2)), + ops.mul_op.as_expr(countminus2, countminus3), ) - block, denominator_id = block.apply_binary_op(popvar_squared, denom_adj, ops.mul_op) # Adjustment - block, countminus1_square = block.apply_unary_op( - countminus1_id, ops.partial_right(ops.unsafe_pow_op, 2) - ) - block, adj_num = block.apply_unary_op( - countminus1_square, ops.partial_right(ops.mul_op, 3) + adj_num = ops.mul_op.as_expr( + ops.unsafe_pow_op.as_expr( + ops.sub_op.as_expr(count_id, ex.const(1)), ex.const(2) + ), + ex.const(3), ) - block, adj_denom = block.apply_binary_op(countminus2_id, countminus3_id, ops.mul_op) - block, adjustment_id = block.apply_binary_op(adj_num, adj_denom, ops.div_op) + adj_denom = ops.mul_op.as_expr(countminus2, countminus3) + adjustment = ops.div_op.as_expr(adj_num, adj_denom) # Combine - block, base_id = block.apply_binary_op(numerator_id, denominator_id, ops.div_op) - block, kurt_id = block.apply_binary_op(base_id, adjustment_id, ops.sub_op) + kurt = ops.sub_op.as_expr(ops.div_op.as_expr(numerator, denominator), adjustment) # Need to produce NA if have less than 4 data points - block, na_cond_id = block.apply_unary_op(count_id, ops.partial_right(ops.ge_op, 4)) - block, kurt_id = block.apply_binary_op( - kurt_id, na_cond_id, ops.partial_arg3(ops.where_op, None) + cleaned_kurt = ops.where_op.as_expr( + kurt, ops.ge_op.as_expr(count_id, ex.const(4)), ex.const(None) ) - return block, kurt_id + return block.project_expr(cleaned_kurt) def align( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 9688f439b1..8c59f8106b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -21,6 +21,7 @@ from __future__ import annotations +import dataclasses import functools import itertools import random @@ -31,8 +32,10 @@ import google.cloud.bigquery as bigquery import pandas as pd +import bigframes._config.sampling_options as sampling_options import bigframes.constants as constants import bigframes.core as core +import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.joins.name_resolution as join_names @@ -80,6 +83,14 @@ def _get_block(self) -> Block: """Get the underlying block value of the object""" +@dataclasses.dataclass() +class MaterializationOptions: + downsampling: sampling_options.SamplingOptions = dataclasses.field( + default_factory=sampling_options.SamplingOptions + ) + ordered: bool = True + + class Block: """A immutable 2D data structure.""" @@ -395,8 +406,6 @@ def _to_dataframe(self, result) -> pd.DataFrame: def to_pandas( self, - value_keys: Optional[Iterable[str]] = None, - max_results: Optional[int] = None, max_download_size: Optional[int] = None, sampling_method: Optional[str] = None, random_state: Optional[int] = None, @@ -404,14 +413,24 @@ def to_pandas( ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" + if (sampling_method is not None) and (sampling_method not in _SAMPLING_METHODS): + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) - df, _, query_job = self._compute_and_count( - value_keys=value_keys, - max_results=max_results, - max_download_size=max_download_size, - sampling_method=sampling_method, - random_state=random_state, - ordered=ordered, + sampling = bigframes.options.sampling.with_max_download_size(max_download_size) + if sampling_method is not None: + sampling = sampling.with_method(sampling_method).with_random_state( # type: ignore + random_state + ) + else: + sampling = sampling.with_disabled() + + df, query_job = self._materialize_local( + materialize_options=MaterializationOptions( + downsampling=sampling, ordered=ordered + ) ) return df, query_job @@ -439,57 +458,29 @@ def _copy_index_to_pandas(self, df: pd.DataFrame): # See: https://github.com/pandas-dev/pandas-stubs/issues/804 df.index.names = self.index.names # type: ignore - def _compute_and_count( - self, - value_keys: Optional[Iterable[str]] = None, - max_results: Optional[int] = None, - max_download_size: Optional[int] = None, - sampling_method: Optional[str] = None, - random_state: Optional[int] = None, - *, - ordered: bool = True, - ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: + def _materialize_local( + self, materialize_options: MaterializationOptions = MaterializationOptions() + ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. - enable_downsampling = ( - True - if sampling_method is not None - else bigframes.options.sampling.enable_downsampling - ) - - max_download_size = ( - max_download_size or bigframes.options.sampling.max_download_size - ) - - random_state = random_state or bigframes.options.sampling.random_state - - if sampling_method is None: - sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM - sampling_method = sampling_method.lower() - - if sampling_method not in _SAMPLING_METHODS: - raise NotImplementedError( - f"The downsampling method {sampling_method} is not implemented, " - f"please choose from {','.join(_SAMPLING_METHODS)}." - ) - - expr = self._apply_value_keys_to_expr(value_keys=value_keys) - results_iterator, query_job = self.session._execute( - expr, max_results=max_results, sorted=ordered + self.expr, sorted=materialize_options.ordered ) - table_size = ( self.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES ) + sample_config = materialize_options.downsampling + max_download_size = sample_config.max_download_size fraction = ( max_download_size / table_size if (max_download_size is not None) and (table_size != 0) else 2 ) + # TODO: Maybe materialize before downsampling + # Some downsampling methods if fraction < 1: - if not enable_downsampling: + if not sample_config.enable_downsampling: raise RuntimeError( f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of " f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n" @@ -507,42 +498,53 @@ def _compute_and_count( "\nPlease refer to the documentation for configuring the downloading limit.", UserWarning, ) - if sampling_method == _HEAD: - total_rows = int(results_iterator.total_rows * fraction) - results_iterator.max_results = total_rows - df = self._to_dataframe(results_iterator) - - if self.index_columns: - df.set_index(list(self.index_columns), inplace=True) - df.index.names = self.index.names # type: ignore - elif (sampling_method == _UNIFORM) and (random_state is None): - filtered_expr = self.expr._uniform_sampling(fraction) - block = Block( - filtered_expr, - index_columns=self.index_columns, - column_labels=self.column_labels, - index_labels=self.index.names, - ) - df, total_rows, _ = block._compute_and_count(max_download_size=None) - elif sampling_method == _UNIFORM: - block = self._split( - fracs=(max_download_size / table_size,), - random_state=random_state, - preserve_order=True, - )[0] - df, total_rows, _ = block._compute_and_count(max_download_size=None) - else: - # This part should never be called, just in case. - raise NotImplementedError( - f"The downsampling method {sampling_method} is not implemented, " - f"please choose from {','.join(_SAMPLING_METHODS)}." - ) + total_rows = results_iterator.total_rows + # Remove downsampling config from subsequent invocations, as otherwise could result in many + # iterations if downsampling undershoots + return self._downsample( + total_rows=total_rows, + sampling_method=sample_config.sampling_method, + fraction=fraction, + random_state=sample_config.random_state, + )._materialize_local( + MaterializationOptions(ordered=materialize_options.ordered) + ) else: total_rows = results_iterator.total_rows df = self._to_dataframe(results_iterator) self._copy_index_to_pandas(df) - return df, total_rows, query_job + return df, query_job + + def _downsample( + self, total_rows: int, sampling_method: str, fraction: float, random_state + ) -> Block: + # either selecting fraction or number of rows + if sampling_method == _HEAD: + filtered_block = self.slice(stop=int(total_rows * fraction)) + return filtered_block + elif (sampling_method == _UNIFORM) and (random_state is None): + filtered_expr = self.expr._uniform_sampling(fraction) + block = Block( + filtered_expr, + index_columns=self.index_columns, + column_labels=self.column_labels, + index_labels=self.index.names, + ) + return block + elif sampling_method == _UNIFORM: + block = self._split( + fracs=(fraction,), + random_state=random_state, + preserve_order=True, + )[0] + return block + else: + # This part should never be called, just in case. + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) def _split( self, @@ -661,23 +663,32 @@ def with_index_labels(self, value: typing.Sequence[Label]) -> Block: index_labels=tuple(value), ) - def apply_unary_op( - self, column: str, op: ops.UnaryOp, result_label: Label = None + def project_expr( + self, expr: ex.Expression, label: Label = None ) -> typing.Tuple[Block, str]: """ - Apply a unary op to the block. Creates a new column to store the result. + Apply a scalar expression to the block. Creates a new column to store the result. """ # TODO(tbergeron): handle labels safely so callers don't need to result_id = guid.generate_guid() - expr = self._expr.project_unary_op(column, op, result_id) + array_val = self._expr.project_to_id(expr, result_id) block = Block( - expr, + array_val, index_columns=self.index_columns, - column_labels=[*self.column_labels, result_label], + column_labels=[*self.column_labels, label], index_labels=self.index.names, ) return (block, result_id) + def apply_unary_op( + self, column: str, op: ops.UnaryOp, result_label: Label = None + ) -> typing.Tuple[Block, str]: + """ + Apply a unary op to the block. Creates a new column to store the result. + """ + expr = op.as_expr(column) + return self.project_expr(expr, result_label) + def apply_binary_op( self, left_column_id: str, @@ -685,17 +696,8 @@ def apply_binary_op( op: ops.BinaryOp, result_label: Label = None, ) -> typing.Tuple[Block, str]: - result_id = guid.generate_guid() - expr = self._expr.project_binary_op( - left_column_id, right_column_id, op, result_id - ) - block = Block( - expr, - index_columns=self.index_columns, - column_labels=[*self.column_labels, result_label], - index_labels=self.index.names, - ) - return (block, result_id) + expr = op.as_expr(left_column_id, right_column_id) + return self.project_expr(expr, result_label) def apply_ternary_op( self, @@ -705,17 +707,8 @@ def apply_ternary_op( op: ops.TernaryOp, result_label: Label = None, ) -> typing.Tuple[Block, str]: - result_id = guid.generate_guid() - expr = self._expr.project_ternary_op( - col_id_1, col_id_2, col_id_3, op, result_id - ) - block = Block( - expr, - index_columns=self.index_columns, - column_labels=[*self.column_labels, result_label], - index_labels=self.index.names, - ) - return (block, result_id) + expr = op.as_expr(col_id_1, col_id_2, col_id_3) + return self.project_expr(expr, result_label) def multi_apply_window_op( self, @@ -855,13 +848,21 @@ def aggregate_all_and_stack( aggregations = [ (col_id, operation, col_id) for col_id in self.value_columns ] + index_col_ids = [ + guid.generate_guid() for i in range(self.column_labels.nlevels) + ] result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( row_labels=self.column_labels.to_list(), - index_col_ids=["index"], + index_col_ids=index_col_ids, unpivot_columns=tuple([(value_col_id, tuple(self.value_columns))]), dtype=dtype, ) - return Block(result_expr, index_columns=["index"], column_labels=[None]) + return Block( + result_expr, + index_columns=index_col_ids, + column_labels=[None], + index_labels=self.column_labels.names, + ) else: # axis_n == 1 # using offsets as identity to group on. # TODO: Allow to promote identity/total_order columns instead for better perf @@ -1145,43 +1146,37 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1): conditions = [] if start != 0: if start > 0: - op = ops.partial_right(ops.ge_op, start) assert positive_offsets - block, start_cond = block.apply_unary_op(positive_offsets, op) + conditions.append(ops.ge_op.as_expr(positive_offsets, ex.const(start))) else: - op = ops.partial_right(ops.le_op, -start - 1) assert negative_offsets - block, start_cond = block.apply_unary_op(negative_offsets, op) - conditions.append(start_cond) + conditions.append( + ops.le_op.as_expr(negative_offsets, ex.const(-start - 1)) + ) if stop is not None: if stop >= 0: - op = ops.partial_right(ops.lt_op, stop) assert positive_offsets - block, stop_cond = block.apply_unary_op(positive_offsets, op) + conditions.append(ops.lt_op.as_expr(positive_offsets, ex.const(stop))) else: - op = ops.partial_right(ops.gt_op, -stop - 1) assert negative_offsets - block, stop_cond = block.apply_unary_op(negative_offsets, op) - conditions.append(stop_cond) - + conditions.append( + ops.gt_op.as_expr(negative_offsets, ex.const(-stop - 1)) + ) if step > 1: - op = ops.partial_right(ops.mod_op, step) if start >= 0: - op = ops.partial_right(ops.sub_op, start) assert positive_offsets - block, start_diff = block.apply_unary_op(positive_offsets, op) + start_diff = ops.sub_op.as_expr(positive_offsets, ex.const(start)) else: - op = ops.partial_right(ops.sub_op, -start + 1) assert negative_offsets - block, start_diff = block.apply_unary_op(negative_offsets, op) - modulo_op = ops.partial_right(ops.mod_op, step) - block, mod = block.apply_unary_op(start_diff, modulo_op) - is_zero_op = ops.partial_right(ops.eq_op, 0) - block, step_cond = block.apply_unary_op(mod, is_zero_op) + start_diff = ops.sub_op.as_expr(negative_offsets, ex.const(-start + 1)) + step_cond = ops.eq_op.as_expr( + ops.mod_op.as_expr(start_diff, ex.const(step)), ex.const(0) + ) conditions.append(step_cond) for cond in conditions: - block = block.filter(cond) + block, cond_id = block.project_expr(cond) + block = block.filter(cond_id) return block.select_columns(self.value_columns) @@ -1203,10 +1198,9 @@ def retrieve_repr_request_results( count = self.shape[0] if count > max_results: head_block = self.slice(0, max_results) - computed_df, query_job = head_block.to_pandas(max_results=max_results) else: head_block = self - computed_df, query_job = head_block.to_pandas() + computed_df, query_job = head_block.to_pandas() formatted_df = computed_df.set_axis(self.column_labels, axis=1) # we reset the axis and substitute the bf index name for the default formatted_df.index.name = self.index.name @@ -1232,9 +1226,13 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_unary_op(index_col, ops.AsTypeOp(to_type="string")) - prefix_op = ops.ApplyLeft(base_op=ops.add_op, left_scalar=prefix) - expr = expr.project_unary_op(index_col, prefix_op) + expr = expr.project_to_id( + expression=ops.add_op.as_expr( + ex.const(prefix), + ops.AsTypeOp(to_type="string").as_expr(index_col), + ), + output_id=index_col, + ) return Block( expr, index_columns=self.index_columns, @@ -1251,9 +1249,13 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: if axis_number == 0: expr = self._expr for index_col in self._index_columns: - expr = expr.project_unary_op(index_col, ops.AsTypeOp(to_type="string")) - prefix_op = ops.ApplyRight(base_op=ops.add_op, right_scalar=suffix) - expr = expr.project_unary_op(index_col, prefix_op) + expr = expr.project_to_id( + expression=ops.add_op.as_expr( + ops.AsTypeOp(to_type="string").as_expr(index_col), + ex.const(suffix), + ), + output_id=index_col, + ) return Block( expr, index_columns=self.index_columns, @@ -1452,28 +1454,23 @@ def _create_pivot_column_index( def _create_pivot_col( block: Block, columns: typing.Sequence[str], value_col: str, value ) -> typing.Tuple[Block, str]: - cond_id = "" + condition: typing.Optional[ex.Expression] = None nlevels = len(columns) for i in range(len(columns)): uvalue_level = value[i] if nlevels > 1 else value if pd.isna(uvalue_level): - block, eq_id = block.apply_unary_op( - columns[i], - ops.isnull_op, - ) + equality = ops.isnull_op.as_expr(columns[i]) else: - block, eq_id = block.apply_unary_op( - columns[i], ops.partial_right(ops.eq_op, uvalue_level) - ) - if cond_id: - block, cond_id = block.apply_binary_op(eq_id, cond_id, ops.and_op) + equality = ops.eq_op.as_expr(columns[i], ex.const(uvalue_level)) + if condition is not None: + condition = ops.and_op.as_expr(equality, condition) else: - cond_id = eq_id - block, masked_id = block.apply_binary_op( - value_col, cond_id, ops.partial_arg3(ops.where_op, None) - ) + condition = equality - return block, masked_id + assert condition is not None + return block.project_expr( + ops.where_op.as_expr(value_col, condition, ex.const(None)) + ) def _get_unique_values( self, columns: Sequence[str], max_unique_values: int @@ -1560,10 +1557,10 @@ def merge( coalesced_ids = [] for left_id, right_id in zip(left_join_ids, right_join_ids): coalesced_id = guid.generate_guid() - joined_expr = joined_expr.project_binary_op( - get_column_left[left_id], - get_column_right[right_id], - ops.coalesce_op, + joined_expr = joined_expr.project_to_id( + ops.coalesce_op.as_expr( + get_column_left[left_id], get_column_right[right_id] + ), coalesced_id, ) coalesced_ids.append(coalesced_id) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index c1e8f1ea48..2cab6fb95d 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -26,8 +26,8 @@ import ibis.expr.types as ibis_types import pandas -import bigframes.constants as constants import bigframes.core.compile.scalar_op_compiler as op_compilers +import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.ordering import ( encode_order_string, @@ -95,16 +95,6 @@ def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: else None ) - @abc.abstractmethod - def select_columns(self: T, column_ids: typing.Sequence[str]) -> T: - """Creates a new expression based on this expression with new columns.""" - ... - - def drop_columns(self: T, columns: Iterable[str]) -> T: - return self.select_columns( - [col for col in self.column_ids if col not in columns] - ) - @abc.abstractmethod def filter(self: T, predicate_id: str, keep_null: bool = False) -> T: """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" @@ -151,39 +141,26 @@ def _reproject_to_table(self: T) -> T: """ ... - def project_row_op( + def projection( self: T, - input_column_ids: typing.Sequence[str], - op: ops.RowOp, - output_column_id: typing.Optional[str] = None, + expression_id_pairs: typing.Tuple[typing.Tuple[ex.Expression, str], ...], ) -> T: - """Creates a new expression based on this expression with unary operation applied to one column.""" - result_id = ( - output_column_id or input_column_ids[0] - ) # overwrite input if not output id provided - inputs = tuple(self._get_ibis_column(col) for col in input_column_ids) - value = op_compiler.compile_row_op(op, inputs).name(result_id) - return self._set_or_replace_by_id(result_id, value) - - def assign(self: T, source_id: str, destination_id: str) -> T: - return self._set_or_replace_by_id( - destination_id, self._get_ibis_column(source_id) - ) + """Apply an expression to the ArrayValue and assign the output to a column.""" + bindings = {col: self._get_ibis_column(col) for col in self.column_ids} + values = [ + op_compiler.compile_expression(expression, bindings).name(id) + for expression, id in expression_id_pairs + ] + result = self._select(tuple(values)) # type: ignore - def assign_constant( - self: T, - destination_id: str, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> T: - # TODO(b/281587571): Solve scalar constant aggregation problem w/Ibis. - ibis_value = bigframes.dtypes.literal_to_ibis_scalar(value, dtype) - if ibis_value is None: - raise NotImplementedError( - f"Type not supported as scalar value {type(value)}. {constants.FEEDBACK_LINK}" - ) - expr = self._set_or_replace_by_id(destination_id, ibis_value) - return expr._reproject_to_table() + # Need to reproject to convert ibis Scalar to ibis Column object + if any(exp_id[0].is_const for exp_id in expression_id_pairs): + result = result._reproject_to_table() + return result + + @abc.abstractmethod + def _select(self: T, values: typing.Tuple[ibis_types.Value]) -> T: + ... @abc.abstractmethod def _set_or_replace_by_id(self: T, id: str, new_value: ibis_types.Value) -> T: @@ -328,14 +305,6 @@ def _to_ibis_expr( table = table.filter(ibis.random() < ibis.literal(fraction)) return table - def select_columns(self, column_ids: typing.Sequence[str]) -> UnorderedIR: - """Creates a new expression based on this expression with new columns.""" - columns = [self._get_ibis_column(col_id) for col_id in column_ids] - builder = self.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - def filter(self, predicate_id: str, keep_null: bool = False) -> UnorderedIR: condition = typing.cast( ibis_types.BooleanValue, self._get_ibis_column(predicate_id) @@ -575,6 +544,11 @@ def _set_or_replace_by_id( builder.columns = [*self.columns, new_value.name(id)] return builder.build() + def _select(self, values: typing.Tuple[ibis_types.Value]) -> UnorderedIR: + builder = self.builder() + builder.columns = values + return builder.build() + def _reproject_to_table(self) -> UnorderedIR: """ Internal operators that projects the internal representation into a @@ -814,20 +788,6 @@ def promote_offsets(self, col_id: str) -> OrderedIR: ] return expr_builder.build() - def select_columns(self, column_ids: typing.Sequence[str]) -> OrderedIR: - """Creates a new expression based on this expression with new columns.""" - columns = [self._get_ibis_column(col_id) for col_id in column_ids] - expr = self - for ordering_column in set(self.column_ids).intersection( - [col_ref.column_id for col_ref in self._ordering.ordering_value_columns] - ): - # Need to hide ordering columns that are being dropped. Alternatively, could project offsets - expr = expr._hide_column(ordering_column) - builder = expr.builder() - builder.columns = list(columns) - new_expr = builder.build() - return new_expr - ## Methods that only work with ordering def project_window_op( self, @@ -1219,6 +1179,29 @@ def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> Ordered builder.columns = [*self.columns, new_value.name(id)] return builder.build() + def _select(self, values: typing.Tuple[ibis_types.Value]) -> OrderedIR: + """Safely assign by id while maintaining ordering integrity.""" + # TODO: Split into explicit set and replace methods + ordering_col_ids = [ + col_ref.column_id for col_ref in self._ordering.ordering_value_columns + ] + ir = self + mappings = {value.name: value for value in values} + for ordering_id in ordering_col_ids: + # Drop case + if (ordering_id not in mappings) and (ordering_id in ir.column_ids): + # id is being dropped, hide it first + ir = ir._hide_column(ordering_id) + # Mutate case + elif (ordering_id in mappings) and not mappings[ordering_id].equals( + ir._get_any_column(ordering_id) + ): + ir = ir._hide_column(ordering_id) + + builder = ir.builder() + builder.columns = list(values) + return builder.build() + ## Ordering specific helpers def _get_any_column(self, key: str) -> ibis_types.Value: """Gets the Ibis expression for a given column. Will also get hidden columns.""" diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index c28958a861..18fcd73d19 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -79,16 +79,6 @@ def compile_join(node: nodes.JoinNode, ordered: bool = True): ) -@_compile_node.register -def compile_select(node: nodes.SelectNode, ordered: bool = True): - return compile_node(node.child, ordered).select_columns(node.column_ids) - - -@_compile_node.register -def compile_drop(node: nodes.DropColumnsNode, ordered: bool = True): - return compile_node(node.child, ordered).drop_columns(node.columns) - - @_compile_node.register def compile_readlocal(node: nodes.ReadLocalNode, ordered: bool = True): array_as_pd = pd.read_feather(io.BytesIO(node.feather_bytes)) @@ -143,10 +133,9 @@ def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): @_compile_node.register -def compile_project(node: nodes.ProjectRowOpNode, ordered: bool = True): - return compile_node(node.child, ordered).project_row_op( - node.input_ids, node.op, node.output_id - ) +def compile_projection(node: nodes.ProjectionNode, ordered: bool = True): + result = compile_node(node.child, ordered) + return result.projection(node.assignments) @_compile_node.register @@ -209,18 +198,6 @@ def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): ) -@_compile_node.register -def compile_assign(node: nodes.AssignNode, ordered: bool = True): - return compile_node(node.child, ordered).assign(node.source_id, node.destination_id) - - -@_compile_node.register -def compile_assign_constant(node: nodes.AssignConstantNode, ordered: bool = True): - return compile_node(node.child, ordered).assign_constant( - node.destination_id, node.value, node.dtype - ) - - @_compile_node.register def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True): return compile_node(node.child, ordered)._uniform_sampling(node.fraction) diff --git a/bigframes/core/compile/row_identity.py b/bigframes/core/compile/row_identity.py index 71d53f90dc..7a87a435fe 100644 --- a/bigframes/core/compile/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -24,7 +24,7 @@ import bigframes.constants as constants import bigframes.core.compile.compiled as compiled -import bigframes.core.joins.name_resolution as naming +import bigframes.core.joins as joining import bigframes.core.ordering as orderings SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} @@ -68,7 +68,7 @@ def join_by_row_identity_unordered( right_mask = right_relative_predicates if how in ["left", "outer"] else None # Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result - map_left_id, map_right_id = naming.JOIN_NAME_REMAPPER( + map_left_id, map_right_id = joining.JOIN_NAME_REMAPPER( left.column_ids, right.column_ids ) joined_columns = [ @@ -125,10 +125,10 @@ def join_by_row_identity_ordered( right_mask = right_relative_predicates if how in ["left", "outer"] else None # Public mapping must use JOIN_NAME_REMAPPER to stay in sync with consumers of join result - lpublicmapping, rpublicmapping = naming.JOIN_NAME_REMAPPER( + lpublicmapping, rpublicmapping = joining.JOIN_NAME_REMAPPER( left.column_ids, right.column_ids ) - lhiddenmapping, rhiddenmapping = naming.JoinNameRemapper(namespace="hidden")( + lhiddenmapping, rhiddenmapping = joining.JoinNameRemapper(namespace="hidden")( left._hidden_column_ids, right._hidden_column_ids ) map_left_id = {**lpublicmapping, **lhiddenmapping} diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index d711dbf456..bf0755acc7 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -26,8 +26,8 @@ import pandas as pd import bigframes.constants as constants +import bigframes.core.expression as ex import bigframes.dtypes -import bigframes.dtypes as dtypes import bigframes.operations as ops _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) @@ -50,6 +50,47 @@ class ScalarOpCompiler: ], ] = {} + @functools.singledispatchmethod + def compile_expression( + self, + expression: ex.Expression, + bindings: typing.Dict[str, ibis_types.Value], + ) -> ibis_types.Value: + raise NotImplementedError(f"Unrecognized expression: {expression}") + + @compile_expression.register + def _( + self, + expression: ex.ScalarConstantExpression, + bindings: typing.Dict[str, ibis_types.Value], + ) -> ibis_types.Value: + return bigframes.dtypes.literal_to_ibis_scalar( + expression.value, expression.dtype + ) + + @compile_expression.register + def _( + self, + expression: ex.UnboundVariableExpression, + bindings: typing.Dict[str, ibis_types.Value], + ) -> ibis_types.Value: + if expression.id not in bindings: + raise ValueError(f"Could not resolve unbound variable {expression.id}") + else: + return bindings[expression.id] + + @compile_expression.register + def _( + self, + expression: ex.OpExpression, + bindings: typing.Dict[str, ibis_types.Value], + ) -> ibis_types.Value: + inputs = [ + self.compile_expression(sub_expr, bindings) + for sub_expr in expression.inputs + ] + return self.compile_row_op(expression.op, inputs) + def compile_row_op( self, op: ops.RowOp, inputs: typing.Sequence[ibis_types.Value] ) -> ibis_types.Value: @@ -1092,38 +1133,6 @@ def clip_op( ) -# Composition Ops -@scalar_op_compiler.register_unary_op(ops.ApplyRight, pass_op=True) -def apply_right(input: ibis_types.Value, op: ops.ApplyRight): - right = dtypes.literal_to_ibis_scalar(op.right_scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (input, right)) - - -@scalar_op_compiler.register_unary_op(ops.ApplyLeft, pass_op=True) -def apply_left(input: ibis_types.Value, op: ops.ApplyLeft): - left = dtypes.literal_to_ibis_scalar(op.left_scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (left, input)) - - -@scalar_op_compiler.register_binary_op(ops.ReverseArgsOp, pass_op=True) -def apply_reversed( - input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ReverseArgsOp -): - return scalar_op_compiler.compile_row_op(op.base_op, (input2, input1)) - - -@scalar_op_compiler.register_binary_op(ops.ApplyArg1, pass_op=True) -def apply_arg1(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg1): - arg1 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (arg1, input1, input2)) - - -@scalar_op_compiler.register_binary_op(ops.ApplyArg3, pass_op=True) -def apply_arg3(input1: ibis_types.Value, input2: ibis_types.Value, op: ops.ApplyArg3): - arg3 = dtypes.literal_to_ibis_scalar(op.scalar, validate=False) - return scalar_op_compiler.compile_row_op(op.base_op, (input1, input2, arg3)) - - # Helpers def is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py new file mode 100644 index 0000000000..540f9b6e5a --- /dev/null +++ b/bigframes/core/expression.py @@ -0,0 +1,112 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import abc +import dataclasses +import itertools +import typing +from typing import Optional + +import bigframes.dtypes +import bigframes.operations + + +def const( + value: typing.Hashable, dtype: Optional[bigframes.dtypes.Dtype] = None +) -> Expression: + return ScalarConstantExpression(value, dtype) + + +def free_var(id: str) -> Expression: + return UnboundVariableExpression(id) + + +@dataclasses.dataclass(frozen=True) +class Expression(abc.ABC): + """An expression represents a computation taking N scalar inputs and producing a single output scalar.""" + + @property + def unbound_variables(self) -> typing.Tuple[str, ...]: + return () + + def rename(self, name_mapping: dict[str, str]) -> Expression: + return self + + @abc.abstractproperty + def is_const(self) -> bool: + return False + + +@dataclasses.dataclass(frozen=True) +class ScalarConstantExpression(Expression): + """An expression representing a scalar constant.""" + + # TODO: Further constrain? + value: typing.Hashable + dtype: Optional[bigframes.dtypes.Dtype] = None + + @property + def is_const(self) -> bool: + return True + + +@dataclasses.dataclass(frozen=True) +class UnboundVariableExpression(Expression): + """A variable expression representing an unbound variable.""" + + id: str + + @property + def unbound_variables(self) -> typing.Tuple[str, ...]: + return (self.id,) + + def rename(self, name_mapping: dict[str, str]) -> Expression: + if self.id in name_mapping: + return UnboundVariableExpression(name_mapping[self.id]) + else: + return self + + @property + def is_const(self) -> bool: + return False + + +@dataclasses.dataclass(frozen=True) +class OpExpression(Expression): + """An expression representing a scalar operation applied to 1 or more argument sub-expressions.""" + + op: bigframes.operations.RowOp + inputs: typing.Tuple[Expression, ...] + + def __post_init__(self): + assert self.op.arguments == len(self.inputs) + + @property + def unbound_variables(self) -> typing.Tuple[str, ...]: + return tuple( + itertools.chain.from_iterable( + map(lambda x: x.unbound_variables, self.inputs) + ) + ) + + def rename(self, name_mapping: dict[str, str]) -> Expression: + return OpExpression( + self.op, tuple(input.rename(name_mapping) for input in self.inputs) + ) + + @property + def is_const(self) -> bool: + return all(child.is_const for child in self.inputs) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 66ba901649..ab6b15e7b9 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -28,7 +28,6 @@ import bigframes.core.window as windows import bigframes.dataframe as df import bigframes.dtypes as dtypes -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.series as series import third_party.bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @@ -540,10 +539,13 @@ def cummin(self, *args, **kwargs) -> series.Series: ) def cumcount(self, *args, **kwargs) -> series.Series: - return self._apply_window_op( - agg_ops.rank_op, - discard_name=True, - )._apply_unary_op(ops.partial_right(ops.sub_op, 1)) + return ( + self._apply_window_op( + agg_ops.rank_op, + discard_name=True, + ) + - 1 + ) def shift(self, periods=1) -> series.Series: """Shift index by desired number of periods.""" diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 12a1303d29..6998d0e974 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -22,6 +22,7 @@ import bigframes.constants as constants import bigframes.core.blocks +import bigframes.core.expression as ex import bigframes.core.guid as guid import bigframes.core.indexes as indexes import bigframes.core.scalar @@ -63,17 +64,14 @@ def __setitem__(self, key, value) -> None: index_column = block.index_columns[0] # if index == key return value else value_colum - block, insert_cond = block.apply_unary_op( - index_column, ops.partial_right(ops.eq_op, key) - ) - block, result_id = block.apply_binary_op( - insert_cond, - self._series._value_column, - ops.partial_arg1(ops.where_op, value), - ) - block = block.copy_values(result_id, value_column).drop_columns( - [insert_cond, result_id] + block, result_id = block.project_expr( + ops.where_op.as_expr( + ex.const(value), + ops.eq_op.as_expr(index_column, ex.const(key)), + self._series._value_column, + ) ) + block = block.copy_values(result_id, value_column).drop_columns([result_id]) self._series._set_block(block) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index f148759f61..4ec11cb163 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -26,6 +26,7 @@ import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.joins as joining import bigframes.core.ordering as order import bigframes.core.utils as utils @@ -186,7 +187,7 @@ def astype( ) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") - return self._apply_unary_op(ops.AsTypeOp(to_type=dtype)) + return self._apply_unary_expr(ops.AsTypeOp(to_type=dtype).as_expr("arg")) def all(self) -> bool: if self.nlevels > 1: @@ -261,7 +262,7 @@ def value_counts( def fillna(self, value=None) -> Index: if self.nlevels > 1: raise TypeError("Multiindex does not support 'fillna'") - return self._apply_unary_op(ops.partial_right(ops.fillna_op, value)) + return self._apply_unary_expr(ops.fillna_op.as_expr("arg", ex.const(value))) def rename(self, name: Union[str, Sequence[str]]) -> Index: names = [name] if isinstance(name, str) else list(name) @@ -284,8 +285,8 @@ def drop( inverse_condition_id, ops.invert_op ) else: - block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, labels) + block, condition_id = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(labels)) ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) @@ -308,19 +309,23 @@ def isin(self, values) -> Index: f"isin(), you passed a [{type(values).__name__}]" ) - return self._apply_unary_op( - ops.IsInOp(values=tuple(values), match_nulls=True) + return self._apply_unary_expr( + ops.IsInOp(values=tuple(values), match_nulls=True).as_expr("arg") ).fillna(value=False) - def _apply_unary_op( + def _apply_unary_expr( self, - op: ops.UnaryOp, + op: ex.Expression, ) -> Index: """Applies a unary operator to the index.""" + if len(op.unbound_variables) != 1: + raise ValueError("Expression must have exactly 1 unbound variable.") + unbound_variable = op.unbound_variables[0] + block = self._block result_ids = [] for col in self._block.index_columns: - block, result_id = block.apply_unary_op(col, op) + block, result_id = block.project_expr(op.rename({unbound_variable: col})) result_ids.append(result_id) block = block.set_index(result_ids, index_labels=self._block.index_labels) @@ -604,8 +609,8 @@ def coalesce_columns( expr = expr.drop_columns([left_id]) elif how == "outer": coalesced_id = bigframes.core.guid.generate_guid() - expr = expr.project_binary_op( - left_id, right_id, ops.coalesce_op, coalesced_id + expr = expr.project_to_id( + ops.coalesce_op.as_expr(left_id, right_id), coalesced_id ) expr = expr.drop_columns([left_id, right_id]) result_ids.append(coalesced_id) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 5385852432..d30db9a7f7 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -17,15 +17,15 @@ from dataclasses import dataclass, field, fields import functools import typing -from typing import Optional, Tuple +from typing import Tuple import pandas +import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.ordering import OrderingColumnReference import bigframes.core.window_spec as window import bigframes.dtypes -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops if typing.TYPE_CHECKING: @@ -145,14 +145,6 @@ def __hash__(self): # Unary nodes -@dataclass(frozen=True) -class DropColumnsNode(UnaryNode): - columns: Tuple[str, ...] - - def __hash__(self): - return self._node_hash - - @dataclass(frozen=True) class PromoteOffsetsNode(UnaryNode): col_id: str @@ -188,18 +180,8 @@ def __hash__(self): @dataclass(frozen=True) -class SelectNode(UnaryNode): - column_ids: typing.Tuple[str, ...] - - def __hash__(self): - return self._node_hash - - -@dataclass(frozen=True) -class ProjectRowOpNode(UnaryNode): - input_ids: typing.Tuple[str, ...] - op: ops.RowOp - output_id: Optional[str] = None +class ProjectionNode(UnaryNode): + assignments: typing.Tuple[typing.Tuple[ex.Expression, str], ...] def __hash__(self): return self._node_hash @@ -266,25 +248,6 @@ def __hash__(self): return self._node_hash -@dataclass(frozen=True) -class AssignNode(UnaryNode): - source_id: str - destination_id: str - - def __hash__(self): - return self._node_hash - - -@dataclass(frozen=True) -class AssignConstantNode(UnaryNode): - destination_id: str - value: typing.Hashable - dtype: typing.Optional[bigframes.dtypes.Dtype] - - def __hash__(self): - return self._node_hash - - @dataclass(frozen=True) class RandomSampleNode(UnaryNode): fraction: float diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index d9cc99a036..cadd8e5145 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.expression as ex import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dataframe @@ -165,7 +166,7 @@ def qcut( ordering=(order.OrderingColumnReference(x._value_column),), ), ) - block, result = block.apply_binary_op( - result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label + block, result = block.project_expr( + ops.where_op.as_expr(result, nullity_id, ex.const(None)), label=label ) return bigframes.series.Series(block.select_column(result)) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1f039904f0..1288117395 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -47,6 +47,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.guid import bigframes.core.indexers as indexers @@ -656,25 +657,34 @@ def _apply_binop( op, axis: str | int = "columns", how: str = "outer", + reverse: bool = False, ): if isinstance(other, (float, int)): - return self._apply_scalar_binop(other, op) + return self._apply_scalar_binop(other, op, reverse=reverse) elif isinstance(other, bigframes.series.Series): - return self._apply_series_binop(other, op, axis=axis, how=how) + return self._apply_series_binop( + other, op, axis=axis, how=how, reverse=reverse + ) elif isinstance(other, DataFrame): - return self._apply_dataframe_binop(other, op, how=how) + return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." f"{constants.FEEDBACK_LINK}" ) - def _apply_scalar_binop(self, other: float | int, op: ops.BinaryOp) -> DataFrame: + def _apply_scalar_binop( + self, other: float | int, op: ops.BinaryOp, reverse: bool = False + ) -> DataFrame: block = self._block - partial_op = ops.ApplyRight(base_op=op, right_scalar=other) for column_id, label in zip( self._block.value_columns, self._block.column_labels ): - block, _ = block.apply_unary_op(column_id, partial_op, result_label=label) + expr = ( + op.as_expr(ex.const(other), column_id) + if reverse + else op.as_expr(column_id, ex.const(other)) + ) + block, _ = block.project_expr(expr, label) block = block.drop_columns([column_id]) return DataFrame(block) @@ -684,6 +694,7 @@ def _apply_series_binop( op: ops.BinaryOp, axis: str | int = "columns", how: str = "outer", + reverse: bool = False, ) -> DataFrame: if axis not in ("columns", "index", 0, 1): raise ValueError(f"Invalid input: axis {axis}.") @@ -703,12 +714,13 @@ def _apply_series_binop( for column_id, label in zip( self._block.value_columns, self._block.column_labels ): - block, _ = block.apply_binary_op( - get_column_left[column_id], - series_col, - op, - result_label=label, + self_col = get_column_left[column_id] + expr = ( + op.as_expr(series_col, self_col) + if reverse + else op.as_expr(self_col, series_col) ) + block, _ = block.project_expr(expr, label) block = block.drop_columns([get_column_left[column_id]]) block = block.drop_columns([series_col]) @@ -716,7 +728,11 @@ def _apply_series_binop( return DataFrame(block) def _apply_dataframe_binop( - self, other: DataFrame, op: ops.BinaryOp, how: str = "outer" + self, + other: DataFrame, + op: ops.BinaryOp, + how: str = "outer", + reverse: bool = False, ) -> DataFrame: # Join rows joined_index, (get_column_left, get_column_right) = self._block.index.join( @@ -738,31 +754,32 @@ def _apply_dataframe_binop( for left_index, right_index in column_indices: if left_index >= 0 and right_index >= 0: # -1 indices indicate missing - left_col_id = self._block.value_columns[left_index] - right_col_id = other._block.value_columns[right_index] - block, result_col_id = block.apply_binary_op( - get_column_left[left_col_id], - get_column_right[right_col_id], - op, + self_col_id = get_column_left[self._block.value_columns[left_index]] + other_col_id = get_column_right[other._block.value_columns[right_index]] + expr = ( + op.as_expr(other_col_id, self_col_id) + if reverse + else op.as_expr(self_col_id, other_col_id) ) - binop_result_ids.append(result_col_id) elif left_index >= 0: - left_col_id = self._block.value_columns[left_index] - block, result_col_id = block.apply_unary_op( - get_column_left[left_col_id], - ops.partial_right(op, None), + self_col_id = get_column_left[self._block.value_columns[left_index]] + expr = ( + op.as_expr(ex.const(None), self_col_id) + if reverse + else op.as_expr(self_col_id, ex.const(None)) ) - binop_result_ids.append(result_col_id) elif right_index >= 0: - right_col_id = other._block.value_columns[right_index] - block, result_col_id = block.apply_unary_op( - get_column_right[right_col_id], - ops.partial_left(op, None), + other_col_id = get_column_right[other._block.value_columns[right_index]] + expr = ( + op.as_expr(other_col_id, ex.const(None)) + if reverse + else op.as_expr(ex.const(None), other_col_id) ) - binop_result_ids.append(result_col_id) else: # Should not be possible raise ValueError("No right or left index.") + block, result_col_id = block.project_expr(expr) + binop_result_ids.append(result_col_id) block = block.select_columns(binop_result_ids).with_column_labels(columns) return DataFrame(block) @@ -822,7 +839,7 @@ def rsub( other: float | int | bigframes.series.Series | DataFrame, axis: str | int = "columns", ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.sub_op), axis=axis) + return self._apply_binop(other, ops.sub_op, axis=axis, reverse=True) __rsub__ = rsub @@ -849,7 +866,7 @@ def rtruediv( other: float | int | bigframes.series.Series | DataFrame, axis: str | int = "columns", ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.div_op), axis=axis) + return self._apply_binop(other, ops.div_op, axis=axis, reverse=True) __rtruediv__ = rdiv = rtruediv @@ -867,7 +884,7 @@ def rfloordiv( other: float | int | bigframes.series.Series | DataFrame, axis: str | int = "columns", ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.floordiv_op), axis=axis) + return self._apply_binop(other, ops.floordiv_op, axis=axis, reverse=True) __rfloordiv__ = rfloordiv @@ -875,7 +892,7 @@ def mod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int return self._apply_binop(other, ops.mod_op, axis=axis) def rmod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore - return self._apply_binop(other, ops.reverse(ops.mod_op), axis=axis) + return self._apply_binop(other, ops.mod_op, axis=axis, reverse=True) __mod__ = mod @@ -889,7 +906,7 @@ def pow( def rpow( self, other: int | bigframes.series.Series, axis: str | int = "columns" ) -> DataFrame: - return self._apply_binop(other, ops.reverse(ops.pow_op), axis=axis) + return self._apply_binop(other, ops.pow_op, axis=axis, reverse=True) __pow__ = pow @@ -1101,8 +1118,8 @@ def drop( condition_id = None for i, idx in enumerate(index): level_id = self._resolve_levels(i)[0] - block, condition_id_cur = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, idx) + block, condition_id_cur = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(idx)) ) if condition_id: block, condition_id = block.apply_binary_op( @@ -1122,8 +1139,8 @@ def drop( elif isinstance(index, indexes.Index): return self._drop_by_index(index) else: - block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, index) + block, condition_id = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(index)) ) block = block.filter(condition_id, keep_null=True).select_columns( self._block.value_columns @@ -3031,7 +3048,7 @@ def __array_ufunc__( if inputs[0] is self: return self._apply_binop(inputs[1], binop) else: - return self._apply_binop(inputs[0], ops.reverse(binop)) + return self._apply_binop(inputs[0], binop, reverse=True) return NotImplemented diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 8c01159113..3607000323 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -160,7 +160,8 @@ def predict( max_output_tokens (int, default 128): Maximum number of tokens that can be generated in the response. Specify a lower value for shorter responses and a higher value for longer responses. A token may be smaller than a word. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. - Default 128. Possible values [1, 1024]. + Default 128. For the 'text-bison' model, possible values are in the range [1, 1024]. For the 'text-bison-32k' model, possible values are in the range [1, 8196]. + Please ensure that the specified value for max_output_tokens is within the appropriate range for the model being used. top_k (int, default 40): Top-k changes how the model selects tokens for output. A top-k of 1 means the selected token is the most probable among all tokens @@ -184,12 +185,26 @@ def predict( # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models if temperature < 0.0 or temperature > 1.0: raise ValueError(f"temperature must be [0.0, 1.0], but is {temperature}.") - if max_output_tokens not in range(1, 1025): + + if ( + self.model_name == _TEXT_GENERATOR_BISON_ENDPOINT + and max_output_tokens not in range(1, 1025) + ): + raise ValueError( + f"max_output_token must be [1, 1024] for TextBison model, but is {max_output_tokens}." + ) + + if ( + self.model_name == _TEXT_GENERATOR_BISON_32K_ENDPOINT + and max_output_tokens not in range(1, 8197) + ): raise ValueError( - f"max_output_token must be [1, 1024], but is {max_output_tokens}." + f"max_output_token must be [1, 8196] for TextBison 32k model, but is {max_output_tokens}." ) + if top_k not in range(1, 41): raise ValueError(f"top_k must be [1, 40], but is {top_k}.") + if top_p < 0.0 or top_p > 1.0: raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 3ef551e453..9737df94f9 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -21,6 +21,10 @@ import bigframes.dtypes as dtypes +if typing.TYPE_CHECKING: + # Avoids circular dependency + import bigframes.core.expression + class RowOp(typing.Protocol): @property @@ -45,6 +49,15 @@ def name(self) -> str: def arguments(self) -> int: return 1 + def as_expr( + self, input_id: typing.Union[str, bigframes.core.expression.Expression] = "arg" + ) -> bigframes.core.expression.Expression: + import bigframes.core.expression + + return bigframes.core.expression.OpExpression( + self, (_convert_expr_input(input_id),) + ) + @dataclasses.dataclass(frozen=True) class BinaryOp: @@ -56,6 +69,21 @@ def name(self) -> str: def arguments(self) -> int: return 2 + def as_expr( + self, + left_input: typing.Union[str, bigframes.core.expression.Expression] = "arg1", + right_input: typing.Union[str, bigframes.core.expression.Expression] = "arg2", + ) -> bigframes.core.expression.Expression: + import bigframes.core.expression + + return bigframes.core.expression.OpExpression( + self, + ( + _convert_expr_input(left_input), + _convert_expr_input(right_input), + ), + ) + @dataclasses.dataclass(frozen=True) class TernaryOp: @@ -67,6 +95,35 @@ def name(self) -> str: def arguments(self) -> int: return 3 + def as_expr( + self, + input1: typing.Union[str, bigframes.core.expression.Expression] = "arg1", + input2: typing.Union[str, bigframes.core.expression.Expression] = "arg2", + input3: typing.Union[str, bigframes.core.expression.Expression] = "arg3", + ) -> bigframes.core.expression.Expression: + import bigframes.core.expression + + return bigframes.core.expression.OpExpression( + self, + ( + _convert_expr_input(input1), + _convert_expr_input(input2), + _convert_expr_input(input3), + ), + ) + + +def _convert_expr_input( + input: typing.Union[str, bigframes.core.expression.Expression] +) -> bigframes.core.expression.Expression: + """Allows creating free variables with just a string""" + import bigframes.core.expression + + if isinstance(input, str): + return bigframes.core.expression.UnboundVariableExpression(input) + else: + return input + # Operation Factories def create_unary_op(name: str) -> UnaryOp: @@ -271,63 +328,6 @@ class MapOp(UnaryOp): mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...] -# Operation Composition -# Meta-ops that do partial application or parameter remapping -# Subject to change, may convert to explicit tree -@dataclasses.dataclass(frozen=True) -class ApplyRight(UnaryOp): - name: typing.ClassVar[str] = "apply_right" - base_op: BinaryOp - right_scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ApplyLeft(UnaryOp): - name: typing.ClassVar[str] = "apply_left" - base_op: BinaryOp - left_scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ApplyArg1(BinaryOp): - name: typing.ClassVar[str] = "apply_arg1" - base_op: TernaryOp - scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ApplyArg3(BinaryOp): - name: typing.ClassVar[str] = "apply_arg3" - base_op: TernaryOp - scalar: typing.Any - - -@dataclasses.dataclass(frozen=True) -class ReverseArgsOp(BinaryOp): - name: typing.ClassVar[str] = "apply_reverse" - base_op: BinaryOp - - -def partial_left(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return ApplyLeft(base_op=op, left_scalar=scalar) - - -def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: - return ApplyRight(base_op=op, right_scalar=scalar) - - -def partial_arg1(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return ApplyArg1(base_op=op, scalar=scalar) - - -def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: - return ApplyArg3(base_op=op, scalar=scalar) - - -def reverse(op: BinaryOp) -> BinaryOp: - return ReverseArgsOp(base_op=op) - - # Binary Ops fillna_op = create_binary_op(name="fillna") cliplower_op = create_binary_op(name="clip_lower") diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 8989255f7e..077815a9d6 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -20,6 +20,7 @@ import bigframes.constants as constants import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.scalar as scalars import bigframes.dtypes import bigframes.operations as ops @@ -136,6 +137,7 @@ def _apply_binary_op( other: typing.Any, op: ops.BinaryOp, alignment: typing.Literal["outer", "left"] = "outer", + reverse: bool = False, ) -> series.Series: """Applies a binary operator to the series and other.""" if isinstance(other, pd.Series): @@ -144,11 +146,7 @@ def _apply_binary_op( f"Pandas series not supported as operand. {constants.FEEDBACK_LINK}" ) if isinstance(other, series.Series): - (left, right, block) = self._align(other, how=alignment) - - block, result_id = block.apply_binary_op( - left, right, op, self._value_column - ) + (self_col, other_col, block) = self._align(other, how=alignment) name = self._name if ( @@ -157,13 +155,20 @@ def _apply_binary_op( and alignment == "outer" ): name = None - - return series.Series( - block.select_column(result_id).assign_label(result_id, name) + expr = op.as_expr( + other_col if reverse else self_col, self_col if reverse else other_col ) + block, result_id = block.project_expr(expr, name) + return series.Series(block.select_column(result_id)) + else: - partial_op = ops.ApplyRight(base_op=op, right_scalar=other) - return self._apply_unary_op(partial_op) + name = self._name + expr = op.as_expr( + ex.const(other) if reverse else self._value_column, + self._value_column if reverse else ex.const(other), + ) + block, result_id = self._block.project_expr(expr, name) + return series.Series(block.select_column(result_id)) def _apply_corr_aggregation(self, other: series.Series) -> float: (left, right, block) = self._align(other, how="outer") diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 0f060a23e8..554acda202 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -48,6 +48,7 @@ import bigframes._config as config import bigframes.constants as constants import bigframes.core.blocks +import bigframes.core.expression as ex import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.core.reshape @@ -294,14 +295,13 @@ def _perform_get_dummies_block_operations( new_column_label = f"{column_label}{value}" if column_label == "": new_column_label = value - new_block, new_id = block.apply_unary_op( - column_id, ops.ApplyLeft(ops.eq_op, value) + new_block, new_id = block.project_expr( + ops.eq_op.as_expr(column_id, ex.const(value)) ) intermediate_col_ids.append(new_id) - block, _ = new_block.apply_unary_op( - new_id, - ops.ApplyRight(ops.fillna_op, False), - result_label=new_column_label, + block, _ = new_block.project_expr( + ops.fillna_op.as_expr(new_id, ex.const(False)), + label=new_column_label, ) if dummy_na: # dummy column name for na depends on the dtype diff --git a/bigframes/series.py b/bigframes/series.py index 8f564423fc..2371aad780 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -34,6 +34,7 @@ from bigframes.core import log_adapter import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks +import bigframes.core.expression as ex import bigframes.core.groupby as groupby import bigframes.core.indexers import bigframes.core.indexes as indexes @@ -188,8 +189,8 @@ def rename( # Will throw if value type isn't compatible with index type. block, const_id = block.create_constant(v, dtype=idx_dtype) - block, cond_id = block.apply_unary_op( - idx_id, ops.ApplyRight(base_op=ops.ne_op, right_scalar=k) + block, cond_id = block.project_expr( + ops.ne_op.as_expr(idx_id, ex.const(k)) ) block, new_idx_id = block.apply_ternary_op( idx_id, cond_id, const_id, ops.where_op @@ -305,14 +306,13 @@ def to_pandas( is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. """ df, query_job = self._block.to_pandas( - (self._value_column,), max_download_size=max_download_size, sampling_method=sampling_method, random_state=random_state, ordered=ordered, ) self._set_internal_query_job(query_job) - series = df[self._value_column] + series = df.squeeze(axis=1) series.name = self._name return series @@ -343,8 +343,8 @@ def drop( inverse_condition_id, ops.invert_op ) else: - block, condition_id = block.apply_unary_op( - level_id, ops.partial_right(ops.ne_op, index) + block, condition_id = block.project_expr( + ops.ne_op.as_expr(level_id, ex.const(index)) ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) @@ -489,11 +489,8 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value): block, cond = self._block.apply_unary_op( self._value_column, ops.IsInOp(tuple(to_replace_list)) ) - block, result_col = block.apply_binary_op( - cond, - self._value_column, - ops.partial_arg1(ops.where_op, value), - result_label=self.name, + block, result_col = block.project_expr( + ops.where_op.as_expr(ex.const(value), cond, self._value_column), self.name ) return Series(block.select_column(result_col)) @@ -606,7 +603,7 @@ def add(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.add_op) def radd(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.add_op)) + return self._apply_binary_op(other, ops.add_op, reverse=True) def __sub__(self, other: float | int | Series) -> Series: return self.sub(other) @@ -618,7 +615,7 @@ def sub(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.sub_op) def rsub(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.sub_op)) + return self._apply_binary_op(other, ops.sub_op, reverse=True) subtract = sub @@ -632,7 +629,7 @@ def mul(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.mul_op) def rmul(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.mul_op)) + return self._apply_binary_op(other, ops.mul_op, reverse=True) multiply = mul @@ -646,7 +643,7 @@ def truediv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.div_op) def rtruediv(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.div_op)) + return self._apply_binary_op(other, ops.div_op, reverse=True) div = truediv @@ -664,7 +661,7 @@ def floordiv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.floordiv_op) def rfloordiv(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.floordiv_op)) + return self._apply_binary_op(other, ops.floordiv_op, reverse=True) def __pow__(self, other: float | int | Series) -> Series: return self.pow(other) @@ -676,7 +673,7 @@ def pow(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.pow_op) def rpow(self, other: float | int | Series) -> Series: - return self._apply_binary_op(other, ops.reverse(ops.pow_op)) + return self._apply_binary_op(other, ops.pow_op, reverse=True) def __lt__(self, other: float | int | Series) -> Series: # type: ignore return self.lt(other) @@ -712,7 +709,7 @@ def mod(self, other) -> Series: # type: ignore return self._apply_binary_op(other, ops.mod_op) def rmod(self, other) -> Series: # type: ignore - return self._apply_binary_op(other, ops.reverse(ops.mod_op)) + return self._apply_binary_op(other, ops.mod_op, reverse=True) def divmod(self, other) -> Tuple[Series, Series]: # type: ignore # TODO(huanc): when self and other both has dtype int and other contains zeros, @@ -736,26 +733,6 @@ def round(self, decimals=0) -> "Series": return self._apply_binary_op(decimals, ops.round_op) def corr(self, other: Series, method="pearson", min_periods=None) -> float: - """ - Compute the correlation with the other Series. Non-number values are ignored in the - computation. - - Uses the "Pearson" method of correlation. Numbers are converted to float before - calculation, so the result may be unstable. - - Args: - other (Series): - The series with which this is to be correlated. - method (string, default "pearson"): - Correlation method to use - currently only "pearson" is supported. - min_periods (int, default None): - The minimum number of observations needed to return a result. Non-default values - are not yet supported, so a result will be returned for at least two observations. - - Returns: - float; Will return NaN if there are fewer than two numeric pairs, either series has a - variance or covariance of zero, or any input value is infinite. - """ # TODO(kemppeterson): Validate early that both are numeric # TODO(kemppeterson): Handle partially-numeric columns if method != "pearson": @@ -1524,7 +1501,7 @@ def __array_ufunc__( if inputs[0] is self: return self._apply_binary_op(inputs[1], binop) else: - return self._apply_binary_op(inputs[0], ops.reverse(binop)) + return self._apply_binary_op(inputs[0], binop, reverse=True) return NotImplemented diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 15c262afa7..d503b844aa 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1497,7 +1497,6 @@ def _execute( self, array_value: core.ArrayValue, job_config: Optional[bigquery.job.QueryJobConfig] = None, - max_results: Optional[int] = None, *, sorted: bool = True, dry_run=False, @@ -1507,7 +1506,6 @@ def _execute( return self._start_query( sql=sql, job_config=job_config, - max_results=max_results, ) def _to_sql( diff --git a/bigframes/version.py b/bigframes/version.py index aeefff7a4b..043d22fd11 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.19.0" +__version__ = "0.19.1" diff --git a/notebooks/getting_started/ml_fundamentals.ipynb b/notebooks/getting_started/ml_fundamentals.ipynb deleted file mode 100644 index 165bd90f31..0000000000 --- a/notebooks/getting_started/ml_fundamentals.ipynb +++ /dev/null @@ -1,3908 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using ML - ML fundamentals\n", - "\n", - "The `bigframes.ml` module implements Scikit-Learn's machine learning API in\n", - "BigQuery DataFrames. It exposes BigQuery's ML capabilities in a simple, popular\n", - "API that works seamlessly with the rest of the BigQuery DataFrames API." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 7ddb1bda-402a-4e8e-8476-7904010fb4ef is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e8aba858-7660-4274-8d90-8d2b0382f8f6 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
penguin_id
0Adelie Penguin (Pygoscelis adeliae)Biscoe40.118.9188.04300.0MALE
1Adelie Penguin (Pygoscelis adeliae)Torgersen39.118.7181.03750.0MALE
2Gentoo penguin (Pygoscelis papua)Biscoe47.414.6212.04725.0FEMALE
3Chinstrap penguin (Pygoscelis antarctica)Dream42.516.7187.03350.0FEMALE
4Adelie Penguin (Pygoscelis adeliae)Biscoe43.219.0197.04775.0MALE
5Gentoo penguin (Pygoscelis papua)Biscoe46.715.3219.05200.0MALE
6Adelie Penguin (Pygoscelis adeliae)Biscoe41.321.1195.04400.0MALE
7Gentoo penguin (Pygoscelis papua)Biscoe45.213.8215.04750.0FEMALE
8Gentoo penguin (Pygoscelis papua)Biscoe46.513.5210.04550.0FEMALE
9Gentoo penguin (Pygoscelis papua)Biscoe50.515.2216.05000.0FEMALE
10Gentoo penguin (Pygoscelis papua)Biscoe48.215.6221.05100.0MALE
11Adelie Penguin (Pygoscelis adeliae)Dream38.118.6190.03700.0FEMALE
12Gentoo penguin (Pygoscelis papua)Biscoe50.715.0223.05550.0MALE
13Adelie Penguin (Pygoscelis adeliae)Biscoe37.820.0190.04250.0MALE
14Adelie Penguin (Pygoscelis adeliae)Biscoe35.017.9190.03450.0FEMALE
15Gentoo penguin (Pygoscelis papua)Biscoe48.715.7208.05350.0MALE
16Adelie Penguin (Pygoscelis adeliae)Torgersen34.621.1198.04400.0MALE
17Gentoo penguin (Pygoscelis papua)Biscoe46.815.4215.05150.0MALE
18Chinstrap penguin (Pygoscelis antarctica)Dream50.320.0197.03300.0MALE
19Adelie Penguin (Pygoscelis adeliae)Dream37.218.1178.03900.0MALE
20Chinstrap penguin (Pygoscelis antarctica)Dream51.018.8203.04100.0MALE
21Adelie Penguin (Pygoscelis adeliae)Biscoe40.517.9187.03200.0FEMALE
22Gentoo penguin (Pygoscelis papua)Biscoe45.513.9210.04200.0FEMALE
23Adelie Penguin (Pygoscelis adeliae)Dream42.218.5180.03550.0FEMALE
24Chinstrap penguin (Pygoscelis antarctica)Dream51.720.3194.03775.0MALE
\n", - "

25 rows × 7 columns

\n", - "
[334 rows x 7 columns in total]" - ], - "text/plain": [ - " species island \\\n", - "penguin_id \n", - "0 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", - "1 Adelie Penguin (Pygoscelis adeliae) Torgersen \n", - "2 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "4 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", - "5 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "6 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", - "7 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "8 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "9 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "10 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "12 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "13 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", - "14 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", - "15 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "16 Adelie Penguin (Pygoscelis adeliae) Torgersen \n", - "17 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "18 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "21 Adelie Penguin (Pygoscelis adeliae) Biscoe \n", - "22 Gentoo penguin (Pygoscelis papua) Biscoe \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream \n", - "\n", - " culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", - "penguin_id \n", - "0 40.1 18.9 188.0 4300.0 \n", - "1 39.1 18.7 181.0 3750.0 \n", - "2 47.4 14.6 212.0 4725.0 \n", - "3 42.5 16.7 187.0 3350.0 \n", - "4 43.2 19.0 197.0 4775.0 \n", - "5 46.7 15.3 219.0 5200.0 \n", - "6 41.3 21.1 195.0 4400.0 \n", - "7 45.2 13.8 215.0 4750.0 \n", - "8 46.5 13.5 210.0 4550.0 \n", - "9 50.5 15.2 216.0 5000.0 \n", - "10 48.2 15.6 221.0 5100.0 \n", - "11 38.1 18.6 190.0 3700.0 \n", - "12 50.7 15.0 223.0 5550.0 \n", - "13 37.8 20.0 190.0 4250.0 \n", - "14 35.0 17.9 190.0 3450.0 \n", - "15 48.7 15.7 208.0 5350.0 \n", - "16 34.6 21.1 198.0 4400.0 \n", - "17 46.8 15.4 215.0 5150.0 \n", - "18 50.3 20.0 197.0 3300.0 \n", - "19 37.2 18.1 178.0 3900.0 \n", - "20 51.0 18.8 203.0 4100.0 \n", - "21 40.5 17.9 187.0 3200.0 \n", - "22 45.5 13.9 210.0 4200.0 \n", - "23 42.2 18.5 180.0 3550.0 \n", - "24 51.7 20.3 194.0 3775.0 \n", - "\n", - " sex \n", - "penguin_id \n", - "0 MALE \n", - "1 MALE \n", - "2 FEMALE \n", - "3 FEMALE \n", - "4 MALE \n", - "5 MALE \n", - "6 MALE \n", - "7 FEMALE \n", - "8 FEMALE \n", - "9 FEMALE \n", - "10 MALE \n", - "11 FEMALE \n", - "12 MALE \n", - "13 MALE \n", - "14 FEMALE \n", - "15 MALE \n", - "16 MALE \n", - "17 MALE \n", - "18 MALE \n", - "19 MALE \n", - "20 MALE \n", - "21 FEMALE \n", - "22 FEMALE \n", - "23 FEMALE \n", - "24 MALE \n", - "...\n", - "\n", - "[334 rows x 7 columns]" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Lets load some test data to use in this tutorial\n", - "import bigframes.pandas\n", - "\n", - "df = bigframes.pandas.read_gbq(\"bigquery-public-data.ml_datasets.penguins\")\n", - "df = df.dropna()\n", - "\n", - "# Temporary workaround: lets name our index so it isn't lost BigQuery DataFrame\n", - "# currently drops unnamed indexes when round-tripping through pandas, which\n", - "# some ML APIs do to route around missing functionality\n", - "df.index.name = \"penguin_id\"\n", - "\n", - "df" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data split\n", - "\n", - "Part of preparing data for a machine learning task is splitting it into subsets for training and testing, to ensure that the solution is not overfitting. Most commonly this is done with `bigframes.ml.model_selection.train_test_split` like so:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job deda90a8-6ec7-419c-8067-e85777bd916f is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job efe8fa0a-d450-475a-99d5-36beeb985247 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5022c56d-e605-4cab-be1b-1ecf189588a1 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 175bd293-d448-4510-b926-1d8cfb4eb5e7 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job a3a2e68c-f5f3-4237-99ad-44974f29d090 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "X_train shape: (267, 6)\n", - "X_test shape: (67, 6)\n", - "y_train shape: (267, 1)\n", - "y_test shape: (67, 1)\n" - ] - } - ], - "source": [ - "# In this example, we're doing supervised learning, where we will learn to predict\n", - "# output variable `y` from input features `X`\n", - "X = df[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex', 'species']]\n", - "y = df[['body_mass_g']] \n", - "\n", - "from bigframes.ml.model_selection import train_test_split\n", - "\n", - "# This will split X and y into test and training sets, with 20% of the rows in the test set,\n", - "# and the rest in the training set\n", - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X, y, test_size=0.2)\n", - "\n", - "# Show the shape of the data after the split\n", - "print(f\"\"\"X_train shape: {X_train.shape}\n", - "X_test shape: {X_test.shape}\n", - "y_train shape: {y_train.shape}\n", - "y_test shape: {y_test.shape}\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job db3365fb-67ca-44cc-a117-88a80dc63cca is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ab78f7ab-a115-448b-92d0-19c091a831ca is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
islandculmen_length_mmculmen_depth_mmflipper_length_mmsexspecies
penguin_id
249Torgersen41.118.6189.0MALEAdelie Penguin (Pygoscelis adeliae)
36Biscoe43.414.4218.0FEMALEGentoo penguin (Pygoscelis papua)
74Biscoe42.814.2209.0FEMALEGentoo penguin (Pygoscelis papua)
235Dream34.017.1185.0FEMALEAdelie Penguin (Pygoscelis adeliae)
117Dream37.818.1193.0MALEAdelie Penguin (Pygoscelis adeliae)
\n", - "

5 rows × 6 columns

\n", - "
[5 rows x 6 columns in total]" - ], - "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm \\\n", - "penguin_id \n", - "249 Torgersen 41.1 18.6 189.0 \n", - "36 Biscoe 43.4 14.4 218.0 \n", - "74 Biscoe 42.8 14.2 209.0 \n", - "235 Dream 34.0 17.1 185.0 \n", - "117 Dream 37.8 18.1 193.0 \n", - "\n", - " sex species \n", - "penguin_id \n", - "249 MALE Adelie Penguin (Pygoscelis adeliae) \n", - "36 FEMALE Gentoo penguin (Pygoscelis papua) \n", - "74 FEMALE Gentoo penguin (Pygoscelis papua) \n", - "235 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "117 MALE Adelie Penguin (Pygoscelis adeliae) \n", - "\n", - "[5 rows x 6 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# If we look at the data, we can see that random rows were selected for\n", - "# each side of the split\n", - "X_test.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 22a72cad-11a6-4f8e-b16d-f92853b8112e is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job bc952727-8806-4fe2-abf2-c3a8a2bd9b6d is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
body_mass_g
penguin_id
2493325.0
364600.0
744700.0
2353400.0
1173750.0
\n", - "

5 rows × 1 columns

\n", - "
[5 rows x 1 columns in total]" - ], - "text/plain": [ - " body_mass_g\n", - "penguin_id \n", - "249 3325.0\n", - "36 4600.0\n", - "74 4700.0\n", - "235 3400.0\n", - "117 3750.0\n", - "\n", - "[5 rows x 1 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Note that this matches the rows in X_test\n", - "y_test.head(5)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Estimators\n", - "\n", - "Following Scikit-Learn, all learning components are \"estimators\"; objects that can learn from training data and then apply themselves to new data. Estimators share the following patterns:\n", - "\n", - "- a constructor that takes a list of parameters\n", - "- a standard string representation that shows the class name and all non-default parameters, e.g. `LinearRegression(fit_intercept=False)`\n", - "- a `.fit(..)` method to fit the estimator to training data\n", - "\n", - "There estimators can be further broken down into two main subtypes:\n", - "\n", - "### Transformers\n", - "\n", - "Transformers are estimators that are used to prepare data for consumption by other estimators ('preprocessing'). In addition to `.fit(...)`, the transformer implements a `.transform(...)` method, which will apply a transformation based on what was computed during `.fit(..)`. With this pattern dynamic preprocessing steps can be applied to both training and test/production data consistently.\n", - "\n", - "An example of a transformer is `bigframes.ml.preprocessing.StandardScaler`, which rescales a dataset to have a mean of zero and a standard deviation of one:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job f239341e-785f-43e1-bfe0-683132d6f15f is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 2d5bbbb9-efc4-4f4e-a8dc-2c7b66b0e5e0 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 66120e1c-2471-4a0c-8b82-aeb189c8866a is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 62825fc4-5b77-43e5-a3e4-525ebfd1285b is DONE. 2.1 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 656d1d69-b4ff-4db6-9f2d-28dcf91e2fd7 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 466507c8-1474-4725-93e5-baf8ee292e39 is DONE. 8.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
standard_scaled_culmen_length_mmstandard_scaled_culmen_depth_mmstandard_scaled_flipper_length_mm
penguin_id
0-0.7505050.84903-0.937262
20.622496-1.3224020.804051
3-0.299107-0.261935-1.009817
50.490839-0.9689131.311935
6-0.5248061.959995-0.429379
70.208715-1.7263891.021716
91.205551-1.0194121.09427
100.772962-0.8174181.457044
121.243168-1.1204081.602153
14-1.7097250.344046-0.792152
170.509647-0.9184151.021716
181.1679351.404513-0.284269
19-1.2959440.445043-1.662809
201.2995930.7985320.151059
21-0.6752720.344046-1.009817
220.26514-1.6758910.658942
241.431251.556008-0.501934
250.3027560.041055-0.574488
260.302756-1.6758910.949161
270.227523-1.7768880.658942
281.318401-0.3629321.747263
292.2023881.3035160.441278
30-0.9197791.959995-0.356824
311.036277-0.6154241.747263
32-0.2238740.19255-0.356824
\n", - "

25 rows × 3 columns

\n", - "
[267 rows x 3 columns in total]" - ], - "text/plain": [ - " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", - "penguin_id \n", - "0 -0.750505 0.84903 \n", - "2 0.622496 -1.322402 \n", - "3 -0.299107 -0.261935 \n", - "5 0.490839 -0.968913 \n", - "6 -0.524806 1.959995 \n", - "7 0.208715 -1.726389 \n", - "9 1.205551 -1.019412 \n", - "10 0.772962 -0.817418 \n", - "12 1.243168 -1.120408 \n", - "14 -1.709725 0.344046 \n", - "17 0.509647 -0.918415 \n", - "18 1.167935 1.404513 \n", - "19 -1.295944 0.445043 \n", - "20 1.299593 0.798532 \n", - "21 -0.675272 0.344046 \n", - "22 0.26514 -1.675891 \n", - "24 1.43125 1.556008 \n", - "25 0.302756 0.041055 \n", - "26 0.302756 -1.675891 \n", - "27 0.227523 -1.776888 \n", - "28 1.318401 -0.362932 \n", - "29 2.202388 1.303516 \n", - "30 -0.919779 1.959995 \n", - "31 1.036277 -0.615424 \n", - "32 -0.223874 0.19255 \n", - "\n", - " standard_scaled_flipper_length_mm \n", - "penguin_id \n", - "0 -0.937262 \n", - "2 0.804051 \n", - "3 -1.009817 \n", - "5 1.311935 \n", - "6 -0.429379 \n", - "7 1.021716 \n", - "9 1.09427 \n", - "10 1.457044 \n", - "12 1.602153 \n", - "14 -0.792152 \n", - "17 1.021716 \n", - "18 -0.284269 \n", - "19 -1.662809 \n", - "20 0.151059 \n", - "21 -1.009817 \n", - "22 0.658942 \n", - "24 -0.501934 \n", - "25 -0.574488 \n", - "26 0.949161 \n", - "27 0.658942 \n", - "28 1.747263 \n", - "29 0.441278 \n", - "30 -0.356824 \n", - "31 1.747263 \n", - "32 -0.356824 \n", - "...\n", - "\n", - "[267 rows x 3 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.ml.preprocessing import StandardScaler\n", - "\n", - "# StandardScaler will only work on numeric columns\n", - "numeric_columns = [\"culmen_length_mm\", \"culmen_depth_mm\", \"flipper_length_mm\"]\n", - "\n", - "scaler = StandardScaler()\n", - "scaler.fit(X_train[numeric_columns])\n", - "\n", - "# Now, standardscaler should transform the numbers to have mean of zero\n", - "# and standard deviation of one:\n", - "scaler.transform(X_train[numeric_columns])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 845c6cff-ac6c-46c1-8e9b-061519f1fa1a is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 1e17f5f7-2956-4bdd-baa9-c07591481341 is DONE. 536 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e2fde7a6-67b4-45a4-91d4-1cb9eff66ae5 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e0683619-23c5-44fd-8930-9d3c9d02729a is DONE. 2.1 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
standard_scaled_culmen_length_mmstandard_scaled_culmen_depth_mmstandard_scaled_flipper_length_mm
penguin_id
1-0.9385870.748033-1.445145
4-0.167450.899528-0.284269
80.453222-1.8778850.658942
11-1.126670.697535-0.792152
13-1.1830941.404513-0.792152
150.867003-0.7669190.513833
16-1.7849581.959995-0.211715
23-0.3555320.647036-1.5177
34-0.600039-1.7768880.949161
36-0.129833-1.4233991.23938
42-1.615684-0.514427-0.429379
480.415606-0.7164211.021716
610.396797-1.1709071.457044
640.434414-1.1204081.09427
65-1.2207111.051024-1.445145
68-1.484026-0.009443-1.009817
701.6381411.4045130.296168
720.8293870.142052-0.719598
74-0.242683-1.5243960.586387
77-1.277136-0.211437-0.647043
810.208715-1.2214050.804051
911.2619760.6470360.005949
960.246331-1.3224020.731497
105-1.8037660.445043-1.009817
111-1.1642860.697535-2.098138
\n", - "

25 rows × 3 columns

\n", - "
[67 rows x 3 columns in total]" - ], - "text/plain": [ - " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", - "penguin_id \n", - "1 -0.938587 0.748033 \n", - "4 -0.16745 0.899528 \n", - "8 0.453222 -1.877885 \n", - "11 -1.12667 0.697535 \n", - "13 -1.183094 1.404513 \n", - "15 0.867003 -0.766919 \n", - "16 -1.784958 1.959995 \n", - "23 -0.355532 0.647036 \n", - "34 -0.600039 -1.776888 \n", - "36 -0.129833 -1.423399 \n", - "42 -1.615684 -0.514427 \n", - "48 0.415606 -0.716421 \n", - "61 0.396797 -1.170907 \n", - "64 0.434414 -1.120408 \n", - "65 -1.220711 1.051024 \n", - "68 -1.484026 -0.009443 \n", - "70 1.638141 1.404513 \n", - "72 0.829387 0.142052 \n", - "74 -0.242683 -1.524396 \n", - "77 -1.277136 -0.211437 \n", - "81 0.208715 -1.221405 \n", - "91 1.261976 0.647036 \n", - "96 0.246331 -1.322402 \n", - "105 -1.803766 0.445043 \n", - "111 -1.164286 0.697535 \n", - "\n", - " standard_scaled_flipper_length_mm \n", - "penguin_id \n", - "1 -1.445145 \n", - "4 -0.284269 \n", - "8 0.658942 \n", - "11 -0.792152 \n", - "13 -0.792152 \n", - "15 0.513833 \n", - "16 -0.211715 \n", - "23 -1.5177 \n", - "34 0.949161 \n", - "36 1.23938 \n", - "42 -0.429379 \n", - "48 1.021716 \n", - "61 1.457044 \n", - "64 1.09427 \n", - "65 -1.445145 \n", - "68 -1.009817 \n", - "70 0.296168 \n", - "72 -0.719598 \n", - "74 0.586387 \n", - "77 -0.647043 \n", - "81 0.804051 \n", - "91 0.005949 \n", - "96 0.731497 \n", - "105 -1.009817 \n", - "111 -2.098138 \n", - "...\n", - "\n", - "[67 rows x 3 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# We can then repeat this transformation on new data\n", - "scaler.transform(X_test[numeric_columns])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Composing transformers\n", - "\n", - "To process data where different columns need different preprocessors, `bigframes.composition.ColumnTransformer` can be employed:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 75c1ce67-e5d7-4f4c-947e-381fc5298236 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 41962e2e-4d14-4053-9297-3ce61699551a is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5d3c22c9-c972-4213-8557-726c9e0aca37 is DONE. 22.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 9cb7b33f-ea05-4cf4-9f92-bb3aa4ea8d10 is DONE. 2.1 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job fe1f35d6-d82c-4aab-a284-637b72554f5b is DONE. 29.2 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 37bc90ff-59cb-4b0c-8f9d-73bcda43524a is DONE. 536 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e23f4724-fdd8-45a9-8c87-defd8d471035 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 257378db-0569-42d7-965a-7757154c710b is DONE. 21.4 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
onehotencoded_islandstandard_scaled_culmen_length_mmstandard_scaled_culmen_depth_mmstandard_scaled_flipper_length_mmonehotencoded_sexonehotencoded_species
penguin_id
0[{'index': 1, 'value': 1.0}]-0.7505050.84903-0.937262[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
2[{'index': 1, 'value': 1.0}]0.622496-1.3224020.804051[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
3[{'index': 2, 'value': 1.0}]-0.299107-0.261935-1.009817[{'index': 1, 'value': 1.0}][{'index': 2, 'value': 1.0}]
5[{'index': 1, 'value': 1.0}]0.490839-0.9689131.311935[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
6[{'index': 1, 'value': 1.0}]-0.5248061.959995-0.429379[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
7[{'index': 1, 'value': 1.0}]0.208715-1.7263891.021716[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
9[{'index': 1, 'value': 1.0}]1.205551-1.0194121.09427[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
10[{'index': 1, 'value': 1.0}]0.772962-0.8174181.457044[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
12[{'index': 1, 'value': 1.0}]1.243168-1.1204081.602153[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
14[{'index': 1, 'value': 1.0}]-1.7097250.344046-0.792152[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
17[{'index': 1, 'value': 1.0}]0.509647-0.9184151.021716[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
18[{'index': 2, 'value': 1.0}]1.1679351.404513-0.284269[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
19[{'index': 2, 'value': 1.0}]-1.2959440.445043-1.662809[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
20[{'index': 2, 'value': 1.0}]1.2995930.7985320.151059[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
21[{'index': 1, 'value': 1.0}]-0.6752720.344046-1.009817[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
22[{'index': 1, 'value': 1.0}]0.26514-1.6758910.658942[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
24[{'index': 2, 'value': 1.0}]1.431251.556008-0.501934[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
25[{'index': 2, 'value': 1.0}]0.3027560.041055-0.574488[{'index': 1, 'value': 1.0}][{'index': 2, 'value': 1.0}]
26[{'index': 1, 'value': 1.0}]0.302756-1.6758910.949161[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
27[{'index': 1, 'value': 1.0}]0.227523-1.7768880.658942[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
28[{'index': 1, 'value': 1.0}]1.318401-0.3629321.747263[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
29[{'index': 2, 'value': 1.0}]2.2023881.3035160.441278[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
30[{'index': 2, 'value': 1.0}]-0.9197791.959995-0.356824[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
31[{'index': 1, 'value': 1.0}]1.036277-0.6154241.747263[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
32[{'index': 3, 'value': 1.0}]-0.2238740.19255-0.356824[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
\n", - "

25 rows × 6 columns

\n", - "
[267 rows x 6 columns in total]" - ], - "text/plain": [ - " onehotencoded_island standard_scaled_culmen_length_mm \\\n", - "penguin_id \n", - "0 [{'index': 1, 'value': 1.0}] -0.750505 \n", - "2 [{'index': 1, 'value': 1.0}] 0.622496 \n", - "3 [{'index': 2, 'value': 1.0}] -0.299107 \n", - "5 [{'index': 1, 'value': 1.0}] 0.490839 \n", - "6 [{'index': 1, 'value': 1.0}] -0.524806 \n", - "7 [{'index': 1, 'value': 1.0}] 0.208715 \n", - "9 [{'index': 1, 'value': 1.0}] 1.205551 \n", - "10 [{'index': 1, 'value': 1.0}] 0.772962 \n", - "12 [{'index': 1, 'value': 1.0}] 1.243168 \n", - "14 [{'index': 1, 'value': 1.0}] -1.709725 \n", - "17 [{'index': 1, 'value': 1.0}] 0.509647 \n", - "18 [{'index': 2, 'value': 1.0}] 1.167935 \n", - "19 [{'index': 2, 'value': 1.0}] -1.295944 \n", - "20 [{'index': 2, 'value': 1.0}] 1.299593 \n", - "21 [{'index': 1, 'value': 1.0}] -0.675272 \n", - "22 [{'index': 1, 'value': 1.0}] 0.26514 \n", - "24 [{'index': 2, 'value': 1.0}] 1.43125 \n", - "25 [{'index': 2, 'value': 1.0}] 0.302756 \n", - "26 [{'index': 1, 'value': 1.0}] 0.302756 \n", - "27 [{'index': 1, 'value': 1.0}] 0.227523 \n", - "28 [{'index': 1, 'value': 1.0}] 1.318401 \n", - "29 [{'index': 2, 'value': 1.0}] 2.202388 \n", - "30 [{'index': 2, 'value': 1.0}] -0.919779 \n", - "31 [{'index': 1, 'value': 1.0}] 1.036277 \n", - "32 [{'index': 3, 'value': 1.0}] -0.223874 \n", - "\n", - " standard_scaled_culmen_depth_mm \\\n", - "penguin_id \n", - "0 0.84903 \n", - "2 -1.322402 \n", - "3 -0.261935 \n", - "5 -0.968913 \n", - "6 1.959995 \n", - "7 -1.726389 \n", - "9 -1.019412 \n", - "10 -0.817418 \n", - "12 -1.120408 \n", - "14 0.344046 \n", - "17 -0.918415 \n", - "18 1.404513 \n", - "19 0.445043 \n", - "20 0.798532 \n", - "21 0.344046 \n", - "22 -1.675891 \n", - "24 1.556008 \n", - "25 0.041055 \n", - "26 -1.675891 \n", - "27 -1.776888 \n", - "28 -0.362932 \n", - "29 1.303516 \n", - "30 1.959995 \n", - "31 -0.615424 \n", - "32 0.19255 \n", - "\n", - " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", - "penguin_id \n", - "0 -0.937262 [{'index': 2, 'value': 1.0}] \n", - "2 0.804051 [{'index': 1, 'value': 1.0}] \n", - "3 -1.009817 [{'index': 1, 'value': 1.0}] \n", - "5 1.311935 [{'index': 2, 'value': 1.0}] \n", - "6 -0.429379 [{'index': 2, 'value': 1.0}] \n", - "7 1.021716 [{'index': 1, 'value': 1.0}] \n", - "9 1.09427 [{'index': 1, 'value': 1.0}] \n", - "10 1.457044 [{'index': 2, 'value': 1.0}] \n", - "12 1.602153 [{'index': 2, 'value': 1.0}] \n", - "14 -0.792152 [{'index': 1, 'value': 1.0}] \n", - "17 1.021716 [{'index': 2, 'value': 1.0}] \n", - "18 -0.284269 [{'index': 2, 'value': 1.0}] \n", - "19 -1.662809 [{'index': 2, 'value': 1.0}] \n", - "20 0.151059 [{'index': 2, 'value': 1.0}] \n", - "21 -1.009817 [{'index': 1, 'value': 1.0}] \n", - "22 0.658942 [{'index': 1, 'value': 1.0}] \n", - "24 -0.501934 [{'index': 2, 'value': 1.0}] \n", - "25 -0.574488 [{'index': 1, 'value': 1.0}] \n", - "26 0.949161 [{'index': 1, 'value': 1.0}] \n", - "27 0.658942 [{'index': 1, 'value': 1.0}] \n", - "28 1.747263 [{'index': 2, 'value': 1.0}] \n", - "29 0.441278 [{'index': 2, 'value': 1.0}] \n", - "30 -0.356824 [{'index': 2, 'value': 1.0}] \n", - "31 1.747263 [{'index': 2, 'value': 1.0}] \n", - "32 -0.356824 [{'index': 2, 'value': 1.0}] \n", - "\n", - " onehotencoded_species \n", - "penguin_id \n", - "0 [{'index': 1, 'value': 1.0}] \n", - "2 [{'index': 3, 'value': 1.0}] \n", - "3 [{'index': 2, 'value': 1.0}] \n", - "5 [{'index': 3, 'value': 1.0}] \n", - "6 [{'index': 1, 'value': 1.0}] \n", - "7 [{'index': 3, 'value': 1.0}] \n", - "9 [{'index': 3, 'value': 1.0}] \n", - "10 [{'index': 3, 'value': 1.0}] \n", - "12 [{'index': 3, 'value': 1.0}] \n", - "14 [{'index': 1, 'value': 1.0}] \n", - "17 [{'index': 3, 'value': 1.0}] \n", - "18 [{'index': 2, 'value': 1.0}] \n", - "19 [{'index': 1, 'value': 1.0}] \n", - "20 [{'index': 2, 'value': 1.0}] \n", - "21 [{'index': 1, 'value': 1.0}] \n", - "22 [{'index': 3, 'value': 1.0}] \n", - "24 [{'index': 2, 'value': 1.0}] \n", - "25 [{'index': 2, 'value': 1.0}] \n", - "26 [{'index': 3, 'value': 1.0}] \n", - "27 [{'index': 3, 'value': 1.0}] \n", - "28 [{'index': 3, 'value': 1.0}] \n", - "29 [{'index': 2, 'value': 1.0}] \n", - "30 [{'index': 1, 'value': 1.0}] \n", - "31 [{'index': 3, 'value': 1.0}] \n", - "32 [{'index': 1, 'value': 1.0}] \n", - "...\n", - "\n", - "[267 rows x 6 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.ml.compose import ColumnTransformer\n", - "from bigframes.ml.preprocessing import OneHotEncoder\n", - "\n", - "# Create an aggregate transform that applies StandardScaler to the numeric columns,\n", - "# and OneHotEncoder to the string columns\n", - "preproc = ColumnTransformer([\n", - " (\"scale\", StandardScaler(), [\"culmen_length_mm\", \"culmen_depth_mm\", \"flipper_length_mm\"]),\n", - " (\"encode\", OneHotEncoder(), [\"species\", \"sex\", \"island\"])])\n", - "\n", - "# Now we can fit all columns of the training data\n", - "preproc.fit(X_train)\n", - "\n", - "processed_X_train = preproc.transform(X_train)\n", - "processed_X_test = preproc.transform(X_test)\n", - "\n", - "processed_X_train" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Predictors\n", - "\n", - "Predictors are estimators that learn and make predictions. In addition to `.fit(...)`, the predictor implements a `.predict(...)` method, which will use what was learned during `.fit(...)` to predict some output.\n", - "\n", - "Predictors can be further broken down into two categories:\n", - "\n", - "#### Supervised predictors\n", - "\n", - "Supervised learning is when we train a model on input-output pairs, and then ask it to predict the output for new inputs. An example of such a predictor is `bigframes.ml.linear_models.LinearRegression`." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 7d9c9f8b-6b4c-451f-ae3d-06fb7090d148 is DONE. 21.4 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job be87ccfa-72ab-4858-9d4a-b2f5f8b2a5e6 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 2d651fac-11bf-42da-8c18-bd33207379ca is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 58836ccc-242b-4574-bc48-4c269e74dbf1 is DONE. 5.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 1bf531f0-0fde-489b-ab36-6040a2a12377 is DONE. 536 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4245f4e6-4d5b-404f-81d7-50f0553e2456 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ed951699-c005-450e-a8b6-0916ec234e7f is DONE. 5.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predicted_body_mass_gonehotencoded_islandstandard_scaled_culmen_length_mmstandard_scaled_culmen_depth_mmstandard_scaled_flipper_length_mmonehotencoded_sexonehotencoded_species
penguin_id
13781.402407[{'index': 3, 'value': 1.0}]-0.9385870.748033-1.445145[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
44124.107944[{'index': 1, 'value': 1.0}]-0.167450.899528-0.284269[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
84670.344196[{'index': 1, 'value': 1.0}]0.453222-1.8778850.658942[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
113529.417214[{'index': 2, 'value': 1.0}]-1.126670.697535-0.792152[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
134014.101714[{'index': 1, 'value': 1.0}]-1.1830941.404513-0.792152[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
155212.41288[{'index': 1, 'value': 1.0}]0.867003-0.7669190.513833[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
164163.595615[{'index': 3, 'value': 1.0}]-1.7849581.959995-0.211715[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
233392.453069[{'index': 2, 'value': 1.0}]-0.3555320.647036-1.5177[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
344698.305397[{'index': 1, 'value': 1.0}]-0.600039-1.7768880.949161[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
364828.226949[{'index': 1, 'value': 1.0}]-0.129833-1.4233991.23938[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
423430.58866[{'index': 1, 'value': 1.0}]-1.615684-0.514427-0.429379[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
485314.260221[{'index': 1, 'value': 1.0}]0.415606-0.7164211.021716[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
615363.205372[{'index': 1, 'value': 1.0}]0.396797-1.1709071.457044[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
644855.908314[{'index': 1, 'value': 1.0}]0.434414-1.1204081.09427[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
653413.100524[{'index': 2, 'value': 1.0}]-1.2207111.051024-1.445145[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
683340.219002[{'index': 3, 'value': 1.0}]-1.484026-0.009443-1.009817[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
704228.73157[{'index': 2, 'value': 1.0}]1.6381411.4045130.296168[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
723811.538478[{'index': 2, 'value': 1.0}]0.8293870.142052-0.719598[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
744659.770763[{'index': 1, 'value': 1.0}]-0.242683-1.5243960.586387[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
773453.388804[{'index': 2, 'value': 1.0}]-1.277136-0.211437-0.647043[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
814766.245033[{'index': 1, 'value': 1.0}]0.208715-1.2214050.804051[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
914057.807281[{'index': 2, 'value': 1.0}]1.2619760.6470360.005949[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
964739.827445[{'index': 1, 'value': 1.0}]0.246331-1.3224020.731497[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
1053394.891976[{'index': 1, 'value': 1.0}]-1.8037660.445043-1.009817[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
1113201.493683[{'index': 1, 'value': 1.0}]-1.1642860.697535-2.098138[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
\n", - "

25 rows × 7 columns

\n", - "
[67 rows x 7 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g onehotencoded_island \\\n", - "penguin_id \n", - "1 3781.402407 [{'index': 3, 'value': 1.0}] \n", - "4 4124.107944 [{'index': 1, 'value': 1.0}] \n", - "8 4670.344196 [{'index': 1, 'value': 1.0}] \n", - "11 3529.417214 [{'index': 2, 'value': 1.0}] \n", - "13 4014.101714 [{'index': 1, 'value': 1.0}] \n", - "15 5212.41288 [{'index': 1, 'value': 1.0}] \n", - "16 4163.595615 [{'index': 3, 'value': 1.0}] \n", - "23 3392.453069 [{'index': 2, 'value': 1.0}] \n", - "34 4698.305397 [{'index': 1, 'value': 1.0}] \n", - "36 4828.226949 [{'index': 1, 'value': 1.0}] \n", - "42 3430.58866 [{'index': 1, 'value': 1.0}] \n", - "48 5314.260221 [{'index': 1, 'value': 1.0}] \n", - "61 5363.205372 [{'index': 1, 'value': 1.0}] \n", - "64 4855.908314 [{'index': 1, 'value': 1.0}] \n", - "65 3413.100524 [{'index': 2, 'value': 1.0}] \n", - "68 3340.219002 [{'index': 3, 'value': 1.0}] \n", - "70 4228.73157 [{'index': 2, 'value': 1.0}] \n", - "72 3811.538478 [{'index': 2, 'value': 1.0}] \n", - "74 4659.770763 [{'index': 1, 'value': 1.0}] \n", - "77 3453.388804 [{'index': 2, 'value': 1.0}] \n", - "81 4766.245033 [{'index': 1, 'value': 1.0}] \n", - "91 4057.807281 [{'index': 2, 'value': 1.0}] \n", - "96 4739.827445 [{'index': 1, 'value': 1.0}] \n", - "105 3394.891976 [{'index': 1, 'value': 1.0}] \n", - "111 3201.493683 [{'index': 1, 'value': 1.0}] \n", - "\n", - " standard_scaled_culmen_length_mm standard_scaled_culmen_depth_mm \\\n", - "penguin_id \n", - "1 -0.938587 0.748033 \n", - "4 -0.16745 0.899528 \n", - "8 0.453222 -1.877885 \n", - "11 -1.12667 0.697535 \n", - "13 -1.183094 1.404513 \n", - "15 0.867003 -0.766919 \n", - "16 -1.784958 1.959995 \n", - "23 -0.355532 0.647036 \n", - "34 -0.600039 -1.776888 \n", - "36 -0.129833 -1.423399 \n", - "42 -1.615684 -0.514427 \n", - "48 0.415606 -0.716421 \n", - "61 0.396797 -1.170907 \n", - "64 0.434414 -1.120408 \n", - "65 -1.220711 1.051024 \n", - "68 -1.484026 -0.009443 \n", - "70 1.638141 1.404513 \n", - "72 0.829387 0.142052 \n", - "74 -0.242683 -1.524396 \n", - "77 -1.277136 -0.211437 \n", - "81 0.208715 -1.221405 \n", - "91 1.261976 0.647036 \n", - "96 0.246331 -1.322402 \n", - "105 -1.803766 0.445043 \n", - "111 -1.164286 0.697535 \n", - "\n", - " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", - "penguin_id \n", - "1 -1.445145 [{'index': 2, 'value': 1.0}] \n", - "4 -0.284269 [{'index': 2, 'value': 1.0}] \n", - "8 0.658942 [{'index': 1, 'value': 1.0}] \n", - "11 -0.792152 [{'index': 1, 'value': 1.0}] \n", - "13 -0.792152 [{'index': 2, 'value': 1.0}] \n", - "15 0.513833 [{'index': 2, 'value': 1.0}] \n", - "16 -0.211715 [{'index': 2, 'value': 1.0}] \n", - "23 -1.5177 [{'index': 1, 'value': 1.0}] \n", - "34 0.949161 [{'index': 1, 'value': 1.0}] \n", - "36 1.23938 [{'index': 1, 'value': 1.0}] \n", - "42 -0.429379 [{'index': 1, 'value': 1.0}] \n", - "48 1.021716 [{'index': 2, 'value': 1.0}] \n", - "61 1.457044 [{'index': 2, 'value': 1.0}] \n", - "64 1.09427 [{'index': 1, 'value': 1.0}] \n", - "65 -1.445145 [{'index': 1, 'value': 1.0}] \n", - "68 -1.009817 [{'index': 1, 'value': 1.0}] \n", - "70 0.296168 [{'index': 2, 'value': 1.0}] \n", - "72 -0.719598 [{'index': 2, 'value': 1.0}] \n", - "74 0.586387 [{'index': 1, 'value': 1.0}] \n", - "77 -0.647043 [{'index': 1, 'value': 1.0}] \n", - "81 0.804051 [{'index': 1, 'value': 1.0}] \n", - "91 0.005949 [{'index': 2, 'value': 1.0}] \n", - "96 0.731497 [{'index': 1, 'value': 1.0}] \n", - "105 -1.009817 [{'index': 1, 'value': 1.0}] \n", - "111 -2.098138 [{'index': 1, 'value': 1.0}] \n", - "\n", - " onehotencoded_species \n", - "penguin_id \n", - "1 [{'index': 1, 'value': 1.0}] \n", - "4 [{'index': 1, 'value': 1.0}] \n", - "8 [{'index': 3, 'value': 1.0}] \n", - "11 [{'index': 1, 'value': 1.0}] \n", - "13 [{'index': 1, 'value': 1.0}] \n", - "15 [{'index': 3, 'value': 1.0}] \n", - "16 [{'index': 1, 'value': 1.0}] \n", - "23 [{'index': 1, 'value': 1.0}] \n", - "34 [{'index': 3, 'value': 1.0}] \n", - "36 [{'index': 3, 'value': 1.0}] \n", - "42 [{'index': 1, 'value': 1.0}] \n", - "48 [{'index': 3, 'value': 1.0}] \n", - "61 [{'index': 3, 'value': 1.0}] \n", - "64 [{'index': 3, 'value': 1.0}] \n", - "65 [{'index': 1, 'value': 1.0}] \n", - "68 [{'index': 1, 'value': 1.0}] \n", - "70 [{'index': 2, 'value': 1.0}] \n", - "72 [{'index': 2, 'value': 1.0}] \n", - "74 [{'index': 3, 'value': 1.0}] \n", - "77 [{'index': 1, 'value': 1.0}] \n", - "81 [{'index': 3, 'value': 1.0}] \n", - "91 [{'index': 2, 'value': 1.0}] \n", - "96 [{'index': 3, 'value': 1.0}] \n", - "105 [{'index': 1, 'value': 1.0}] \n", - "111 [{'index': 1, 'value': 1.0}] \n", - "\n", - "[67 rows x 7 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.ml.linear_model import LinearRegression\n", - "\n", - "linreg = LinearRegression()\n", - "\n", - "# Learn from the training data how to predict output y\n", - "linreg.fit(processed_X_train, y_train)\n", - "\n", - "# Predict y for the test data\n", - "predicted_y_test = linreg.predict(processed_X_test)\n", - "\n", - "predicted_y_test" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Unsupervised predictors\n", - "\n", - "In unsupervised learning, there are no known outputs in the training data, instead the model learns on input data alone and predicts something else. An example of an unsupervised predictor is `bigframes.ml.cluster.KMeans`, which learns how to fit input data to a target number of clusters." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 027042f1-9a18-43d8-a378-ab9410e395b1 is DONE. 23.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 6c8484a0-a504-4e50-93d6-3d247c9ff558 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e81ca2de-df2e-41ec-af86-14f8dcec1b44 is DONE. 6.2 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 3e6d413c-f8c4-4390-95eb-3a1f5bc59aed is DONE. 536 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e448220d-0c50-45b7-bcbe-d1159b3d18ce is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e167a234-828d-4f05-8654-63cf97e50ba3 is DONE. 10.2 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CENTROID_IDNEAREST_CENTROIDS_DISTANCEonehotencoded_islandstandard_scaled_culmen_length_mmstandard_scaled_culmen_depth_mmstandard_scaled_flipper_length_mmonehotencoded_sexonehotencoded_species
penguin_id
13[{'CENTROID_ID': 3, 'DISTANCE': 1.236380597035...[{'index': 3, 'value': 1.0}]-0.9385870.748033-1.445145[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
43[{'CENTROID_ID': 3, 'DISTANCE': 1.039497631856...[{'index': 1, 'value': 1.0}]-0.167450.899528-0.284269[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
81[{'CENTROID_ID': 1, 'DISTANCE': 1.171040485975...[{'index': 1, 'value': 1.0}]0.453222-1.8778850.658942[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
112[{'CENTROID_ID': 2, 'DISTANCE': 0.969102754012...[{'index': 2, 'value': 1.0}]-1.126670.697535-0.792152[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
133[{'CENTROID_ID': 3, 'DISTANCE': 1.113138945949...[{'index': 1, 'value': 1.0}]-1.1830941.404513-0.792152[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
151[{'CENTROID_ID': 1, 'DISTANCE': 1.070996026772...[{'index': 1, 'value': 1.0}]0.867003-0.7669190.513833[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
163[{'CENTROID_ID': 3, 'DISTANCE': 1.780136190720...[{'index': 3, 'value': 1.0}]-1.7849581.959995-0.211715[{'index': 2, 'value': 1.0}][{'index': 1, 'value': 1.0}]
232[{'CENTROID_ID': 2, 'DISTANCE': 1.382540667483...[{'index': 2, 'value': 1.0}]-0.3555320.647036-1.5177[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
341[{'CENTROID_ID': 1, 'DISTANCE': 1.598627908302...[{'index': 1, 'value': 1.0}]-0.600039-1.7768880.949161[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
361[{'CENTROID_ID': 1, 'DISTANCE': 1.095162305190...[{'index': 1, 'value': 1.0}]-0.129833-1.4233991.23938[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
422[{'CENTROID_ID': 2, 'DISTANCE': 1.275841743930...[{'index': 1, 'value': 1.0}]-1.615684-0.514427-0.429379[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
481[{'CENTROID_ID': 1, 'DISTANCE': 0.882209023196...[{'index': 1, 'value': 1.0}]0.415606-0.7164211.021716[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
611[{'CENTROID_ID': 1, 'DISTANCE': 0.816202832282...[{'index': 1, 'value': 1.0}]0.396797-1.1709071.457044[{'index': 2, 'value': 1.0}][{'index': 3, 'value': 1.0}]
641[{'CENTROID_ID': 1, 'DISTANCE': 0.735435721625...[{'index': 1, 'value': 1.0}]0.434414-1.1204081.09427[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
652[{'CENTROID_ID': 2, 'DISTANCE': 1.292559869148...[{'index': 2, 'value': 1.0}]-1.2207111.051024-1.445145[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
682[{'CENTROID_ID': 2, 'DISTANCE': 0.876430138449...[{'index': 3, 'value': 1.0}]-1.484026-0.009443-1.009817[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
704[{'CENTROID_ID': 4, 'DISTANCE': 1.314229913955...[{'index': 2, 'value': 1.0}]1.6381411.4045130.296168[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
724[{'CENTROID_ID': 4, 'DISTANCE': 0.938569518009...[{'index': 2, 'value': 1.0}]0.8293870.142052-0.719598[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
741[{'CENTROID_ID': 1, 'DISTANCE': 1.350320088546...[{'index': 1, 'value': 1.0}]-0.242683-1.5243960.586387[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
772[{'CENTROID_ID': 2, 'DISTANCE': 0.904806634663...[{'index': 2, 'value': 1.0}]-1.277136-0.211437-0.647043[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
811[{'CENTROID_ID': 1, 'DISTANCE': 0.919082578073...[{'index': 1, 'value': 1.0}]0.208715-1.2214050.804051[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
914[{'CENTROID_ID': 4, 'DISTANCE': 0.760360038086...[{'index': 2, 'value': 1.0}]1.2619760.6470360.005949[{'index': 2, 'value': 1.0}][{'index': 2, 'value': 1.0}]
961[{'CENTROID_ID': 1, 'DISTANCE': 0.950188657227...[{'index': 1, 'value': 1.0}]0.246331-1.3224020.731497[{'index': 1, 'value': 1.0}][{'index': 3, 'value': 1.0}]
1052[{'CENTROID_ID': 2, 'DISTANCE': 1.101316467029...[{'index': 1, 'value': 1.0}]-1.8037660.445043-1.009817[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
1112[{'CENTROID_ID': 2, 'DISTANCE': 1.549061068385...[{'index': 1, 'value': 1.0}]-1.1642860.697535-2.098138[{'index': 1, 'value': 1.0}][{'index': 1, 'value': 1.0}]
\n", - "

25 rows × 8 columns

\n", - "
[67 rows x 8 columns in total]" - ], - "text/plain": [ - " CENTROID_ID NEAREST_CENTROIDS_DISTANCE \\\n", - "penguin_id \n", - "1 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.236380597035... \n", - "4 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.039497631856... \n", - "8 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.171040485975... \n", - "11 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.969102754012... \n", - "13 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.113138945949... \n", - "15 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.070996026772... \n", - "16 3 [{'CENTROID_ID': 3, 'DISTANCE': 1.780136190720... \n", - "23 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.382540667483... \n", - "34 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.598627908302... \n", - "36 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.095162305190... \n", - "42 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.275841743930... \n", - "48 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.882209023196... \n", - "61 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.816202832282... \n", - "64 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.735435721625... \n", - "65 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.292559869148... \n", - "68 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.876430138449... \n", - "70 4 [{'CENTROID_ID': 4, 'DISTANCE': 1.314229913955... \n", - "72 4 [{'CENTROID_ID': 4, 'DISTANCE': 0.938569518009... \n", - "74 1 [{'CENTROID_ID': 1, 'DISTANCE': 1.350320088546... \n", - "77 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.904806634663... \n", - "81 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.919082578073... \n", - "91 4 [{'CENTROID_ID': 4, 'DISTANCE': 0.760360038086... \n", - "96 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.950188657227... \n", - "105 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.101316467029... \n", - "111 2 [{'CENTROID_ID': 2, 'DISTANCE': 1.549061068385... \n", - "\n", - " onehotencoded_island standard_scaled_culmen_length_mm \\\n", - "penguin_id \n", - "1 [{'index': 3, 'value': 1.0}] -0.938587 \n", - "4 [{'index': 1, 'value': 1.0}] -0.16745 \n", - "8 [{'index': 1, 'value': 1.0}] 0.453222 \n", - "11 [{'index': 2, 'value': 1.0}] -1.12667 \n", - "13 [{'index': 1, 'value': 1.0}] -1.183094 \n", - "15 [{'index': 1, 'value': 1.0}] 0.867003 \n", - "16 [{'index': 3, 'value': 1.0}] -1.784958 \n", - "23 [{'index': 2, 'value': 1.0}] -0.355532 \n", - "34 [{'index': 1, 'value': 1.0}] -0.600039 \n", - "36 [{'index': 1, 'value': 1.0}] -0.129833 \n", - "42 [{'index': 1, 'value': 1.0}] -1.615684 \n", - "48 [{'index': 1, 'value': 1.0}] 0.415606 \n", - "61 [{'index': 1, 'value': 1.0}] 0.396797 \n", - "64 [{'index': 1, 'value': 1.0}] 0.434414 \n", - "65 [{'index': 2, 'value': 1.0}] -1.220711 \n", - "68 [{'index': 3, 'value': 1.0}] -1.484026 \n", - "70 [{'index': 2, 'value': 1.0}] 1.638141 \n", - "72 [{'index': 2, 'value': 1.0}] 0.829387 \n", - "74 [{'index': 1, 'value': 1.0}] -0.242683 \n", - "77 [{'index': 2, 'value': 1.0}] -1.277136 \n", - "81 [{'index': 1, 'value': 1.0}] 0.208715 \n", - "91 [{'index': 2, 'value': 1.0}] 1.261976 \n", - "96 [{'index': 1, 'value': 1.0}] 0.246331 \n", - "105 [{'index': 1, 'value': 1.0}] -1.803766 \n", - "111 [{'index': 1, 'value': 1.0}] -1.164286 \n", - "\n", - " standard_scaled_culmen_depth_mm \\\n", - "penguin_id \n", - "1 0.748033 \n", - "4 0.899528 \n", - "8 -1.877885 \n", - "11 0.697535 \n", - "13 1.404513 \n", - "15 -0.766919 \n", - "16 1.959995 \n", - "23 0.647036 \n", - "34 -1.776888 \n", - "36 -1.423399 \n", - "42 -0.514427 \n", - "48 -0.716421 \n", - "61 -1.170907 \n", - "64 -1.120408 \n", - "65 1.051024 \n", - "68 -0.009443 \n", - "70 1.404513 \n", - "72 0.142052 \n", - "74 -1.524396 \n", - "77 -0.211437 \n", - "81 -1.221405 \n", - "91 0.647036 \n", - "96 -1.322402 \n", - "105 0.445043 \n", - "111 0.697535 \n", - "\n", - " standard_scaled_flipper_length_mm onehotencoded_sex \\\n", - "penguin_id \n", - "1 -1.445145 [{'index': 2, 'value': 1.0}] \n", - "4 -0.284269 [{'index': 2, 'value': 1.0}] \n", - "8 0.658942 [{'index': 1, 'value': 1.0}] \n", - "11 -0.792152 [{'index': 1, 'value': 1.0}] \n", - "13 -0.792152 [{'index': 2, 'value': 1.0}] \n", - "15 0.513833 [{'index': 2, 'value': 1.0}] \n", - "16 -0.211715 [{'index': 2, 'value': 1.0}] \n", - "23 -1.5177 [{'index': 1, 'value': 1.0}] \n", - "34 0.949161 [{'index': 1, 'value': 1.0}] \n", - "36 1.23938 [{'index': 1, 'value': 1.0}] \n", - "42 -0.429379 [{'index': 1, 'value': 1.0}] \n", - "48 1.021716 [{'index': 2, 'value': 1.0}] \n", - "61 1.457044 [{'index': 2, 'value': 1.0}] \n", - "64 1.09427 [{'index': 1, 'value': 1.0}] \n", - "65 -1.445145 [{'index': 1, 'value': 1.0}] \n", - "68 -1.009817 [{'index': 1, 'value': 1.0}] \n", - "70 0.296168 [{'index': 2, 'value': 1.0}] \n", - "72 -0.719598 [{'index': 2, 'value': 1.0}] \n", - "74 0.586387 [{'index': 1, 'value': 1.0}] \n", - "77 -0.647043 [{'index': 1, 'value': 1.0}] \n", - "81 0.804051 [{'index': 1, 'value': 1.0}] \n", - "91 0.005949 [{'index': 2, 'value': 1.0}] \n", - "96 0.731497 [{'index': 1, 'value': 1.0}] \n", - "105 -1.009817 [{'index': 1, 'value': 1.0}] \n", - "111 -2.098138 [{'index': 1, 'value': 1.0}] \n", - "\n", - " onehotencoded_species \n", - "penguin_id \n", - "1 [{'index': 1, 'value': 1.0}] \n", - "4 [{'index': 1, 'value': 1.0}] \n", - "8 [{'index': 3, 'value': 1.0}] \n", - "11 [{'index': 1, 'value': 1.0}] \n", - "13 [{'index': 1, 'value': 1.0}] \n", - "15 [{'index': 3, 'value': 1.0}] \n", - "16 [{'index': 1, 'value': 1.0}] \n", - "23 [{'index': 1, 'value': 1.0}] \n", - "34 [{'index': 3, 'value': 1.0}] \n", - "36 [{'index': 3, 'value': 1.0}] \n", - "42 [{'index': 1, 'value': 1.0}] \n", - "48 [{'index': 3, 'value': 1.0}] \n", - "61 [{'index': 3, 'value': 1.0}] \n", - "64 [{'index': 3, 'value': 1.0}] \n", - "65 [{'index': 1, 'value': 1.0}] \n", - "68 [{'index': 1, 'value': 1.0}] \n", - "70 [{'index': 2, 'value': 1.0}] \n", - "72 [{'index': 2, 'value': 1.0}] \n", - "74 [{'index': 3, 'value': 1.0}] \n", - "77 [{'index': 1, 'value': 1.0}] \n", - "81 [{'index': 3, 'value': 1.0}] \n", - "91 [{'index': 2, 'value': 1.0}] \n", - "96 [{'index': 3, 'value': 1.0}] \n", - "105 [{'index': 1, 'value': 1.0}] \n", - "111 [{'index': 1, 'value': 1.0}] \n", - "\n", - "[67 rows x 8 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.ml.cluster import KMeans\n", - "\n", - "kmeans = KMeans(n_clusters=4)\n", - "\n", - "kmeans.fit(processed_X_train)\n", - "\n", - "kmeans.predict(processed_X_test)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pipelines\n", - "\n", - "Transfomers and predictors can be chained into a single estimator component using `bigframes.ml.pipeline.Pipeline`:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Pipeline(steps=[('preproc',\n", - " ColumnTransformer(transformers=[('scale', StandardScaler(),\n", - " ['culmen_length_mm',\n", - " 'culmen_depth_mm',\n", - " 'flipper_length_mm']),\n", - " ('encode', OneHotEncoder(),\n", - " ['species', 'sex',\n", - " 'island'])])),\n", - " ('linreg', LinearRegression())])" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.ml.pipeline import Pipeline\n", - "\n", - "pipeline = Pipeline([\n", - " ('preproc', preproc),\n", - " ('linreg', linreg)\n", - "])\n", - "\n", - "# Print our pipeline\n", - "pipeline" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The pipeline simplifies the workflow by applying each of its component steps automatically:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job b11be0d8-e6f1-41cb-8cb2-25a38e7ef311 is DONE. 24.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job f32ea25c-be39-4726-a8f5-604ae83849a6 is DONE. 8.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 86e29b78-76f5-4937-8bde-407b99af04a2 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ca819734-0d41-4d9e-b743-09edae8c7fee is DONE. 29.6 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 49bb5bed-cc84-47e0-9a90-08ab01e00548 is DONE. 536 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 1e40a085-2289-47dd-afd8-820413186b9f is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 60319296-a480-4f51-b7ad-190ac6de963a is DONE. 6.2 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predicted_body_mass_gislandculmen_length_mmculmen_depth_mmflipper_length_mmsexspecies
penguin_id
13781.396682Torgersen39.118.7181.0MALEAdelie Penguin (Pygoscelis adeliae)
44124.102574Biscoe43.219.0197.0MALEAdelie Penguin (Pygoscelis adeliae)
84670.338389Biscoe46.513.5210.0FEMALEGentoo penguin (Pygoscelis papua)
113529.411644Dream38.118.6190.0FEMALEAdelie Penguin (Pygoscelis adeliae)
134014.09632Biscoe37.820.0190.0MALEAdelie Penguin (Pygoscelis adeliae)
155212.407319Biscoe48.715.7208.0MALEGentoo penguin (Pygoscelis papua)
164163.590502Torgersen34.621.1198.0MALEAdelie Penguin (Pygoscelis adeliae)
233392.44731Dream42.218.5180.0FEMALEAdelie Penguin (Pygoscelis adeliae)
344698.299674Biscoe40.913.7214.0FEMALEGentoo penguin (Pygoscelis papua)
364828.221398Biscoe43.414.4218.0FEMALEGentoo penguin (Pygoscelis papua)
423430.582874Biscoe35.516.2195.0FEMALEAdelie Penguin (Pygoscelis adeliae)
485314.254798Biscoe46.315.8215.0MALEGentoo penguin (Pygoscelis papua)
615363.19995Biscoe46.214.9221.0MALEGentoo penguin (Pygoscelis papua)
644855.90281Biscoe46.415.0216.0FEMALEGentoo penguin (Pygoscelis papua)
653413.094869Dream37.619.3181.0FEMALEAdelie Penguin (Pygoscelis adeliae)
683340.213193Torgersen36.217.2187.0FEMALEAdelie Penguin (Pygoscelis adeliae)
704228.726508Dream52.820.0205.0MALEChinstrap penguin (Pygoscelis antarctica)
723811.532821Dream48.517.5191.0MALEChinstrap penguin (Pygoscelis antarctica)
744659.765013Biscoe42.814.2209.0FEMALEGentoo penguin (Pygoscelis papua)
773453.383042Dream37.316.8192.0FEMALEAdelie Penguin (Pygoscelis adeliae)
814766.239424Biscoe45.214.8212.0FEMALEGentoo penguin (Pygoscelis papua)
914057.801947Dream50.818.5201.0MALEChinstrap penguin (Pygoscelis antarctica)
964739.821792Biscoe45.414.6211.0FEMALEGentoo penguin (Pygoscelis papua)
1053394.886275Biscoe34.518.1187.0FEMALEAdelie Penguin (Pygoscelis adeliae)
1113201.48777Biscoe37.918.6172.0FEMALEAdelie Penguin (Pygoscelis adeliae)
\n", - "

25 rows × 7 columns

\n", - "
[67 rows x 7 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g island culmen_length_mm \\\n", - "penguin_id \n", - "1 3781.396682 Torgersen 39.1 \n", - "4 4124.102574 Biscoe 43.2 \n", - "8 4670.338389 Biscoe 46.5 \n", - "11 3529.411644 Dream 38.1 \n", - "13 4014.09632 Biscoe 37.8 \n", - "15 5212.407319 Biscoe 48.7 \n", - "16 4163.590502 Torgersen 34.6 \n", - "23 3392.44731 Dream 42.2 \n", - "34 4698.299674 Biscoe 40.9 \n", - "36 4828.221398 Biscoe 43.4 \n", - "42 3430.582874 Biscoe 35.5 \n", - "48 5314.254798 Biscoe 46.3 \n", - "61 5363.19995 Biscoe 46.2 \n", - "64 4855.90281 Biscoe 46.4 \n", - "65 3413.094869 Dream 37.6 \n", - "68 3340.213193 Torgersen 36.2 \n", - "70 4228.726508 Dream 52.8 \n", - "72 3811.532821 Dream 48.5 \n", - "74 4659.765013 Biscoe 42.8 \n", - "77 3453.383042 Dream 37.3 \n", - "81 4766.239424 Biscoe 45.2 \n", - "91 4057.801947 Dream 50.8 \n", - "96 4739.821792 Biscoe 45.4 \n", - "105 3394.886275 Biscoe 34.5 \n", - "111 3201.48777 Biscoe 37.9 \n", - "\n", - " culmen_depth_mm flipper_length_mm sex \\\n", - "penguin_id \n", - "1 18.7 181.0 MALE \n", - "4 19.0 197.0 MALE \n", - "8 13.5 210.0 FEMALE \n", - "11 18.6 190.0 FEMALE \n", - "13 20.0 190.0 MALE \n", - "15 15.7 208.0 MALE \n", - "16 21.1 198.0 MALE \n", - "23 18.5 180.0 FEMALE \n", - "34 13.7 214.0 FEMALE \n", - "36 14.4 218.0 FEMALE \n", - "42 16.2 195.0 FEMALE \n", - "48 15.8 215.0 MALE \n", - "61 14.9 221.0 MALE \n", - "64 15.0 216.0 FEMALE \n", - "65 19.3 181.0 FEMALE \n", - "68 17.2 187.0 FEMALE \n", - "70 20.0 205.0 MALE \n", - "72 17.5 191.0 MALE \n", - "74 14.2 209.0 FEMALE \n", - "77 16.8 192.0 FEMALE \n", - "81 14.8 212.0 FEMALE \n", - "91 18.5 201.0 MALE \n", - "96 14.6 211.0 FEMALE \n", - "105 18.1 187.0 FEMALE \n", - "111 18.6 172.0 FEMALE \n", - "\n", - " species \n", - "penguin_id \n", - "1 Adelie Penguin (Pygoscelis adeliae) \n", - "4 Adelie Penguin (Pygoscelis adeliae) \n", - "8 Gentoo penguin (Pygoscelis papua) \n", - "11 Adelie Penguin (Pygoscelis adeliae) \n", - "13 Adelie Penguin (Pygoscelis adeliae) \n", - "15 Gentoo penguin (Pygoscelis papua) \n", - "16 Adelie Penguin (Pygoscelis adeliae) \n", - "23 Adelie Penguin (Pygoscelis adeliae) \n", - "34 Gentoo penguin (Pygoscelis papua) \n", - "36 Gentoo penguin (Pygoscelis papua) \n", - "42 Adelie Penguin (Pygoscelis adeliae) \n", - "48 Gentoo penguin (Pygoscelis papua) \n", - "61 Gentoo penguin (Pygoscelis papua) \n", - "64 Gentoo penguin (Pygoscelis papua) \n", - "65 Adelie Penguin (Pygoscelis adeliae) \n", - "68 Adelie Penguin (Pygoscelis adeliae) \n", - "70 Chinstrap penguin (Pygoscelis antarctica) \n", - "72 Chinstrap penguin (Pygoscelis antarctica) \n", - "74 Gentoo penguin (Pygoscelis papua) \n", - "77 Adelie Penguin (Pygoscelis adeliae) \n", - "81 Gentoo penguin (Pygoscelis papua) \n", - "91 Chinstrap penguin (Pygoscelis antarctica) \n", - "96 Gentoo penguin (Pygoscelis papua) \n", - "105 Adelie Penguin (Pygoscelis adeliae) \n", - "111 Adelie Penguin (Pygoscelis adeliae) \n", - "\n", - "[67 rows x 7 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.fit(X_train, y_train)\n", - "\n", - "predicted_y_test = pipeline.predict(X_test)\n", - "predicted_y_test" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the backend, a pipeline will actually be compiled into a single model with an embedded TRANSFORM step." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Evaluating results\n", - "\n", - "Some models include a convenient `.score(X, y)` method for evaulation with a preset accuracy metric:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job c02fb597-8d5a-42ca-9185-03b59c5ef2f9 is DONE. 29.6 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 7f1f565b-0f73-4a4e-b33f-8484fa260838 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d4b9d4a6-d75e-46e1-b092-ab58e8aef890 is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
0216.44435772639.6987070.00463170.5883560.8963960.900547
\n", - "

1 rows × 6 columns

\n", - "
[1 rows x 6 columns in total]" - ], - "text/plain": [ - " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 216.444357 72639.698707 0.00463 \n", - "\n", - " median_absolute_error r2_score explained_variance \n", - "0 170.588356 0.896396 0.900547 \n", - "\n", - "[1 rows x 6 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# In the case of a pipeline, this will be equivalent to calling .score on the contained LinearRegression\n", - "pipeline.score(X_test, y_test)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For a more general approach, the library `bigframes.ml.metrics` is provided:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 73448ee8-698b-435f-b11e-6fe2de3bcd8d is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e002f59d-a03c-4ec9-a85a-93adbfd7bd17 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 4ab1febc-fb55-473a-b295-69e4329cc5f0 is DONE. 30.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0.8963962044533755" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.ml.metrics import r2_score\n", - "\n", - "r2_score(y_test, predicted_y_test[\"predicted_body_mass_g\"])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save/Load to BigQuery\n", - "\n", - "Estimators can be saved to BigQuery as BQML models, and loaded again in future.\n", - "\n", - "Saving requires `bigquery.tables.create` permission, and loading requires `bigquery.models.getMetadata` permission.\n", - "These permissions can be at project level or the dataset level.\n", - "\n", - "If you have those permissions, please go ahead and uncomment the code in the following cells and run." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# # Replace with a path where you have permission to save a model\n", - "# model_name = \"bigframes-dev.bqml_tutorial.penguins_model\"\n", - "\n", - "# linreg.to_gbq(model_name, replace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# # WARNING - until b/281709360 is fixed & pipeline is updated, pipelines will load as models,\n", - "# # and details of their transform steps will be lost (the loaded model will behave the same)\n", - "# bigframes.pandas.read_gbq_model(model_name)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "a850322d07d9bdc9ec5f301d307e048bcab2390ae395e1cbce9335f4e081e5e2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb new file mode 100644 index 0000000000..089c167d39 --- /dev/null +++ b/notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb @@ -0,0 +1,970 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Machine Learning Fundamentals with BigQuery DataFrames\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"Vertex\n", + " Open in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "24743cf4a1e1" + }, + "source": [ + "**_NOTE_**: This notebook has been tested in the following environment:\n", + "\n", + "* Python version = 3.10" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "The `bigframes.ml` module implements Scikit-Learn's machine learning API in\n", + "BigQuery DataFrames. It exposes BigQuery's ML capabilities in a simple, popular\n", + "API that works seamlessly with the rest of the BigQuery DataFrames API.\n", + "\n", + "Learn more about [BigQuery DataFrames](https://cloud.google.com/python/docs/reference/bigframes/latest)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you will walk through an end-to-end machine learning workflow using BigQuery DataFrames. You will load data, manipulate and prepare it for model training, build supervised and unsupervised models, and evaluate and save a model for future use; all using built-in BigQuery DataFrames functionality." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses the [```penguins``` table](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=ml_datasets&t=penguins) (a BigQuery public dataset), which contains data on a set of penguins including species, island of residence, weight, culmen length and depth, flipper length, and sex." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* BigQuery (storage and compute)\n", + "* BigQuery ML\n", + "\n", + "Learn about [BigQuery storage pricing](https://cloud.google.com/bigquery/pricing#storage),\n", + "[BigQuery compute pricing](https://cloud.google.com/bigquery/pricing#analysis_pricing_models),\n", + "and [BigQuery ML pricing](https://cloud.google.com/bigquery/pricing#bqml),\n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Depending on your Jupyter environment, you might have to install packages." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NRTcBQPZpKWd" + }, + "source": [ + "**Vertex AI Workbench or Colab**\n", + "\n", + "Do nothing, BigQuery DataFrames package is already installed." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bdOJtFo1pRnc" + }, + "source": [ + "**Local JupyterLab instance**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mfPoOwPLGpSr" + }, + "outputs": [], + "source": [ + "# !pip install bigframes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "Complete the tasks in this section to set up your environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Yq7zKYWelRQP" + }, + "source": [ + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com) to enable the BigQuery API.\n", + "\n", + "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "If you don't know your project ID, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Set the region\n", + "\n", + "You can also change the `REGION` variable used by BigQuery. Learn more about [BigQuery regions](https://cloud.google.com/bigquery/docs/locations#supported_locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eF-Twtc4XGem" + }, + "outputs": [], + "source": [ + "REGION = \"US\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XcW9adriUQRc" + }, + "source": [ + "#### Set the dataset ID\n", + "\n", + "As part of this notebook, you will save BigQuery ML models to your Google Cloud project, which requires a dataset. Create the dataset, if needed, and provide the ID here as the `DATASET` variable used by BigQuery. Learn how to create a [BigQuery dataset](https://cloud.google.com/bigquery/docs/datasets)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BbMh9JHvUHAn" + }, + "outputs": [], + "source": [ + "DATASET = \"\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NwxfWoR5UGwO" + }, + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you might have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**Vertex AI Workbench**\n", + "\n", + "Do nothing, you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**Local JupyterLab instance**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**Colab**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "\n", + "### Set BigQuery DataFrames options" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NPPMuw2PXGeo" + }, + "outputs": [], + "source": [ + "bf.options.bigquery.project = PROJECT_ID\n", + "bf.options.bigquery.location = REGION" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pDfrKwMKE_dK" + }, + "source": [ + "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.reset_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LjfRpSruzg5j" + }, + "source": [ + "## Import data into BigQuery DataFrames\n", + "\n", + "You can create a DataFrame by reading data from a BigQuery table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d86W4hNqzZJb" + }, + "outputs": [], + "source": [ + "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.penguins\")\n", + "df = df.dropna()\n", + "\n", + "# BigQuery DataFrames creates a default numbered index, which we can give a name\n", + "df.index.name = \"penguin_id\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pDfCJ6-LkRB1" + }, + "source": [ + "Take a look at a few rows of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "arGaUZVWkSwT" + }, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WkUIcMXPkahu" + }, + "source": [ + "## Clean and prepare data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DScncEoDkiTG" + }, + "source": [ + "We're are going to start with supervised learning, where a Linear Regression model will learn to predict the body mass (output variable `y`) using input features such as flipper length, sex, species, and more (features `X`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "B9mW93o9z_-L" + }, + "outputs": [], + "source": [ + "# Isolate input features and output variable into DataFrames\n", + "X = df[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex', 'species']]\n", + "y = df[['body_mass_g']]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wkw0Cs62k_cl" + }, + "source": [ + "Part of preparing data for a machine learning task is splitting it into subsets for training and testing to ensure that the solution is not overfitting. By default, BQML will automatically manage splitting the data for you. However, BQML also supports manually splitting out your training data.\n", + "\n", + "Performing a manual data split can be done with `bigframes.ml.model_selection.train_test_split` like so:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NysWAWmvlAxB" + }, + "outputs": [], + "source": [ + "from bigframes.ml.model_selection import train_test_split\n", + "\n", + "# This will split X and y into test and training sets, with 20% of the rows in the test set,\n", + "# and the rest in the training set\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2)\n", + "\n", + "# Show the shape of the data after the split\n", + "print(f\"\"\"X_train shape: {X_train.shape}\n", + "X_test shape: {X_test.shape}\n", + "y_train shape: {y_train.shape}\n", + "y_test shape: {y_test.shape}\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "faFnVnNolydu" + }, + "source": [ + "If we look at the data, we can see that random rows were selected for\n", + "each side of the split:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f8bz1HwLlyLP" + }, + "outputs": [], + "source": [ + "X_test.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v4ic7GQEl67Y" + }, + "source": [ + "Note that the `y_test` data matches the same rows in `X_test`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PflbhKGkl8v2" + }, + "outputs": [], + "source": [ + "y_test.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Dkf52IdvmSaj" + }, + "source": [ + "## Estimators\n", + "\n", + "Following Scikit-Learn, all learning components are \"estimators\"; objects that can learn from training data and then apply themselves to new data. Estimators share the following patterns:\n", + "\n", + "- a constructor that takes a list of parameters\n", + "- a standard string representation that shows the class name and all non-default parameters, e.g. `LinearRegression(fit_intercept=False)`\n", + "- a `.fit(..)` method to fit the estimator to training data\n", + "\n", + "There estimators can be further broken down into two main subtypes:\n", + " 1. Transformers\n", + " 2. Predictors\n", + "\n", + "Let's walk through each of these with our example model." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "55oNSWQ2Q5te" + }, + "source": [ + "### Transformers\n", + "\n", + "Transformers are estimators that are used to prepare data for consumption by other estimators ('preprocessing'). In addition to `.fit(...)`, the transformer implements a `.transform(...)` method, which will apply a transformation based on what was computed during `.fit(..)`. With this pattern dynamic preprocessing steps can be applied to both training and test/production data consistently.\n", + "\n", + "An example of a transformer is `bigframes.ml.preprocessing.StandardScaler`, which rescales a dataset to have a mean of zero and a standard deviation of one:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yhATDMR-mkdF" + }, + "outputs": [], + "source": [ + "from bigframes.ml.preprocessing import StandardScaler\n", + "\n", + "# StandardScaler will only work on numeric columns\n", + "numeric_columns = [\"culmen_length_mm\", \"culmen_depth_mm\", \"flipper_length_mm\"]\n", + "\n", + "scaler = StandardScaler()\n", + "scaler.fit(X_train[numeric_columns])\n", + "\n", + "# Now, standardscaler should transform the numbers to have mean of zero\n", + "# and standard deviation of one:\n", + "scaler.transform(X_train[numeric_columns])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vhywHzH-ml-W" + }, + "source": [ + "We can then repeat this transformation on the test data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TfwSLOTXmspI" + }, + "outputs": [], + "source": [ + "scaler.transform(X_test[numeric_columns])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9enAdjzPmwmv" + }, + "source": [ + "#### Composing transformers\n", + "\n", + "To process data where different columns need different preprocessors, `bigframes.composition.ColumnTransformer` can be employed.\n", + "\n", + "Let's create an aggregate transform that applies `StandardScalar` to the numeric columns and `OneHotEncoder` to the string columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "I8Wwx3emmz2J" + }, + "outputs": [], + "source": [ + "from bigframes.ml.compose import ColumnTransformer\n", + "from bigframes.ml.preprocessing import OneHotEncoder\n", + "\n", + "# Create an aggregate transform that applies StandardScaler to the numeric columns,\n", + "# and OneHotEncoder to the string columns\n", + "preproc = ColumnTransformer([\n", + " (\"scale\", StandardScaler(), [\"culmen_length_mm\", \"culmen_depth_mm\", \"flipper_length_mm\"]),\n", + " (\"encode\", OneHotEncoder(), [\"species\", \"sex\", \"island\"])])\n", + "\n", + "# Now we can fit all columns of the training data\n", + "preproc.fit(X_train)\n", + "\n", + "processed_X_train = preproc.transform(X_train)\n", + "processed_X_test = preproc.transform(X_test)\n", + "\n", + "# View the processed training data\n", + "processed_X_train" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JhoO4fctm4Q5" + }, + "source": [ + "### Predictors\n", + "\n", + "Predictors are estimators that learn and make predictions. In addition to `.fit(...)`, the predictor implements a `.predict(...)` method, which will use what was learned during `.fit(...)` to predict some output.\n", + "\n", + "Predictors can be further broken down into two categories:\n", + "* Supervised predictors\n", + "* Unsupervised predictors" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TqLItVyjslP8" + }, + "source": [ + "#### Supervised predictors\n", + "\n", + "Supervised learning is when we train a model on input-output pairs, and then ask it to predict the output for new inputs. An example of such a predictor is `bigframes.ml.linear_models.LinearRegression`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZeloMmopm8KI" + }, + "outputs": [], + "source": [ + "from bigframes.ml.linear_model import LinearRegression\n", + "\n", + "linreg = LinearRegression()\n", + "\n", + "# Learn from the training data how to predict output y\n", + "linreg.fit(processed_X_train, y_train)\n", + "\n", + "# Predict y for the test data\n", + "predicted_y_test = linreg.predict(processed_X_test)\n", + "\n", + "# View predictions\n", + "predicted_y_test" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z42qesW_nAIf" + }, + "source": [ + "#### Unsupervised predictors\n", + "\n", + "In unsupervised learning, there are no known outputs in the training data, instead the model learns on input data alone and predicts something else. An example of an unsupervised predictor is `bigframes.ml.cluster.KMeans`, which learns how to fit input data to a target number of clusters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "M13zd02znCIg" + }, + "outputs": [], + "source": [ + "from bigframes.ml.cluster import KMeans\n", + "\n", + "# Specify KMeans with four clusters\n", + "kmeans = KMeans(n_clusters=4)\n", + "\n", + "# Fit data\n", + "kmeans.fit(processed_X_train)\n", + "\n", + "# View predictions\n", + "kmeans.predict(processed_X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DFwsIbscnEvh" + }, + "source": [ + "## Pipelines\n", + "\n", + "Transfomers and predictors can be chained into a single estimator component using `bigframes.ml.pipeline.Pipeline`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ku2OXqgJnEeR" + }, + "outputs": [], + "source": [ + "from bigframes.ml.pipeline import Pipeline\n", + "\n", + "pipeline = Pipeline([\n", + " ('preproc', preproc),\n", + " ('linreg', linreg)\n", + "])\n", + "\n", + "# Print our pipeline\n", + "pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cCQCY_6wnKz_" + }, + "source": [ + "The pipeline simplifies the workflow by applying each of its component steps automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hsF7FYagnMko" + }, + "outputs": [], + "source": [ + "pipeline.fit(X_train, y_train)\n", + "\n", + "predicted_y_test = pipeline.predict(X_test)\n", + "predicted_y_test" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SiLzpsg8nRXn" + }, + "source": [ + "In the backend, a pipeline will actually be compiled into a single model with an embedded TRANSFORM step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sTzAxTv1nUKZ" + }, + "source": [ + "## Evaluating results\n", + "\n", + "Some models include a convenient `.score(X, y)` method for evaulation with a preset accuracy metric:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q8nR1ZqznU-B" + }, + "outputs": [], + "source": [ + "# In the case of a pipeline, this will be equivalent to calling .score on the contained LinearRegression\n", + "pipeline.score(X_test, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UHM7jls6nY8A" + }, + "source": [ + "For a more general approach, the library `bigframes.ml.metrics` is provided:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vdEN4Ob9nan4" + }, + "outputs": [], + "source": [ + "from bigframes.ml.metrics import r2_score\n", + "\n", + "r2_score(y_test, predicted_y_test[\"predicted_body_mass_g\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "opn4ycPyneVh" + }, + "source": [ + "## Save to BigQuery\n", + "\n", + "Estimators can be saved to BigQuery as BQML models, and loaded again in future.\n", + "\n", + "Saving requires `bigquery.tables.create` permission, and loading requires `bigquery.models.getMetadata` permission.\n", + "These permissions can be at project level or the dataset level.\n", + "\n", + "If you have those permissions, please go ahead and uncomment the code in the following cells and run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fb0HpkdpnigJ" + }, + "outputs": [], + "source": [ + "linreg.to_gbq(f\"{DATASET}.penguins_model\", replace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_zNOBlHdnkII" + }, + "outputs": [], + "source": [ + "bf.read_gbq_model(f\"{DATASET}.penguins_model\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RfV-du5uTcBB" + }, + "source": [ + "We can also save the pipeline to BigQuery. BigQuery will save this as a single model, with the pre-processing steps embedded in the TRANSFORM property:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "P76_TQ3IR6nB" + }, + "outputs": [], + "source": [ + "pipeline.to_gbq(f\"{DATASET}.penguins_pipeline\", replace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GKvlKFjAbToJ" + }, + "outputs": [], + "source": [ + "bf.read_gbq_model(f\"{DATASET}.penguins_pipeline\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wCsmt0IwFkDy" + }, + "source": [ + "## Summary and next steps\n", + "\n", + "You've completed an end-to-end machine learning workflow using the built-in capabilities of BigQuery DataFrames.\n", + "\n", + "Learn more about BigQuery DataFrames in the [documentation](https://cloud.google.com/python/docs/reference/bigframes/latest) and find more sample notebooks in the [GitHub repo](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "### Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can uncomment the remaining cells and run them to delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QwumLUKmVpuH" + }, + "outputs": [], + "source": [ + "# # Delete the BQML models\n", + "# MODEL_NAME = f\"{PROJECT_ID}:{DATASET}.penguins_model\"\n", + "# ! bq rm -f --model {MODEL_NAME}\n", + "# PIPELINE_NAME = f\"{PROJECT_ID}:{DATASET}.penguins_pipeline\"\n", + "# ! bq rm -f --model {PIPELINE_NAME}" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/noxfile.py b/noxfile.py index 1d3624005a..f25cb9fa10 100644 --- a/noxfile.py +++ b/noxfile.py @@ -432,6 +432,15 @@ def docs(session): session.install("-e", ".") session.install( + # We need to pin to specific versions of the `sphinxcontrib-*` packages + # which still support sphinx 4.x. + # See https://github.com/googleapis/sphinx-docfx-yaml/issues/344 + # and https://github.com/googleapis/sphinx-docfx-yaml/issues/345. + "sphinxcontrib-applehelp==1.0.4", + "sphinxcontrib-devhelp==1.0.2", + "sphinxcontrib-htmlhelp==2.0.1", + "sphinxcontrib-qthelp==1.0.3", + "sphinxcontrib-serializinghtml==1.1.5", SPHINX_VERSION, "alabaster", "recommonmark", @@ -458,6 +467,15 @@ def docfx(session): session.install("-e", ".") session.install( + # We need to pin to specific versions of the `sphinxcontrib-*` packages + # which still support sphinx 4.x. + # See https://github.com/googleapis/sphinx-docfx-yaml/issues/344 + # and https://github.com/googleapis/sphinx-docfx-yaml/issues/345. + "sphinxcontrib-applehelp==1.0.4", + "sphinxcontrib-devhelp==1.0.2", + "sphinxcontrib-htmlhelp==2.0.1", + "sphinxcontrib-qthelp==1.0.3", + "sphinxcontrib-serializinghtml==1.1.5", SPHINX_VERSION, "alabaster", "recommonmark", diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index d767b30bd6..c5e8b45b8e 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -713,6 +713,26 @@ def test_column_multi_index_binary_op(scalars_df_index, scalars_pandas_df_index) pandas.testing.assert_series_equal(bf_result, pd_result) +@skip_legacy_pandas +def test_column_multi_index_any(): + columns = pandas.MultiIndex.from_tuples( + [("col0", "col00"), ("col0", "col00"), ("col1", "col11")] + ) + pd_df = pandas.DataFrame( + [[0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2]], columns=columns + ) + bf_df = bpd.DataFrame(pd_df) + + pd_result = pd_df.isna().any() + bf_result = bf_df.isna().any().to_pandas() + + pandas.testing.assert_frame_equal( + bf_result.reset_index(drop=False), + pd_result.reset_index(drop=False), + check_dtype=False, + ) + + def test_column_multi_index_agg(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "float64_col"] multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 623448b3aa..9f415f3bc4 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -136,7 +136,9 @@ def test_arrayvalues_to_ibis_expr_with_project_unary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_unary_op("col1", ops.AsTypeOp("string"))._compile_ordered() + expr = value.project_to_id( + ops.AsTypeOp("string").as_expr("col1"), output_id="col1" + )._compile_ordered() assert value._compile_ordered().columns[0].type().is_int64() assert expr.columns[0].type().is_string() @@ -152,8 +154,8 @@ def test_arrayvalues_to_ibis_expr_with_project_binary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_binary_op( - "col2", "col3", ops.add_op, "col4" + expr = value.project_to_id( + ops.add_op.as_expr("col2", "col3"), "col4" )._compile_ordered() assert expr.columns[3].type().is_float64() actual = expr._to_ibis_expr(ordering_mode="unordered") @@ -173,8 +175,8 @@ def test_arrayvalues_to_ibis_expr_with_project_ternary_op(): ), total_ordering_columns=["col1"], ) - expr = value.project_ternary_op( - "col2", "col3", "col4", ops.where_op, "col5" + expr = value.project_to_id( + ops.where_op.as_expr("col2", "col3", "col4"), "col5" )._compile_ordered() assert expr.columns[4].type().is_float64() actual = expr._to_ibis_expr(ordering_mode="unordered") diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 98c4fcdd44..10a4c195ab 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -809,6 +809,21 @@ def corr(self, other, method="pearson", min_periods=None) -> float: Uses the "Pearson" method of correlation. Numbers are converted to float before calculation, so the result may be unstable. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s1 = bpd.Series([.2, .0, .6, .2]) + >>> s2 = bpd.Series([.3, .6, .0, .1]) + >>> s1.corr(s2) + -0.8510644963469901 + + >>> s1 = bpd.Series([1, 2, 3], index=[0, 1, 2]) + >>> s2 = bpd.Series([1, 2, 3], index=[2, 1, 0]) + >>> s1.corr(s2) + -1.0 + Args: other (Series): The series with which this is to be correlated.