diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 4bdeef3904..81f87c5691 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:a8a80fc6456e433df53fc2a0d72ca0345db0ddefb409f1b75b118dfd1babd952 -# created: 2024-03-15T16:25:47.905264637Z + digest: sha256:5a4c19d17e597b92d786e569be101e636c9c2817731f80a5adec56b2aa8fe070 +# created: 2024-04-12T11:35:58.922854369Z diff --git a/.github/auto-label.yaml b/.github/auto-label.yaml index b2016d119b..8b37ee8971 100644 --- a/.github/auto-label.yaml +++ b/.github/auto-label.yaml @@ -13,3 +13,8 @@ # limitations under the License. requestsize: enabled: true + +path: + pullrequest: true + paths: + samples: "samples" diff --git a/.github/blunderbuss.yml b/.github/blunderbuss.yml new file mode 100644 index 0000000000..8d9cb1008e --- /dev/null +++ b/.github/blunderbuss.yml @@ -0,0 +1,17 @@ +# Blunderbuss config +# +# This file controls who is assigned for pull requests and issues. +# Note: This file is autogenerated. To make changes to the assignee +# team, please update `codeowner_team` in `.repo-metadata.json`. +assign_issues: + - googleapis/api-bigquery-dataframe + +assign_issues_by: + - labels: + - "samples" + to: + - googleapis/python-samples-reviewers + - googleapis/api-bigquery-dataframe + +assign_prs: + - googleapis/api-bigquery-dataframe diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index dd61f5f320..51f92b8e12 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -252,9 +252,9 @@ googleapis-common-protos==1.61.0 \ --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \ --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b # via google-api-core -idna==3.4 \ - --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \ - --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2 +idna==3.7 \ + --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ + --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 # via requests importlib-metadata==6.8.0 \ --hash=sha256:3ebb78df84a805d7698245025b975d9d67053cd94c79245ba4b3eb694abe68bb \ diff --git a/CHANGELOG.md b/CHANGELOG.md index a3314c976e..a96c902835 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,31 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.3.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.2.0...v1.3.0) (2024-04-22) + + +### Features + +* Add `Series.struct.dtypes` property ([#599](https://github.com/googleapis/python-bigquery-dataframes/issues/599)) ([d924ec2](https://github.com/googleapis/python-bigquery-dataframes/commit/d924ec2937c158644b5d1bbae4f82476de2c1655)) +* Add fine tuning `fit()` for Palm2TextGenerator ([#616](https://github.com/googleapis/python-bigquery-dataframes/issues/616)) ([9c106bd](https://github.com/googleapis/python-bigquery-dataframes/commit/9c106bd24482620ef5ff3c85f94be9da76c49716)) +* Add quantile statistic ([#613](https://github.com/googleapis/python-bigquery-dataframes/issues/613)) ([bc82804](https://github.com/googleapis/python-bigquery-dataframes/commit/bc82804da43c03c2311cd56f47a2316d3aae93d2)) +* Expose `max_batching_rows` in `remote_function` ([#622](https://github.com/googleapis/python-bigquery-dataframes/issues/622)) ([240a1ac](https://github.com/googleapis/python-bigquery-dataframes/commit/240a1ac6fa914550bb6216cd5d179a36009f2657)) +* Support primary key(s) in `read_gbq` by using as the `index_col` by default ([#625](https://github.com/googleapis/python-bigquery-dataframes/issues/625)) ([75bb240](https://github.com/googleapis/python-bigquery-dataframes/commit/75bb2409532e80de742030d05ffcbacacf5ffba2)) +* Warn if location is set to unknown location ([#609](https://github.com/googleapis/python-bigquery-dataframes/issues/609)) ([3706b4f](https://github.com/googleapis/python-bigquery-dataframes/commit/3706b4f9dde65788b5e6343a6428fb1866499461)) + + +### Bug Fixes + +* Address technical writers fb ([#611](https://github.com/googleapis/python-bigquery-dataframes/issues/611)) ([9f8f181](https://github.com/googleapis/python-bigquery-dataframes/commit/9f8f181279133abdb7da3aa045df6fa278587013)) +* Infer narrowest numeric type when combining numeric columns ([#602](https://github.com/googleapis/python-bigquery-dataframes/issues/602)) ([8f9ece6](https://github.com/googleapis/python-bigquery-dataframes/commit/8f9ece6d13f57f02d677bf0e3fea97dea94ae240)) +* Use exact median implementation by default ([#619](https://github.com/googleapis/python-bigquery-dataframes/issues/619)) ([9d205ae](https://github.com/googleapis/python-bigquery-dataframes/commit/9d205aecb77f35baeec82a8f6e1b72c2d852ca46)) + + +### Documentation + +* Fix rendering of examples for multiple apis ([#620](https://github.com/googleapis/python-bigquery-dataframes/issues/620)) ([9665e39](https://github.com/googleapis/python-bigquery-dataframes/commit/9665e39ef288841f03a9d823bd2210ef58394ad3)) +* Set `index_cols` in `read_gbq` as a best practice ([#624](https://github.com/googleapis/python-bigquery-dataframes/issues/624)) ([70015b7](https://github.com/googleapis/python-bigquery-dataframes/commit/70015b79e8cff16ff1b36c5e3f019fe099750a9d)) + ## [1.2.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.1.0...v1.2.0) (2024-04-15) diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 50e14eaf28..74561e6f24 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -22,11 +22,33 @@ import google.api_core.exceptions import google.auth.credentials +import bigframes.constants +import bigframes.exceptions + SESSION_STARTED_MESSAGE = ( "Cannot change '{attribute}' once a session has started. " "Call bigframes.pandas.close_session() first, if you are using the bigframes.pandas API." ) +UNKNOWN_LOCATION_MESSAGE = "The location '{location}' is set to an unknown value." + + +def _validate_location(value: Optional[str]): + + if value is None: + return + + if value not in bigframes.constants.ALL_BIGQUERY_LOCATIONS: + warnings.warn( + UNKNOWN_LOCATION_MESSAGE.format(location=value), + # There are many layers before we get to (possibly) the user's code: + # -> bpd.options.bigquery.location = "us-central-1" + # -> location.setter + # -> _validate_location + stacklevel=3, + category=bigframes.exceptions.UnknownLocationWarning, + ) + class BigQueryOptions: """Encapsulates configuration for working with a session.""" @@ -93,6 +115,7 @@ def location(self) -> Optional[str]: def location(self, value: Optional[str]): if self._session_started and self._location != value: raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="location")) + _validate_location(value) self._location = value @property diff --git a/bigframes/constants.py b/bigframes/constants.py index 0751501085..c6d8f3acc2 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -92,3 +92,6 @@ LEP_ENABLED_BIGQUERY_LOCATIONS = frozenset( ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS ) + +# BigQuery default is 10000, leave 100 for overhead +MAX_COLUMNS = 9900 diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 3fa690ef37..9e6b86fc30 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -354,10 +354,7 @@ def unpivot( *, passthrough_columns: typing.Sequence[str] = (), index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] - ] = pandas.Float64Dtype(), - how: typing.Literal["left", "right"] = "left", + join_side: typing.Literal["left", "right"] = "left", ) -> ArrayValue: """ Unpivot ArrayValue columns. @@ -367,23 +364,88 @@ def unpivot( unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. index_col_id (str): The column id to be used for the row labels. - dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. Returns: ArrayValue: The unpivoted ArrayValue """ + # There will be N labels, used to disambiguate which of N source columns produced each output row + explode_offsets_id = bigframes.core.guid.generate_guid("unpivot_offsets_") + labels_array = self._create_unpivot_labels_array(row_labels, index_col_ids) + labels_array = labels_array.promote_offsets(explode_offsets_id) + + # Unpivot creates N output rows for each input row, labels disambiguate these N rows + joined_array = self._cross_join_w_labels(labels_array, join_side) + + # Build the output rows as a case statment that selects between the N input columns + unpivot_exprs = [] + # Supports producing multiple stacked ouput columns for stacking only part of hierarchical index + for col_id, input_ids in unpivot_columns: + # row explode offset used to choose the input column + # we use offset instead of label as labels are not necessarily unique + cases = tuple( + ( + ops.eq_op.as_expr(explode_offsets_id, ex.const(i)), + ex.free_var(id_or_null) + if (id_or_null is not None) + else ex.const(None), + ) + for i, id_or_null in enumerate(input_ids) + ) + col_expr = ops.case_when_op.as_expr(*cases) + unpivot_exprs.append((col_expr, col_id)) + + label_exprs = ((ex.free_var(id), id) for id in index_col_ids) + # passthrough columns are unchanged, just repeated N times each + passthrough_exprs = ((ex.free_var(id), id) for id in passthrough_columns) return ArrayValue( - nodes.UnpivotNode( - child=self.node, - row_labels=tuple(row_labels), - unpivot_columns=tuple(unpivot_columns), - passthrough_columns=tuple(passthrough_columns), - index_col_ids=tuple(index_col_ids), - dtype=dtype, - how=how, + nodes.ProjectionNode( + child=joined_array.node, + assignments=(*label_exprs, *unpivot_exprs, *passthrough_exprs), ) ) + def _cross_join_w_labels( + self, labels_array: ArrayValue, join_side: typing.Literal["left", "right"] + ) -> ArrayValue: + """ + Convert each row in self to N rows, one for each label in labels array. + """ + table_join_side = ( + join_def.JoinSide.LEFT if join_side == "left" else join_def.JoinSide.RIGHT + ) + labels_join_side = table_join_side.inverse() + labels_mappings = tuple( + join_def.JoinColumnMapping(labels_join_side, id, id) + for id in labels_array.schema.names + ) + table_mappings = tuple( + join_def.JoinColumnMapping(table_join_side, id, id) + for id in self.schema.names + ) + join = join_def.JoinDefinition( + conditions=(), mappings=(*labels_mappings, *table_mappings), type="cross" + ) + if join_side == "left": + joined_array = self.join(labels_array, join_def=join) + else: + joined_array = labels_array.join(self, join_def=join) + return joined_array + + def _create_unpivot_labels_array( + self, + former_column_labels: typing.Sequence[typing.Hashable], + col_ids: typing.Sequence[str], + ) -> ArrayValue: + """Create an ArrayValue from a list of label tuples.""" + rows = [] + for row_offset in range(len(former_column_labels)): + row_label = former_column_labels[row_offset] + row_label = (row_label,) if not isinstance(row_label, tuple) else row_label + row = {col_ids[i]: row_label[i] for i in range(len(col_ids))} + rows.append(row) + + return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session) + def join( self, other: ArrayValue, diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index c789b2a69c..a221b343a5 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -15,6 +15,7 @@ import functools import typing +from typing import Sequence import pandas as pd @@ -105,6 +106,40 @@ def indicate_duplicates( ) +def quantile( + block: blocks.Block, + columns: Sequence[str], + qs: Sequence[float], + grouping_column_ids: Sequence[str] = (), + dropna: bool = False, +) -> blocks.Block: + # TODO: handle windowing and more interpolation methods + window = core.WindowSpec( + grouping_keys=tuple(grouping_column_ids), + ) + quantile_cols = [] + labels = [] + if len(columns) * len(qs) > constants.MAX_COLUMNS: + raise NotImplementedError("Too many aggregates requested.") + for col in columns: + for q in qs: + label = block.col_id_to_label[col] + new_label = (*label, q) if isinstance(label, tuple) else (label, q) + labels.append(new_label) + block, quantile_col = block.apply_window_op( + col, + agg_ops.QuantileOp(q), + window_spec=window, + ) + quantile_cols.append(quantile_col) + block, results = block.aggregate( + grouping_column_ids, + tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols), + dropna=dropna, + ) + return block.select_columns(results).with_column_labels(labels) + + def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: supported_methods = [ "linear", @@ -823,5 +858,5 @@ def _idx_extrema( # Stack the entire column axis to produce single-column result # Assumption: uniform dtype for stackability return block.aggregate_all_and_stack( - agg_ops.AnyValueOp(), dtype=block.dtypes[0] + agg_ops.AnyValueOp(), ).with_column_labels([original_block.index.name]) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 5b411e5416..0f9cacd83d 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -914,9 +914,6 @@ def aggregate_all_and_stack( axis: int | str = 0, value_col_id: str = "values", dropna: bool = True, - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] - ] = pd.Float64Dtype(), ) -> Block: axis_n = utils.get_axis_number(axis) if axis_n == 0: @@ -931,7 +928,6 @@ def aggregate_all_and_stack( row_labels=self.column_labels.to_list(), index_col_ids=index_col_ids, unpivot_columns=tuple([(value_col_id, tuple(self.value_columns))]), - dtype=dtype, ) return Block( result_expr, @@ -949,7 +945,6 @@ def aggregate_all_and_stack( index_col_ids=[guid.generate_guid()], unpivot_columns=[(value_col_id, tuple(self.value_columns))], passthrough_columns=[*self.index_columns, offset_col], - dtype=dtype, ) index_aggregations = [ (ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.free_var(col_id)), col_id) @@ -1498,22 +1493,24 @@ def stack(self, how="left", levels: int = 1): row_label_tuples = utils.index_as_tuples(row_labels) - if col_labels is not None: + if col_labels is None: + result_index: pd.Index = pd.Index([None]) + result_col_labels: Sequence[Tuple] = list([()]) + elif (col_labels.nlevels == 1) and all( + col_labels.isna() + ): # isna not implemented for MultiIndex for newer pandas versions + result_index = pd.Index([None]) + result_col_labels = utils.index_as_tuples(col_labels.drop_duplicates()) + else: result_index = col_labels.drop_duplicates().dropna(how="all") result_col_labels = utils.index_as_tuples(result_index) - else: - result_index = pd.Index([None]) - result_col_labels = list([()]) # Get matching columns unpivot_columns: List[Tuple[str, List[str]]] = [] - dtypes = [] for val in result_col_labels: col_id = guid.generate_guid("unpivot_") input_columns, dtype = self._create_stack_column(val, row_label_tuples) unpivot_columns.append((col_id, input_columns)) - if dtype: - dtypes.append(dtype or pd.Float64Dtype()) added_index_columns = [guid.generate_guid() for _ in range(row_labels.nlevels)] unpivot_expr = self._expr.unpivot( @@ -1521,8 +1518,7 @@ def stack(self, how="left", levels: int = 1): passthrough_columns=self.index_columns, unpivot_columns=unpivot_columns, index_col_ids=added_index_columns, - dtype=tuple(dtypes), - how=how, + join_side=how, ) new_index_level_names = self.column_labels.names[-levels:] if how == "left": @@ -1554,15 +1550,12 @@ def melt( value_labels = [self.col_id_to_label[col_id] for col_id in value_vars] id_labels = [self.col_id_to_label[col_id] for col_id in id_vars] - dtype = self._expr.get_column_type(value_vars[0]) - unpivot_expr = self._expr.unpivot( row_labels=value_labels, passthrough_columns=id_vars, unpivot_columns=(unpivot_col,), index_col_ids=var_col_ids, - dtype=dtype, - how="right", + join_side="right", ) index_id = guid.generate_guid() unpivot_expr = unpivot_expr.promote_offsets(index_id) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index ae21243506..98d296c779 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -148,6 +148,14 @@ def _( return cast(ibis_types.NumericValue, value) +@compile_unary_agg.register +@numeric_op +def _( + op: agg_ops.QuantileOp, column: ibis_types.NumericColumn, window=None +) -> ibis_types.NumericValue: + return _apply_window_if_present(column.quantile(op.q), window) + + @compile_unary_agg.register @numeric_op def _( diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index f1c5d62010..a59d599679 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -40,10 +40,8 @@ OrderingExpression, ) import bigframes.core.schema as schemata -import bigframes.core.utils as utils from bigframes.core.window_spec import WindowSpec import bigframes.dtypes -import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops ORDER_ID_COLUMN = "bigframes_ordering_id" @@ -109,36 +107,6 @@ def filter(self: T, predicate: ex.Expression) -> T: """Filter the table on a given expression, the predicate must be a boolean expression.""" ... - @abc.abstractmethod - def unpivot( - self: T, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] - ] = pandas.Float64Dtype(), - how="left", - ) -> T: - """ - Unpivot ArrayValue columns. - - Args: - row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. - unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. - passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. - index_col_id (str): The column id to be used for the row labels. - dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. - - Returns: - ArrayValue: The unpivoted ArrayValue - """ - ... - @abc.abstractmethod def _reproject_to_table(self: T) -> T: """ @@ -332,115 +300,6 @@ def _filter(self, predicate_value: ibis_types.BooleanValue) -> UnorderedIR: expr.predicates = [*self._predicates, predicate_value] return expr.build() - def unpivot( - self, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] - ] = pandas.Float64Dtype(), - how="left", - ) -> UnorderedIR: - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr() - row_n = len(row_labels) - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) - ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - op_compiler.compile_row_op( - ops.AsTypeOp(col_dtype), (unpivot_table[col],) - ) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - unpivot_offset_id, - ) - - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - return UnorderedIR( - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - ) - def aggregate( self, aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], @@ -920,149 +779,6 @@ def project_window_op( # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation. return result._reproject_to_table() if not skip_reproject_unsafe else result - def unpivot( - self, - row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[ - typing.Tuple[str, typing.Sequence[typing.Optional[str]]] - ], - *, - passthrough_columns: typing.Sequence[str] = (), - index_col_ids: typing.Sequence[str] = ["index"], - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] - ] = pandas.Float64Dtype(), - how="left", - ) -> OrderedIR: - if how not in ("left", "right"): - raise ValueError("'how' must be 'left' or 'right'") - table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) - row_n = len(row_labels) - hidden_col_ids = self._hidden_ordering_column_names.keys() - if not all( - len(source_columns) == row_n for _, source_columns in unpivot_columns - ): - raise ValueError("Columns and row labels must all be same length.") - - unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_") - unpivot_table = table.cross_join( - ibis.memtable({unpivot_offset_id: range(row_n)}) - ) - # Use ibis memtable to infer type of rowlabels (if possible) - # TODO: Allow caller to specify dtype - if isinstance(row_labels[0], tuple): - labels_table = ibis.memtable(row_labels) - labels_ibis_types = [ - labels_table[col].type() for col in labels_table.columns - ] - else: - labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()] - labels_dtypes = [ - bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) - for ibis_type in labels_ibis_types - ] - - label_columns = [] - for label_part, (col_id, label_dtype) in enumerate( - zip(index_col_ids, labels_dtypes) - ): - # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels - labels_as_tuples = [ - label if isinstance(label, tuple) else (label,) for label in row_labels - ] - cases = [ - ( - i, - bigframes.dtypes.literal_to_ibis_scalar( - label_tuple[label_part], # type:ignore - force_dtype=label_dtype, # type:ignore - ), - ) - for i, label_tuple in enumerate(labels_as_tuples) - ] - labels_value = ( - typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]) - .cases(cases, default=None) # type:ignore - .name(col_id) - ) - label_columns.append(labels_value) - - unpivot_values = [] - for j in range(len(unpivot_columns)): - col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype - result_col, source_cols = unpivot_columns[j] - null_value = bigframes.dtypes.literal_to_ibis_scalar( - None, force_dtype=col_dtype - ) - ibis_values = [ - op_compiler.compile_row_op( - ops.AsTypeOp(col_dtype), (unpivot_table[col],) - ) - if col is not None - else null_value - for col in source_cols - ] - cases = [(i, ibis_values[i]) for i in range(len(ibis_values))] - unpivot_value = typing.cast( - ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id] - ).cases( - cases, default=null_value # type:ignore - ) - unpivot_values.append(unpivot_value.name(result_col)) - - unpivot_table = unpivot_table.select( - passthrough_columns, - *label_columns, - *unpivot_values, - *hidden_col_ids, - unpivot_offset_id, - ) - - # Extend the original ordering using unpivot_offset_id - old_ordering = self._ordering - if how == "left": - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - *old_ordering.ordering_value_columns, - ascending_over(unpivot_offset_id), - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - else: # how=="right" - new_ordering = ExpressionOrdering( - ordering_value_columns=tuple( - [ - ascending_over(unpivot_offset_id), - *old_ordering.ordering_value_columns, - ] - ), - total_ordering_columns=frozenset( - [*old_ordering.total_ordering_columns, unpivot_offset_id] - ), - ) - value_columns = [ - unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns - ] - passthrough_values = [unpivot_table[col] for col in passthrough_columns] - hidden_ordering_columns = [ - unpivot_table[unpivot_offset_id], - *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids], - ] - return OrderedIR( - table=unpivot_table, - columns=[ - *[unpivot_table[col_id] for col_id in index_col_ids], - *value_columns, - *passthrough_values, - ], - hidden_ordering_columns=hidden_ordering_columns, - ordering=new_ordering, - ) - def _reproject_to_table(self) -> OrderedIR: table = self._to_ibis_expr( ordering_mode="unordered", diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 638e3eacdd..a68023d13d 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -179,18 +179,6 @@ def compile_reproject(node: nodes.ReprojectOpNode, ordered: bool = True): return compile_node(node.child, ordered)._reproject_to_table() -@_compile_node.register -def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True): - return compile_node(node.child, ordered).unpivot( - node.row_labels, - node.unpivot_columns, - passthrough_columns=node.passthrough_columns, - index_col_ids=node.index_col_ids, - dtype=node.dtype, - how=node.how, - ) - - @_compile_node.register def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True): return compile_node(node.child, ordered).explode(node.column_ids) diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 53a25d63ed..072d974b39 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -189,6 +189,25 @@ def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): return decorator + def register_nary_op(self, op_ref: typing.Union[ops.NaryOp, type[ops.NaryOp]]): + """ + Decorator to register a nary op implementation. + + Args: + op_ref (NaryOp or NaryOp type): + Class or instance of operator that is implemented by the decorated function. + """ + key = typing.cast(str, op_ref.name) + + def decorator(impl: typing.Callable[..., ibis_types.Value]): + def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp): + return impl(*args) + + self._register(key, normalized_impl) + return impl + + return decorator + def _register( self, op_name: str, @@ -1346,6 +1365,25 @@ def clip_op( ) +@scalar_op_compiler.register_nary_op(ops.case_when_op) +def switch_op(*cases_and_outputs: ibis_types.Value) -> ibis_types.Value: + # ibis can handle most type coercions, but we need to force bool -> int + # TODO: dispatch coercion depending on bigframes dtype schema + result_values = cases_and_outputs[1::2] + do_upcast_bool = any(t.type().is_numeric() for t in result_values) + if do_upcast_bool: + # Just need to upcast to int, ibis can handle further coercion + result_values = tuple( + val.cast(ibis_dtypes.int64) if val.type().is_boolean() else val + for val in result_values + ) + + case_val = ibis.case() + for predicate, output in zip(cases_and_outputs[::2], result_values): + case_val = case_val.when(predicate, output) + return case_val.end() + + # Helpers def is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 4980f5369d..70eb519a1b 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -190,9 +190,6 @@ class OpExpression(Expression): op: bigframes.operations.RowOp inputs: typing.Tuple[Expression, ...] - def __post_init__(self): - assert self.op.arguments == len(self.inputs) - @property def unbound_variables(self) -> typing.Tuple[str, ...]: return tuple( diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index e2b28553c6..05b1cc7f41 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -15,6 +15,7 @@ from __future__ import annotations import typing +from typing import Sequence, Union import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby import pandas as pd @@ -112,17 +113,37 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame: self._raise_on_non_numeric("mean") return self._aggregate_all(agg_ops.mean_op, numeric_only=True) - def median( - self, numeric_only: bool = False, *, exact: bool = False - ) -> df.DataFrame: - if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) + def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("median") + if exact: + return self.quantile(0.5) return self._aggregate_all(agg_ops.median_op, numeric_only=True) + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ) -> df.DataFrame: + if not numeric_only: + self._raise_on_non_numeric("quantile") + q_cols = tuple( + col + for col in self._selected_cols + if self._column_type(col) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + ) + multi_q = utils.is_list_like(q) + result = block_ops.quantile( + self._block, + q_cols, + qs=tuple(q) if multi_q else (q,), # type: ignore + grouping_column_ids=self._by_col_ids, + dropna=self._dropna, + ) + result_df = df.DataFrame(result) + if multi_q: + return result_df.stack() + else: + return result_df.droplevel(-1, 1) + def min(self, numeric_only: bool = False, *args) -> df.DataFrame: return self._aggregate_all(agg_ops.min_op, numeric_only=numeric_only) @@ -466,8 +487,32 @@ def sum(self, *args) -> series.Series: def mean(self, *args) -> series.Series: return self._aggregate(agg_ops.mean_op) - def median(self, *args, **kwargs) -> series.Series: - return self._aggregate(agg_ops.mean_op) + def median( + self, + *args, + exact: bool = True, + **kwargs, + ) -> series.Series: + if exact: + return self.quantile(0.5) + else: + return self._aggregate(agg_ops.median_op) + + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ) -> series.Series: + multi_q = utils.is_list_like(q) + result = block_ops.quantile( + self._block, + (self._value_column,), + qs=tuple(q) if multi_q else (q,), # type: ignore + grouping_column_ids=self._by_col_ids, + dropna=self._dropna, + ) + if multi_q: + return series.Series(result.stack()) + else: + return series.Series(result.stack()).droplevel(-1) def std(self, *args, **kwargs) -> series.Series: return self._aggregate(agg_ops.std_op) diff --git a/bigframes/core/join_def.py b/bigframes/core/join_def.py index 4646a0d6ae..632a1864da 100644 --- a/bigframes/core/join_def.py +++ b/bigframes/core/join_def.py @@ -22,6 +22,11 @@ class JoinSide(enum.Enum): LEFT = 0 RIGHT = 1 + def inverse(self) -> JoinSide: + if self == JoinSide.LEFT: + return JoinSide.RIGHT + return JoinSide.LEFT + JoinType = Literal["inner", "outer", "left", "right", "cross"] diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index a1072b0d68..688e165732 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -21,8 +21,6 @@ import typing from typing import Callable, Tuple -import pandas - import bigframes.core.expression as ex import bigframes.core.guid from bigframes.core.join_def import JoinColumnMapping, JoinDefinition, JoinSide @@ -579,88 +577,6 @@ def relation_ops_created(self) -> int: return 0 -@dataclass(frozen=True) -class UnpivotNode(UnaryNode): - # TODO: Refactor unpivot - row_labels: typing.Tuple[typing.Hashable, ...] - unpivot_columns: typing.Tuple[ - typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]], ... - ] - passthrough_columns: typing.Tuple[str, ...] = () - index_col_ids: typing.Tuple[str, ...] = ("index",) - dtype: typing.Union[ - bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...] - ] = (pandas.Float64Dtype(),) - how: typing.Literal["left", "right"] = "left" - - def __hash__(self): - return self._node_hash - - @property - def row_preserving(self) -> bool: - return False - - @property - def non_local(self) -> bool: - return True - - @property - def joins(self) -> bool: - return True - - @functools.cached_property - def schema(self) -> schemata.ArraySchema: - def infer_dtype( - values: typing.Iterable[typing.Hashable], - ) -> bigframes.dtypes.Dtype: - item_types = map(lambda x: bigframes.dtypes.infer_literal_type(x), values) - etype = functools.reduce( - lambda t1, t2: bigframes.dtypes.lcd_type(t1, t2) - if (t1 and t2) - else None, - item_types, - ) - return bigframes.dtypes.dtype_for_etype(etype) - - label_tuples = [ - label if isinstance(label, tuple) else (label,) for label in self.row_labels - ] - idx_dtypes = [ - infer_dtype(map(lambda x: typing.cast(tuple, x)[i], label_tuples)) - for i in range(len(self.index_col_ids)) - ] - - index_items = [ - schemata.SchemaItem(id, dtype) - for id, dtype in zip(self.index_col_ids, idx_dtypes) - ] - value_dtypes = ( - self.dtype - if isinstance(self.dtype, tuple) - else (self.dtype,) * len(self.unpivot_columns) - ) - value_items = [ - schemata.SchemaItem(col[0], dtype) - for col, dtype in zip(self.unpivot_columns, value_dtypes) - ] - passthrough_items = [ - schemata.SchemaItem(id, self.child.schema.get_type(id)) - for id in self.passthrough_columns - ] - return schemata.ArraySchema((*index_items, *value_items, *passthrough_items)) - - @property - def variables_introduced(self) -> int: - return ( - len(self.schema.items) - len(self.passthrough_columns) + OVERHEAD_VARIABLES - ) - - @property - def relation_ops_created(self) -> int: - # Unpivot is essentially a cross join and a projection. - return 2 - - @dataclass(frozen=True) class RandomSampleNode(UnaryNode): fraction: float diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2deef95277..ff8404761c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1961,9 +1961,7 @@ def any( frame = self._raise_on_non_boolean("any") else: frame = self._drop_non_bool() - block = frame._block.aggregate_all_and_stack( - agg_ops.any_op, dtype=pandas.BooleanDtype(), axis=axis - ) + block = frame._block.aggregate_all_and_stack(agg_ops.any_op, axis=axis) return bigframes.series.Series(block.select_column("values")) def all( @@ -1973,9 +1971,7 @@ def all( frame = self._raise_on_non_boolean("all") else: frame = self._drop_non_bool() - block = frame._block.aggregate_all_and_stack( - agg_ops.all_op, dtype=pandas.BooleanDtype(), axis=axis - ) + block = frame._block.aggregate_all_and_stack(agg_ops.all_op, axis=axis) return bigframes.series.Series(block.select_column("values")) def sum( @@ -1999,18 +1995,42 @@ def mean( return bigframes.series.Series(block.select_column("values")) def median( - self, *, numeric_only: bool = False, exact: bool = False + self, *, numeric_only: bool = False, exact: bool = True ) -> bigframes.series.Series: + if not numeric_only: + frame = self._raise_on_non_numeric("median") + else: + frame = self._drop_non_numeric() if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) + result = frame.quantile() + result.name = None + return result + else: + block = frame._block.aggregate_all_and_stack(agg_ops.median_op) + return bigframes.series.Series(block.select_column("values")) + + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ): if not numeric_only: frame = self._raise_on_non_numeric("median") else: frame = self._drop_non_numeric() - block = frame._block.aggregate_all_and_stack(agg_ops.median_op) - return bigframes.series.Series(block.select_column("values")) + multi_q = utils.is_list_like(q) + result = block_ops.quantile( + frame._block, frame._block.value_columns, qs=tuple(q) if multi_q else (q,) # type: ignore + ) + if multi_q: + return DataFrame(result.stack()).droplevel(0) + else: + result_df = ( + DataFrame(result) + .stack(list(range(0, frame.columns.nlevels))) + .droplevel(0) + ) + result_series = bigframes.series.Series(result_df._block) + result_series.name = q + return result_series def std( self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py new file mode 100644 index 0000000000..62122e79d2 --- /dev/null +++ b/bigframes/exceptions.py @@ -0,0 +1,17 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class UnknownLocationWarning(Warning): + """The location is set to an unknown value.""" diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 178c911591..f866575a26 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -145,7 +145,13 @@ def __init__( self._cloud_function_docker_repository = cloud_function_docker_repository def create_bq_remote_function( - self, input_args, input_types, output_type, endpoint, bq_function_name + self, + input_args, + input_types, + output_type, + endpoint, + bq_function_name, + max_batching_rows, ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" @@ -169,14 +175,25 @@ def create_bq_remote_function( bq_function_args.append( f"{name} {third_party_ibis_bqtypes.BigQueryType.from_ibis(input_types[idx])}" ) + + remote_function_options = { + "endpoint": endpoint, + "max_batching_rows": max_batching_rows, + } + + remote_function_options_str = ", ".join( + [ + f'{key}="{val}"' if isinstance(val, str) else f"{key}={val}" + for key, val in remote_function_options.items() + if val is not None + ] + ) + create_function_ddl = f""" CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)}) RETURNS {bq_function_return_type} REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}` - OPTIONS ( - endpoint = "{endpoint}", - max_batching_rows = 1000 - )""" + OPTIONS ({remote_function_options_str})""" logger.info(f"Creating BQ remote function: {create_function_ddl}") @@ -438,6 +455,7 @@ def provision_bq_remote_function( reuse, name, package_requirements, + max_batching_rows, ): """Provision a BigQuery remote function.""" # If reuse of any existing function with the same name (indicated by the @@ -485,7 +503,12 @@ def provision_bq_remote_function( "Exactly one type should be provided for every input arg." ) self.create_bq_remote_function( - input_args, input_types, output_type, cf_endpoint, remote_function_name + input_args, + input_types, + output_type, + cf_endpoint, + remote_function_name, + max_batching_rows, ) else: logger.info(f"Remote function {remote_function_name} already exists.") @@ -607,6 +630,7 @@ def remote_function( cloud_function_service_account: Optional[str] = None, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -723,6 +747,15 @@ def remote_function( projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. + max_batching_rows (int, Optional): + The maximum number of rows to be batched for processing in the + BQ remote function. Default value is 1000. A lower number can be + passed to avoid timeouts in case the user code is too complex to + process large number of rows fast enough. A higher number can be + used to increase throughput in case the user code is fast enough. + `None` can be passed to let BQ remote functions service apply + default batching. See for more details + https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. """ import bigframes.pandas as bpd @@ -846,6 +879,7 @@ def wrapper(f): reuse, name, packages, + max_batching_rows, ) # TODO: Move ibis logic to compiler step diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 04aaeec1bc..b94ae39687 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -321,6 +321,46 @@ def create_model( return self._create_model_with_sql(session=session, sql=sql) + def create_llm_remote_model( + self, + X_train: bpd.DataFrame, + y_train: bpd.DataFrame, + connection_name: str, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> BqmlModel: + """Create a session-temporary BQML model with the CREATE OR REPLACE MODEL statement + + Args: + X_train: features columns for training + y_train: labels columns for training + options: a dict of options to configure the model. Generates a BQML OPTIONS + clause + connection_name: + a BQ connection to talk with Vertex AI, of the format ... https://cloud.google.com/bigquery/docs/create-cloud-resource-connection + + Returns: a BqmlModel, wrapping a trained model in BigQuery + """ + options = dict(options) + # Cache dataframes to make sure base table is not a snapshot + # cached dataframe creates a full copy, never uses snapshot + input_data = X_train._cached(force=True).join( + y_train._cached(force=True), how="outer" + ) + options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) + + session = X_train._session + + model_ref = self._create_model_ref(session._anonymous_dataset) + + sql = self._model_creation_sql_generator.create_llm_remote_model( + source_df=input_data, + model_ref=model_ref, + options=options, + connection_name=connection_name, + ) + + return self._create_model_with_sql(session=session, sql=sql) + def create_time_series_model( self, X_train: bpd.DataFrame, diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 31c691fd51..37a38cdd5c 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -27,6 +27,10 @@ from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd +_BQML_PARAMS_MAPPING = { + "max_iterations": "maxIterations", +} + _TEXT_GENERATOR_BISON_ENDPOINT = "text-bison" _TEXT_GENERATOR_BISON_32K_ENDPOINT = "text-bison-32k" _TEXT_GENERATOR_ENDPOINTS = ( @@ -62,6 +66,8 @@ class PaLM2TextGenerator(base.BaseEstimator): Connection to connect with remote service. str of the format ... if None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach permission if the connection isn't fully setup. + max_iterations (Optional[int], Default to 300): + The number of steps to run when performing supervised tuning. """ def __init__( @@ -70,9 +76,11 @@ def __init__( model_name: Literal["text-bison", "text-bison-32k"] = "text-bison", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, + max_iterations: int = 300, ): self.model_name = model_name self.session = session or bpd.get_global_session() + self.max_iterations = max_iterations self._bq_connection_manager = self.session.bqconnectionmanager connection_name = connection_name or self.session._bq_connection @@ -132,12 +140,73 @@ def _from_bq( model_connection = model._properties["remoteModelInfo"]["connection"] model_endpoint = bqml_endpoint.split("/")[-1] + # Get the optional params + kwargs: dict = {} + last_fitting = model.training_runs[-1]["trainingOptions"] + + dummy_text_generator = cls() + for bf_param, _ in dummy_text_generator.__dict__.items(): + bqml_param = _BQML_PARAMS_MAPPING.get(bf_param) + if bqml_param in last_fitting: + # Convert types + if bf_param in ["max_iterations"]: + kwargs[bf_param] = int(last_fitting[bqml_param]) + text_generator_model = cls( - session=session, model_name=model_endpoint, connection_name=model_connection + **kwargs, + session=session, + model_name=model_endpoint, + connection_name=model_connection, ) text_generator_model._bqml_model = core.BqmlModel(session, model) return text_generator_model + @property + def _bqml_options(self) -> dict: + """The model options as they will be set for BQML""" + options = { + "max_iterations": self.max_iterations, + "data_split_method": "NO_SPLIT", + } + return options + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + ) -> PaLM2TextGenerator: + """Fine tune PaLM2TextGenerator model. + + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + DataFrame of shape (n_samples, n_features). Training data. + y (bigframes.dataframe.DataFrame or bigframes.series.Series: + Training labels. + + Returns: + PaLM2TextGenerator: Fitted Estimator. + """ + X, y = utils.convert_to_dataframe(X, y) + + options = self._bqml_options + options["endpoint"] = self.model_name + "@001" + options["prompt_col"] = X.columns.tolist()[0] + + self._bqml_model = self._bqml_model_factory.create_llm_remote_model( + X, + y, + options=options, + connection_name=self.connection_name, + ) + return self + def predict( self, X: Union[bpd.DataFrame, bpd.Series], diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 42c13fdb40..48eb5a93a7 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -35,7 +35,7 @@ def train_test_split( Args: *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series): A sequence of BigQuery DataFrames or Series that can be joined on - their indexes + their indexes. test_size (default None): The proportion of the dataset to include in the test split. If None, this will default to the complement of train_size. If both diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index fab358cce3..59c768ce81 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -177,6 +177,23 @@ def create_model( parts.append(f"AS {source_sql}") return "\n".join(parts) + def create_llm_remote_model( + self, + source_df: bpd.DataFrame, + connection_name: str, + model_ref: google.cloud.bigquery.ModelReference, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> str: + """Encode the CREATE OR REPLACE MODEL statement for BQML""" + source_sql = source_df.sql + + parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] + parts.append(self.connection(connection_name)) + if options: + parts.append(self.options(**options)) + parts.append(f"AS {source_sql}") + return "\n".join(parts) + def create_remote_model( self, connection_name: str, diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index d631ba8508..a7c385a2b8 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -15,7 +15,9 @@ from __future__ import annotations import dataclasses +import functools import typing +from typing import Tuple, Union import numpy as np import pandas as pd @@ -34,11 +36,6 @@ class RowOp(typing.Protocol): def name(self) -> str: ... - @property - def arguments(self) -> int: - """The number of column argument the operation takes""" - ... - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: ... @@ -48,21 +45,29 @@ def order_preserving(self) -> bool: ... -# These classes can be used to create simple ops that don't take local parameters -# All is needed is a unique name, and to register an implementation in ibis_mappings.py @dataclasses.dataclass(frozen=True) -class UnaryOp: +class NaryOp: @property def name(self) -> str: raise NotImplementedError("RowOp abstract base class has no implementation") + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + raise NotImplementedError("Abstract operation has no output type") + + @property + def order_preserving(self) -> bool: + """Whether the row operation preserves total ordering. Can be pruned from ordering expressions.""" + return False + + +# These classes can be used to create simple ops that don't take local parameters +# All is needed is a unique name, and to register an implementation in ibis_mappings.py +@dataclasses.dataclass(frozen=True) +class UnaryOp(NaryOp): @property def arguments(self) -> int: return 1 - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - raise NotImplementedError("Abstract operation has no output type") - def as_expr( self, input_id: typing.Union[str, bigframes.core.expression.Expression] = "arg" ) -> bigframes.core.expression.Expression: @@ -72,25 +77,13 @@ def as_expr( self, (_convert_expr_input(input_id),) ) - @property - def order_preserving(self) -> bool: - """Whether the row operation preserves total ordering. Can be pruned from ordering expressions.""" - return False - @dataclasses.dataclass(frozen=True) -class BinaryOp: - @property - def name(self) -> str: - raise NotImplementedError("RowOp abstract base class has no implementation") - +class BinaryOp(NaryOp): @property def arguments(self) -> int: return 2 - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - raise NotImplementedError("Abstract operation has no output type") - def as_expr( self, left_input: typing.Union[str, bigframes.core.expression.Expression] = "arg1", @@ -106,25 +99,13 @@ def as_expr( ), ) - @property - def order_preserving(self) -> bool: - """Whether the row operation preserves total ordering. Can be pruned from ordering expressions.""" - return False - @dataclasses.dataclass(frozen=True) -class TernaryOp: - @property - def name(self) -> str: - raise NotImplementedError("RowOp abstract base class has no implementation") - +class TernaryOp(NaryOp): @property def arguments(self) -> int: return 3 - def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - raise NotImplementedError("Abstract operation has no output type") - def as_expr( self, input1: typing.Union[str, bigframes.core.expression.Expression] = "arg1", @@ -142,11 +123,6 @@ def as_expr( ), ) - @property - def order_preserving(self) -> bool: - """Whether the row operation preserves total ordering. Can be pruned from ordering expressions.""" - return False - def _convert_expr_input( input: typing.Union[str, bigframes.core.expression.Expression] @@ -664,6 +640,46 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT clip_op = ClipOp() + +class CaseWhenOp(NaryOp): + name: typing.ClassVar[str] = "switch" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + assert len(input_types) % 2 == 0 + # predicate1, output1, predicate2, output2... + if not all(map(lambda x: x == dtypes.BOOL_DTYPE, input_types[::2])): + raise TypeError(f"Case inputs {input_types[::2]} must be boolean-valued") + output_expr_types = input_types[1::2] + return functools.reduce( + lambda t1, t2: dtypes.coerce_to_common(t1, t2), + output_expr_types, + ) + + def as_expr( + self, + *case_output_pairs: Tuple[ + Union[str | bigframes.core.expression.Expression], + Union[str | bigframes.core.expression.Expression], + ], + ) -> bigframes.core.expression.Expression: + import bigframes.core.expression + + # Keep this in sync with output_type and compilers + inputs: list[bigframes.core.expression.Expression] = [] + + for case, output in case_output_pairs: + inputs.append(_convert_expr_input(case)) + inputs.append(_convert_expr_input(output)) + + return bigframes.core.expression.OpExpression( + self, + tuple(inputs), + ) + + +case_when_op = CaseWhenOp() + + # Just parameterless unary ops for now # TODO: Parameter mappings NUMPY_TO_OP: typing.Final = { diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index f33dc16e30..0d27d1d75d 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -109,6 +109,18 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return input_types[0] +@dataclasses.dataclass(frozen=True) +class QuantileOp(UnaryAggregateOp): + q: float + + @property + def name(self): + return f"{int(self.q*100)}%" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) + + @dataclasses.dataclass(frozen=True) class ApproxQuartilesOp(UnaryAggregateOp): quartile: int diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index e8a1af9602..d222f0993b 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -15,9 +15,11 @@ from __future__ import annotations import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors +import pandas as pd from bigframes.core import log_adapter import bigframes.dataframe +import bigframes.dtypes import bigframes.operations import bigframes.operations.base import bigframes.series @@ -45,3 +47,13 @@ def explode(self) -> bigframes.dataframe.DataFrame: return bigframes.pandas.concat( [self.field(i) for i in range(pa_type.num_fields)], axis="columns" ) + + def dtypes(self) -> pd.Series: + pa_type = self._dtype.pyarrow_dtype + return pd.Series( + data=[ + bigframes.dtypes.arrow_dtype_to_bigframes_dtype(pa_type.field(i).type) + for i in range(pa_type.num_fields) + ], + index=[pa_type.field(i).name for i in range(pa_type.num_fields)], + ) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 91c3eb603b..96af6ab1b3 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -643,6 +643,7 @@ def remote_function( cloud_function_service_account: Optional[str] = None, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -656,6 +657,7 @@ def remote_function( cloud_function_service_account=cloud_function_service_account, cloud_function_kms_key_name=cloud_function_kms_key_name, cloud_function_docker_repository=cloud_function_docker_repository, + max_batching_rows=max_batching_rows, ) diff --git a/bigframes/series.py b/bigframes/series.py index 2f9123f9a3..47acfd0afb 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -23,7 +23,7 @@ import os import textwrap import typing -from typing import Any, Literal, Mapping, Optional, Sequence, Tuple, Union +from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery @@ -966,12 +966,21 @@ def mode(self) -> Series: def mean(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.mean_op)) - def median(self, *, exact: bool = False) -> float: + def median(self, *, exact: bool = True) -> float: if exact: - raise NotImplementedError( - f"Only approximate median is supported. {constants.FEEDBACK_LINK}" - ) - return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) + return typing.cast(float, self.quantile(0.5)) + else: + return typing.cast(float, self._apply_aggregation(agg_ops.median_op)) + + def quantile(self, q: Union[float, Sequence[float]] = 0.5) -> Union[Series, float]: + qs = tuple(q) if utils.is_list_like(q) else (q,) + result = block_ops.quantile(self._block, (self._value_column,), qs=qs) + if utils.is_list_like(q): + result = result.stack() + result = result.drop_levels([result.index_columns[0]]) + return Series(result) + else: + return cast(float, Series(result).to_pandas().squeeze()) def sum(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.sum_op)) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index b6d56006be..f3f1ffce16 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -708,13 +708,15 @@ def _get_snapshot_sql_and_primary_key( f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}" ) - # TODO(b/305264153): Use public properties to fetch primary keys once - # added to google-cloud-bigquery. - primary_keys = ( - table._properties.get("tableConstraints", {}) - .get("primaryKey", {}) - .get("columns") - ) + primary_keys = None + if ( + (table_constraints := getattr(table, "table_constraints", None)) is not None + and (primary_key := table_constraints.primary_key) is not None + # This will be False for either None or empty list. + # We want primary_keys = None if no primary keys are set. + and (columns := primary_key.columns) + ): + primary_keys = columns job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name @@ -777,12 +779,13 @@ def _read_gbq_table( query, default_project=self.bqclient.project ) - ( - table_expression, - total_ordering_cols, - ) = self._get_snapshot_sql_and_primary_key( + (table_expression, primary_keys,) = self._get_snapshot_sql_and_primary_key( table_ref, api_name=api_name, use_cache=use_cache ) + total_ordering_cols = primary_keys + + if not index_col and primary_keys is not None: + index_col = primary_keys for key in columns: if key not in table_expression.columns: @@ -1541,6 +1544,7 @@ def remote_function( cloud_function_service_account: Optional[str] = None, cloud_function_kms_key_name: Optional[str] = None, cloud_function_docker_repository: Optional[str] = None, + max_batching_rows: Optional[int] = 1000, ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1635,6 +1639,15 @@ def remote_function( projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME. For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin. + max_batching_rows (int, Optional): + The maximum number of rows to be batched for processing in the + BQ remote function. Default value is 1000. A lower number can be + passed to avoid timeouts in case the user code is too complex to + process large number of rows fast enough. A higher number can be + used to increase throughput in case the user code is fast enough. + `None` can be passed to let BQ remote functions service apply + default batching. See for more details + https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request. Returns: callable: A remote function object pointing to the cloud assets created in the background to support the remote execution. The cloud assets can be @@ -1656,6 +1669,7 @@ def remote_function( cloud_function_service_account=cloud_function_service_account, cloud_function_kms_key_name=cloud_function_kms_key_name, cloud_function_docker_repository=cloud_function_docker_repository, + max_batching_rows=max_batching_rows, ) def read_gbq_function( diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index 75283a060a..ac6ba4bae4 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -18,6 +18,7 @@ import datetime import itertools +import os import textwrap import types from typing import Dict, Iterable, Optional, Sequence, Tuple, Union @@ -34,6 +35,8 @@ MAX_LABELS_COUNT = 64 TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}" +LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" + def create_job_configs_labels( job_configs_labels: Optional[Dict[str, str]], @@ -243,4 +246,32 @@ def start_query_with_client( ) else: results_iterator = query_job.result(max_results=max_results) + + if LOGGING_NAME_ENV_VAR in os.environ: + # when running notebooks via pytest nbmake + pytest_log_job(query_job) + return results_iterator, query_job + + +def pytest_log_job(query_job: bigquery.QueryJob): + """For pytest runs only, log information about the query job + to a file in order to create a performance report. + """ + if LOGGING_NAME_ENV_VAR not in os.environ: + raise EnvironmentError( + "Environment variable {env_var} is not set".format( + env_var=LOGGING_NAME_ENV_VAR + ) + ) + test_name = os.environ[LOGGING_NAME_ENV_VAR] + current_directory = os.getcwd() + bytes_processed = query_job.total_bytes_processed + if not isinstance(bytes_processed, int): + return # filter out mocks + if query_job.configuration.dry_run: + # dry runs don't process their total_bytes_processed + bytes_processed = 0 + bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed") + with open(bytes_file, "a") as f: + f.write(str(bytes_processed) + "\n") diff --git a/bigframes/version.py b/bigframes/version.py index ec2105b648..1f103401e4 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.2.0" +__version__ = "1.3.0" diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb new file mode 100644 index 0000000000..a6e8444aac --- /dev/null +++ b/notebooks/apps/synthetic_data_generation.ipynb @@ -0,0 +1,1133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigQuery DataFrames: Synthetic Data Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to BigQuery DataFrames (installing which also installs `pandas` as a dependency) we will use\n", + "`faker` library as a building block for synthetic data generation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "suoG7eWDZARj", + "outputId": "b5c620a9-8f5b-413f-dd38-93448f941846" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting faker\n", + " Downloading Faker-24.9.0-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.4 in /usr/local/lib/python3.10/dist-packages (from faker) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.4->faker) (1.16.0)\n", + "Installing collected packages: faker\n", + "Successfully installed faker-24.9.0\n" + ] + } + ], + "source": [ + "!pip install faker" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "m3q1oeJALhsG" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "bpd.options.bigquery.project = PROJECT_ID" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use `GeminiTextGenerator` for our purpose, which is BigQuery DataFrame's state-of-the-art LLM integration at the time of writing this notebook (Apr 16 2024)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 69 + }, + "id": "lIYdn1woOS1n", + "outputId": "be474338-44c2-4ce0-955e-d525b8b9c84b" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/bigframes/session/__init__.py:1907: UserWarning: No explicit location is set, so using location US for the session.\n", + " return Session(context)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 3e8423da-737c-42e2-a3d2-d2180ca18579 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from bigframes.ml.llm import GeminiTextGenerator\n", + "\n", + "model = GeminiTextGenerator()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Craft a prompt for the LLM to indicate the schema of the desired data and hints for the code that could generate such data. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "id": "SSR-lLScLa95", + "outputId": "cbaec34e-6fa6-45b4-e54a-f11ca06b61e1" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d651d0bf-300c-4b1d-9e3c-03310b71287c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job c67b9bb9-2f3e-4b9e-b680-0b7b6e9d2279 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
prompt
0Write python code to generate a pandas datafra...
\n", + "

1 rows × 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " prompt\n", + "0 Write python code to generate a pandas datafra...\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"\"\"\\\n", + "Write python code to generate a pandas dataframe based on the requirements:\n", + " Column name: Name, type: string, Description: Latin American Names\n", + " Column name: Age, type: int\n", + " Column name: Gender, type: string, Description: Inclusive\n", + "\n", + "Note:\n", + " - Return the code only, no additional texts or comments\n", + " - Use faker library\n", + " - Generate 100 rows\n", + " - The final dataframe should be named 'result_df'.\n", + "\"\"\"\n", + "\n", + "df_prompt = bpd.DataFrame({\"prompt\" : [prompt]})\n", + "df_prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Be accommodating that LLM may not produce a runnable code in the first go and may need some nudging. We will retry by adding the failing code and the exception it throws as additional context in the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 277 + }, + "id": "miDe3K4GNvOo", + "outputId": "f2039e80-5ad7-4551-f8b2-7ef714a89d63" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d5c0725d-9070-4712-adfd-8a9bd86eefc3 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4eb581a3-7f97-411a-bee1-91e8c150cef4 is DONE. 8 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job f3d5503d-a3e7-49ce-b985-5ffbdbd856e3 is DONE. 2 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 8ef76041-f077-4a05-bc03-63e6983ef853 is DONE. 332 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "import pandas as pd\n", + "from faker import Faker\n", + "\n", + "fake = Faker('es_ES')\n", + "result_df = pd.DataFrame({\n", + " 'Name': [fake.name() for _ in range(100)],\n", + " 'Age': [fake.random_int(min=18, max=65) for _ in range(100)],\n", + " 'Gender': [fake.random_element(elements=['Male', 'Female', 'Non-binary']) for _ in range(100)]\n", + "})\n", + "\n" + ] + } + ], + "source": [ + "max_tries = 5\n", + "for i in range(max_tries):\n", + " # Get LLM generated code\n", + " df_result = model.predict(df_prompt)\n", + " llm_result = df_result['ml_generate_text_llm_result'].iloc[0]\n", + "\n", + " # Python code comes back as a markdown code block,\n", + " # remove the prefix \"```python\" and suffix \"```\"\n", + " code = llm_result[9:-3]\n", + " print(code)\n", + "\n", + " # Check if the generated code is runnable\n", + " try:\n", + " exec(code)\n", + " break\n", + " except Exception as ex:\n", + " print(ex)\n", + " error_context = f\"\"\"\n", + "Previous code:\n", + "{code}\n", + "\n", + "Had this exception:\n", + "{ex}\"\"\"\n", + "\n", + " # Update the prompt to help LLM correct error\n", + " df_prompt[\"prompt\"] += error_context\n", + "\n", + " # If we have exhausted max tries then stop trying\n", + " if i+1 == max_tries:\n", + " raise Exception(\"Failed to generate runnable code\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the generated code and verify that it produced the desired data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "GODcPwX2PBEu", + "outputId": "dec4c872-c464-49e4-cd7f-9442fc977d18" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"execution_context\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 100,\n \"samples\": [\n \"Renata Pla Cases\",\n \"Guiomar Carnero-Paz\",\n \"Luciano Garmendia\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13,\n \"min\": 18,\n \"max\": 64,\n \"num_unique_values\": 39,\n \"samples\": [\n 56,\n 31,\n 34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Male\",\n \"Non-binary\",\n \"Female\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeGender
0Pastora Acuña Company21Male
1León Reig-Salom39Non-binary
2Aura Tomás Llobet30Female
3Vicente Correa Palomar64Female
4Benito del Fuster34Female
............
95Eduardo Cabrera27Non-binary
96Nazaret de Izaguirre40Non-binary
97Manuela Agullo Bustamante27Female
98Eugenio Mateo Naranjo Blazquez36Non-binary
99Heriberto Vicens Baeza53Female
\n", + "

100 rows × 3 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " Name Age Gender\n", + "0 Pastora Acuña Company 21 Male\n", + "1 León Reig-Salom 39 Non-binary\n", + "2 Aura Tomás Llobet 30 Female\n", + "3 Vicente Correa Palomar 64 Female\n", + "4 Benito del Fuster 34 Female\n", + ".. ... ... ...\n", + "95 Eduardo Cabrera 27 Non-binary\n", + "96 Nazaret de Izaguirre 40 Non-binary\n", + "97 Manuela Agullo Bustamante 27 Female\n", + "98 Eugenio Mateo Naranjo Blazquez 36 Non-binary\n", + "99 Heriberto Vicens Baeza 53 Female\n", + "\n", + "[100 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "execution_context = {}\n", + "exec(code, execution_context)\n", + "execution_context.get(\"result_df\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to run this code at scale to generate since we want to generate large amount of data. Let's deploy a `remote_function` for this purpose." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 36 + }, + "id": "n-BsGciNqSwU", + "outputId": "996e5639-a49c-4542-a0dc-ede450e0eb6d" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'projects/bigframes-dev/locations/us-central1/functions/bigframes-19f2f35637098969770261a2974bef32'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@bpd.remote_function([int], str, packages=['faker', 'pandas'])\n", + "def data_generator(id):\n", + " context = {}\n", + " exec(code, context)\n", + " result_df = context.get(\"result_df\")\n", + " return result_df.to_json(orient=\"records\")\n", + "\n", + "data_generator.bigframes_cloud_function" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s say we want to generate 1 million rows of synthetic data. Since our generated code produces 100 rows in one run, we can initialize an indicator dataframe with 1M/100 = 10K indicator rows. Then we can apply the remote function to produce 100 synthetic data rows for each indicator row." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "id": "Odkmev9nsYqA", + "outputId": "4aa7a1fd-0c0d-4412-f326-a20e19f583b5" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Load job 40b9c3a8-27fc-40a8-9edf-4aa2e0fec332 is DONE. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "desired_num_rows = 1_000_000 # 1 million rows\n", + "batch_size = 100 # used in the prompt\n", + "num_batches = int(desired_num_rows/batch_size)\n", + "\n", + "df = bpd.DataFrame({\"row_id\": range(num_batches)})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "id": "UyBhlJFVsmQC", + "outputId": "29748df5-673b-4320-bb1f-53abaace3b81" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 9dd49b50-2dbf-4351-b9ad-b17aeb627caf is DONE. 240.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df[\"json_data\"] = df[\"row_id\"].apply(data_generator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point each item in `df[\"json_data\"]` is a json serialized array of 100 records. Let’s flatten that into 1 record per row using a direct SQL." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 932 + }, + "id": "6p3eM21qvRvy", + "outputId": "333f4e49-a555-4d2f-b527-02142782b3a7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 3f8d2133-b01d-402d-a731-79592810ca1c is DONE. 63.7 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4a613aa3-6323-4914-8e34-93323885d458 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0deb03be-725b-40b4-a7a1-1023b0477f35 is DONE. 40.1 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeGender
0Eloy Santiago-Aragón31Male
1Amanda Mata Abril20Non-binary
2Danilo Velázquez Salcedo58Male
3Leyre Alba España61Female
4Paulina Amores Pastor41Male
5Jorge Cuadrado Mena50Female
6Chucho Catalán36Non-binary
7Vidal Benavente Lerma38Male
8Clementina Álamo32Female
9Petrona Roselló-Valls61Male
10Luís Camilo Sastre Marin45Male
11Gil Baudelio Carbajo Ordóñez58Non-binary
12David del Donoso44Female
13Dolores Arnau Ros21Non-binary
14Febe de León46Non-binary
15Ariadna Almazán34Female
16Blas Serna Aguiló24Non-binary
17Paulino Barreda Almeida59Female
18Eligio Valcárcel Tormo35Non-binary
19Toño Amador Torres Portillo48Female
20Florencia del Bejarano65Non-binary
21Clímaco Andreu Gómez18Male
22Xiomara Dominguez Solana35Female
23Leire Castilla Borrego19Non-binary
24Angelita Garmendia Carpio21Non-binary
\n", + "

25 rows × 3 columns

\n", + "
[1000000 rows x 3 columns in total]" + ], + "text/plain": [ + " Name Age Gender\n", + "0 Eloy Santiago-Aragón 31 Male\n", + "1 Amanda Mata Abril 20 Non-binary\n", + "2 Danilo Velázquez Salcedo 58 Male\n", + "3 Leyre Alba España 61 Female\n", + "4 Paulina Amores Pastor 41 Male\n", + "5 Jorge Cuadrado Mena 50 Female\n", + "6 Chucho Catalán 36 Non-binary\n", + "7 Vidal Benavente Lerma 38 Male\n", + "8 Clementina Álamo 32 Female\n", + "9 Petrona Roselló-Valls 61 Male\n", + "10 Luís Camilo Sastre Marin 45 Male\n", + "11 Gil Baudelio Carbajo Ordóñez 58 Non-binary\n", + "12 David del Donoso 44 Female\n", + "13 Dolores Arnau Ros 21 Non-binary\n", + "14 Febe de León 46 Non-binary\n", + "15 Ariadna Almazán 34 Female\n", + "16 Blas Serna Aguiló 24 Non-binary\n", + "17 Paulino Barreda Almeida 59 Female\n", + "18 Eligio Valcárcel Tormo 35 Non-binary\n", + "19 Toño Amador Torres Portillo 48 Female\n", + "20 Florencia del Bejarano 65 Non-binary\n", + "21 Clímaco Andreu Gómez 18 Male\n", + "22 Xiomara Dominguez Solana 35 Female\n", + "23 Leire Castilla Borrego 19 Non-binary\n", + "24 Angelita Garmendia Carpio 21 Non-binary\n", + "...\n", + "\n", + "[1000000 rows x 3 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = f\"\"\"\n", + "WITH T0 AS ({df.sql}),\n", + "T1 AS (\n", + " SELECT PARSE_JSON(json_row) AS json_row\n", + " FROM T0, UNNEST(JSON_EXTRACT_ARRAY(json_data)) AS json_row\n", + ")\n", + "SELECT STRING(json_row.Name) AS Name,\n", + " INT64(json_row.Age) AS Age,\n", + " STRING(json_row.Gender) AS Gender\n", + "FROM T1\n", + "\"\"\"\n", + "df_result = bpd.read_gbq(sql)\n", + "df_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There you have it, 1 million synthetic data rows ready to use, or save them in a BigQuery table for future use." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/noxfile.py b/noxfile.py index fa9c0a57d8..91ad6bc0e6 100644 --- a/noxfile.py +++ b/noxfile.py @@ -723,6 +723,10 @@ def notebook(session: nox.Session): # The experimental notebooks imagine features that don't yet # exist or only exist as temporary prototypes. "notebooks/experimental/longer_ml_demo.ipynb", + # The notebooks that are added for more use cases, such as backing a + # blog post, which may take longer to execute and need not be + # continuously tested. + "notebooks/apps/synthetic_data_generation.ipynb", ] # Convert each Path notebook object to a string using a list comprehension. @@ -764,6 +768,8 @@ def notebook(session: nox.Session): "--nbmake-timeout=900", # 15 minutes ] + logging_name_env_var = "BIGFRAMES_PERFORMANCE_LOG_NAME" + try: # Populate notebook parameters and make a backup so that the notebooks # are runnable. @@ -773,13 +779,21 @@ def notebook(session: nox.Session): *notebooks, ) - # Run self-contained notebooks in single session.run - # achieve parallelization via -n - session.run( - *pytest_command, - "-nauto", - *notebooks, - ) + # Run notebooks in parallel session.run's, since each notebook + # takes an environment variable for performance logging + processes = [] + for notebook in notebooks: + session.env[logging_name_env_var] = os.path.basename(notebook) + process = Process( + target=session.run, + args=(*pytest_command, notebook), + ) + process.start() + processes.append(process) + + for process in processes: + process.join() + finally: # Prevent our notebook changes from getting checked in to git # accidentally. @@ -789,11 +803,12 @@ def notebook(session: nox.Session): *notebooks, ) - # Run regionalized notebooks in parallel session.run's, since each notebook - # takes a different region via env param. + # Additionally run regionalized notebooks in parallel session.run's. + # Each notebook takes a different region via env param. processes = [] for notebook, regions in notebooks_reg.items(): for region in regions: + session.env[logging_name_env_var] = os.path.basename(notebook) process = Process( target=session.run, args=(*pytest_command, notebook), @@ -805,6 +820,35 @@ def notebook(session: nox.Session): for process in processes: process.join() + # when run via pytest, notebooks output a .bytesprocessed report + # collect those reports and print a summary + _print_bytes_processed_report() + + +def _print_bytes_processed_report(): + """Add an informational report about http queries and bytes + processed to the testlog output for purposes of measuring + bigquery-related performance changes. + """ + print("---BIGQUERY USAGE REPORT---") + cumulative_queries = 0 + cumulative_bytes = 0 + for report in Path("notebooks/").glob("*/*.bytesprocessed"): + with open(report, "r") as f: + filename = report.stem + lines = f.read().splitlines() + query_count = len(lines) + total_bytes = sum([int(line) for line in lines]) + format_string = f"{filename} - query count: {query_count}, bytes processed sum: {total_bytes}" + print(format_string) + cumulative_bytes += total_bytes + cumulative_queries += query_count + print( + "---total queries: {total_queries}, total bytes: {total_bytes}---".format( + total_queries=cumulative_queries, total_bytes=cumulative_bytes + ) + ) + @nox.session(python="3.10") def release_dry_run(session): diff --git a/setup.py b/setup.py index 83049f9715..2ccf63259c 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ "gcsfs >=2023.3.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0dev", - "google-cloud-bigquery[bqstorage,pandas] >=3.10.0", + "google-cloud-bigquery[bqstorage,pandas] >=3.16.0", "google-cloud-functions >=1.12.0", "google-cloud-bigquery-connection >=1.12.0", "google-cloud-iam >=2.12.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 1e1f3a3e66..f5007ed564 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -4,7 +4,7 @@ fsspec==2023.3.0 gcsfs==2023.3.0 geopandas==0.12.2 google-auth==2.15.0 -google-cloud-bigquery==3.10.0 +google-cloud-bigquery==3.16.0 google-cloud-functions==1.12.0 google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index cf6b2a01f8..ec9acc292e 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1300,3 +1300,39 @@ def square_num(x): cleanup_remote_function_assets( session.bqclient, session.cloudfunctionsclient, square_num ) + + +@pytest.mark.parametrize( + ("max_batching_rows"), + [ + 10_000, + None, + ], +) +@pytest.mark.flaky(retries=2, delay=120) +def test_remote_function_max_batching_rows(session, scalars_dfs, max_batching_rows): + try: + + def square(x): + return x * x + + square_remote = session.remote_function( + [int], int, reuse=False, max_batching_rows=max_batching_rows + )(square) + + bq_routine = session.bqclient.get_routine( + square_remote.bigframes_remote_function + ) + assert bq_routine.remote_function_options.max_batching_rows == max_batching_rows + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df["int64_too"].apply(square_remote).to_pandas() + pd_result = scalars_pandas_df["int64_too"].apply(square) + + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + session.bqclient, session.cloudfunctionsclient, square_remote + ) diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py new file mode 100644 index 0000000000..62ef7d5c72 --- /dev/null +++ b/tests/system/load/test_llm.py @@ -0,0 +1,68 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +import bigframes.ml.llm + + +@pytest.fixture(scope="session") +def llm_fine_tune_df_default_index( + session: bigframes.Session, +) -> bigframes.dataframe.DataFrame: + sql = """ +SELECT + CONCAT("Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: ", text) as prompt, + CAST(label AS STRING) as label +FROM `llm_tuning.emotion_classification_train` +""" + return session.read_gbq(sql) + + +@pytest.fixture(scope="session") +def llm_remote_text_pandas_df(): + """Additional data matching the penguins dataset, with a new index""" + return pd.DataFrame( + { + "prompt": [ + "Please do sentiment analysis on the following text and only output a number from 0 to 5where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey", + "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: i was feeling a little vain when i did this one", + "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: a father of children killed in an accident", + ], + } + ) + + +def test_llm_palm_configure_fit( + llm_fine_tune_df_default_index, llm_remote_text_pandas_df +): + model = bigframes.ml.llm.PaLM2TextGenerator( + model_name="text-bison", max_iterations=1 + ) + + df = llm_fine_tune_df_default_index.dropna() + X_train = df[["prompt"]] + y_train = df[["label"]] + model.fit(X_train, y_train) + + assert model is not None + + df = model.predict(llm_remote_text_pandas_df).to_pandas() + assert df.shape == (3, 4) + assert "ml_generate_text_llm_result" in df.columns + series = df["ml_generate_text_llm_result"] + assert all(series.str.len() == 1) + + # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index b9e4889801..6f6b67597a 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e70764fcc0..4c598a682d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2504,7 +2504,10 @@ def test_df_melt_default(scalars_dfs): # Pandas produces int64 index, Bigframes produces Int64 (nullable) pd.testing.assert_frame_equal( - bf_result, pd_result, check_index_type=False, check_dtype=False + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, ) @@ -2984,10 +2987,14 @@ def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ord bf_result = bf_series.to_pandas(ordered=ordered) # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_series = pd_series.astype("Float64") # Pandas has object index type + pd_series.index = pd_series.index.astype(pd.StringDtype(storage="pyarrow")) assert_series_equal( - pd_series, bf_result, check_index_type=False, ignore_order=not ordered + pd_series, + bf_result, + check_index_type=False, + ignore_order=not ordered, + check_dtype=False, ) @@ -3029,6 +3036,31 @@ def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): ) +def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_index): + q = 0.45 + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index): + q = [0, 0.33, 0.67, 1.0] + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + pd_result.index = pd_result.index.astype("Float64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("op"), [ @@ -3051,7 +3083,7 @@ def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op pd_series = op(scalars_pandas_df_index).astype("boolean") bf_result = bf_series.to_pandas() - # Pandas has object index type + pd_series.index = pd_series.index.astype(bf_result.index.dtype) pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index ba79ba1ab1..7b36a06f49 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -65,6 +65,24 @@ def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index): assert ((pd_min <= bf_result_computed) & (bf_result_computed <= pd_max)).all().all() +@pytest.mark.parametrize( + ("q"), + [ + ([0.2, 0.4, 0.6, 0.8]), + (0.11), + ], +) +def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q): + col_names = ["int64_too", "float64_col", "int64_col", "string_col"] + bf_result = ( + scalars_df_index[col_names].groupby("string_col").quantile(q) + ).to_pandas() + pd_result = scalars_pandas_df_index[col_names].groupby("string_col").quantile(q) + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("operator"), [ @@ -389,3 +407,20 @@ def test_dataframe_groupby_nonnumeric_with_mean(): pd.testing.assert_frame_equal( pd_result, bf_result, check_index_type=False, check_dtype=False ) + + +@pytest.mark.parametrize( + ("q"), + [ + ([0.2, 0.4, 0.6, 0.8]), + (0.11), + ], +) +def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q): + bf_result = ( + scalars_df_index.groupby("string_col")["int64_col"].quantile(q) + ).to_pandas() + pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].quantile(q) + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d27cd0a236..9cb615fdcb 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1320,14 +1320,34 @@ def test_median(scalars_dfs): assert pd_min < bf_result < pd_max +def test_median_exact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_result = scalars_df[col_name].median(exact=True) + pd_result = scalars_pandas_df[col_name].median() + assert math.isclose(pd_result, bf_result) + + +def test_series_quantile(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + bf_series = scalars_df[col_name] + pd_series = scalars_pandas_df[col_name] + + pd_result = pd_series.quantile([0.0, 0.4, 0.6, 1.0]) + bf_result = bf_series.quantile([0.0, 0.4, 0.6, 1.0]) + pd.testing.assert_series_equal( + pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False + ) + + def test_numeric_literal(scalars_dfs): scalars_df, _ = scalars_dfs col_name = "numeric_col" assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9)) - bf_result = scalars_df[col_name] - scalars_df[col_name].median() + bf_result = scalars_df[col_name] + 42 assert bf_result.size == scalars_df[col_name].size - # TODO(b/323387826): The precision increased by 1 unexpectedly. - # assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) + assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9)) def test_repr(scalars_dfs): @@ -1502,12 +1522,32 @@ def test_groupby_mean(scalars_dfs): ) -def test_groupby_median(scalars_dfs): +def test_groupby_median_exact(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_too" - bf_series = ( + bf_result = ( scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median() ) + pd_result = ( + scalars_pandas_df[col_name] + .groupby(scalars_pandas_df["string_col"], dropna=False) + .median() + ) + + assert_series_equal( + pd_result, + bf_result.to_pandas(), + ) + + +def test_groupby_median_inexact(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_too" + bf_series = ( + scalars_df[col_name] + .groupby(scalars_df["string_col"], dropna=False) + .median(exact=False) + ) pd_max = ( scalars_pandas_df[col_name] .groupby(scalars_pandas_df["string_col"], dropna=False) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index ce415f9324..1e76a8bd8b 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -236,14 +236,13 @@ def test_read_gbq_w_anonymous_query_results_table(session: bigframes.Session): def test_read_gbq_w_primary_keys_table( session: bigframes.Session, usa_names_grouped_table: bigquery.Table ): + # Validate that the table we're querying has a primary key. table = usa_names_grouped_table - # TODO(b/305264153): Use public properties to fetch primary keys once - # added to google-cloud-bigquery. - primary_keys = ( - table._properties.get("tableConstraints", {}) - .get("primaryKey", {}) - .get("columns") - ) + table_constraints = table.table_constraints + assert table_constraints is not None + primary_key = table_constraints.primary_key + assert primary_key is not None + primary_keys = primary_key.columns assert len(primary_keys) != 0 df = session.read_gbq(f"{table.project}.{table.dataset_id}.{table.table_id}") diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index cf13084610..7d9a452f42 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -13,10 +13,13 @@ # limitations under the License. import re +import warnings import pytest +import bigframes import bigframes._config.bigquery_options as bigquery_options +import bigframes.exceptions @pytest.mark.parametrize( @@ -78,3 +81,51 @@ def test_setter_if_session_started_but_setting_the_same_value(attribute): setattr(options, attribute, original_object) assert getattr(options, attribute) is original_object + + +@pytest.mark.parametrize( + [ + "valid_location", + ], + [ + (None,), + ("us-central1",), + ], +) +def test_location_set_to_valid_no_warning(valid_location): + options = bigquery_options.BigQueryOptions() + # Ensure that no warnings are emitted. + # https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html#additional-use-cases-of-warnings-in-tests + with warnings.catch_warnings(): + # Turn matching UnknownLocationWarning into exceptions. + # https://docs.python.org/3/library/warnings.html#warning-filter + warnings.simplefilter( + "error", category=bigframes.exceptions.UnknownLocationWarning + ) + options.location = valid_location + + +@pytest.mark.parametrize( + [ + "invalid_location", + ], + [ + # Test with common mistakes, see article. + # https://en.wikipedia.org/wiki/Edit_distance#Formal_definition_and_properties + # Substitution + ("us-wist-3",), + # Insertion + ("us-central-1",), + # Deletion + ("asia-suth2",), + ], +) +def test_location_set_to_invalid_warning(invalid_location): + options = bigquery_options.BigQueryOptions() + with pytest.warns( + bigframes.exceptions.UnknownLocationWarning, + match=re.escape( + f"The location '{invalid_location}' is set to an unknown value." + ), + ): + options.location = invalid_location diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 5b1ff37775..3560f05cb6 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -181,6 +181,29 @@ def test_create_model_transform_correct( ) +def test_create_llm_remote_model_correct( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = model_creation_sql_generator.create_llm_remote_model( + source_df=mock_df, + connection_name="my_project.us.my_connection", + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_remote_model" + ), + options={"option_key1": "option_value1", "option_key2": 2}, + ) + assert ( + sql + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model` +REMOTE WITH CONNECTION `my_project.us.my_connection` +OPTIONS( + option_key1="option_value1", + option_key2=2) +AS input_X_y_sql""" + ) + + def test_create_remote_model_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 6846659930..28b08e49dc 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -13,7 +13,7 @@ # limitations under the License. import datetime -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Sequence import unittest.mock as mock import google.auth.credentials @@ -37,6 +37,7 @@ def create_bigquery_session( bqclient: Optional[mock.Mock] = None, session_id: str = "abcxyz", + table_schema: Sequence[google.cloud.bigquery.SchemaField] = TEST_SCHEMA, anonymous_dataset: Optional[google.cloud.bigquery.DatasetReference] = None, ) -> bigframes.Session: credentials = mock.create_autospec( @@ -51,7 +52,7 @@ def create_bigquery_session( table = mock.create_autospec(google.cloud.bigquery.Table, instance=True) table._properties = {} type(table).location = mock.PropertyMock(return_value="test-region") - type(table).schema = mock.PropertyMock(return_value=TEST_SCHEMA) + type(table).schema = mock.PropertyMock(return_value=table_schema) bqclient.get_table.return_value = table if anonymous_dataset is None: @@ -72,7 +73,7 @@ def query_mock(query, *args, **kwargs): if query.startswith("SELECT CURRENT_TIMESTAMP()"): query_job.result = mock.MagicMock(return_value=[[datetime.datetime.now()]]) else: - type(query_job).schema = mock.PropertyMock(return_value=TEST_SCHEMA) + type(query_job).schema = mock.PropertyMock(return_value=table_schema) return query_job diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 3e2b28c200..543196066a 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -19,9 +19,11 @@ import google.api_core.exceptions import google.cloud.bigquery +import google.cloud.bigquery.table import pytest import bigframes +import bigframes.exceptions from .. import resources @@ -50,6 +52,43 @@ def test_read_gbq_cached_table(): assert "1999-01-02T03:04:05.678901" in df.sql +def test_read_gbq_clustered_table_ok_default_index_with_primary_key(): + """If a primary key is set on the table, we use that as the index column + by default, no error should be raised in this case. + + See internal issue 335727141. + """ + table = google.cloud.bigquery.Table("my-project.my_dataset.my_table") + table.clustering_fields = ["col1", "col2"] + table.schema = ( + google.cloud.bigquery.SchemaField("pk_1", "INT64"), + google.cloud.bigquery.SchemaField("pk_2", "INT64"), + google.cloud.bigquery.SchemaField("col_1", "INT64"), + google.cloud.bigquery.SchemaField("col_2", "INT64"), + ) + + # TODO(b/305264153): use setter for table_constraints in client library + # when available. + table._properties["tableConstraints"] = { + "primaryKey": { + "columns": ["pk_1", "pk_2"], + }, + } + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + bqclient.get_table.return_value = table + session = resources.create_bigquery_session( + bqclient=bqclient, table_schema=table.schema + ) + table._properties["location"] = session._location + + df = session.read_gbq("my-project.my_dataset.my_table") + + # There should be no analytic operators to prevent row filtering pushdown. + assert "OVER" not in df.sql + assert tuple(df.index.names) == ("pk_1", "pk_2") + + @pytest.mark.parametrize( "not_found_table_id", [("unknown.dataset.table"), ("project.unknown.table"), ("project.dataset.unknown")], diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index 88826b31ce..fddeab19a2 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -3,6 +3,7 @@ import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops from ibis.backends.bigquery.registry import OPERATION_REGISTRY +import ibis.expr.operations.reductions as ibis_reductions def _approx_quantiles(translator, op: vendored_ibis_ops.ApproximateMultiQuantile): @@ -31,12 +32,19 @@ def _generate_array(translator, op: vendored_ibis_ops.GenerateArray): return f"GENERATE_ARRAY(0, {arg})" +def _quantile(translator, op: ibis_reductions.Quantile): + arg = translator.translate(op.arg) + quantile = translator.translate(op.quantile) + return f"PERCENTILE_CONT({arg}, {quantile})" + + patched_ops = { vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles, # type:ignore vendored_ibis_ops.FirstNonNullValue: _first_non_null_value, # type:ignore vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore + ibis_reductions.Quantile: _quantile, # type:ignore } OPERATION_REGISTRY.update(patched_ops) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 8e3ea06a3d..bd6e50d096 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -92,3 +92,32 @@ def explode(self): The data corresponding to all child fields. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def dtypes(self): + """ + Return the dtype object of each child field of the struct. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=bpd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.dtypes() + version Int64 + project string[pyarrow] + dtype: object + + Returns: + A *pandas* Series with the data type of all child fields. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6707dc1403..0515f690e3 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4327,16 +4327,16 @@ def min(self, axis=0, *, numeric_only: bool = False): Finding the minimum value in each column (the default behavior without an explicit axis parameter). >>> df.min() - A 1.0 - B 2.0 - dtype: Float64 + A 1 + B 2 + dtype: Int64 Finding the minimum value in each row. >>> df.min(axis=1) - 0 1.0 - 1 3.0 - dtype: Float64 + 0 1 + 1 3 + dtype: Int64 Args: axis ({index (0), columns (1)}): @@ -4372,16 +4372,16 @@ def max(self, axis=0, *, numeric_only: bool = False): Finding the maximum value in each column (the default behavior without an explicit axis parameter). >>> df.max() - A 3.0 - B 4.0 - dtype: Float64 + A 3 + B 4 + dtype: Int64 Finding the maximum value in each row. >>> df.max(axis=1) - 0 2.0 - 1 4.0 - dtype: Float64 + 0 2 + 1 4 + dtype: Int64 Args: axis ({index (0), columns (1)}): @@ -4416,16 +4416,16 @@ def sum(self, axis=0, *, numeric_only: bool = False): Calculating the sum of each column (the default behavior without an explicit axis parameter). >>> df.sum() - A 4.0 - B 6.0 - dtype: Float64 + A 4 + B 6 + dtype: Int64 Calculating the sum of each row. >>> df.sum(axis=1) - 0 3.0 - 1 7.0 - dtype: Float64 + 0 3 + 1 7 + dtype: Int64 Args: axis ({index (0), columns (1)}): @@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def median(self, *, numeric_only: bool = False, exact: bool = False): + def median(self, *, numeric_only: bool = False, exact: bool = True): """Return the median of the values over colunms. **Examples:** @@ -4500,22 +4500,61 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): Finding the median value of each column. >>> df.median() - A 1.0 - B 2.0 + A 2.0 + B 3.0 dtype: Float64 Args: numeric_only (bool. default False): Default False. Include only float, int, boolean columns. - exact (bool. default False): - Default False. Get the exact median instead of an approximate - one. Note: ``exact=True`` not yet supported. + exact (bool. default True): + Default True. Get the exact median instead of an approximate + one. Returns: bigframes.series.Series: Series with the median of values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def quantile( + self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False + ): + """ + Return values at the given quantile over requested axis. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df.quantile(.1) + a 1.3 + b 3.7 + Name: 0.1, dtype: Float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + + [2 rows x 2 columns] + + Args: + q (float or array-like, default 0.5 (50% quantile)): + Value between 0 <= q <= 1, the quantile(s) to compute. + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. + + Returns: + Series or DataFrame: + If ``q`` is an array, a DataFrame will be returned where the + index is ``q``, the columns are the columns of self, and the + values are the quantiles. + If ``q`` is a float, a Series will be returned where the + index is the columns of self and the values are the quantiles. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def var(self, axis=0, *, numeric_only: bool = False): """Return unbiased variance over requested axis. @@ -4710,10 +4749,10 @@ def count(self, *, numeric_only: bool = False): Counting non-NA values for each column: >>> df.count() - A 4.0 - B 5.0 - C 3.0 - dtype: Float64 + A 4 + B 5 + C 3 + dtype: Int64 Args: numeric_only (bool, default False): @@ -5013,17 +5052,17 @@ def melt(self, id_vars, value_vars, var_name, value_name): Using `melt` with `id_vars` and `value_vars`: >>> df.melt(id_vars='A', value_vars=['B', 'C']) - A variable value - 0 1.0 B 1 - 1 B 2 - 2 3.0 B 3 - 3 4.0 B 4 - 4 5.0 B 5 - 5 1.0 C - 6 C 3 - 7 3.0 C - 8 4.0 C 4 - 9 5.0 C 5 + A variable value + 0 1.0 B 1.0 + 1 B 2.0 + 2 3.0 B 3.0 + 3 4.0 B 4.0 + 4 5.0 B 5.0 + 5 1.0 C + 6 C 3.5 + 7 3.0 C + 8 4.0 C 4.5 + 9 5.0 C 5.0 [10 rows x 3 columns] @@ -5064,9 +5103,9 @@ def nunique(self): [3 rows x 2 columns] >>> df.nunique() - A 3.0 - B 2.0 - dtype: Float64 + A 3 + B 2 + dtype: Int64 Returns: bigframes.series.Series: Series with number of distinct elements. @@ -5275,9 +5314,9 @@ def agg(self, func): Using a single function: >>> df.agg('sum') - A 6.0 - B 6.0 - dtype: Float64 + A 6 + B 6 + dtype: Int64 Using a list of functions: diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 9c6120fd6c..54c876ef3c 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -61,6 +61,7 @@ def __iter__(self) -> Iterator: iterator **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index ed4ca66f38..f3f7748e34 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -68,7 +68,7 @@ def median( self, numeric_only: bool = False, *, - exact: bool = False, + exact: bool = True, ): """ Compute median of groups, excluding missing values. @@ -76,15 +76,45 @@ def median( Args: numeric_only (bool, default False): Include only float, int, boolean columns. - exact (bool, default False): - Calculate the exact median instead of an approximation. Note: - ``exact=True`` is not supported. + exact (bool, default True): + Calculate the exact median instead of an approximation. Returns: pandas.Series or pandas.DataFrame: Median of groups. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def quantile(self, q=0.5, *, numeric_only: bool = False): + """ + Return group values at the given quantile, a la numpy.percentile. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> df = bpd.DataFrame([ + ... ['a', 1], ['a', 2], ['a', 3], + ... ['b', 1], ['b', 3], ['b', 5] + ... ], columns=['key', 'val']) + >>> df.groupby('key').quantile() + val + key + a 2.0 + b 3.0 + + [2 rows x 1 columns] + + Args: + q (float or array-like, default 0.5 (50% quantile)): + Value(s) between 0 and 1 providing the quantile(s) to compute. + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. + + Returns: + Series or DataFrame: Return type determined by caller of GroupBy object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def std( self, *, diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 3f0175359a..f34612cb11 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -37,7 +37,7 @@ def dayofweek(self): """The day of the week with Monday=0, Sunday=6. Return the day of the week. It is assumed the week starts on - Monday, which is denoted by 0 and ends on Sunday which is denoted + Monday, which is denoted by 0 and ends on Sunday, which is denoted by 6. **Examples:** diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 46bc9714f8..0c5b8d4521 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3,7 +3,16 @@ """ from __future__ import annotations -from typing import Hashable, IO, Literal, Mapping, Optional, Sequence, TYPE_CHECKING +from typing import ( + Hashable, + IO, + Literal, + Mapping, + Optional, + Sequence, + TYPE_CHECKING, + Union, +) from bigframes_vendored.pandas.core.generic import NDFrame import numpy @@ -584,9 +593,9 @@ def agg(self, func): 1 >>> s.agg(['min', 'max']) - min 1.0 - max 4.0 - dtype: Float64 + min 1 + max 4 + dtype: Int64 Args: func (function): @@ -853,6 +862,7 @@ def autocorr(self, lag: int = 1) -> float: the Series and its shifted self. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None @@ -2803,6 +2813,7 @@ def combine_first(self, other) -> Series: of the two indexes. **Examples:** + >>> import bigframes.pandas as bpd >>> import numpy as np >>> bpd.options.display.progress_bar = None @@ -2843,6 +2854,7 @@ def update(self, other) -> None: on index. **Examples:** + >>> import bigframes.pandas as bpd >>> import pandas as pd >>> import numpy as np @@ -3138,19 +3150,51 @@ def mean(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def median(self, *, exact: bool = False): + def median(self, *, exact: bool = True): """Return the median of the values over the requested axis. Args: - exact (bool. default False): - Default False. Get the exact median instead of an approximate - one. Note: ``exact=True`` not yet supported. + exact (bool. default True): + Default True. Get the exact median instead of an approximate + one. Returns: scalar: Scalar. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def quantile( + self, + q: Union[float, Sequence[float]] = 0.5, + ) -> Union[Series, float]: + """ + Return value at the given quantile. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([1, 2, 3, 4]) + >>> s.quantile(.5) + 2.5 + >>> s.quantile([.25, .5, .75]) + 0.25 1.75 + 0.5 2.5 + 0.75 3.25 + dtype: Float64 + + Args: + q (float or array-like, default 0.5 (50% quantile)): + The quantile(s) to compute, which can lie in range: 0 <= q <= 1. + + Returns: + float or Series: + If ``q`` is an array, a Series will be returned where the + index is ``q`` and the values are the quantiles, otherwise + a float will be returned. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def prod(self): """Return the product of the values over the requested axis. diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index b5feeb13c5..93cee71289 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -27,13 +27,17 @@ def read_gbq( ): """Loads a DataFrame from BigQuery. - BigQuery tables are an unordered, unindexed data source. By default, - the DataFrame will have an arbitrary index and ordering. - - Set the `index_col` argument to one or more columns to choose an - index. The resulting DataFrame is sorted by the index columns. For the - best performance, ensure the index columns don't contain duplicate - values. + BigQuery tables are an unordered, unindexed data source. To add support + pandas-compatibility, the following indexing options are supported: + + * (Default behavior) Add an arbitrary sequential index and ordering + using an an analytic windowed operation that prevents filtering + push down. + * (Recommended) Set the ``index_col`` argument to one or more columns. + Unique values for the row labels are recommended. Duplicate labels + are possible, but note that joins on a non-unique index can duplicate + rows and operations like ``cumsum()`` that window across a non-unique + index can have some non-deternimism. .. note:: By default, even SQL query inputs with an ORDER BY clause create a @@ -105,6 +109,9 @@ def read_gbq( In tha case, will read all the matched table as one DataFrame. index_col (Iterable[str] or str): Name of result column(s) to use for index in results DataFrame. + + **New in bigframes version 1.3.0**: If ``index_cols`` is not + set, the primary key(s) of the table are used as the index. columns (Iterable[str]): List of BigQuery column names in the desired order for results DataFrame. diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py index 19f56965df..bf016357a6 100644 --- a/third_party/bigframes_vendored/pandas/plotting/_core.py +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -11,6 +11,7 @@ class PlotAccessor: For Series: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> ser = bpd.Series([1, 2, 3, 3]) >>> plot = ser.plot(kind='hist', title="My plot") @@ -57,6 +58,7 @@ def hist( >>> import bigframes.pandas as bpd >>> import numpy as np + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -93,6 +95,7 @@ def line( **Examples:** >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'one': [1, 2, 3, 4], @@ -160,6 +163,7 @@ def area( Draw an area plot based on basic business metrics: >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None >>> df = bpd.DataFrame( ... { ... 'sales': [3, 2, 3, 9, 10, 6], diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index fd8db7a227..1a151a1119 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -153,7 +153,7 @@ def fit_transform(self, X, y=None): Target values (None for unsupervised transformations). Returns: - bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new) + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new). Transformed DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index 00bbf8cd60..8e8b2c1952 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -122,7 +122,7 @@ def recall_score( ): """Compute the recall. - The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + The recall is the ratio ``tp / (tp + fn)``, where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. @@ -170,7 +170,7 @@ def precision_score( ): """Compute the precision. - The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + The precision is the ratio ``tp / (tp + fp)``, where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. @@ -244,9 +244,9 @@ def f1_score( dtype: float64 Args: - y_true: Series or DataFrame of shape (n_samples,) + y_true: Series or DataFrame of shape (n_samples,). Ground truth (correct) target values. - y_pred: Series or DataFrame of shape (n_samples,) + y_pred: Series or DataFrame of shape (n_samples,). Estimated targets as returned by a classifier. average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ default='binary' diff --git a/third_party/bigframes_vendored/sklearn/pipeline.py b/third_party/bigframes_vendored/sklearn/pipeline.py index aed1565960..8a98ee4141 100644 --- a/third_party/bigframes_vendored/sklearn/pipeline.py +++ b/third_party/bigframes_vendored/sklearn/pipeline.py @@ -20,13 +20,14 @@ class Pipeline(BaseEstimator, metaclass=ABCMeta): """Pipeline of transforms with a final estimator. Sequentially apply a list of transforms and a final estimator. - Intermediate steps of the pipeline must be `transforms`, that is, they + Intermediate steps of the pipeline must be `transforms`. That is, they must implement `fit` and `transform` methods. The final estimator only needs to implement `fit`. The purpose of the pipeline is to assemble several steps that can be - cross-validated together while setting different parameters. This simplifies code, and allows deploying an estimator - and peprocessing together, e.g. with `Pipeline.to_gbq(...).` + cross-validated together while setting different parameters. This + simplifies code and allows for deploying an estimator and peprocessing + together, e.g. with `Pipeline.to_gbq(...).` """ def fit( diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 5e5e8ac042..b883e82249 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -23,15 +23,21 @@ class OneHotEncoder(BaseEstimator): Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to a binary one-hot encoding. - .. code-block:: - - from bigframes.ml.preprocessing import OneHotEncoder - import bigframes.pandas as bpd - - enc = OneHotEncoder() - X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) - enc.fit(X) - print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]}))) + >>> from bigframes.ml.preprocessing import OneHotEncoder + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> enc = OneHotEncoder() + >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]}) + >>> enc.fit(X) + OneHotEncoder() + + >>> print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]}))) + onehotencoded_a onehotencoded_b + 0 [{'index': 1, 'value': 1.0}] [{'index': 1, 'value': 1.0}] + 1 [{'index': 2, 'value': 1.0}] [{'index': 0, 'value': 1.0}] + + [2 rows x 2 columns] Args: drop (Optional[Literal["most_frequent"]], default None): @@ -52,7 +58,7 @@ class OneHotEncoder(BaseEstimator): Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, max_categories includes the category representing the infrequent categories along with the frequent categories. - Default None, set limit to 1,000,000. + Default None. Set limit to 1,000,000. """ def fit(self, X, y=None): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py index cc6b995c8c..61a44db92f 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -26,7 +26,7 @@ class LabelEncoder(BaseEstimator): Specifies an upper limit to the number of output features for each input feature when considering infrequent categories. If there are infrequent categories, max_categories includes the category representing the infrequent categories along with the frequent categories. - Default None, set limit to 1,000,000. + Default None. Set limit to 1,000,000. """ def fit(self, y):