diff --git a/CHANGELOG.md b/CHANGELOG.md index 29f99ecc43..771f04776e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,28 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.20.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.19.2...v0.20.0) (2024-01-30) + + +### Features + +* Add `DataFrame.peek()` as an efficient alternative to `head()` results preview ([#318](https://github.com/googleapis/python-bigquery-dataframes/issues/318)) ([9c34d83](https://github.com/googleapis/python-bigquery-dataframes/commit/9c34d834e83ca5514bee723ebb9a7ad1ad50e88d)) +* Add ARIMA_EVAULATE options in forecasting models ([#336](https://github.com/googleapis/python-bigquery-dataframes/issues/336)) ([73e997b](https://github.com/googleapis/python-bigquery-dataframes/commit/73e997b3e80f844a8120b52ed2ece8b046cf4ca9)) +* Add Index constructor, repr, copy, get_level_values, to_series ([#334](https://github.com/googleapis/python-bigquery-dataframes/issues/334)) ([e5d054e](https://github.com/googleapis/python-bigquery-dataframes/commit/e5d054e93a05f5c504e8db57b954c07d33e5f5b9)) +* Improve error message for drive based BQ table reads ([#344](https://github.com/googleapis/python-bigquery-dataframes/issues/344)) ([0794788](https://github.com/googleapis/python-bigquery-dataframes/commit/0794788a2d232d795d803cd0c5b3f7d51c562cf1)) +* Update cut to work without labels = False and show intervals as dict ([#335](https://github.com/googleapis/python-bigquery-dataframes/issues/335)) ([4ff53db](https://github.com/googleapis/python-bigquery-dataframes/commit/4ff53db48133b817bec5f123b634690244a610d3)) + + +### Bug Fixes + +* Chance default connection name in getting_started.ipnyb ([#347](https://github.com/googleapis/python-bigquery-dataframes/issues/347)) ([677f014](https://github.com/googleapis/python-bigquery-dataframes/commit/677f0146acf19def88fddbeb0527a078458948ae)) +* Series iteration correctly returns values instead of index ([#339](https://github.com/googleapis/python-bigquery-dataframes/issues/339)) ([2c6af9b](https://github.com/googleapis/python-bigquery-dataframes/commit/2c6af9ba8b362dae39a6e082cdc816c955c73517)) + + +### Documentation + +* Add code samples for `Series.{between, cumprod}` ([#353](https://github.com/googleapis/python-bigquery-dataframes/issues/353)) ([09a52fd](https://github.com/googleapis/python-bigquery-dataframes/commit/09a52fda19cde8efa6b20731d5b8e21f50b18a9a)) + ## [0.19.2](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.19.1...v0.19.2) (2024-01-22) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 8c08d073d7..8c399e34ab 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -106,10 +106,10 @@ def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: return self._compile_ordered().get_column_type(key) def _compile_ordered(self) -> compiling.OrderedIR: - return compiling.compile_ordered(self.node) + return compiling.compile_ordered_ir(self.node) def _compile_unordered(self) -> compiling.UnorderedIR: - return compiling.compile_unordered(self.node) + return compiling.compile_unordered_ir(self.node) def row_count(self) -> ArrayValue: """Get number of rows in ArrayValue as a single-entry ArrayValue.""" diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 3ec0419c6d..9e17dc2752 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -287,7 +287,6 @@ def reset_index(self, drop: bool = True) -> Block: A new Block because dropping index columns can break references from Index classes that point to this block. """ - block = self new_index_col_id = guid.generate_guid() expr = self._expr.promote_offsets(new_index_col_id) if drop: @@ -295,7 +294,7 @@ def reset_index(self, drop: bool = True) -> Block: # ordering expression as reset_index shouldn't change the row # order. expr = expr.drop_columns(self.index_columns) - block = Block( + return Block( expr, index_columns=[new_index_col_id], column_labels=self.column_labels, @@ -321,13 +320,12 @@ def reset_index(self, drop: bool = True) -> Block: # See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html column_labels_modified = column_labels_modified.insert(level, label) - block = Block( + return Block( expr, index_columns=[new_index_col_id], column_labels=column_labels_modified, index_labels=[None], ) - return block def set_index( self, @@ -432,8 +430,18 @@ def to_pandas( downsampling=sampling, ordered=ordered ) ) + df.set_axis(self.column_labels, axis=1, copy=False) return df, query_job + def try_peek(self, n: int = 20) -> typing.Optional[pd.DataFrame]: + if self.expr.node.peekable: + iterator, _ = self.session._peek(self.expr, n) + df = self._to_dataframe(iterator) + self._copy_index_to_pandas(df) + return df + else: + return None + def to_pandas_batches(self): """Download results one message at a time.""" dtypes = dict(zip(self.index_columns, self.index_dtypes)) diff --git a/bigframes/core/compile/__init__.py b/bigframes/core/compile/__init__.py index 761fd9a465..c3e2bd832a 100644 --- a/bigframes/core/compile/__init__.py +++ b/bigframes/core/compile/__init__.py @@ -13,11 +13,11 @@ # limitations under the License. from bigframes.core.compile.compiled import OrderedIR, UnorderedIR -from bigframes.core.compile.compiler import compile_ordered, compile_unordered +from bigframes.core.compile.compiler import compile_ordered_ir, compile_unordered_ir __all__ = [ - "compile_ordered", - "compile_unordered", + "compile_ordered_ir", + "compile_unordered_ir", "OrderedIR", "UnorderedIR", ] diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py new file mode 100644 index 0000000000..044c33799e --- /dev/null +++ b/bigframes/core/compile/aggregate_compiler.py @@ -0,0 +1,413 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import functools +import typing +from typing import cast, Optional + +import ibis +import ibis.expr.datatypes as ibis_dtypes +import ibis.expr.types as ibis_types +import pandas as pd + +import bigframes.constants as constants +import bigframes.core.window_spec as window_spec +import bigframes.dtypes as dtypes +import bigframes.operations.aggregations as agg_ops +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops + + +def compile_unary_aggregate( + op: agg_ops.AggregateOp, input: ibis_types.Column +) -> ibis_types.Value: + return compile_agg(op, input) + + +def compile_unary_analytic( + op: agg_ops.WindowOp, input: ibis_types.Column, window: window_spec.WindowSpec +) -> ibis_types.Value: + return compile_agg(op, input, window) + + +@functools.singledispatch +def compile_agg( + op: agg_ops.WindowOp, + input: ibis_types.Column, + window: Optional[window_spec.WindowSpec] = None, +) -> ibis_types.Value: + """Defines transformation but isn't cached, always use compile_node instead""" + raise ValueError(f"Can't compile unrecognized operation: {op}") + + +def numeric_op(operation): + @functools.wraps(operation) + def constrained_op(op, column: ibis_types.Column, window=None): + if column.type().is_boolean(): + column = typing.cast( + ibis_types.NumericColumn, column.cast(ibis_dtypes.int64) + ) + if column.type().is_numeric(): + return operation(op, column, window) + else: + raise ValueError( + f"Numeric operation cannot be applied to type {column.type()}. {constants.FEEDBACK_LINK}" + ) + + return constrained_op + + +@compile_agg.register +@numeric_op +def _( + op: agg_ops.SumOp, column: ibis_types.NumericColumn, window=None +) -> ibis_types.NumericValue: + # Will be null if all inputs are null. Pandas defaults to zero sum though. + bq_sum = _apply_window_if_present(column.sum(), window) + return ( + ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() # type: ignore + ) + + +@compile_agg.register +@numeric_op +def _( + op: agg_ops.MedianOp, column: ibis_types.NumericColumn, window=None +) -> ibis_types.NumericValue: + # PERCENTILE_CONT has very few allowed windows. For example, "window + # framing clause is not allowed for analytic function percentile_cont". + if window is not None: + raise NotImplementedError( + f"Median with windowing is not supported. {constants.FEEDBACK_LINK}" + ) + + # TODO(swast): Allow switching between exact and approximate median. + # For now, the best we can do is an approximate median when we're doing + # an aggregation, as PERCENTILE_CONT is only an analytic function. + return cast(ibis_types.NumericValue, column.approx_median()) + + +@compile_agg.register +@numeric_op +def _( + op: agg_ops.ApproxQuartilesOp, column: ibis_types.NumericColumn, window=None +) -> ibis_types.NumericValue: + # PERCENTILE_CONT has very few allowed windows. For example, "window + # framing clause is not allowed for analytic function percentile_cont". + if window is not None: + raise NotImplementedError( + f"Approx Quartiles with windowing is not supported. {constants.FEEDBACK_LINK}" + ) + value = vendored_ibis_ops.ApproximateMultiQuantile( + column, num_bins=4 # type: ignore + ).to_expr()[op._quartile] + return cast(ibis_types.NumericValue, value) + + +@compile_agg.register +@numeric_op +def _( + op: agg_ops.MeanOp, column: ibis_types.NumericColumn, window=None +) -> ibis_types.NumericValue: + return _apply_window_if_present(column.mean(), window) + + +@compile_agg.register +@numeric_op +def _( + op: agg_ops.ProductOp, column: ibis_types.NumericColumn, window=None +) -> ibis_types.NumericValue: + # Need to short-circuit as log with zeroes is illegal sql + is_zero = cast(ibis_types.BooleanColumn, (column == 0)) + + # There is no product sql aggregate function, so must implement as a sum of logs, and then + # apply power after. Note, log and power base must be equal! This impl uses base 2. + logs = cast( + ibis_types.NumericColumn, + ibis.case().when(is_zero, 0).else_(column.abs().log2()).end(), + ) + logs_sum = _apply_window_if_present(logs.sum(), window) + magnitude = cast(ibis_types.NumericValue, ibis_types.literal(2)).pow(logs_sum) + + # Can't determine sign from logs, so have to determine parity of count of negative inputs + is_negative = cast( + ibis_types.NumericColumn, + ibis.case().when(column.sign() == -1, 1).else_(0).end(), + ) + negative_count = _apply_window_if_present(is_negative.sum(), window) + negative_count_parity = negative_count % cast( + ibis_types.NumericValue, ibis.literal(2) + ) # 1 if result should be negative, otherwise 0 + + any_zeroes = _apply_window_if_present(is_zero.any(), window) + float_result = ( + ibis.case() + .when(any_zeroes, ibis_types.literal(0)) + .else_(magnitude * pow(-1, negative_count_parity)) + .end() + ) + return float_result.cast(column.type()) # type: ignore + + +@compile_agg.register +def _(op: agg_ops.MaxOp, column: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present(column.max(), window) + + +@compile_agg.register +def _(op: agg_ops.MinOp, column: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present(column.min(), window) + + +@compile_agg.register +@numeric_op +def _(op: agg_ops.StdOp, x: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present(cast(ibis_types.NumericColumn, x).std(), window) + + +@compile_agg.register +@numeric_op +def _(op: agg_ops.VarOp, x: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present(cast(ibis_types.NumericColumn, x).var(), window) + + +@compile_agg.register +@numeric_op +def _(op: agg_ops.PopVarOp, x: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present( + cast(ibis_types.NumericColumn, x).var(how="pop"), window + ) + + +@compile_agg.register +def _( + op: agg_ops.CountOp, column: ibis_types.Column, window=None +) -> ibis_types.IntegerValue: + return _apply_window_if_present(column.count(), window) + + +@compile_agg.register +def _(op: agg_ops.CutOp, x: ibis_types.Column, window=None): + out = ibis.case() + + if op._bins_int > 0: + col_min = _apply_window_if_present(x.min(), window) + col_max = _apply_window_if_present(x.max(), window) + bin_width = (col_max - col_min) / op._bins + + if op._labels is False: + for this_bin in range(op._bins_int - 1): + out = out.when( + x <= (col_min + (this_bin + 1) * bin_width), + dtypes.literal_to_ibis_scalar( + this_bin, force_dtype=pd.Int64Dtype() + ), + ) + out = out.when(x.notnull(), op._bins - 1) + else: + interval_struct = None + adj = (col_max - col_min) * 0.001 + for this_bin in range(op._bins_int): + left_edge = ( + col_min + this_bin * bin_width - (0 if this_bin > 0 else adj) + ) + right_edge = col_min + (this_bin + 1) * bin_width + interval_struct = ibis.struct( + { + "left_exclusive": left_edge, + "right_inclusive": right_edge, + } + ) + + if this_bin < op._bins_int - 1: + out = out.when( + x <= (col_min + (this_bin + 1) * bin_width), + interval_struct, + ) + else: + out = out.when(x.notnull(), interval_struct) + else: + for interval in op._bins: + condition = (x > interval.left) & (x <= interval.right) + interval_struct = ibis.struct( + {"left_exclusive": interval.left, "right_inclusive": interval.right} + ) + out = out.when(condition, interval_struct) + return out.end() + + +@compile_agg.register +@numeric_op +def _( + self: agg_ops.QcutOp, column: ibis_types.Column, window=None +) -> ibis_types.IntegerValue: + if isinstance(self._quantiles, int): + quantiles_ibis = dtypes.literal_to_ibis_scalar(self._quantiles) + percent_ranks = cast( + ibis_types.FloatingColumn, + _apply_window_if_present(column.percent_rank(), window), + ) + float_bucket = cast(ibis_types.FloatingColumn, (percent_ranks * quantiles_ibis)) + return float_bucket.ceil().clip(lower=_ibis_num(1)) - _ibis_num(1) + else: + percent_ranks = cast( + ibis_types.FloatingColumn, + _apply_window_if_present(column.percent_rank(), window), + ) + out = ibis.case() + first_ibis_quantile = dtypes.literal_to_ibis_scalar(self._quantiles[0]) + out = out.when(percent_ranks < first_ibis_quantile, None) + for bucket_n in range(len(self._quantiles) - 1): + ibis_quantile = dtypes.literal_to_ibis_scalar(self._quantiles[bucket_n + 1]) + out = out.when( + percent_ranks <= ibis_quantile, + dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=pd.Int64Dtype()), + ) + out = out.else_(None) + return out.end() # type: ignore + + +@compile_agg.register +def _( + op: agg_ops.NuniqueOp, column: ibis_types.Column, window=None +) -> ibis_types.IntegerValue: + return _apply_window_if_present(column.nunique(), window) + + +@compile_agg.register +def _( + op: agg_ops.AnyValueOp, column: ibis_types.Column, window=None +) -> ibis_types.IntegerValue: + return _apply_window_if_present(column.arbitrary(), window) + + +@compile_agg.register +def _( + op: agg_ops.RankOp, column: ibis_types.Column, window=None +) -> ibis_types.IntegerValue: + # Ibis produces 0-based ranks, while pandas creates 1-based ranks + return _apply_window_if_present(column.rank(), window) + 1 + + +@compile_agg.register +def _( + op: agg_ops.DenseRankOp, column: ibis_types.Column, window=None +) -> ibis_types.IntegerValue: + # Ibis produces 0-based ranks, while pandas creates 1-based ranks + return _apply_window_if_present(column.dense_rank(), window) + 1 + + +@compile_agg.register +def _(op: agg_ops.FirstOp, column: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present(column.first(), window) + + +@compile_agg.register +def _( + op: agg_ops.FirstNonNullOp, column: ibis_types.Column, window=None +) -> ibis_types.Value: + return _apply_window_if_present( + vendored_ibis_ops.FirstNonNullValue(column).to_expr(), window # type: ignore + ) + + +@compile_agg.register +def _(op: agg_ops.LastOp, column: ibis_types.Column, window=None) -> ibis_types.Value: + return _apply_window_if_present(column.last(), window) + + +@compile_agg.register +def _( + op: agg_ops.LastNonNullOp, column: ibis_types.Column, window=None +) -> ibis_types.Value: + return _apply_window_if_present( + vendored_ibis_ops.LastNonNullValue(column).to_expr(), window # type: ignore + ) + + +@compile_agg.register +def _(op: agg_ops.ShiftOp, column: ibis_types.Column, window=None) -> ibis_types.Value: + if op._periods == 0: # No-op + return column + if op._periods > 0: + return _apply_window_if_present(column.lag(op._periods), window) + return _apply_window_if_present(column.lead(-op._periods), window) + + +@compile_agg.register +def _(op: agg_ops.DiffOp, column: ibis_types.Column, window=None) -> ibis_types.Value: + shifted = compile_agg(agg_ops.ShiftOp(op._periods), column, window) + if column.type().is_boolean(): + return cast(ibis_types.BooleanColumn, column) != cast( + ibis_types.BooleanColumn, shifted + ) + elif column.type().is_numeric(): + return cast(ibis_types.NumericColumn, column) - cast( + ibis_types.NumericColumn, shifted + ) + else: + raise TypeError(f"Cannot perform diff on type{column.type()}") + + +@compile_agg.register +def _( + op: agg_ops.AllOp, column: ibis_types.Column, window=None +) -> ibis_types.BooleanValue: + # BQ will return null for empty column, result would be true in pandas. + result = _is_true(column).all() + return cast( + ibis_types.BooleanScalar, + _apply_window_if_present(result, window).fillna(ibis_types.literal(True)), + ) + + +@compile_agg.register +def _( + op: agg_ops.AnyOp, column: ibis_types.Column, window=None +) -> ibis_types.BooleanValue: + # BQ will return null for empty column, result would be false in pandas. + result = _is_true(column).any() + return cast( + ibis_types.BooleanScalar, + _apply_window_if_present(result, window).fillna(ibis_types.literal(True)), + ) + + +def _apply_window_if_present(value: ibis_types.Value, window): + return value.over(window) if (window is not None) else value + + +def _map_to_literal( + original: ibis_types.Value, literal: ibis_types.Scalar +) -> ibis_types.Column: + # Hack required to perform aggregations on literals in ibis, even though bigquery will let you directly aggregate literals (eg. 'SELECT COUNT(1) from table1') + return ibis.ifelse(original.isnull(), literal, literal) # type: ignore + + +def _ibis_num(number: float): + return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) + + +def _is_true(column: ibis_types.Column) -> ibis_types.BooleanColumn: + if column.type().is_boolean(): + return cast(ibis_types.BooleanColumn, column) + elif column.type().is_numeric(): + result = cast(ibis_types.NumericColumn, column).__ne__(ibis_types.literal(0)) + return cast(ibis_types.BooleanColumn, result) + elif column.type().is_string(): + result = cast(ibis_types.StringValue, column).length() > ibis_types.literal(0) + return cast(ibis_types.BooleanColumn, result) + else: + # Time and geo values don't have a 'False' value + return cast( + ibis_types.BooleanColumn, _map_to_literal(column, ibis_types.literal(True)) + ) diff --git a/bigframes/core/compile/analytic_compiler.py b/bigframes/core/compile/analytic_compiler.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index eaaf692a17..c867eaf680 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -26,6 +26,7 @@ import ibis.expr.types as ibis_types import pandas +import bigframes.core.compile.aggregate_compiler as agg_compiler import bigframes.core.compile.scalar_op_compiler as op_compilers import bigframes.core.expression as ex import bigframes.core.guid @@ -208,6 +209,13 @@ def builder(self): predicates=self._predicates, ) + def peek_sql(self, n: int): + # Peek currently implemented as top level LIMIT op. + # Execution engine handles limit pushdown. + # In future, may push down limit/filters in compilation. + sql = ibis_bigquery.Backend().compile(self._to_ibis_expr().limit(n)) + return typing.cast(str, sql) + def to_sql( self, offset_column: typing.Optional[str] = None, @@ -440,7 +448,7 @@ def aggregate( """ table = self._to_ibis_expr() stats = { - col_out: agg_op._as_ibis(table[col_in]) + col_out: agg_compiler.compile_agg(agg_op, table[col_in]) for col_in, agg_op, col_out in aggregations } if by_column_ids: @@ -803,7 +811,7 @@ def project_window_op( column = typing.cast(ibis_types.Column, self._get_ibis_column(column_name)) window = self._ibis_window_from_spec(window_spec, allow_ties=op.handles_ties) - window_op = op._as_ibis(column, window) + window_op = agg_compiler.compile_unary_analytic(op, column, window) clauses = [] if op.skips_nulls and not never_skip_nulls: @@ -811,12 +819,16 @@ def project_window_op( if window_spec.min_periods: if op.skips_nulls: # Most operations do not count NULL values towards min_periods - observation_count = agg_ops.count_op._as_ibis(column, window) + observation_count = agg_compiler.compile_unary_analytic( + agg_ops.count_op, column, window + ) else: # Operations like count treat even NULLs as valid observations for the sake of min_periods # notnull is just used to convert null values to non-null (FALSE) values to be counted denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) - observation_count = agg_ops.count_op._as_ibis(denulled_value, window) + observation_count = agg_compiler.compile_unary_analytic( + agg_ops.count_op, denulled_value, window + ) clauses.append( ( observation_count < ibis_types.literal(window_spec.min_periods), diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index c948f0bdef..2ec00f7073 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -29,14 +29,18 @@ import bigframes.session -def compile_ordered(node: nodes.BigFrameNode) -> compiled.OrderedIR: +def compile_ordered_ir(node: nodes.BigFrameNode) -> compiled.OrderedIR: return typing.cast(compiled.OrderedIR, compile_node(node, True)) -def compile_unordered(node: nodes.BigFrameNode) -> compiled.UnorderedIR: +def compile_unordered_ir(node: nodes.BigFrameNode) -> compiled.UnorderedIR: return typing.cast(compiled.UnorderedIR, compile_node(node, False)) +def compile_peak_sql(node: nodes.BigFrameNode, n_rows: int) -> typing.Optional[str]: + return compile_unordered_ir(node).peek_sql(n_rows) + + @functools.cache def compile_node( node: nodes.BigFrameNode, ordered: bool = True @@ -56,8 +60,8 @@ def _compile_node( @_compile_node.register def compile_join(node: nodes.JoinNode, ordered: bool = True): if ordered: - left_ordered = compile_ordered(node.left_child) - right_ordered = compile_ordered(node.right_child) + left_ordered = compile_ordered_ir(node.left_child) + right_ordered = compile_ordered_ir(node.right_child) return bigframes.core.compile.single_column.join_by_column_ordered( left=left_ordered, right=right_ordered, @@ -65,8 +69,8 @@ def compile_join(node: nodes.JoinNode, ordered: bool = True): allow_row_identity_join=node.allow_row_identity_join, ) else: - left_unordered = compile_unordered(node.left_child) - right_unordered = compile_unordered(node.right_child) + left_unordered = compile_unordered_ir(node.left_child) + right_unordered = compile_unordered_ir(node.right_child) return bigframes.core.compile.single_column.join_by_column_unordered( left=left_unordered, right=right_unordered, @@ -103,7 +107,7 @@ def compile_readgbq(node: nodes.ReadGbqNode, ordered: bool = True): @_compile_node.register def compile_promote_offsets(node: nodes.PromoteOffsetsNode, ordered: bool = True): - result = compile_ordered(node.child).promote_offsets(node.col_id) + result = compile_ordered_ir(node.child).promote_offsets(node.col_id) return result if ordered else result.to_unordered() @@ -115,17 +119,17 @@ def compile_filter(node: nodes.FilterNode, ordered: bool = True): @_compile_node.register def compile_orderby(node: nodes.OrderByNode, ordered: bool = True): if ordered: - return compile_ordered(node.child).order_by(node.by) + return compile_ordered_ir(node.child).order_by(node.by) else: - return compile_unordered(node.child) + return compile_unordered_ir(node.child) @_compile_node.register def compile_reversed(node: nodes.ReversedNode, ordered: bool = True): if ordered: - return compile_ordered(node.child).reversed() + return compile_ordered_ir(node.child).reversed() else: - return compile_unordered(node.child) + return compile_unordered_ir(node.child) @_compile_node.register @@ -137,22 +141,22 @@ def compile_projection(node: nodes.ProjectionNode, ordered: bool = True): @_compile_node.register def compile_concat(node: nodes.ConcatNode, ordered: bool = True): if ordered: - compiled_ordered = [compile_ordered(node) for node in node.children] + compiled_ordered = [compile_ordered_ir(node) for node in node.children] return concat_impl.concat_ordered(compiled_ordered) else: - compiled_unordered = [compile_unordered(node) for node in node.children] + compiled_unordered = [compile_unordered_ir(node) for node in node.children] return concat_impl.concat_unordered(compiled_unordered) @_compile_node.register def compile_rowcount(node: nodes.RowCountNode, ordered: bool = True): - result = compile_unordered(node.child).row_count() + result = compile_unordered_ir(node.child).row_count() return result if ordered else result.to_unordered() @_compile_node.register def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): - result = compile_unordered(node.child).aggregate( + result = compile_unordered_ir(node.child).aggregate( node.aggregations, node.by_column_ids, node.dropna ) return result if ordered else result.to_unordered() @@ -160,13 +164,13 @@ def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): @_compile_node.register def compile_corr(node: nodes.CorrNode, ordered: bool = True): - result = compile_unordered(node.child).corr_aggregate(node.corr_aggregations) + result = compile_unordered_ir(node.child).corr_aggregate(node.corr_aggregations) return result if ordered else result.to_unordered() @_compile_node.register def compile_window(node: nodes.WindowOpNode, ordered: bool = True): - result = compile_ordered(node.child).project_window_op( + result = compile_ordered_ir(node.child).project_window_op( node.column_name, node.op, node.window_spec, diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 540f9b6e5a..d1be644439 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -18,16 +18,13 @@ import dataclasses import itertools import typing -from typing import Optional -import bigframes.dtypes +import bigframes.dtypes as dtypes import bigframes.operations -def const( - value: typing.Hashable, dtype: Optional[bigframes.dtypes.Dtype] = None -) -> Expression: - return ScalarConstantExpression(value, dtype) +def const(value: typing.Hashable, dtype: dtypes.ExpressionType = None) -> Expression: + return ScalarConstantExpression(value, dtype or dtypes.infer_literal_type(value)) def free_var(id: str) -> Expression: @@ -45,9 +42,16 @@ def unbound_variables(self) -> typing.Tuple[str, ...]: def rename(self, name_mapping: dict[str, str]) -> Expression: return self - @abc.abstractproperty + @property + @abc.abstractmethod def is_const(self) -> bool: - return False + ... + + @abc.abstractmethod + def output_type( + self, input_types: dict[str, dtypes.ExpressionType] + ) -> dtypes.ExpressionType: + ... @dataclasses.dataclass(frozen=True) @@ -56,12 +60,17 @@ class ScalarConstantExpression(Expression): # TODO: Further constrain? value: typing.Hashable - dtype: Optional[bigframes.dtypes.Dtype] = None + dtype: dtypes.ExpressionType = None @property def is_const(self) -> bool: return True + def output_type( + self, input_types: dict[str, bigframes.dtypes.Dtype] + ) -> dtypes.ExpressionType: + return self.dtype + @dataclasses.dataclass(frozen=True) class UnboundVariableExpression(Expression): @@ -83,6 +92,14 @@ def rename(self, name_mapping: dict[str, str]) -> Expression: def is_const(self) -> bool: return False + def output_type( + self, input_types: dict[str, bigframes.dtypes.Dtype] + ) -> dtypes.ExpressionType: + if self.id in input_types: + return input_types[self.id] + else: + raise ValueError("Type of variable has not been fixed.") + @dataclasses.dataclass(frozen=True) class OpExpression(Expression): @@ -110,3 +127,11 @@ def rename(self, name_mapping: dict[str, str]) -> Expression: @property def is_const(self) -> bool: return all(child.is_const for child in self.inputs) + + def output_type( + self, input_types: dict[str, dtypes.ExpressionType] + ) -> dtypes.ExpressionType: + operand_types = tuple( + map(lambda x: x.output_type(input_types=input_types), self.inputs) + ) + return self.op.output_type(*operand_types) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 6998d0e974..0a47c3a78e 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -294,7 +294,7 @@ def _loc_getitem_series_or_dataframe( keys_df = keys_df.set_index(temp_name, drop=True) return _perform_loc_list_join(series_or_dataframe, keys_df) elif isinstance(key, bigframes.core.indexes.Index): - block = key._data._get_block() + block = key._block block = block.select_columns(()) keys_df = bigframes.dataframe.DataFrame(block) return _perform_loc_list_join(series_or_dataframe, keys_df) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 8b3613d82c..78a4fc6f0b 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -17,8 +17,9 @@ from __future__ import annotations import typing -from typing import Mapping, Sequence, Tuple, Union +from typing import Hashable, Mapping, Optional, Sequence, Tuple, Union +import google.cloud.bigquery as bigquery import numpy as np import pandas @@ -33,16 +34,60 @@ import bigframes.core.utils as utils import bigframes.dtypes import bigframes.dtypes as bf_dtypes +import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import third_party.bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index +if typing.TYPE_CHECKING: + import bigframes.dataframe + import bigframes.series + class Index(vendored_pandas_index.Index): __doc__ = vendored_pandas_index.Index.__doc__ - def __init__(self, data: blocks.BlockHolder): - self._data = data + def __init__( + self, + data=None, + dtype=None, + *, + name=None, + ): + import bigframes.dataframe as df + import bigframes.series as series + + if isinstance(data, blocks.Block): + block = data.select_columns([]) + elif isinstance(data, df.DataFrame): + raise ValueError("Cannot construct index from dataframe.") + elif isinstance(data, series.Series) or isinstance(data, Index): + if isinstance(data, series.Series): + block = data._block + block = block.set_index( + col_ids=[data._value_column], + ) + elif isinstance(data, Index): + block = data._block + index = Index(data=block) + name = data.name if name is None else name + if name is not None: + index.name = name + if dtype is not None: + index = index.astype(dtype) + block = index._block + else: + pd_index = pandas.Index(data=data, dtype=dtype, name=name) + pd_df = pandas.DataFrame(index=pd_index) + block = df.DataFrame(pd_df)._block + self._query_job = None + self._block: blocks.Block = block + + @classmethod + def from_frame( + cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] + ) -> Index: + return FrameIndex(frame) @property def name(self) -> blocks.Label: @@ -55,15 +100,16 @@ def name(self, value: blocks.Label): @property def names(self) -> typing.Sequence[blocks.Label]: """Returns the names of the Index.""" - return self._data._get_block()._index_labels + return self._block._index_labels @names.setter def names(self, values: typing.Sequence[blocks.Label]): - return self._data._set_block(self._block.with_index_labels(values)) + new_block = self._block.with_index_labels(values) + self._block = new_block @property def nlevels(self) -> int: - return len(self._data._get_block().index_columns) + return len(self._block.index_columns) @property def values(self) -> np.ndarray: @@ -75,7 +121,7 @@ def ndim(self) -> int: @property def shape(self) -> typing.Tuple[int]: - return (self._data._get_block().shape[0],) + return (self._block.shape[0],) @property def dtype(self): @@ -107,9 +153,7 @@ def is_monotonic_increasing(self) -> bool: """ return typing.cast( bool, - self._data._get_block().is_monotonic_increasing( - self._data._get_block().index_columns - ), + self._block.is_monotonic_increasing(self._block.index_columns), ) @property @@ -122,9 +166,7 @@ def is_monotonic_decreasing(self) -> bool: """ return typing.cast( bool, - self._data._get_block().is_monotonic_decreasing( - self._data._get_block().index_columns - ), + self._block.is_monotonic_decreasing(self._block.index_columns), ) @property @@ -149,14 +191,65 @@ def has_duplicates(self) -> bool: duplicates_df = df.DataFrame(duplicates_block) return duplicates_df["is_duplicate"].any() - @property - def _block(self) -> blocks.Block: - return self._data._get_block() - @property def T(self) -> Index: return self.transpose() + @property + def query_job(self) -> Optional[bigquery.QueryJob]: + """BigQuery job metadata for the most recent query. + + Returns: + The most recent `QueryJob + `_. + """ + if self._query_job is None: + self._query_job = self._block._compute_dry_run() + return self._query_job + + def __repr__(self) -> str: + # TODO(swast): Add a timeout here? If the query is taking a long time, + # maybe we just print the job metadata that we have so far? + # TODO(swast): Avoid downloading the whole series by using job + # metadata, like we do with DataFrame. + opts = bigframes.options.display + max_results = opts.max_rows + if opts.repr_mode == "deferred": + return formatter.repr_query_job(self.query_job) + + pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) + self._query_job = query_job + return repr(pandas_df.index) + + def copy(self, name: Optional[Hashable] = None): + copy_index = Index(self._block) + if name is not None: + copy_index.name = name + return copy_index + + def to_series( + self, index: Optional[Index] = None, name: Optional[Hashable] = None + ) -> bigframes.series.Series: + if self.nlevels != 1: + NotImplementedError( + f"Converting multi-index to series is not yet supported. {constants.FEEDBACK_LINK}" + ) + + import bigframes.series + + name = self.name if name is None else name + if index is None: + return bigframes.series.Series(data=self, index=self, name=name) + else: + return bigframes.series.Series(data=self, index=Index(index), name=name) + + def get_level_values(self, level) -> Index: + level_n = level if isinstance(level, int) else self.names.index(level) + block = self._block.drop_levels( + [self._block.index_columns[i] for i in range(self.nlevels) if i != level_n] + ) + return Index(block) + def _memory_usage(self) -> int: (n_rows,) = self.shape return sum( @@ -180,7 +273,7 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"): order.OrderingColumnReference(column, direction=direction, na_last=na_last) for column in index_columns ] - return Index._from_block(self._block.order_by(ordering)) + return Index(self._block.order_by(ordering)) def astype( self, @@ -269,7 +362,7 @@ def rename(self, name: Union[str, Sequence[str]]) -> Index: names = [name] if isinstance(name, str) else list(name) if len(names) != self.nlevels: raise ValueError("'name' must be same length as levels") - return Index._from_block(self._block.with_index_labels(names)) + return Index(self._block.with_index_labels(names)) def drop( self, @@ -291,17 +384,17 @@ def drop( ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) - return Index._from_block(block) + return Index(block) def dropna(self, how: str = "any") -> Index: if how not in ("any", "all"): raise ValueError("'how' must be one of 'any', 'all'") result = block_ops.dropna(self._block, self._block.index_columns, how=how) # type: ignore - return Index._from_block(result) + return Index(result) def drop_duplicates(self, *, keep: str = "first") -> Index: block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep) - return Index._from_block(block) + return Index(block) def isin(self, values) -> Index: if not utils.is_list_like(values): @@ -330,7 +423,7 @@ def _apply_unary_expr( result_ids.append(result_id) block = block.set_index(result_ids, index_labels=self._block.index_labels) - return Index._from_block(block) + return Index(block) def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any: if self.nlevels > 1: @@ -344,7 +437,7 @@ def __getitem__(self, key: int) -> typing.Any: result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas() else: # special case, want [-1:] instead of [-1:0] result_pd_df, _ = self._block.slice(key).to_pandas() - if result_pd_df.empty: + if result_pd_df.index.empty: raise IndexError("single positional indexer is out-of-bounds") return result_pd_df.index[0] else: @@ -367,11 +460,36 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: def __len__(self): return self.shape[0] - @classmethod - def _from_block(cls, block: blocks.Block) -> Index: - import bigframes.dataframe as df - return Index(df.DataFrame(block)) +# Index that mutates the originating dataframe/series +class FrameIndex(Index): + def __init__( + self, + series_or_dataframe: typing.Union[ + bigframes.series.Series, bigframes.dataframe.DataFrame + ], + ): + super().__init__(series_or_dataframe._block) + self._whole_frame = series_or_dataframe + + @property + def name(self) -> blocks.Label: + return self.names[0] + + @name.setter + def name(self, value: blocks.Label): + self.names = [value] + + @property + def names(self) -> typing.Sequence[blocks.Label]: + """Returns the names of the Index.""" + return self._block._index_labels + + @names.setter + def names(self, values: typing.Sequence[blocks.Label]): + new_block = self._whole_frame._get_block().with_index_labels(values) + self._whole_frame._set_block(new_block) + self._block = new_block class IndexValue: @@ -406,15 +524,6 @@ def dtypes( def session(self) -> core.Session: return self._expr.session - def __repr__(self) -> str: - """Converts an Index to a string.""" - # TODO(swast): Add a timeout here? If the query is taking a long time, - # maybe we just print the job metadata that we have so far? - # TODO(swast): Avoid downloading the whole index by using job - # metadata, like we do with DataFrame. - preview = self.to_pandas() - return repr(preview) - def to_pandas(self) -> pandas.Index: """Executes deferred operations and downloads the results.""" # Project down to only the index column. So the query can be cached to visualize other data. diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index bf261b62f4..e1882c3684 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -16,6 +16,7 @@ from dataclasses import dataclass, field, fields import functools +import itertools import typing from typing import Tuple @@ -74,6 +75,18 @@ def session(self): def _node_hash(self): return hash(tuple(hash(getattr(self, field.name)) for field in fields(self))) + @property + def peekable(self) -> bool: + """Indicates whether the node can be sampled efficiently""" + return all(child.peekable for child in self.child_nodes) + + @property + def roots(self) -> typing.Set[BigFrameNode]: + roots = itertools.chain.from_iterable( + map(lambda child: child.roots, self.child_nodes) + ) + return set(roots) + @dataclass(frozen=True) class UnaryNode(BigFrameNode): @@ -98,6 +111,12 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + children_peekable = all(child.peekable for child in self.child_nodes) + single_root = len(self.roots) == 1 + return children_peekable and single_root + @dataclass(frozen=True) class ConcatNode(BigFrameNode): @@ -119,6 +138,14 @@ class ReadLocalNode(BigFrameNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return True + + @property + def roots(self) -> typing.Set[BigFrameNode]: + return {self} + # TODO: Refactor to take raw gbq object reference @dataclass(frozen=True) @@ -136,6 +163,14 @@ def session(self): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return True + + @property + def roots(self) -> typing.Set[BigFrameNode]: + return {self} + # Unary nodes @dataclass(frozen=True) @@ -145,6 +180,10 @@ class PromoteOffsetsNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + @dataclass(frozen=True) class FilterNode(UnaryNode): @@ -194,6 +233,10 @@ class AggregateNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + # TODO: Unify into aggregate @dataclass(frozen=True) @@ -203,6 +246,10 @@ class CorrNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + @dataclass(frozen=True) class WindowOpNode(UnaryNode): @@ -216,6 +263,10 @@ class WindowOpNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + @dataclass(frozen=True) class ReprojectOpNode(UnaryNode): @@ -239,6 +290,10 @@ class UnpivotNode(UnaryNode): def __hash__(self): return self._node_hash + @property + def peekable(self) -> bool: + return False + @dataclass(frozen=True) class RandomSampleNode(UnaryNode): diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index cadd8e5145..4a3bb16a39 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -129,12 +129,15 @@ def cut( if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") - if labels is not False: + if labels is not None and labels is not False: raise NotImplementedError( - f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" + "The 'labels' parameter must be either False or None. " + "Please provide a valid value for 'labels'." ) - return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec()) + return x._apply_window_op( + agg_ops.CutOp(bins, labels=labels), window_spec=core.WindowSpec() + ) def qcut( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1288117395..2a20a4aabb 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -146,10 +146,15 @@ def __init__( block = result_index._block if block: - if index: - raise NotImplementedError( - "DataFrame 'index' constructor parameter not supported " - f"when passing BigQuery-backed objects. {constants.FEEDBACK_LINK}" + if index is not None: + bf_index = indexes.Index(index) + idx_block = bf_index._block + idx_cols = idx_block.index_columns + join_idx, (_, r_mapping) = block.reset_index().index.join( + bf_index._block.reset_index().index, how="inner" + ) + block = join_idx._block.set_index( + [r_mapping[idx_col] for idx_col in idx_cols] ) if columns: block = block.select_columns(list(columns)) # type:ignore @@ -250,7 +255,7 @@ def _sql_names( def index( self, ) -> indexes.Index: - return indexes.Index(self) + return indexes.Index.from_frame(self) @index.setter def index(self, value): @@ -661,6 +666,14 @@ def _apply_binop( ): if isinstance(other, (float, int)): return self._apply_scalar_binop(other, op, reverse=reverse) + elif isinstance(other, indexes.Index): + return self._apply_series_binop( + other.to_series(index=self.index), + op, + axis=axis, + how=how, + reverse=reverse, + ) elif isinstance(other, bigframes.series.Series): return self._apply_series_binop( other, op, axis=axis, how=how, reverse=reverse @@ -1066,6 +1079,37 @@ def head(self, n: int = 5) -> DataFrame: def tail(self, n: int = 5) -> DataFrame: return typing.cast(DataFrame, self.iloc[-n:]) + def peek(self, n: int = 5, *, force: bool = False) -> pandas.DataFrame: + """ + Preview n arbitrary rows from the dataframe. No guarantees about row selection or ordering. + DataFrame.peek(force=False) will always be very fast, but will not succeed if data requires + full data scanning. Using force=True will always succeed, but may be perform expensive + computations. + + Args: + n (int, default 5): + The number of rows to select from the dataframe. Which N rows are returned is non-deterministic. + force (bool, default False): + If the data cannot be peeked efficiently, the dataframe will instead be fully materialized as part + of the operation if force=True. If force=False, the operation will throw a ValueError. + Returns: + pandas.DataFrame: A pandas DataFrame with n rows. + + Raises: + ValueError: If force=False and data cannot be efficiently peeked. + """ + maybe_result = self._block.try_peek(n) + if maybe_result is None: + if force: + self._cached() + maybe_result = self._block.try_peek(n) + assert maybe_result is not None + else: + raise ValueError( + "Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe." + ) + return maybe_result.set_axis(self._block.column_labels, axis=1, copy=False) + def nlargest( self, n: int, @@ -1152,7 +1196,7 @@ def drop( return DataFrame(block) def _drop_by_index(self, index: indexes.Index) -> DataFrame: - block = index._data._get_block() + block = index._block block, ordering_col = block.promote_offsets() joined_index, (get_column_left, get_column_right) = self._block.index.join( block.index @@ -1288,9 +1332,7 @@ def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame: f"Length of values ({given_rows}) does not match length of index ({actual_rows})" ) - local_df = bigframes.dataframe.DataFrame( - {k: v}, session=self._get_block().expr.session - ) + local_df = DataFrame({k: v}, session=self._get_block().expr.session) # local_df is likely (but not guaranteed) to be cached locally # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE @@ -1591,7 +1633,7 @@ def _reindex_rows( raise ValueError("Original index must be unique to reindex") keep_original_names = False if isinstance(index, indexes.Index): - new_indexer = DataFrame(data=index._data._get_block())[[]] + new_indexer = DataFrame(data=index._block)[[]] else: if not isinstance(index, pandas.Index): keep_original_names = True diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 608885dec4..cb2210bec6 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -23,7 +23,9 @@ import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery import ibis +from ibis.backends.bigquery.datatypes import BigQueryType import ibis.expr.datatypes as ibis_dtypes +from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type import ibis.expr.types as ibis_types import numpy as np import pandas as pd @@ -42,6 +44,14 @@ pd.ArrowDtype, gpd.array.GeometryDtype, ] +# Represents both column types (dtypes) and local-only types +# None represents the type of a None scalar. +ExpressionType = typing.Optional[Dtype] + +INT_DTYPE = pd.Int64Dtype() +FLOAT_DTYPE = pd.Float64Dtype() +BOOL_DTYPE = pd.BooleanDtype() +STRING_DTYPE = pd.StringDtype(storage="pyarrow") # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable UNORDERED_DTYPES = [gpd.array.GeometryDtype()] @@ -539,20 +549,20 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]: return lcd_type(pd.Int64Dtype(), dtype) if isinstance(scalar, decimal.Decimal): # TODO: Check context to see if can use NUMERIC instead of BIGNUMERIC - return lcd_type(pd.ArrowDtype(pa.decimal128(76, 38)), dtype) + return lcd_type(pd.ArrowDtype(pa.decimal256(76, 38)), dtype) return None -def lcd_type(dtype1: Dtype, dtype2: Dtype) -> typing.Optional[Dtype]: +def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype: if dtype1 == dtype2: return dtype1 # Implicit conversion currently only supported for numeric types hierarchy: list[Dtype] = [ pd.BooleanDtype(), pd.Int64Dtype(), - pd.Float64Dtype(), pd.ArrowDtype(pa.decimal128(38, 9)), pd.ArrowDtype(pa.decimal256(76, 38)), + pd.Float64Dtype(), ] if (dtype1 not in hierarchy) or (dtype2 not in hierarchy): return None @@ -560,6 +570,14 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> typing.Optional[Dtype]: return hierarchy[lcd_index] +def lcd_etype(etype1: ExpressionType, etype2: ExpressionType) -> ExpressionType: + if etype1 is None: + return etype2 + if etype2 is None: + return etype1 + return lcd_type_or_throw(etype1, etype2) + + def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: result = lcd_type(dtype1, dtype2) if result is None: @@ -567,3 +585,44 @@ def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype: f"BigFrames cannot upcast {dtype1} and {dtype2} to common type. {constants.FEEDBACK_LINK}" ) return result + + +def infer_literal_type(literal) -> typing.Optional[Dtype]: + if pd.isna(literal): + return None # Null value without a definite type + # Temporary logic, use ibis inferred type + ibis_literal = literal_to_ibis_scalar(literal) + return ibis_dtype_to_bigframes_dtype(ibis_literal.type()) + + +# Input and output types supported by BigQuery DataFrames remote functions. +# TODO(shobs): Extend the support to all types supported by BQ remote functions +# https://cloud.google.com/bigquery/docs/remote-functions#limitations +SUPPORTED_IO_PYTHON_TYPES = {bool, float, int, str} +SUPPORTED_IO_BIGQUERY_TYPEKINDS = { + "BOOLEAN", + "BOOL", + "FLOAT", + "FLOAT64", + "INT64", + "INTEGER", + "STRING", +} + + +class UnsupportedTypeError(ValueError): + def __init__(self, type_, supported_types): + self.type = type_ + self.supported_types = supported_types + + +def ibis_type_from_python_type(t: type) -> ibis_dtypes.DataType: + if t not in SUPPORTED_IO_PYTHON_TYPES: + raise UnsupportedTypeError(t, SUPPORTED_IO_PYTHON_TYPES) + return python_type_to_bigquery_type(t) + + +def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> ibis_dtypes.DataType: + if tk not in SUPPORTED_IO_BIGQUERY_TYPEKINDS: + raise UnsupportedTypeError(tk, SUPPORTED_IO_BIGQUERY_TYPEKINDS) + return BigQueryType.to_ibis(tk) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index f54c26fa56..dfffbe65ac 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -46,12 +46,12 @@ from ibis.backends.bigquery.compiler import compiles from ibis.backends.bigquery.datatypes import BigQueryType from ibis.expr.datatypes.core import DataType as IbisDataType -from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type import ibis.expr.operations as ops import ibis.expr.rules as rlz from bigframes import clients import bigframes.constants as constants +import bigframes.dtypes logger = logging.getLogger(__name__) @@ -59,20 +59,6 @@ # https://docs.python.org/3/library/pickle.html#data-stream-format _pickle_protocol_version = 4 -# Input and output types supported by BigQuery DataFrames remote functions. -# TODO(shobs): Extend the support to all types supported by BQ remote functions -# https://cloud.google.com/bigquery/docs/remote-functions#limitations -SUPPORTED_IO_PYTHON_TYPES = {bool, float, int, str} -SUPPORTED_IO_BIGQUERY_TYPEKINDS = { - "BOOLEAN", - "BOOL", - "FLOAT", - "FLOAT64", - "INT64", - "INTEGER", - "STRING", -} - def get_remote_function_locations(bq_location): """Get BQ location and cloud functions region given a BQ client.""" @@ -558,24 +544,6 @@ def f(*args, **kwargs): return f -class UnsupportedTypeError(ValueError): - def __init__(self, type_, supported_types): - self.type = type_ - self.supported_types = supported_types - - -def ibis_type_from_python_type(t: type) -> IbisDataType: - if t not in SUPPORTED_IO_PYTHON_TYPES: - raise UnsupportedTypeError(t, SUPPORTED_IO_PYTHON_TYPES) - return python_type_to_bigquery_type(t) - - -def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> IbisDataType: - if tk not in SUPPORTED_IO_BIGQUERY_TYPEKINDS: - raise UnsupportedTypeError(tk, SUPPORTED_IO_BIGQUERY_TYPEKINDS) - return BigQueryType.to_ibis(tk) - - def ibis_signature_from_python_signature( signature: inspect.Signature, input_types: Sequence[type], @@ -583,8 +551,10 @@ def ibis_signature_from_python_signature( ) -> IbisSignature: return IbisSignature( parameter_names=list(signature.parameters.keys()), - input_types=[ibis_type_from_python_type(t) for t in input_types], - output_type=ibis_type_from_python_type(output_type), + input_types=[ + bigframes.dtypes.ibis_type_from_python_type(t) for t in input_types + ], + output_type=bigframes.dtypes.ibis_type_from_python_type(output_type), ) @@ -599,10 +569,14 @@ def ibis_signature_from_routine(routine: bigquery.Routine) -> IbisSignature: return IbisSignature( parameter_names=[arg.name for arg in routine.arguments], input_types=[ - ibis_type_from_type_kind(arg.data_type.type_kind) if arg.data_type else None + bigframes.dtypes.ibis_type_from_type_kind(arg.data_type.type_kind) + if arg.data_type + else None for arg in routine.arguments ], - output_type=ibis_type_from_type_kind(routine.return_type.type_kind), + output_type=bigframes.dtypes.ibis_type_from_type_kind( + routine.return_type.type_kind + ), ) @@ -908,7 +882,7 @@ def read_gbq_function( raise ValueError( "Function return type must be specified. {constants.FEEDBACK_LINK}" ) - except UnsupportedTypeError as e: + except bigframes.dtypes.UnsupportedTypeError as e: raise ValueError( f"Type {e.type} not supported, supported types are {e.supported_types}. " f"{constants.FEEDBACK_LINK}" diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 1e2224c9bc..7c156b4cb7 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -136,6 +136,13 @@ def evaluate(self, input_data: Optional[bpd.DataFrame] = None): return self._session.read_gbq(sql) + def arima_evaluate(self, show_all_candidate_models: bool = False): + sql = self._model_manipulation_sql_generator.ml_arima_evaluate( + show_all_candidate_models + ) + + return self._session.read_gbq(sql) + def centroids(self) -> bpd.DataFrame: assert self._model.model_type == "KMEANS" diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 03b9857cc5..8d448fbace 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -151,6 +151,31 @@ def score( input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) + def summary( + self, + show_all_candidate_models: bool = False, + ) -> bpd.DataFrame: + """Summary of the evaluation metrics of the time series model. + + .. note:: + + Output matches that of the BigQuery ML.ARIMA_EVALUATE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-arima-evaluate + for the outputs relevant to this model type. + + Args: + show_all_candidate_models (bool, default to False): + Whether to show evaluation metrics or an error message for either + all candidate models or for only the best model with the lowest + AIC. Default to False. + + Returns: + bigframes.dataframe.DataFrame: A DataFrame as evaluation result. + """ + if not self._bqml_model: + raise RuntimeError("A model must be fitted before score") + return self._bqml_model.arima_evaluate(show_all_candidate_models) + def to_gbq(self, model_name: str, replace: bool = False) -> ARIMAPlus: """Save the model to BigQuery. diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 25caaf1ac6..152f881ec0 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -260,6 +260,12 @@ def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str: return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`, ({source_sql}))""" + # ML evaluation TVFs + def ml_arima_evaluate(self, show_all_candidate_models: bool = False) -> str: + """Encode ML.ARMIA_EVALUATE for BQML""" + return f"""SELECT * FROM ML.ARIMA_EVALUATE(MODEL `{self._model_name}`, + STRUCT({show_all_candidate_models} AS show_all_candidate_models))""" + def ml_centroids(self) -> str: """Encode ML.CENTROIDS for BQML""" return f"""SELECT * FROM ML.CENTROIDS(MODEL `{self._model_name}`)""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 9737df94f9..b40f42a3e8 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -20,6 +20,7 @@ import numpy as np import bigframes.dtypes as dtypes +import bigframes.operations.type as op_typing if typing.TYPE_CHECKING: # Avoids circular dependency @@ -36,6 +37,9 @@ def arguments(self) -> int: """The number of column argument the operation takes""" raise NotImplementedError("RowOp abstract base class has no implementation") + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + raise NotImplementedError("Abstract typing rule has no output type") + # These classes can be used to create simple ops that don't take local parameters # All is needed is a unique name, and to register an implementation in ibis_mappings.py @@ -49,6 +53,9 @@ def name(self) -> str: def arguments(self) -> int: return 1 + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + raise NotImplementedError("Abstract operation has no output type") + def as_expr( self, input_id: typing.Union[str, bigframes.core.expression.Expression] = "arg" ) -> bigframes.core.expression.Expression: @@ -69,6 +76,9 @@ def name(self) -> str: def arguments(self) -> int: return 2 + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + raise NotImplementedError("Abstract operation has no output type") + def as_expr( self, left_input: typing.Union[str, bigframes.core.expression.Expression] = "arg1", @@ -95,6 +105,9 @@ def name(self) -> str: def arguments(self) -> int: return 3 + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + raise NotImplementedError("Abstract operation has no output type") + def as_expr( self, input1: typing.Union[str, bigframes.core.expression.Expression] = "arg1", @@ -126,28 +139,34 @@ def _convert_expr_input( # Operation Factories -def create_unary_op(name: str) -> UnaryOp: +def create_unary_op( + name: str, type_rule: op_typing.OpTypeRule = op_typing.INPUT_TYPE +) -> UnaryOp: return dataclasses.make_dataclass( name, - [("name", typing.ClassVar[str], name)], # type: ignore + [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_rule.as_method)], # type: ignore bases=(UnaryOp,), frozen=True, )() -def create_binary_op(name: str) -> BinaryOp: +def create_binary_op( + name: str, type_rule: op_typing.OpTypeRule = op_typing.Supertype() +) -> BinaryOp: return dataclasses.make_dataclass( name, - [("name", typing.ClassVar[str], name)], # type: ignore + [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_rule.as_method)], # type: ignore bases=(BinaryOp,), frozen=True, )() -def create_ternary_op(name: str) -> TernaryOp: +def create_ternary_op( + name: str, type_rule: op_typing.OpTypeRule = op_typing.Supertype() +) -> TernaryOp: return dataclasses.make_dataclass( name, - [("name", typing.ClassVar[str], name)], # type: ignore + [("name", typing.ClassVar[str], name), ("output_type", typing.ClassVar[typing.Callable], type_rule.as_method)], # type: ignore bases=(TernaryOp,), frozen=True, )() @@ -155,57 +174,57 @@ def create_ternary_op(name: str) -> TernaryOp: # Unary Ops ## Generic Ops -invert_op = create_unary_op(name="invert") -isnull_op = create_unary_op(name="isnull") -notnull_op = create_unary_op(name="notnull") -hash_op = create_unary_op(name="hash") +invert_op = create_unary_op(name="invert", type_rule=op_typing.INPUT_TYPE) +isnull_op = create_unary_op(name="isnull", type_rule=op_typing.PREDICATE) +notnull_op = create_unary_op(name="notnull", type_rule=op_typing.PREDICATE) +hash_op = create_unary_op(name="hash", type_rule=op_typing.INTEGER) ## String Ops -len_op = create_unary_op(name="len") -reverse_op = create_unary_op(name="reverse") -lower_op = create_unary_op(name="lower") -upper_op = create_unary_op(name="upper") -strip_op = create_unary_op(name="strip") -isalnum_op = create_unary_op(name="isalnum") -isalpha_op = create_unary_op(name="isalpha") -isdecimal_op = create_unary_op(name="isdecimal") -isdigit_op = create_unary_op(name="isdigit") -isnumeric_op = create_unary_op(name="isnumeric") -isspace_op = create_unary_op(name="isspace") -islower_op = create_unary_op(name="islower") -isupper_op = create_unary_op(name="isupper") -rstrip_op = create_unary_op(name="rstrip") -lstrip_op = create_unary_op(name="lstrip") -capitalize_op = create_unary_op(name="capitalize") +len_op = create_unary_op(name="len", type_rule=op_typing.INTEGER) +reverse_op = create_unary_op(name="reverse", type_rule=op_typing.STRING) +lower_op = create_unary_op(name="lower", type_rule=op_typing.STRING) +upper_op = create_unary_op(name="upper", type_rule=op_typing.STRING) +strip_op = create_unary_op(name="strip", type_rule=op_typing.STRING) +isalnum_op = create_unary_op(name="isalnum", type_rule=op_typing.PREDICATE) +isalpha_op = create_unary_op(name="isalpha", type_rule=op_typing.PREDICATE) +isdecimal_op = create_unary_op(name="isdecimal", type_rule=op_typing.PREDICATE) +isdigit_op = create_unary_op(name="isdigit", type_rule=op_typing.PREDICATE) +isnumeric_op = create_unary_op(name="isnumeric", type_rule=op_typing.PREDICATE) +isspace_op = create_unary_op(name="isspace", type_rule=op_typing.PREDICATE) +islower_op = create_unary_op(name="islower", type_rule=op_typing.PREDICATE) +isupper_op = create_unary_op(name="isupper", type_rule=op_typing.PREDICATE) +rstrip_op = create_unary_op(name="rstrip", type_rule=op_typing.STRING) +lstrip_op = create_unary_op(name="lstrip", type_rule=op_typing.STRING) +capitalize_op = create_unary_op(name="capitalize", type_rule=op_typing.STRING) ## DateTime Ops -day_op = create_unary_op(name="day") -dayofweek_op = create_unary_op(name="dayofweek") +day_op = create_unary_op(name="day", type_rule=op_typing.INTEGER) +dayofweek_op = create_unary_op(name="dayofweek", type_rule=op_typing.INTEGER) date_op = create_unary_op(name="date") -hour_op = create_unary_op(name="hour") -minute_op = create_unary_op(name="minute") -month_op = create_unary_op(name="month") -quarter_op = create_unary_op(name="quarter") -second_op = create_unary_op(name="second") -time_op = create_unary_op(name="time") -year_op = create_unary_op(name="year") +hour_op = create_unary_op(name="hour", type_rule=op_typing.INTEGER) +minute_op = create_unary_op(name="minute", type_rule=op_typing.INTEGER) +month_op = create_unary_op(name="month", type_rule=op_typing.INTEGER) +quarter_op = create_unary_op(name="quarter", type_rule=op_typing.INTEGER) +second_op = create_unary_op(name="second", type_rule=op_typing.INTEGER) +time_op = create_unary_op(name="time", type_rule=op_typing.INTEGER) +year_op = create_unary_op(name="year", type_rule=op_typing.INTEGER) ## Trigonometry Ops -sin_op = create_unary_op(name="sin") -cos_op = create_unary_op(name="cos") -tan_op = create_unary_op(name="tan") -arcsin_op = create_unary_op(name="arcsin") -arccos_op = create_unary_op(name="arccos") -arctan_op = create_unary_op(name="arctan") -sinh_op = create_unary_op(name="sinh") -cosh_op = create_unary_op(name="cosh") -tanh_op = create_unary_op(name="tanh") -arcsinh_op = create_unary_op(name="arcsinh") -arccosh_op = create_unary_op(name="arccosh") -arctanh_op = create_unary_op(name="arctanh") +sin_op = create_unary_op(name="sin", type_rule=op_typing.REAL_NUMERIC) +cos_op = create_unary_op(name="cos", type_rule=op_typing.REAL_NUMERIC) +tan_op = create_unary_op(name="tan", type_rule=op_typing.REAL_NUMERIC) +arcsin_op = create_unary_op(name="arcsin", type_rule=op_typing.REAL_NUMERIC) +arccos_op = create_unary_op(name="arccos", type_rule=op_typing.REAL_NUMERIC) +arctan_op = create_unary_op(name="arctan", type_rule=op_typing.REAL_NUMERIC) +sinh_op = create_unary_op(name="sinh", type_rule=op_typing.REAL_NUMERIC) +cosh_op = create_unary_op(name="cosh", type_rule=op_typing.REAL_NUMERIC) +tanh_op = create_unary_op(name="tanh", type_rule=op_typing.REAL_NUMERIC) +arcsinh_op = create_unary_op(name="arcsinh", type_rule=op_typing.REAL_NUMERIC) +arccosh_op = create_unary_op(name="arccosh", type_rule=op_typing.REAL_NUMERIC) +arctanh_op = create_unary_op(name="arctanh", type_rule=op_typing.REAL_NUMERIC) ## Numeric Ops -abs_op = create_unary_op(name="abs") -exp_op = create_unary_op(name="exp") -ln_op = create_unary_op(name="log") -log10_op = create_unary_op(name="log10") -sqrt_op = create_unary_op(name="sqrt") +abs_op = create_unary_op(name="abs", type_rule=op_typing.INPUT_TYPE) +exp_op = create_unary_op(name="exp", type_rule=op_typing.REAL_NUMERIC) +ln_op = create_unary_op(name="log", type_rule=op_typing.REAL_NUMERIC) +log10_op = create_unary_op(name="log10", type_rule=op_typing.REAL_NUMERIC) +sqrt_op = create_unary_op(name="sqrt", type_rule=op_typing.REAL_NUMERIC) # Parameterized unary ops @@ -214,18 +233,27 @@ class StrContainsOp(UnaryOp): name: typing.ClassVar[str] = "str_contains" pat: str + def output_type(self, *input_types): + return dtypes.BOOL_DTYPE + @dataclasses.dataclass(frozen=True) class StrContainsRegexOp(UnaryOp): name: typing.ClassVar[str] = "str_contains_regex" pat: str + def output_type(self, *input_types): + return dtypes.BOOL_DTYPE + @dataclasses.dataclass(frozen=True) class StrGetOp(UnaryOp): name: typing.ClassVar[str] = "str_get" i: int + def output_type(self, *input_types): + return dtypes.STRING_DTYPE + @dataclasses.dataclass(frozen=True) class StrPadOp(UnaryOp): @@ -234,6 +262,9 @@ class StrPadOp(UnaryOp): fillchar: str side: typing.Literal["both", "left", "right"] + def output_type(self, *input_types): + return dtypes.STRING_DTYPE + @dataclasses.dataclass(frozen=True) class ReplaceStrOp(UnaryOp): @@ -241,6 +272,9 @@ class ReplaceStrOp(UnaryOp): pat: str repl: str + def output_type(self, *input_types): + return dtypes.STRING_DTYPE + @dataclasses.dataclass(frozen=True) class RegexReplaceStrOp(UnaryOp): @@ -248,24 +282,36 @@ class RegexReplaceStrOp(UnaryOp): pat: str repl: str + def output_type(self, *input_types): + return dtypes.STRING_DTYPE + @dataclasses.dataclass(frozen=True) class StartsWithOp(UnaryOp): name: typing.ClassVar[str] = "str_startswith" pat: typing.Sequence[str] + def output_type(self, *input_types): + return dtypes.BOOL_DTYPE + @dataclasses.dataclass(frozen=True) class EndsWithOp(UnaryOp): name: typing.ClassVar[str] = "str_endswith" pat: typing.Sequence[str] + def output_type(self, *input_types): + return dtypes.BOOL_DTYPE + @dataclasses.dataclass(frozen=True) class ZfillOp(UnaryOp): name: typing.ClassVar[str] = "str_zfill" width: int + def output_type(self, *input_types): + return dtypes.STRING_DTYPE + @dataclasses.dataclass(frozen=True) class StrFindOp(UnaryOp): @@ -274,6 +320,9 @@ class StrFindOp(UnaryOp): start: typing.Optional[int] end: typing.Optional[int] + def output_type(self, *input_types): + return dtypes.BOOL_DTYPE + @dataclasses.dataclass(frozen=True) class StrExtractOp(UnaryOp): @@ -281,6 +330,9 @@ class StrExtractOp(UnaryOp): pat: str n: int = 1 + def output_type(self, *input_types): + return dtypes.STRING_DTYPE + @dataclasses.dataclass(frozen=True) class StrSliceOp(UnaryOp): @@ -288,12 +340,18 @@ class StrSliceOp(UnaryOp): start: typing.Optional[int] end: typing.Optional[int] + def output_type(self, *input_types): + return dtypes.STRING_DTYPE + @dataclasses.dataclass(frozen=True) class StrRepeatOp(UnaryOp): name: typing.ClassVar[str] = "str_repeat" repeats: int + def output_type(self, *input_types): + return dtypes.STRING_DTYPE + # Other parameterized unary operations @dataclasses.dataclass(frozen=True) @@ -305,8 +363,14 @@ class StructFieldOp(UnaryOp): @dataclasses.dataclass(frozen=True) class AsTypeOp(UnaryOp): name: typing.ClassVar[str] = "astype" + # TODO: Convert strings to dtype earlier to_type: dtypes.DtypeString | dtypes.Dtype + def output_type(self, *input_types): + if isinstance(self.to_type, str): + return dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[self.to_type] + return self.to_type + @dataclasses.dataclass(frozen=True) class IsInOp(UnaryOp): @@ -314,6 +378,9 @@ class IsInOp(UnaryOp): values: typing.Tuple match_nulls: bool = True + def output_type(self, *input_types): + return dtypes.BOOL_DTYPE + @dataclasses.dataclass(frozen=True) class RemoteFunctionOp(UnaryOp): @@ -321,12 +388,21 @@ class RemoteFunctionOp(UnaryOp): func: typing.Callable apply_on_null: bool + def output_type(self, *input_types): + python_type = self.func.__signature__.output_type + ibis_type = dtypes.ibis_type_from_python_type(python_type) + dtype = dtypes.ibis_dtype_to_bigframes_dtype(ibis_type) + return dtype + @dataclasses.dataclass(frozen=True) class MapOp(UnaryOp): name = "map_values" mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...] + def output_type(self, *input_types): + return input_types[0] + # Binary Ops fillna_op = create_binary_op(name="fillna") @@ -334,34 +410,48 @@ class MapOp(UnaryOp): clipupper_op = create_binary_op(name="clip_upper") coalesce_op = create_binary_op(name="coalesce") ## Math Ops -add_op = create_binary_op(name="add") -sub_op = create_binary_op(name="sub") -mul_op = create_binary_op(name="mul") -div_op = create_binary_op(name="div") -floordiv_op = create_binary_op(name="floordiv") -pow_op = create_binary_op(name="pow") -mod_op = create_binary_op(name="mod") -round_op = create_binary_op(name="round") -unsafe_pow_op = create_binary_op(name="unsafe_pow_op") +add_op = create_binary_op(name="add", type_rule=op_typing.NUMERIC) +sub_op = create_binary_op(name="sub", type_rule=op_typing.NUMERIC) +mul_op = create_binary_op(name="mul", type_rule=op_typing.NUMERIC) +div_op = create_binary_op(name="div", type_rule=op_typing.REAL_NUMERIC) +floordiv_op = create_binary_op(name="floordiv", type_rule=op_typing.REAL_NUMERIC) +pow_op = create_binary_op(name="pow", type_rule=op_typing.REAL_NUMERIC) +mod_op = create_binary_op(name="mod", type_rule=op_typing.NUMERIC) +round_op = create_binary_op(name="round", type_rule=op_typing.REAL_NUMERIC) +unsafe_pow_op = create_binary_op(name="unsafe_pow_op", type_rule=op_typing.REAL_NUMERIC) # Logical Ops -and_op = create_binary_op(name="and") -or_op = create_binary_op(name="or") +and_op = create_binary_op(name="and", type_rule=op_typing.PREDICATE) +or_op = create_binary_op(name="or", type_rule=op_typing.PREDICATE) ## Comparison Ops -eq_op = create_binary_op(name="eq") -eq_null_match_op = create_binary_op(name="eq_nulls_match") -ne_op = create_binary_op(name="ne") -lt_op = create_binary_op(name="lt") -gt_op = create_binary_op(name="gt") -le_op = create_binary_op(name="le") -ge_op = create_binary_op(name="ge") +eq_op = create_binary_op(name="eq", type_rule=op_typing.PREDICATE) +eq_null_match_op = create_binary_op( + name="eq_nulls_match", type_rule=op_typing.PREDICATE +) +ne_op = create_binary_op(name="ne", type_rule=op_typing.PREDICATE) +lt_op = create_binary_op(name="lt", type_rule=op_typing.PREDICATE) +gt_op = create_binary_op(name="gt", type_rule=op_typing.PREDICATE) +le_op = create_binary_op(name="le", type_rule=op_typing.PREDICATE) +ge_op = create_binary_op(name="ge", type_rule=op_typing.PREDICATE) ## String Ops -strconcat_op = create_binary_op(name="strconcat") +strconcat_op = create_binary_op(name="strconcat", type_rule=op_typing.STRING) + # Ternary Ops -where_op = create_ternary_op(name="where") -clip_op = create_ternary_op(name="clip") +@dataclasses.dataclass(frozen=True) +class WhereOp(TernaryOp): + name: typing.ClassVar[str] = "where" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # Second input is boolean and doesn't affect output type + return dtypes.lcd_etype(input_types[0], input_types[2]) + + +where_op = WhereOp() + + +clip_op = create_ternary_op(name="clip", type_rule=op_typing.Supertype()) # Just parameterless unary ops for now diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 452abf047c..ba62ae28d2 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -16,21 +16,13 @@ import typing -import ibis -import ibis.expr.datatypes as ibis_dtypes -import ibis.expr.types as ibis_types from pandas import Int64Dtype import pandas as pd -import bigframes.constants as constants import bigframes.dtypes as dtypes -import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops class WindowOp: - def _as_ibis(self, value: ibis_types.Column, window=None): - raise NotImplementedError("Base class WindowOp has no implementaiton.") - @property def skips_nulls(self): """Whether the window op skips null rows.""" @@ -45,191 +37,59 @@ def handles_ties(self): class AggregateOp(WindowOp): name = "abstract_aggregate" - def _as_ibis(self, value: ibis_types.Column, window=None): - raise NotImplementedError("Base class AggregateOp has no implementaiton.") - - -def numeric_op(operation): - def constrained_op(op, column: ibis_types.Column, window=None): - if column.type().is_boolean(): - column = typing.cast( - ibis_types.NumericColumn, column.cast(ibis_dtypes.int64) - ) - if column.type().is_numeric(): - return operation(op, column, window) - else: - raise ValueError( - f"Numeric operation cannot be applied to type {column.type()}. {constants.FEEDBACK_LINK}" - ) - - return constrained_op - class SumOp(AggregateOp): name = "sum" - @numeric_op - def _as_ibis( - self, column: ibis_types.NumericColumn, window=None - ) -> ibis_types.NumericValue: - # Will be null if all inputs are null. Pandas defaults to zero sum though. - bq_sum = _apply_window_if_present(column.sum(), window) - return ( - ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() # type: ignore - ) - class MedianOp(AggregateOp): name = "median" - @numeric_op - def _as_ibis( - self, column: ibis_types.NumericColumn, window=None - ) -> ibis_types.NumericValue: - # PERCENTILE_CONT has very few allowed windows. For example, "window - # framing clause is not allowed for analytic function percentile_cont". - if window is not None: - raise NotImplementedError( - f"Median with windowing is not supported. {constants.FEEDBACK_LINK}" - ) - - # TODO(swast): Allow switching between exact and approximate median. - # For now, the best we can do is an approximate median when we're doing - # an aggregation, as PERCENTILE_CONT is only an analytic function. - return typing.cast(ibis_types.NumericValue, column.approx_median()) - class ApproxQuartilesOp(AggregateOp): def __init__(self, quartile: int): self.name = f"{quartile*25}%" self._quartile = quartile - @numeric_op - def _as_ibis( - self, column: ibis_types.NumericColumn, window=None - ) -> ibis_types.NumericValue: - # PERCENTILE_CONT has very few allowed windows. For example, "window - # framing clause is not allowed for analytic function percentile_cont". - if window is not None: - raise NotImplementedError( - f"Approx Quartiles with windowing is not supported. {constants.FEEDBACK_LINK}" - ) - value = vendored_ibis_ops.ApproximateMultiQuantile( - column, num_bins=4 # type: ignore - ).to_expr()[self._quartile] - return typing.cast(ibis_types.NumericValue, value) - class MeanOp(AggregateOp): name = "mean" - @numeric_op - def _as_ibis( - self, column: ibis_types.NumericColumn, window=None - ) -> ibis_types.NumericValue: - return _apply_window_if_present(column.mean(), window) - class ProductOp(AggregateOp): name = "product" - @numeric_op - def _as_ibis( - self, column: ibis_types.NumericColumn, window=None - ) -> ibis_types.NumericValue: - # Need to short-circuit as log with zeroes is illegal sql - is_zero = typing.cast(ibis_types.BooleanColumn, (column == 0)) - - # There is no product sql aggregate function, so must implement as a sum of logs, and then - # apply power after. Note, log and power base must be equal! This impl uses base 2. - logs = typing.cast( - ibis_types.NumericColumn, - ibis.case().when(is_zero, 0).else_(column.abs().log2()).end(), - ) - logs_sum = _apply_window_if_present(logs.sum(), window) - magnitude = typing.cast(ibis_types.NumericValue, ibis_types.literal(2)).pow( - logs_sum - ) - - # Can't determine sign from logs, so have to determine parity of count of negative inputs - is_negative = typing.cast( - ibis_types.NumericColumn, - ibis.case().when(column.sign() == -1, 1).else_(0).end(), - ) - negative_count = _apply_window_if_present(is_negative.sum(), window) - negative_count_parity = negative_count % typing.cast( - ibis_types.NumericValue, ibis.literal(2) - ) # 1 if result should be negative, otherwise 0 - - any_zeroes = _apply_window_if_present(is_zero.any(), window) - float_result = ( - ibis.case() - .when(any_zeroes, ibis_types.literal(0)) - .else_(magnitude * pow(-1, negative_count_parity)) - .end() - ) - return float_result.cast(column.type()) # type: ignore - class MaxOp(AggregateOp): name = "max" - def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: - return _apply_window_if_present(column.max(), window) - class MinOp(AggregateOp): name = "min" - def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: - return _apply_window_if_present(column.min(), window) - class StdOp(AggregateOp): name = "std" - @numeric_op - def _as_ibis(self, x: ibis_types.Column, window=None) -> ibis_types.Value: - return _apply_window_if_present( - typing.cast(ibis_types.NumericColumn, x).std(), window - ) - class VarOp(AggregateOp): name = "var" - @numeric_op - def _as_ibis(self, x: ibis_types.Column, window=None) -> ibis_types.Value: - return _apply_window_if_present( - typing.cast(ibis_types.NumericColumn, x).var(), window - ) - class PopVarOp(AggregateOp): name = "popvar" - @numeric_op - def _as_ibis(self, x: ibis_types.Column, window=None) -> ibis_types.Value: - return _apply_window_if_present( - typing.cast(ibis_types.NumericColumn, x).var(how="pop"), window - ) - class CountOp(AggregateOp): name = "count" - def _as_ibis( - self, column: ibis_types.Column, window=None - ) -> ibis_types.IntegerValue: - return _apply_window_if_present(column.count(), window) - @property def skips_nulls(self): return False class CutOp(WindowOp): - def __init__(self, bins: typing.Union[int, pd.IntervalIndex]): + def __init__(self, bins: typing.Union[int, pd.IntervalIndex], labels=None): if isinstance(bins, int): if not bins > 0: raise ValueError("`bins` should be a positive integer.") @@ -239,28 +99,7 @@ def __init__(self, bins: typing.Union[int, pd.IntervalIndex]): self._bins_int = 0 self._bins = bins - def _as_ibis(self, x: ibis_types.Column, window=None): - out = ibis.case() - - if self._bins_int > 0: - col_min = _apply_window_if_present(x.min(), window) - col_max = _apply_window_if_present(x.max(), window) - bin_width = (col_max - col_min) / self._bins - - for this_bin in range(self._bins_int - 1): - out = out.when( - x <= (col_min + (this_bin + 1) * bin_width), - dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()), - ) - out = out.when(x.notnull(), self._bins - 1) - else: - for interval in self._bins: - condition = (x > interval.left) & (x <= interval.right) - interval_struct = ibis.struct( - {"left_exclusive": interval.left, "right_inclusive": interval.right} - ) - out = out.when(condition, interval_struct) - return out.end() + self._labels = labels @property def skips_nulls(self): @@ -276,39 +115,6 @@ def __init__(self, quantiles: typing.Union[int, typing.Sequence[float]]): self.name = f"qcut-{quantiles}" self._quantiles = quantiles - @numeric_op - def _as_ibis( - self, column: ibis_types.Column, window=None - ) -> ibis_types.IntegerValue: - if isinstance(self._quantiles, int): - quantiles_ibis = dtypes.literal_to_ibis_scalar(self._quantiles) - percent_ranks = typing.cast( - ibis_types.FloatingColumn, - _apply_window_if_present(column.percent_rank(), window), - ) - float_bucket = typing.cast( - ibis_types.FloatingColumn, (percent_ranks * quantiles_ibis) - ) - return float_bucket.ceil().clip(lower=_ibis_num(1)) - _ibis_num(1) - else: - percent_ranks = typing.cast( - ibis_types.FloatingColumn, - _apply_window_if_present(column.percent_rank(), window), - ) - out = ibis.case() - first_ibis_quantile = dtypes.literal_to_ibis_scalar(self._quantiles[0]) - out = out.when(percent_ranks < first_ibis_quantile, None) - for bucket_n in range(len(self._quantiles) - 1): - ibis_quantile = dtypes.literal_to_ibis_scalar( - self._quantiles[bucket_n + 1] - ) - out = out.when( - percent_ranks <= ibis_quantile, - dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=Int64Dtype()), - ) - out = out.else_(None) - return out.end() # type: ignore - @property def skips_nulls(self): return False @@ -321,11 +127,6 @@ def handles_ties(self): class NuniqueOp(AggregateOp): name = "nunique" - def _as_ibis( - self, column: ibis_types.Column, window=None - ) -> ibis_types.IntegerValue: - return _apply_window_if_present(column.nunique(), window) - @property def skips_nulls(self): return False @@ -336,11 +137,6 @@ class AnyValueOp(AggregateOp): # Do not expose to users. For special cases only (e.g. pivot). name = "any_value" - def _as_ibis( - self, column: ibis_types.Column, window=None - ) -> ibis_types.IntegerValue: - return _apply_window_if_present(column.arbitrary(), window) - @property def skips_nulls(self): return True @@ -349,12 +145,6 @@ def skips_nulls(self): class RankOp(WindowOp): name = "rank" - def _as_ibis( - self, column: ibis_types.Column, window=None - ) -> ibis_types.IntegerValue: - # Ibis produces 0-based ranks, while pandas creates 1-based ranks - return _apply_window_if_present(column.rank(), window) + 1 - @property def skips_nulls(self): return False @@ -365,12 +155,6 @@ def handles_ties(self): class DenseRankOp(WindowOp): - def _as_ibis( - self, column: ibis_types.Column, window=None - ) -> ibis_types.IntegerValue: - # Ibis produces 0-based ranks, while pandas creates 1-based ranks - return _apply_window_if_present(column.dense_rank(), window) + 1 - @property def skips_nulls(self): return False @@ -381,8 +165,7 @@ def handles_ties(self): class FirstOp(WindowOp): - def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: - return _apply_window_if_present(column.first(), window) + name = "first" class FirstNonNullOp(WindowOp): @@ -390,15 +173,9 @@ class FirstNonNullOp(WindowOp): def skips_nulls(self): return False - def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: - return _apply_window_if_present( - vendored_ibis_ops.FirstNonNullValue(column).to_expr(), window # type: ignore - ) - class LastOp(WindowOp): - def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: - return _apply_window_if_present(column.last(), window) + name = "last" class LastNonNullOp(WindowOp): @@ -406,23 +183,11 @@ class LastNonNullOp(WindowOp): def skips_nulls(self): return False - def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: - return _apply_window_if_present( - vendored_ibis_ops.LastNonNullValue(column).to_expr(), window # type: ignore - ) - class ShiftOp(WindowOp): def __init__(self, periods: int): self._periods = periods - def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: - if self._periods == 0: # No-op - return column - if self._periods > 0: - return _apply_window_if_present(column.lag(self._periods), window) - return _apply_window_if_present(column.lead(-self._periods), window) - @property def skips_nulls(self): return False @@ -432,80 +197,18 @@ class DiffOp(WindowOp): def __init__(self, periods: int): self._periods = periods - def _as_ibis(self, column: ibis_types.Column, window=None) -> ibis_types.Value: - shifted = ShiftOp(self._periods)._as_ibis(column, window) - if column.type().is_boolean(): - return typing.cast(ibis_types.BooleanColumn, column) != typing.cast( - ibis_types.BooleanColumn, shifted - ) - elif column.type().is_numeric(): - return typing.cast(ibis_types.NumericColumn, column) - typing.cast( - ibis_types.NumericColumn, shifted - ) - else: - raise TypeError(f"Cannot perform diff on type{column.type()}") - @property def skips_nulls(self): return False class AllOp(AggregateOp): - def _as_ibis( - self, column: ibis_types.Column, window=None - ) -> ibis_types.BooleanValue: - # BQ will return null for empty column, result would be true in pandas. - result = _is_true(column).all() - return typing.cast( - ibis_types.BooleanScalar, - _apply_window_if_present(result, window).fillna(ibis_types.literal(True)), - ) + name = "all" class AnyOp(AggregateOp): name = "any" - def _as_ibis( - self, column: ibis_types.Column, window=None - ) -> ibis_types.BooleanValue: - # BQ will return null for empty column, result would be false in pandas. - result = _is_true(column).any() - return typing.cast( - ibis_types.BooleanScalar, - _apply_window_if_present(result, window).fillna(ibis_types.literal(True)), - ) - - -def _is_true(column: ibis_types.Column) -> ibis_types.BooleanColumn: - if column.type().is_boolean(): - return typing.cast(ibis_types.BooleanColumn, column) - elif column.type().is_numeric(): - result = typing.cast(ibis_types.NumericColumn, column).__ne__( - ibis_types.literal(0) - ) - return typing.cast(ibis_types.BooleanColumn, result) - elif column.type().is_string(): - result = typing.cast( - ibis_types.StringValue, column - ).length() > ibis_types.literal(0) - return typing.cast(ibis_types.BooleanColumn, result) - else: - # Time and geo values don't have a 'False' value - return typing.cast( - ibis_types.BooleanColumn, _map_to_literal(column, ibis_types.literal(True)) - ) - - -def _apply_window_if_present(value: ibis_types.Value, window): - return value.over(window) if (window is not None) else value - - -def _map_to_literal( - original: ibis_types.Value, literal: ibis_types.Scalar -) -> ibis_types.Column: - # Hack required to perform aggregations on literals in ibis, even though bigquery will let you directly aggregate literals (eg. 'SELECT COUNT(1) from table1') - return ibis.ifelse(original.isnull(), literal, literal) # type: ignore - sum_op = SumOp() mean_op = MeanOp() @@ -560,7 +263,3 @@ def lookup_agg_func(key: str) -> AggregateOp: return _AGGREGATIONS_LOOKUP[key] else: raise ValueError(f"Unrecognize aggregate function: {key}") - - -def _ibis_num(number: float): - return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 077815a9d6..6829d3faab 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -21,6 +21,7 @@ import bigframes.constants as constants import bigframes.core.blocks as blocks import bigframes.core.expression as ex +import bigframes.core.indexes as indexes import bigframes.core.scalar as scalars import bigframes.dtypes import bigframes.operations as ops @@ -54,10 +55,34 @@ def __init__( if isinstance(data, blocks.Block): assert len(data.value_columns) == 1 assert len(data.column_labels) == 1 + assert index is None block = data elif isinstance(data, SeriesMethods): - block = data._get_block() + block = data._block + if index is not None: + # reindex + bf_index = indexes.Index(index) + idx_block = bf_index._block + idx_cols = idx_block.value_columns + block_idx, _ = idx_block.index.join(block.index, how="left") + block = block_idx._block.with_index_labels(bf_index.names) + + elif isinstance(data, indexes.Index): + if data.nlevels != 1: + raise NotImplementedError("Cannot interpret multi-index as Series.") + # Reset index to promote index columns to value columns, set default index + block = data._block.reset_index(drop=False) + if index is not None: + # Align by offset + bf_index = indexes.Index(index) + idx_block = bf_index._block.reset_index(drop=False) + idx_cols = idx_block.value_columns + block_idx, (l_mapping, _) = idx_block.index.join( + block.index, how="left" + ) + block = block_idx._block.set_index([l_mapping[col] for col in idx_cols]) + block = block.with_index_labels(bf_index.names) if block: if name: @@ -66,16 +91,10 @@ def __init__( f"BigQuery DataFrames only supports hashable series names. {constants.FEEDBACK_LINK}" ) block = block.with_column_labels([name]) - if index: - raise NotImplementedError( - f"Series 'index' constructor parameter not supported when passing BigQuery-backed objects. {constants.FEEDBACK_LINK}" - ) if dtype: block = block.multi_apply_unary_op( block.value_columns, ops.AsTypeOp(to_type=dtype) ) - self._block = block - else: import bigframes.pandas @@ -95,14 +114,15 @@ def __init__( if isinstance(dt, pd.ArrowDtype) ) ): - self._block = blocks.block_from_local(pd_dataframe) + block = blocks.block_from_local(pd_dataframe) elif session: - self._block = session.read_pandas(pd_dataframe)._get_block() + block = session.read_pandas(pd_dataframe)._get_block() else: # Uses default global session - self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() + block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() if pd_series.name is None: - self._block = self._block.with_column_labels([None]) + block = block.with_column_labels([None]) + self._block: blocks.Block = block @property def _value_column(self) -> str: diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py new file mode 100644 index 0000000000..3c16f0cbe9 --- /dev/null +++ b/bigframes/operations/type.py @@ -0,0 +1,80 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import functools + +import pandas as pd + +import bigframes.dtypes +from bigframes.dtypes import ExpressionType + +# TODO: Apply input type constraints to help pre-empt invalid expression construction + + +@dataclasses.dataclass +class OpTypeRule: + def output_type(self, *input_types: ExpressionType) -> ExpressionType: + raise NotImplementedError("Abstract typing rule has no output type") + + @property + def as_method(self): + def meth(_, *input_types: ExpressionType) -> ExpressionType: + return self.output_type(*input_types) + + return meth + + +@dataclasses.dataclass +class InputType(OpTypeRule): + def output_type(self, *input_types: ExpressionType) -> ExpressionType: + assert len(input_types) == 1 + return input_types[0] + + +@dataclasses.dataclass +class RealNumeric(OpTypeRule): + def output_type(self, *input_types: ExpressionType) -> ExpressionType: + all_ints = all(pd.api.types.is_integer(input) for input in input_types) + if all_ints: + return bigframes.dtypes.FLOAT_DTYPE + else: + return functools.reduce( + lambda t1, t2: bigframes.dtypes.lcd_etype(t1, t2), input_types + ) + + +@dataclasses.dataclass +class Supertype(OpTypeRule): + def output_type(self, *input_types: ExpressionType) -> ExpressionType: + return functools.reduce( + lambda t1, t2: bigframes.dtypes.lcd_etype(t1, t2), input_types + ) + + +@dataclasses.dataclass +class Fixed(OpTypeRule): + out_type: ExpressionType + + def output_type(self, *input_types: ExpressionType) -> ExpressionType: + return self.out_type + + +# Common type rules +NUMERIC = Supertype() +REAL_NUMERIC = RealNumeric() +PREDICATE = Fixed(bigframes.dtypes.BOOL_DTYPE) +INTEGER = Fixed(bigframes.dtypes.INT_DTYPE) +STRING = Fixed(bigframes.dtypes.STRING_DTYPE) +INPUT_TYPE = InputType() diff --git a/bigframes/series.py b/bigframes/series.py index 2371aad780..6a21727975 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -76,10 +76,6 @@ def dtype(self): def dtypes(self): return self._dtype - @property - def index(self) -> indexes.Index: - return indexes.Index(self) - @property def loc(self) -> bigframes.core.indexers.LocSeriesIndexer: return bigframes.core.indexers.LocSeriesIndexer(self) @@ -120,6 +116,10 @@ def empty(self) -> bool: def values(self) -> numpy.ndarray: return self.to_numpy() + @property + def index(self) -> indexes.Index: + return indexes.Index.from_frame(self) + @property def query_job(self) -> Optional[bigquery.QueryJob]: """BigQuery job metadata for the most recent query. @@ -155,7 +155,7 @@ def __len__(self): def __iter__(self) -> typing.Iterator: return itertools.chain.from_iterable( - map(lambda x: x.index, self._block.to_pandas_batches()) + map(lambda x: x.squeeze(axis=1), self._block.to_pandas_batches()) ) def copy(self) -> Series: @@ -978,7 +978,7 @@ def idxmax(self) -> blocks.Label: ] ) block = block.slice(0, 1) - return indexes.Index._from_block(block).to_pandas()[0] + return indexes.Index(block).to_pandas()[0] def idxmin(self) -> blocks.Label: block = self._block.order_by( @@ -991,7 +991,7 @@ def idxmin(self) -> blocks.Label: ] ) block = block.slice(0, 1) - return indexes.Index._from_block(block).to_pandas()[0] + return indexes.Index(block).to_pandas()[0] @property def is_monotonic_increasing(self) -> bool: @@ -1279,9 +1279,7 @@ def reindex(self, index=None, *, validate: typing.Optional[bool] = None): raise ValueError("Original index must be unique to reindex") keep_original_names = False if isinstance(index, indexes.Index): - new_indexer = bigframes.dataframe.DataFrame(data=index._data._get_block())[ - [] - ] + new_indexer = bigframes.dataframe.DataFrame(data=index._block)[[]] else: if not isinstance(index, pandas.Index): keep_original_names = True diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9e1e6b560a..bd813c8c6b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -597,9 +597,16 @@ def _get_snapshot_sql_and_primary_key( ).result() )[0][0] self._df_snapshot[table_ref] = snapshot_timestamp - table_expression = self.ibis_client.sql( - bigframes_io.create_snapshot_sql(table_ref, snapshot_timestamp) - ) + + try: + table_expression = self.ibis_client.sql( + bigframes_io.create_snapshot_sql(table_ref, snapshot_timestamp) + ) + except google.api_core.exceptions.Forbidden as ex: + if "Drive credentials" in ex.message: + ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." + raise + return table_expression, primary_keys def _read_gbq_table( @@ -1451,7 +1458,13 @@ def _start_query( job_config.labels = bigframes_io.create_job_configs_labels( job_configs_labels=job_config.labels, api_methods=api_methods ) - query_job = self.bqclient.query(sql, job_config=job_config) + + try: + query_job = self.bqclient.query(sql, job_config=job_config) + except google.api_core.exceptions.Forbidden as ex: + if "Drive credentials" in ex.message: + ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." + raise opts = bigframes.options.display if opts.progress_bar is not None and not query_job.configuration.dry_run: @@ -1508,6 +1521,17 @@ def _execute( job_config=job_config, ) + def _peek( + self, array_value: core.ArrayValue, n_rows: int + ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """A 'peek' efficiently accesses a small number of rows in the dataframe.""" + if not array_value.node.peekable: + raise NotImplementedError("cannot efficient peek this dataframe") + sql = self._compile_unordered(array_value).peek_sql(n_rows) + return self._start_query( + sql=sql, + ) + def _to_sql( self, array_value: core.ArrayValue, @@ -1528,12 +1552,12 @@ def _to_sql( def _compile_ordered( self, array_value: core.ArrayValue ) -> bigframes.core.compile.OrderedIR: - return bigframes.core.compile.compile_ordered(array_value.node) + return bigframes.core.compile.compile_ordered_ir(array_value.node) def _compile_unordered( self, array_value: core.ArrayValue ) -> bigframes.core.compile.UnorderedIR: - return bigframes.core.compile.compile_unordered(array_value.node) + return bigframes.core.compile.compile_unordered_ir(array_value.node) def _get_table_size(self, destination_table): table = self.bqclient.get_table(destination_table) diff --git a/bigframes/version.py b/bigframes/version.py index 4d71f17d71..131f820e7d 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.19.2" +__version__ = "0.20.0" diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index 18be5e48fd..a9b6aefe30 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -532,6 +532,10 @@ }, "outputs": [], "source": [ + "# BigQuery DataFrames can read directly from GCS.\n", + "fn = 'gs://cloud-samples-data/vertex-ai/bigframe/penguins.csv'\n", + "\n", + "# Or from a local file.\n", "# fn = 'penguins.csv'" ] }, @@ -580,7 +584,9 @@ }, "outputs": [], "source": [ - "df_from_local = bf.read_csv(fn)" + "# If order is not important, use the \"bigquery\" engine to\n", + "# allow BigQuery DataFrames to read directly from GCS.\n", + "df_from_local = bf.read_csv(fn, engine=\"bigquery\")" ] }, { @@ -658,7 +664,10 @@ }, "outputs": [], "source": [ - "df_from_local.to_gbq(PROJECT_ID + \".\" + DATASET_ID + \".penguins\")" + "df_from_local.to_gbq(\n", + " PROJECT_ID + \".\" + DATASET_ID + \".penguins\",\n", + " if_exists=\"replace\",\n", + ")" ] }, { @@ -771,7 +780,7 @@ }, "outputs": [], "source": [ - "bq_df[\"species\", \"body_mass_g\"].groupby(by=bq_df[\"species\"]).mean(numeric_only=True).head()" + "bq_df[[\"species\", \"body_mass_g\"]].groupby(by=bq_df[\"species\"]).mean(numeric_only=True).head()" ] }, { @@ -925,7 +934,7 @@ "# # Delete the BigQuery Connection\n", "# from google.cloud import bigquery_connection_v1 as bq_connection\n", "# client = bq_connection.ConnectionServiceClient()\n", - "# CONNECTION_ID = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/bigframes-rf-conn\"\n", + "# CONNECTION_ID = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/bigframes-default-connection\"\n", "# client.delete_connection(name=CONNECTION_ID)\n", "# print(\"Deleted connection '{}'.\".format(CONNECTION_ID))" ] diff --git a/noxfile.py b/noxfile.py index 62fbee2650..7cf9faf685 100644 --- a/noxfile.py +++ b/noxfile.py @@ -30,6 +30,10 @@ BLACK_VERSION = "black==22.3.0" ISORT_VERSION = "isort==5.12.0" + +# pytest-retry is not yet compatible with pytest 8.x. +# https://github.com/str0zzapreti/pytest-retry/issues/32 +PYTEST_VERSION = "pytest<8.0.0dev" SPHINX_VERSION = "sphinx==4.5.0" LINT_PATHS = ["docs", "bigframes", "tests", "third_party", "noxfile.py", "setup.py"] @@ -39,7 +43,7 @@ UNIT_TEST_STANDARD_DEPENDENCIES = [ "mock", "asyncmock", - "pytest", + PYTEST_VERSION, "pytest-cov", "pytest-asyncio", "pytest-mock", @@ -55,7 +59,7 @@ "jinja2", "mock", "openpyxl", - "pytest", + PYTEST_VERSION, "pytest-cov", "pytest-retry", "pytest-timeout", @@ -542,7 +546,12 @@ def prerelease(session: nox.sessions.Session, tests_path): # TODO(shobs): Remove excluding version 2.2.0rc0 after # https://github.com/pandas-dev/pandas/issues/56646 and # https://github.com/pandas-dev/pandas/issues/56651 are resolved. - "pandas!=2.1.4,!=2.2.0rc0", + # + # TODO(shobs): Remove excluding version 2.2.0 after + # https://github.com/googleapis/python-bigquery-dataframes/issues/341 + # https://github.com/googleapis/python-bigquery-dataframes/issues/337 + # are resolved + "pandas!=2.1.4, !=2.2.0rc0, !=2.2.0", ) already_installed.add("pandas") @@ -648,9 +657,23 @@ def system_prerelease(session: nox.sessions.Session): @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) -def notebook(session): +def notebook(session: nox.Session): + GOOGLE_CLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") + if not GOOGLE_CLOUD_PROJECT: + session.error( + "Set GOOGLE_CLOUD_PROJECT environment variable to run notebook session." + ) + session.install("-e", ".[all]") - session.install("pytest", "pytest-xdist", "pytest-retry", "nbmake") + session.install( + "pytest", + "pytest-xdist", + "pytest-retry", + "nbmake", + "google-cloud-aiplatform", + "matplotlib", + "seaborn", + ) notebooks_list = list(Path("notebooks/").glob("*/*.ipynb")) @@ -660,19 +683,22 @@ def notebook(session): # These notebooks contain special colab `param {type:"string"}` # comments, which make it easy for customers to fill in their # own information. + # + # With the notebooks_fill_params.py script, we are able to find and + # replace the PROJECT_ID parameter, but not the others. + # # TODO(ashleyxu): Test these notebooks by replacing parameters with # appropriate values and omitting cleanup logic that may break # our test infrastructure. - "notebooks/getting_started/getting_started_bq_dataframes.ipynb", - "notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", - "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", - "notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb", - "notebooks/regression/bq_dataframes_ml_linear_regression.ipynb", - "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", - "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", - "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", - "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", - "notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb", + "notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", # Needs DATASET. + "notebooks/regression/bq_dataframes_ml_linear_regression.ipynb", # Needs DATASET_ID. + "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION. + # TODO(swast): investigate why we get 404 errors, even though + # bq_dataframes_llm_code_generation creates a bucket in the sample. + "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI. + "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", # Needs BUCKET_URI. + "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", # Needs BUCKET_URI. + "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", # Needs BUCKET_URI. # The experimental notebooks imagine features that don't yet # exist or only exist as temporary prototypes. "notebooks/experimental/longer_ml_demo.ipynb", @@ -700,9 +726,9 @@ def notebook(session): for nb, regions in notebooks_reg.items() } - # For some reason nbmake exits silently with "no tests ran" message if + # The pytest --nbmake exits silently with "no tests ran" message if # one of the notebook paths supplied does not exist. Let's make sure that - # each path exists + # each path exists. for nb in notebooks + list(notebooks_reg): assert os.path.exists(nb), nb @@ -714,16 +740,33 @@ def notebook(session): pytest_command = [ "py.test", "--nbmake", - "--nbmake-timeout=600", + "--nbmake-timeout=900", # 15 minutes ] - # Run self-contained notebooks in single session.run - # achieve parallelization via -n - session.run( - *pytest_command, - "-nauto", - *notebooks, - ) + try: + # Populate notebook parameters and make a backup so that the notebooks + # are runnable. + session.run( + "python", + CURRENT_DIRECTORY / "scripts" / "notebooks_fill_params.py", + *notebooks, + ) + + # Run self-contained notebooks in single session.run + # achieve parallelization via -n + session.run( + *pytest_command, + "-nauto", + *notebooks, + ) + finally: + # Prevent our notebook changes from getting checked in to git + # accidentally. + session.run( + "python", + CURRENT_DIRECTORY / "scripts" / "notebooks_restore_from_backup.py", + *notebooks, + ) # Run regionalized notebooks in parallel session.run's, since each notebook # takes a different region via env param. diff --git a/scripts/manage_cloud_functions.py b/scripts/manage_cloud_functions.py new file mode 100644 index 0000000000..6b69089089 --- /dev/null +++ b/scripts/manage_cloud_functions.py @@ -0,0 +1,195 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from datetime import datetime +import sys +import time + +import google.api_core.exceptions +from google.cloud import functions_v2 + +GCF_REGIONS_ALL = [ + "asia-east1", + "asia-east2", + "asia-northeast1", + "asia-northeast2", + "europe-north1", + "europe-southwest1", + "europe-west1", + "europe-west2", + "europe-west4", + "europe-west8", + "europe-west9", + "us-central1", + "us-east1", + "us-east4", + "us-east5", + "us-south1", + "us-west1", + "asia-east2", + "asia-northeast3", + "asia-southeast1", + "asia-southeast2", + "asia-south1", + "asia-south2", + "australia-southeast1", + "australia-southeast2", + "europe-central2", + "europe-west2", + "europe-west3", + "europe-west6", + "northamerica-northeast1", + "northamerica-northeast2", + "southamerica-east1", + "southamerica-west1", + "us-west2", + "us-west3", + "us-west4", +] + +GCF_CLIENT = functions_v2.FunctionServiceClient() + + +def get_bigframes_functions(project, region): + parent = f"projects/{args.project_id}/locations/{region}" + functions = GCF_CLIENT.list_functions( + functions_v2.ListFunctionsRequest(parent=parent) + ) + # Filter bigframes created functions + functions = [ + function + for function in functions + if function.name.startswith( + f"projects/{args.project_id}/locations/{region}/functions/bigframes-" + ) + ] + + return functions + + +def summarize_gcfs(args): + """Summarize number of bigframes cloud functions in various regions.""" + + region_counts = {} + for region in args.regions: + functions = get_bigframes_functions(args.project_id, region) + functions_count = len(functions) + + # Exclude reporting regions with 0 bigframes GCFs + if functions_count == 0: + continue + + # Count how many GCFs are newer than a day + recent = 0 + for f in functions: + age = datetime.now() - datetime.fromtimestamp(f.update_time.timestamp()) + if age.days <= 0: + recent += 1 + + region_counts[region] = (functions_count, recent) + + for item in sorted( + region_counts.items(), key=lambda item: item[1][0], reverse=True + ): + region = item[0] + count, recent = item[1] + print( + "{}: Total={}, Recent={}, OlderThanADay={}".format( + region, count, recent, count - recent + ) + ) + + +def cleanup_gcfs(args): + """Clean-up bigframes cloud functions in the given regions.""" + max_delete_per_region = args.number + + for region in args.regions: + functions = get_bigframes_functions(args.project_id, region) + count = 0 + for f in functions: + age = datetime.now() - datetime.fromtimestamp(f.update_time.timestamp()) + if age.days > 0: + try: + count += 1 + GCF_CLIENT.delete_function(name=f.name) + print( + f"[{region}]: deleted [{count}] {f.name} last updated on {f.update_time}" + ) + if count >= max_delete_per_region: + break + # Mostly there is a 60 mutations per minute quota, we want to use 10% of + # that for this clean-up, i.e. 6 mutations per minute. So wait for + # 60/6 = 10 seconds + time.sleep(10) + except google.api_core.exceptions.ResourceExhausted: + # Stop deleting in this region for now + print( + f"Cannot delete any more functions in region {region} due to quota exhaustion. Please try again later." + ) + break + + +def list_str(values): + return [val for val in values.split(",") if val] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Manage cloud functions created to serve bigframes remote functions." + ) + parser.add_argument( + "-p", + "--project-id", + type=str, + required=True, + action="store", + help="GCP project-id.", + ) + parser.add_argument( + "-r", + "--regions", + type=list_str, + required=False, + default=GCF_REGIONS_ALL, + action="store", + help="Cloud functions region(s). If multiple regions, Specify comma separated (e.g. region1,region2)", + ) + + subparsers = parser.add_subparsers(title="subcommands", required=True) + parser_summary = subparsers.add_parser( + "summary", + help="BigFrames cloud functions summary.", + description="Show the bigframes cloud functions summary.", + ) + parser_summary.set_defaults(func=summarize_gcfs) + parser_cleanup = subparsers.add_parser( + "cleanup", + help="BigFrames cloud functions clean up.", + description="Delete the stale bigframes cloud functions.", + ) + parser_cleanup.add_argument( + "-n", + "--number", + type=int, + required=False, + default=100, + action="store", + help="Number of stale (more than a day old) cloud functions to clean up.", + ) + parser_cleanup.set_defaults(func=cleanup_gcfs) + + args = parser.parse_args(sys.argv[1:]) + args.func(args) diff --git a/scripts/notebooks_fill_params.py b/scripts/notebooks_fill_params.py new file mode 100644 index 0000000000..e0f7c8d687 --- /dev/null +++ b/scripts/notebooks_fill_params.py @@ -0,0 +1,65 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import re +import shutil +import sys + +GOOGLE_CLOUD_PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] + + +def make_backup(notebook_path: str): + shutil.copy( + notebook_path, + f"{notebook_path}.backup", + ) + + +def replace_project(line): + """ + Notebooks contain special colab `param {type:"string"}` + comments, which make it easy for customers to fill in their + own information. + """ + # Make sure we're robust to whitespace differences. + cleaned = re.sub(r"\s", "", line) + if cleaned == 'PROJECT_ID=""#@param{type:"string"}': + return f'PROJECT_ID = "{GOOGLE_CLOUD_PROJECT}" # @param {{type:"string"}}\n' + else: + return line + + +def replace_params(notebook_path: str): + with open(notebook_path, "r", encoding="utf-8") as notebook_file: + notebook_json = json.load(notebook_file) + + for cell in notebook_json["cells"]: + lines = cell.get("source", []) + new_lines = [replace_project(line) for line in lines] + cell["source"] = new_lines + + with open(notebook_path, "w", encoding="utf-8") as notebook_file: + json.dump(notebook_json, notebook_file, indent=2, ensure_ascii=False) + + +def main(notebook_paths): + for notebook_path in notebook_paths: + make_backup(notebook_path) + replace_params(notebook_path) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/scripts/notebooks_restore_from_backup.py b/scripts/notebooks_restore_from_backup.py new file mode 100644 index 0000000000..4d3e0333e3 --- /dev/null +++ b/scripts/notebooks_restore_from_backup.py @@ -0,0 +1,35 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +import shutil +import sys + + +def restore_from_backup(notebook_path): + backup_path = pathlib.Path(f"{notebook_path}.backup") + if backup_path.exists(): + shutil.move( + backup_path, + notebook_path, + ) + + +def main(notebook_paths): + for notebook_path in notebook_paths: + restore_from_backup(notebook_path) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index 33b835e852..2bb136b0f2 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -16,6 +16,20 @@ from bigframes.ml import forecasting +ARIMA_EVALUATE_OUTPUT_COL = [ + "non_seasonal_p", + "non_seasonal_d", + "non_seasonal_q", + "log_likelihood", + "AIC", + "variance", + "seasonal_periods", + "has_holiday_effect", + "has_spikes_and_dips", + "has_step_changes", + "error_message", +] + def test_arima_plus_model_fit_score( time_series_df_default_index, dataset_id, new_time_series_df @@ -42,7 +56,24 @@ def test_arima_plus_model_fit_score( pd.testing.assert_frame_equal(result, expected, check_exact=False, rtol=0.1) # save, load to ensure configuration was kept - reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True) + reloaded_model = model.to_gbq(f"{dataset_id}.temp_arima_plus_model", replace=True) + assert ( + f"{dataset_id}.temp_arima_plus_model" in reloaded_model._bqml_model.model_name + ) + + +def test_arima_plus_model_fit_summary(time_series_df_default_index, dataset_id): + model = forecasting.ARIMAPlus() + X_train = time_series_df_default_index[["parsed_date"]] + y_train = time_series_df_default_index[["total_visits"]] + model.fit(X_train, y_train) + + result = model.summary() + assert result.shape == (1, 12) + assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) + + # save, load to ensure configuration was kept + reloaded_model = model.to_gbq(f"{dataset_id}.temp_arima_plus_model", replace=True) assert ( - f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name + f"{dataset_id}.temp_arima_plus_model" in reloaded_model._bqml_model.model_name ) diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index be8d9c2bac..4726d5ab21 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -20,6 +20,20 @@ from bigframes.ml import forecasting +ARIMA_EVALUATE_OUTPUT_COL = [ + "non_seasonal_p", + "non_seasonal_d", + "non_seasonal_q", + "log_likelihood", + "AIC", + "variance", + "seasonal_periods", + "has_holiday_effect", + "has_spikes_and_dips", + "has_step_changes", + "error_message", +] + def test_model_predict_default(time_series_arima_plus_model: forecasting.ARIMAPlus): utc = pytz.utc @@ -104,6 +118,24 @@ def test_model_score( ) +def test_model_summary( + time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df +): + result = time_series_arima_plus_model.summary() + assert result.shape == (1, 12) + assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) + + +def test_model_summary_show_all_candidates( + time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df +): + result = time_series_arima_plus_model.summary( + show_all_candidate_models=True, + ) + assert result.shape[0] > 1 + assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) + + def test_model_score_series( time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df ): @@ -126,3 +158,11 @@ def test_model_score_series( rtol=0.1, check_index_type=False, ) + + +def test_model_summary_series( + time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df +): + result = time_series_arima_plus_model.summary() + assert result.shape == (1, 12) + assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9557475b46..4ae31fa4a0 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -412,6 +412,37 @@ def test_rename(scalars_dfs): ) +def test_df_peek(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df.peek(n=3) + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[scalars_df.int64_col != 0].peek(n=3) + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_exception(scalars_dfs): + scalars_df, _ = scalars_dfs + + with pytest.raises(ValueError): + # Window ops aren't compatible with efficient peeking + scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3, force=False) + + +def test_df_peek_force(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3, force=True) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + def test_repr_w_all_rows(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index f7fa0f0855..2961884ebf 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -16,9 +16,44 @@ import pandas as pd import pytest +import bigframes.pandas as bpd from tests.system.utils import assert_pandas_index_equal_ignore_index_type +def test_index_construct_from_list(): + bf_result = bpd.Index( + [3, 14, 159], dtype=pd.Int64Dtype(), name="my_index" + ).to_pandas() + pd_result = pd.Index([3, 14, 159], dtype=pd.Int64Dtype(), name="my_index") + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_construct_from_series(): + bf_result = bpd.Index( + bpd.Series([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name"), + name="index_name", + dtype=pd.Int64Dtype(), + ).to_pandas() + pd_result = pd.Index( + pd.Series([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name"), + name="index_name", + dtype=pd.Int64Dtype(), + ) + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_construct_from_index(): + bf_index_input = bpd.Index( + [3, 14, 159], dtype=pd.Float64Dtype(), name="series_name" + ) + bf_result = bpd.Index( + bf_index_input, dtype=pd.Int64Dtype(), name="index_name" + ).to_pandas() + pd_index_input = pd.Index([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name") + pd_result = pd.Index(pd_index_input, dtype=pd.Int64Dtype(), name="index_name") + pd.testing.assert_index_equal(bf_result, pd_result) + + def test_get_index(scalars_df_index, scalars_pandas_df_index): index = scalars_df_index.index bf_result = index.to_pandas() @@ -240,6 +275,43 @@ def test_index_value_counts(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize( + ("level",), + [ + ("int64_too",), + ("rowindex_2",), + (1,), + ], +) +def test_index_get_level_values(scalars_df_index, scalars_pandas_df_index, level): + bf_result = ( + scalars_df_index.set_index(["int64_too", "rowindex_2"]) + .index.get_level_values(level) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index( + ["int64_too", "rowindex_2"] + ).index.get_level_values(level) + + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_to_series( + scalars_df_index, + scalars_pandas_df_index, +): + bf_result = ( + scalars_df_index.set_index(["int64_too"]) + .index.to_series(index=scalars_df_index["float64_col"], name="new_name") + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index(["int64_too"]).index.to_series( + index=scalars_pandas_df_index["float64_col"], name="new_name" + ) + + pd.testing.assert_series_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("how",), [ diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a79ddb64cd..0910c0b7e2 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -397,6 +397,30 @@ def test_cut(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +def test_cut_default_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5) + bf_result = bpd.cut(scalars_df["float64_col"], 5).to_pandas() + + # Convert to match data format + pd_result_converted = pd.Series( + [ + {"left_exclusive": interval.left, "right_inclusive": interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip( + pd_result, pd_result.cat.categories[pd_result.cat.codes] + ) + ], + name=pd_result.name, + ) + + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) + + @pytest.mark.parametrize( ("bins",), [ @@ -424,7 +448,6 @@ def test_cut_with_interval(scalars_dfs, bins): ], name=pd_result.name, ) - pd_result.index = pd_result.index.astype("Int64") pd.testing.assert_series_equal( bf_result, pd_result_converted, check_index=False, check_dtype=False diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 6f919f740f..5d8fb0b29c 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -42,6 +42,40 @@ def test_series_construct_copy(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +def test_series_construct_copy_with_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df["int64_col"], + name="test_series", + dtype="Float64", + index=scalars_df["int64_too"], + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df["int64_col"], + name="test_series", + dtype="Float64", + index=scalars_pandas_df["int64_too"], + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_copy_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df.index, + name="test_series", + dtype="Float64", + index=scalars_df["int64_too"], + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df.index, + name="test_series", + dtype="Float64", + index=scalars_pandas_df["int64_too"], + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + def test_series_construct_pandas(scalars_dfs): _, scalars_pandas_df = scalars_dfs bf_result = series.Series( @@ -2987,3 +3021,13 @@ def test_sample(scalars_dfs, frac, n, random_state): n = 1 if n is None else n expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n assert bf_result.shape[0] == expected_sample_size + + +def test_series_iter( + scalars_df_index, + scalars_pandas_df_index, +): + for bf_i, pd_i in zip( + scalars_df_index["int64_too"], scalars_pandas_df_index["int64_too"] + ): + assert bf_i == pd_i diff --git a/tests/unit/core/test_expression.py b/tests/unit/core/test_expression.py new file mode 100644 index 0000000000..f46c47a582 --- /dev/null +++ b/tests/unit/core/test_expression.py @@ -0,0 +1,49 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.core.expression as ex +import bigframes.dtypes as dtypes +import bigframes.operations as ops + + +def test_expression_dtype_simple(): + expression = ops.add_op.as_expr("a", "b") + result = expression.output_type({"a": dtypes.INT_DTYPE, "b": dtypes.INT_DTYPE}) + assert result == dtypes.INT_DTYPE + + +def test_expression_dtype_nested(): + expression = ops.add_op.as_expr( + "a", ops.abs_op.as_expr(ops.sub_op.as_expr("b", ex.const(3.14))) + ) + + result = expression.output_type({"a": dtypes.INT_DTYPE, "b": dtypes.INT_DTYPE}) + + assert result == dtypes.FLOAT_DTYPE + + +def test_expression_dtype_where(): + expression = ops.where_op.as_expr(ex.const(3), ex.const(True), ex.const(None)) + + result = expression.output_type({}) + + assert result == dtypes.INT_DTYPE + + +def test_expression_dtype_astype(): + expression = ops.AsTypeOp("Int64").as_expr(ex.const(3.14159)) + + result = expression.output_type({}) + + assert result == dtypes.INT_DTYPE diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 73d19cc0bb..37cc33d33e 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -273,6 +273,19 @@ def test_ml_evaluate_produces_correct_sql( ) +def test_ml_arima_evaluate_produces_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, +): + sql = model_manipulation_sql_generator.ml_arima_evaluate( + show_all_candidate_models=True + ) + assert ( + sql + == """SELECT * FROM ML.ARIMA_EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`, + STRUCT(True AS show_all_candidate_models))""" + ) + + def test_ml_evaluate_no_source_produces_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index d38a393f27..5fc8996993 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -35,7 +35,7 @@ def test_read_gbq_missing_parts(missing_parts_table_id): "not_found_table_id", [("unknown.dataset.table"), ("project.unknown.table"), ("project.dataset.unknown")], ) -def test_read_gdb_not_found_tables(not_found_table_id): +def test_read_gbq_not_found_tables(not_found_table_id): bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.side_effect = google.api_core.exceptions.NotFound( @@ -47,6 +47,34 @@ def test_read_gdb_not_found_tables(not_found_table_id): session.read_gbq(not_found_table_id) +@pytest.mark.parametrize( + ("api_name", "query_or_table"), + [ + ("read_gbq", "project.dataset.table"), + ("read_gbq_table", "project.dataset.table"), + ("read_gbq", "SELECT * FROM project.dataset.table"), + ("read_gbq_query", "SELECT * FROM project.dataset.table"), + ], + ids=[ + "read_gbq_on_table", + "read_gbq_table", + "read_gbq_on_query", + "read_gbq_query", + ], +) +def test_read_gbq_external_table_no_drive_access(api_name, query_or_table): + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + bqclient.get_table.side_effect = google.api_core.exceptions.Forbidden( + "Access Denied: BigQuery BigQuery: Permission denied while getting Drive credentials." + ) + session = resources.create_bigquery_session(bqclient=bqclient) + + api = getattr(session, api_name) + with pytest.raises(google.api_core.exceptions.Forbidden): + api(query_or_table) + + @mock.patch.dict(os.environ, {}, clear=True) def test_session_init_fails_with_no_project(): with pytest.raises( diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 047ad5638d..dabbf11c6c 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -241,9 +241,7 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): def test_remote_function_io_types_are_supported_bigframes_types(): from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type - from bigframes.functions.remote_function import ( - SUPPORTED_IO_PYTHON_TYPES as rf_supported_io_types, - ) + from bigframes.dtypes import SUPPORTED_IO_PYTHON_TYPES as rf_supported_io_types for python_type in rf_supported_io_types: ibis_type = python_type_to_bigquery_type(python_type) diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 4835a24dc7..d6af223456 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -85,7 +85,10 @@ def test_method_matches_session(method_name: str): def test_cut_raises_with_labels(): - with pytest.raises(NotImplementedError, match="Only labels=False"): + with pytest.raises( + NotImplementedError, + match="The 'labels' parameter must be either False or None.", + ): mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py index ed24ada6c7..392872a7be 100644 --- a/tests/unit/test_remote_function.py +++ b/tests/unit/test_remote_function.py @@ -15,14 +15,17 @@ from ibis.backends.bigquery import datatypes as bq_types from ibis.expr import datatypes as ibis_types -from bigframes.functions import remote_function as rf +import bigframes.dtypes def test_supported_types_correspond(): # The same types should be representable by the supported Python and BigQuery types. - ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES} + ibis_types_from_python = { + ibis_types.dtype(t) for t in bigframes.dtypes.SUPPORTED_IO_PYTHON_TYPES + } ibis_types_from_bigquery = { - bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS + bq_types.BigQueryType.to_ibis(tk) + for tk in bigframes.dtypes.SUPPORTED_IO_BIGQUERY_TYPEKINDS } assert ibis_types_from_python == ibis_types_from_bigquery diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 10cdbf8f7c..93fba9f3aa 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4714,7 +4714,7 @@ def index(self): [3 rows x 3 columns] >>> df.index # doctest: +ELLIPSIS - + Index([10, 20, 30], dtype='Int64') >>> df.index.values array([10, 20, 30], dtype=object) @@ -4731,7 +4731,10 @@ def index(self): [3 rows x 1 columns] >>> df1.index # doctest: +ELLIPSIS - + MultiIndex([( 'Alice', 'Seattle'), + ( 'Bob', 'New York'), + ('Aritra', 'Kona')], + name='Name') >>> df1.index.values array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], dtype=object) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 2ca51f6493..b55c7e23d8 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -50,10 +50,10 @@ def size(self) -> int: def __iter__(self) -> Iterator: """ - Iterate over info axis. + Iterate over column axis for DataFrame, or values for Series. - Returns - iterator: Info axis as iterator. + Returns: + iterator **Examples:** >>> import bigframes.pandas as bpd @@ -71,9 +71,9 @@ def __iter__(self) -> Iterator: >>> series = bpd.Series(["a", "b", "c"], index=[10, 20, 30]) >>> for x in series: ... print(x) - 10 - 20 - 30 + a + b + c """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index e8737341a3..3ad8729271 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -58,6 +58,23 @@ def T(self) -> Index: """Return the transpose, which is by definition self.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def copy( + self, + name=None, + ) -> Index: + """ + Make a copy of this object. + + Name is set on the new object. + + Args: + name (Label, optional): + Set name for new object. + Returns: + Index: Index refer to new object which is a copy of this object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def transpose(self) -> Index: """ Return the transpose, which is by definition self. @@ -81,6 +98,40 @@ def astype(self, dtype): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def get_level_values(self, level) -> Index: + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatibility. + + Args: + level (int or str): + It is either the integer position or the name of the level. + + Returns: + Index: Calling object, as there is only one level in the Index. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_series(self): + """ + Create a Series with both index and values equal to the index keys. + + Useful with map for returning an indexer based on an index. + + Args: + index (Index, optional): + Index of resulting Series. If None, defaults to original index. + name (str, optional): + Name of resulting Series. If None, defaults to name of original + index. + + Returns: + Series: The dtype will be based on the type of the Index values. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def isin(self, values): """ Return a boolean array where the index values are in `values`. diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 55975c3fc1..fbd1d2d052 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -38,6 +38,15 @@ def cut( Cut with an integer (equal-width bins): + >>> bpd.cut(s, bins=4) + 0 {'left_exclusive': -0.01, 'right_inclusive': 2.5} + 1 {'left_exclusive': -0.01, 'right_inclusive': 2.5} + 2 {'left_exclusive': 2.5, 'right_inclusive': 5.0} + 3 {'left_exclusive': 7.5, 'right_inclusive': 10.0} + dtype: struct[pyarrow] + + Cut with an integer (equal-width bins) and labels=False: + >>> bpd.cut(s, bins=4, labels=False) 0 0 1 0 @@ -50,7 +59,7 @@ def cut( >>> import pandas as pd >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) - >>> bpd.cut(s, bins=interval_index, labels=False) + >>> bpd.cut(s, bins=interval_index) 0 1 {'left_exclusive': 0, 'right_inclusive': 1} 2 {'left_exclusive': 1, 'right_inclusive': 5} @@ -60,7 +69,7 @@ def cut( Cut with an iterable of tuples: >>> bins_tuples = [(0, 1), (1, 4), (5, 20)] - >>> bpd.cut(s, bins=bins_tuples, labels=False) + >>> bpd.cut(s, bins=bins_tuples) 0 1 {'left_exclusive': 0, 'right_inclusive': 1} 2 @@ -82,9 +91,7 @@ def cut( labels (None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the - bins. This affects the type of the output container (see below). - If True, raises an error. When `ordered=False`, labels must be - provided. + bins. This affects the type of the output container. Returns: Series: A Series representing the respective bin for each value diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 1aa4ffffbb..9e376c713e 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -69,7 +69,7 @@ def index(self): 30 35 Name: Age, dtype: Int64 >>> s.index # doctest: +ELLIPSIS - + Index([10, 20, 30], dtype='Int64') >>> s.index.values array([10, 20, 30], dtype=object) @@ -84,7 +84,10 @@ def index(self): Aritra Kona 35 Name: Age, dtype: Int64 >>> s1.index # doctest: +ELLIPSIS - + MultiIndex([( 'Alice', 'Seattle'), + ( 'Bob', 'New York'), + ('Aritra', 'Kona')], + name='Name') >>> s1.index.values array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], dtype=object) @@ -1774,6 +1777,42 @@ def between( corresponding Series element is between the boundary values `left` and `right`. NA values are treated as `False`. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Boundary values are included by default: + + >>> s = bpd.Series([2, 0, 4, 8, np.nan]) + >>> s.between(1, 4) + 0 True + 1 False + 2 True + 3 False + 4 + dtype: boolean + + With inclusive set to "neither" boundary values are excluded: + + >>> s.between(1, 4, inclusive="neither") + 0 True + 1 False + 2 False + 3 False + 4 + dtype: boolean + + left and right can be any scalar value: + + >>> s = bpd.Series(['Alice', 'Bob', 'Carol', 'Eve']) + >>> s.between('Anna', 'Daniel') + 0 False + 1 True + 2 True + 3 False + dtype: boolean + Args: left (scalar or list-like): Left boundary. @@ -1796,6 +1835,30 @@ def cumprod(self): Returns a DataFrame or Series of the same size containing the cumulative product. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([2, np.nan, 5, -1, 0]) + >>> s + 0 2.0 + 1 + 2 5.0 + 3 -1.0 + 4 0.0 + dtype: Float64 + + By default, NA values are ignored. + + >>> s.cumprod() + 0 2.0 + 1 + 2 10.0 + 3 -10.0 + 4 0.0 + dtype: Float64 + Returns: bigframes.series.Series: Return cumulative sum of scalar or Series. """