diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f189bb704e..010eb96f75 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1014,6 +1014,34 @@ def aggregate_all_and_stack( index_labels=self.index.names, ) + def aggregate_size( + self, + by_column_ids: typing.Sequence[str] = (), + *, + dropna: bool = True, + ): + """Returns a block object to compute the size(s) of groups.""" + agg_specs = [ + (ex.NullaryAggregation(agg_ops.SizeOp()), guid.generate_guid()), + ] + output_col_ids = [agg_spec[1] for agg_spec in agg_specs] + result_expr = self.expr.aggregate(agg_specs, by_column_ids, dropna=dropna) + names: typing.List[Label] = [] + for by_col_id in by_column_ids: + if by_col_id in self.value_columns: + names.append(self.col_id_to_label[by_col_id]) + else: + names.append(self.col_id_to_index_name[by_col_id]) + return ( + Block( + result_expr, + index_columns=by_column_ids, + column_labels=["size"], + index_labels=names, + ), + output_col_ids, + ) + def select_column(self, id: str) -> Block: return self.select_columns([id]) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index c0b0562a54..fada4ebbd8 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -36,6 +36,8 @@ def compile_aggregate( bindings: typing.Dict[str, ibis_types.Value], order_by: typing.Sequence[ibis_types.Value] = [], ) -> ibis_types.Value: + if isinstance(aggregate, ex.NullaryAggregation): + return compile_nullary_agg(aggregate.op) if isinstance(aggregate, ex.UnaryAggregation): input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings) if aggregate.op.can_order_by: @@ -55,7 +57,9 @@ def compile_analytic( window: window_spec.WindowSpec, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - if isinstance(aggregate, ex.UnaryAggregation): + if isinstance(aggregate, ex.NullaryAggregation): + return compile_nullary_agg(aggregate.op, window) + elif isinstance(aggregate, ex.UnaryAggregation): input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings) return compile_unary_agg(aggregate.op, input, window) elif isinstance(aggregate, ex.BinaryAggregation): @@ -93,6 +97,14 @@ def compile_ordered_unary_agg( raise ValueError(f"Can't compile unrecognized operation: {op}") +@functools.singledispatch +def compile_nullary_agg( + op: agg_ops.WindowOp, + window: Optional[window_spec.WindowSpec] = None, +) -> ibis_types.Value: + raise ValueError(f"Can't compile unrecognized operation: {op}") + + def numeric_op(operation): @functools.wraps(operation) def constrained_op( @@ -118,6 +130,11 @@ def constrained_op( ### Specific Op implementations Below +@compile_nullary_agg.register +def _(op: agg_ops.SizeOp, window=None) -> ibis_types.NumericValue: + return _apply_window_if_present(vendored_ibis_ops.count(1), window) + + @compile_unary_agg.register @numeric_op def _( diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 70eb519a1b..c216c29717 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -46,6 +46,16 @@ def output_type( ... +@dataclasses.dataclass(frozen=True) +class NullaryAggregation(Aggregation): + op: agg_ops.NullaryWindowOp = dataclasses.field() + + def output_type( + self, input_types: dict[str, bigframes.dtypes.Dtype] + ) -> dtypes.ExpressionType: + return self.op.output_type() + + @dataclasses.dataclass(frozen=True) class UnaryAggregation(Aggregation): op: agg_ops.UnaryWindowOp = dataclasses.field() diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 41d0750030..91c5e54d89 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -104,6 +104,20 @@ def __getitem__( dropna=self._dropna, ) + def size(self) -> typing.Union[df.DataFrame, series.Series]: + agg_block, _ = self._block.aggregate_size( + by_column_ids=self._by_col_ids, + dropna=self._dropna, + ) + agg_block = agg_block.with_column_labels(pd.Index(["size"])) + dataframe = df.DataFrame(agg_block) + + if self._as_index: + series = dataframe["size"] + return series.rename(None) + else: + return self._convert_index(dataframe) + def sum(self, numeric_only: bool = False, *args) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("sum") @@ -520,6 +534,13 @@ def std(self, *args, **kwargs) -> series.Series: def var(self, *args, **kwargs) -> series.Series: return self._aggregate(agg_ops.var_op) + def size(self) -> series.Series: + agg_block, _ = self._block.aggregate_size( + by_column_ids=self._by_col_ids, + dropna=self._dropna, + ) + return series.Series(agg_block, name=self._value_name) + def skew(self, *args, **kwargs) -> series.Series: block = block_ops.skew(self._block, [self._value_column], self._by_col_ids) return series.Series(block) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 3b5310554b..783abfd788 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -47,6 +47,13 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ... +@dataclasses.dataclass(frozen=True) +class NullaryWindowOp(WindowOp): + @property + def arguments(self) -> int: + return 0 + + @dataclasses.dataclass(frozen=True) class UnaryWindowOp(WindowOp): @property @@ -72,6 +79,13 @@ def arguments(self) -> int: ... +@dataclasses.dataclass(frozen=True) +class NullaryAggregateOp(AggregateOp, NullaryWindowOp): + @property + def arguments(self) -> int: + return 0 + + @dataclasses.dataclass(frozen=True) class UnaryAggregateOp(AggregateOp, UnaryWindowOp): @property @@ -86,6 +100,14 @@ def arguments(self) -> int: return 2 +@dataclasses.dataclass(frozen=True) +class SizeOp(NullaryAggregateOp): + name: ClassVar[str] = "size" + + def output_type(self, *input_types: dtypes.ExpressionType): + return dtypes.INT_DTYPE + + @dataclasses.dataclass(frozen=True) class SumOp(UnaryAggregateOp): name: ClassVar[str] = "sum" @@ -446,6 +468,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT ) +size_op = SizeOp() sum_op = SumOp() mean_op = MeanOp() median_op = MedianOp() diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 7b36a06f49..02d9bf9725 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -18,6 +18,10 @@ import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal +# ================= +# DataFrame.groupby +# ================= + @pytest.mark.parametrize( ("operator"), @@ -269,21 +273,26 @@ def test_dataframe_groupby_analytic( pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) -def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.groupby("bool_col")["int64_too"].skew().to_pandas() - pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].skew() +def test_dataframe_groupby_size_as_index_false( + scalars_df_index, scalars_pandas_df_index +): + bf_result = scalars_df_index.groupby("string_col", as_index=False).size() + bf_result_computed = bf_result.to_pandas() + pd_result = scalars_pandas_df_index.groupby("string_col", as_index=False).size() - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + pd.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False, check_index_type=False + ) -def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.groupby("bool_col")["int64_too"].kurt().to_pandas() - # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139 - pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply( - pd.Series.kurt - ) +def test_dataframe_groupby_size_as_index_true( + scalars_df_index, scalars_pandas_df_index +): + bf_result = scalars_df_index.groupby("string_col", as_index=True).size() + pd_result = scalars_pandas_df_index.groupby("string_col", as_index=True).size() + bf_result_computed = bf_result.to_pandas() - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index): @@ -356,6 +365,30 @@ def test_dataframe_groupby_getitem_list( pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) +def test_dataframe_groupby_nonnumeric_with_mean(): + df = pd.DataFrame( + { + "key1": ["a", "a", "a", "b"], + "key2": ["a", "a", "c", "c"], + "key3": [1, 2, 3, 4], + "key4": [1.6, 2, 3, 4], + } + ) + pd_result = df.groupby(["key1", "key2"]).mean() + + with bpd.option_context("bigquery.location", "US"): + bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas() + + pd.testing.assert_frame_equal( + pd_result, bf_result, check_index_type=False, check_dtype=False + ) + + +# ============== +# Series.groupby +# ============== + + def test_series_groupby_agg_string(scalars_df_index, scalars_pandas_df_index): bf_result = ( scalars_df_index["int64_col"] @@ -392,21 +425,49 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): ) -def test_dataframe_groupby_nonnumeric_with_mean(): - df = pd.DataFrame( - { - "key1": ["a", "a", "a", "b"], - "key2": ["a", "a", "c", "c"], - "key3": [1, 2, 3, 4], - "key4": [1.6, 2, 3, 4], - } +def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_too"] + .groupby(scalars_df_index["bool_col"]) + .kurt() + .to_pandas() + ) + # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139 + pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply( + pd.Series.kurt ) - pd_result = df.groupby(["key1", "key2"]).mean() - bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas() - pd.testing.assert_frame_equal( - pd_result, bf_result, check_index_type=False, check_dtype=False + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + +def test_series_groupby_size(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_too"].groupby(scalars_df_index["bool_col"]).size() ) + pd_result = ( + scalars_pandas_df_index["int64_too"] + .groupby(scalars_pandas_df_index["bool_col"]) + .size() + ) + bf_result_computed = bf_result.to_pandas() + + pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) + + +def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_too"] + .groupby(scalars_df_index["bool_col"]) + .skew() + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index["int64_too"] + .groupby(scalars_pandas_df_index["bool_col"]) + .skew() + ) + + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) @pytest.mark.parametrize( diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 3d6a3b37b1..2e1373533e 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -2,10 +2,19 @@ from __future__ import annotations +import ibis import ibis.expr.operations as ops import ibis.expr.rules as rlz +# TODO(swast): We can remove this if ibis adds aggregates over scalar values. +# See: https://github.com/ibis-project/ibis/issues/8698 +@ibis.udf.agg.builtin +def count(value: int) -> int: + """Count of a scalar.""" + return 0 # pragma: NO COVER + + class FirstNonNullValue(ops.Analytic): """Retrieve the first element.""" @@ -21,6 +30,7 @@ class LastNonNullValue(ops.Analytic): __all__ = [ + "count", "FirstNonNullValue", "LastNonNullValue", ]