From 027d406c166b3cb95ca6e9b57278f754a061eb19 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 20 Mar 2024 16:32:20 +0000 Subject: [PATCH 1/4] feat: add `GroupBy.size()` to get number of rows in each group --- bigframes/core/blocks.py | 29 +++++ bigframes/core/compile/aggregate_compiler.py | 19 +++- bigframes/core/expression.py | 5 + bigframes/core/groupby/__init__.py | 21 ++++ bigframes/operations/aggregations.py | 20 ++++ tests/system/small/test_groupby.py | 103 ++++++++++++++---- .../ibis/expr/operations/analytic.py | 8 ++ 7 files changed, 181 insertions(+), 24 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 0ebbe48cc4..b560eb811b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -933,6 +933,35 @@ def aggregate_all_and_stack( index_labels=self.index.names, ) + def aggregate_size( + self, + by_column_ids: typing.Sequence[str] = (), + *, + dropna: bool = True, + ): + """Returns a block object to compute the size(s) of groups.""" + agg_specs = [ + (ex.NullaryAggregation(agg_ops.SizeOp()), guid.generate_guid()), + ] + output_col_ids = [agg_spec[1] for agg_spec in agg_specs] + result_expr = self.expr.aggregate(agg_specs, by_column_ids, dropna=dropna) + aggregate_labels = self._get_labels_for_columns(["size"]) + names: typing.List[Label] = [] + for by_col_id in by_column_ids: + if by_col_id in self.value_columns: + names.append(self.col_id_to_label[by_col_id]) + else: + names.append(self.col_id_to_index_name[by_col_id]) + return ( + Block( + result_expr, + index_columns=by_column_ids, + column_labels=aggregate_labels, + index_labels=names, + ), + output_col_ids, + ) + def select_column(self, id: str) -> Block: return self.select_columns([id]) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 9c1db0f162..682ab93850 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -35,6 +35,8 @@ def compile_aggregate( aggregate: ex.Aggregation, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: + if isinstance(aggregate, ex.NullaryAggregation): + return compile_nullary_agg(aggregate.op) if isinstance(aggregate, ex.UnaryAggregation): input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings) return compile_unary_agg( @@ -54,7 +56,9 @@ def compile_analytic( window: window_spec.WindowSpec, bindings: typing.Dict[str, ibis_types.Value], ) -> ibis_types.Value: - if isinstance(aggregate, ex.UnaryAggregation): + if isinstance(aggregate, ex.NullaryAggregation): + return compile_nullary_agg(aggregate.op, window) + elif isinstance(aggregate, ex.UnaryAggregation): input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings) return compile_unary_agg(aggregate.op, input, window) elif isinstance(aggregate, ex.BinaryAggregation): @@ -81,6 +85,14 @@ def compile_unary_agg( raise ValueError(f"Can't compile unrecognized operation: {op}") +@functools.singledispatch +def compile_nullary_agg( + op: agg_ops.WindowOp, + window: Optional[window_spec.WindowSpec] = None, +) -> ibis_types.Value: + raise ValueError(f"Can't compile unrecognized operation: {op}") + + def numeric_op(operation): @functools.wraps(operation) def constrained_op(op, column: ibis_types.Column, window=None): @@ -101,6 +113,11 @@ def constrained_op(op, column: ibis_types.Column, window=None): ### Specific Op implementations Below +@compile_nullary_agg.register +def _(op: agg_ops.SizeOp, window=None) -> ibis_types.NumericValue: + return _apply_window_if_present(vendored_ibis_ops.count(1), window) + + @compile_unary_agg.register @numeric_op def _( diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 4c2ae461fd..b5842977ef 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -40,6 +40,11 @@ class Aggregation(abc.ABC): op: agg_ops.WindowOp = dataclasses.field() +@dataclasses.dataclass(frozen=True) +class NullaryAggregation(Aggregation): + op: agg_ops.NullaryWindowOp = dataclasses.field() + + @dataclasses.dataclass(frozen=True) class UnaryAggregation(Aggregation): op: agg_ops.UnaryWindowOp = dataclasses.field() diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 2b447a0190..6f7b4a0e6c 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -102,6 +102,20 @@ def __getitem__( dropna=self._dropna, ) + def size(self) -> typing.Union[df.DataFrame, series.Series]: + agg_block, _ = self._block.aggregate_size( + by_column_ids=self._by_col_ids, + dropna=self._dropna, + ) + agg_block = agg_block.with_column_labels(pd.Index(["size"])) + dataframe = df.DataFrame(agg_block) + + if self._as_index: + series = dataframe["size"] + return series.rename(None) + else: + return self._convert_index(dataframe) + def sum(self, numeric_only: bool = False, *args) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("sum") @@ -475,6 +489,13 @@ def std(self, *args, **kwargs) -> series.Series: def var(self, *args, **kwargs) -> series.Series: return self._aggregate(agg_ops.var_op) + def size(self) -> series.Series: + agg_block, _ = self._block.aggregate_size( + by_column_ids=self._by_col_ids, + dropna=self._dropna, + ) + return series.Series(agg_block, name=self._value_name) + def skew(self, *args, **kwargs) -> series.Series: block = block_ops.skew(self._block, [self._value_column], self._by_col_ids) return series.Series(block) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 6301ece865..efbbeae310 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -33,6 +33,13 @@ def handles_ties(self): return False +@dataclasses.dataclass(frozen=True) +class NullaryWindowOp(WindowOp): + @property + def arguments(self) -> int: + return 0 + + @dataclasses.dataclass(frozen=True) class UnaryWindowOp(WindowOp): @property @@ -55,6 +62,13 @@ def arguments(self) -> int: ... +@dataclasses.dataclass(frozen=True) +class NullaryAggregateOp(AggregateOp, NullaryWindowOp): + @property + def arguments(self) -> int: + return 0 + + @dataclasses.dataclass(frozen=True) class UnaryAggregateOp(AggregateOp, UnaryWindowOp): @property @@ -69,6 +83,11 @@ def arguments(self) -> int: return 2 +@dataclasses.dataclass(frozen=True) +class SizeOp(NullaryAggregateOp): + name: ClassVar[str] = "size" + + @dataclasses.dataclass(frozen=True) class SumOp(UnaryAggregateOp): name: ClassVar[str] = "sum" @@ -270,6 +289,7 @@ class CovOp(BinaryAggregateOp): name: ClassVar[str] = "cov" +size_op = SizeOp() sum_op = SumOp() mean_op = MeanOp() median_op = MedianOp() diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index b38dcaf5d1..f70c96af8e 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -19,6 +19,10 @@ from tests.system.utils import assert_pandas_df_equal +# ================= +# DataFrame.groupby +# ================= + @pytest.mark.parametrize( ("operator"), [ @@ -250,21 +254,26 @@ def test_dataframe_groupby_analytic( pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) -def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.groupby("bool_col")["int64_too"].skew().to_pandas() - pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].skew() +def test_dataframe_groupby_size_as_index_false( + scalars_df_index, scalars_pandas_df_index +): + bf_result = scalars_df_index.groupby("string_col", as_index=False).size() + bf_result_computed = bf_result.to_pandas() + pd_result = scalars_pandas_df_index.groupby("string_col", as_index=False).size() - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + pd.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False, check_index_type=False + ) -def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.groupby("bool_col")["int64_too"].kurt().to_pandas() - # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139 - pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply( - pd.Series.kurt - ) +def test_dataframe_groupby_size_as_index_true( + scalars_df_index, scalars_pandas_df_index +): + bf_result = scalars_df_index.groupby("string_col", as_index=True).size() + pd_result = scalars_pandas_df_index.groupby("string_col", as_index=True).size() + bf_result_computed = bf_result.to_pandas() - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index): @@ -337,6 +346,26 @@ def test_dataframe_groupby_getitem_list( pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) +def test_dataframe_groupby_nonnumeric_with_mean(): + df = pd.DataFrame( + { + "key1": ["a", "a", "a", "b"], + "key2": ["a", "a", "c", "c"], + "key3": [1, 2, 3, 4], + "key4": [1.6, 2, 3, 4], + } + ) + pd_result = df.groupby(["key1", "key2"]).mean() + bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas() + + pd.testing.assert_frame_equal( + pd_result, bf_result, check_index_type=False, check_dtype=False + ) + +# ============== +# Series.groupby +# ============== + def test_series_groupby_agg_string(scalars_df_index, scalars_pandas_df_index): bf_result = ( scalars_df_index["int64_col"] @@ -373,18 +402,46 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): ) -def test_dataframe_groupby_nonnumeric_with_mean(): - df = pd.DataFrame( - { - "key1": ["a", "a", "a", "b"], - "key2": ["a", "a", "c", "c"], - "key3": [1, 2, 3, 4], - "key4": [1.6, 2, 3, 4], - } +def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_too"] + .groupby(scalars_df_index["bool_col"]) + .kurt() + .to_pandas() + ) + # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139 + pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply( + pd.Series.kurt ) - pd_result = df.groupby(["key1", "key2"]).mean() - bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas() - pd.testing.assert_frame_equal( - pd_result, bf_result, check_index_type=False, check_dtype=False + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + +def test_series_groupby_size(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_too"].groupby(scalars_df_index["bool_col"]).size() ) + pd_result = ( + scalars_pandas_df_index["int64_too"] + .groupby(scalars_pandas_df_index["bool_col"]) + .size() + ) + bf_result_computed = bf_result.to_pandas() + + pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False) + + +def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index["int64_too"] + .groupby(scalars_df_index["bool_col"]) + .skew() + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index["int64_too"] + .groupby(scalars_pandas_df_index["bool_col"]) + .skew() + ) + + pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 3d6a3b37b1..6373c0b952 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -2,10 +2,17 @@ from __future__ import annotations +import ibis import ibis.expr.operations as ops import ibis.expr.rules as rlz +@ibis.udf.agg.builtin +def count(value: int) -> int: + """Count of a scalar.""" + return 0 # pragma: NO COVER + + class FirstNonNullValue(ops.Analytic): """Retrieve the first element.""" @@ -21,6 +28,7 @@ class LastNonNullValue(ops.Analytic): __all__ = [ + "count", "FirstNonNullValue", "LastNonNullValue", ] From 713c5316b2ddc1fb79e2800853afc5be8d68071c Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 20 Mar 2024 19:02:21 +0000 Subject: [PATCH 2/4] add TODO --- third_party/bigframes_vendored/ibis/expr/operations/analytic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 6373c0b952..2e1373533e 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -7,6 +7,8 @@ import ibis.expr.rules as rlz +# TODO(swast): We can remove this if ibis adds aggregates over scalar values. +# See: https://github.com/ibis-project/ibis/issues/8698 @ibis.udf.agg.builtin def count(value: int) -> int: """Count of a scalar.""" From 62be2165e263f0c6b1a560fa06398f37f5bbe5d5 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 21 Mar 2024 22:07:18 +0000 Subject: [PATCH 3/4] format --- bigframes/core/expression.py | 5 +++++ bigframes/operations/aggregations.py | 3 +++ tests/system/small/test_groupby.py | 4 +++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 26c58aea1e..9648664b71 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -50,6 +50,11 @@ def output_type( class NullaryAggregation(Aggregation): op: agg_ops.NullaryWindowOp = dataclasses.field() + def output_type( + self, input_types: dict[str, bigframes.dtypes.Dtype] + ) -> dtypes.ExpressionType: + return self.op.output_type() + @dataclasses.dataclass(frozen=True) class UnaryAggregation(Aggregation): diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index d29b6318b2..216af796bd 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -99,6 +99,9 @@ def arguments(self) -> int: class SizeOp(NullaryAggregateOp): name: ClassVar[str] = "size" + def output_type(self, *input_types: dtypes.ExpressionType): + return dtypes.INT_DTYPE + @dataclasses.dataclass(frozen=True) class SumOp(UnaryAggregateOp): diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index f70c96af8e..7a7320be59 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -18,11 +18,11 @@ import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal - # ================= # DataFrame.groupby # ================= + @pytest.mark.parametrize( ("operator"), [ @@ -362,10 +362,12 @@ def test_dataframe_groupby_nonnumeric_with_mean(): pd_result, bf_result, check_index_type=False, check_dtype=False ) + # ============== # Series.groupby # ============== + def test_series_groupby_agg_string(scalars_df_index, scalars_pandas_df_index): bf_result = ( scalars_df_index["int64_col"] From e9d1a63d08fcf7541f2df3842cf1e14c29744782 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 22 Mar 2024 15:36:13 +0000 Subject: [PATCH 4/4] fix bad merge --- bigframes/core/blocks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 1e0306e558..dd79fd7995 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -952,7 +952,6 @@ def aggregate_size( ] output_col_ids = [agg_spec[1] for agg_spec in agg_specs] result_expr = self.expr.aggregate(agg_specs, by_column_ids, dropna=dropna) - aggregate_labels = self._get_labels_for_columns(["size"]) names: typing.List[Label] = [] for by_col_id in by_column_ids: if by_col_id in self.value_columns: @@ -963,7 +962,7 @@ def aggregate_size( Block( result_expr, index_columns=by_column_ids, - column_labels=aggregate_labels, + column_labels=["size"], index_labels=names, ), output_col_ids,