From 027d406c166b3cb95ca6e9b57278f754a061eb19 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 20 Mar 2024 16:32:20 +0000
Subject: [PATCH 1/4] feat: add `GroupBy.size()` to get number of rows in each
 group

---
 bigframes/core/blocks.py                      |  29 +++++
 bigframes/core/compile/aggregate_compiler.py  |  19 +++-
 bigframes/core/expression.py                  |   5 +
 bigframes/core/groupby/__init__.py            |  21 ++++
 bigframes/operations/aggregations.py          |  20 ++++
 tests/system/small/test_groupby.py            | 103 ++++++++++++++----
 .../ibis/expr/operations/analytic.py          |   8 ++
 7 files changed, 181 insertions(+), 24 deletions(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 0ebbe48cc4..b560eb811b 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -933,6 +933,35 @@ def aggregate_all_and_stack(
                 index_labels=self.index.names,
             )
 
+    def aggregate_size(
+        self,
+        by_column_ids: typing.Sequence[str] = (),
+        *,
+        dropna: bool = True,
+    ):
+        """Returns a block object to compute the size(s) of groups."""
+        agg_specs = [
+            (ex.NullaryAggregation(agg_ops.SizeOp()), guid.generate_guid()),
+        ]
+        output_col_ids = [agg_spec[1] for agg_spec in agg_specs]
+        result_expr = self.expr.aggregate(agg_specs, by_column_ids, dropna=dropna)
+        aggregate_labels = self._get_labels_for_columns(["size"])
+        names: typing.List[Label] = []
+        for by_col_id in by_column_ids:
+            if by_col_id in self.value_columns:
+                names.append(self.col_id_to_label[by_col_id])
+            else:
+                names.append(self.col_id_to_index_name[by_col_id])
+        return (
+            Block(
+                result_expr,
+                index_columns=by_column_ids,
+                column_labels=aggregate_labels,
+                index_labels=names,
+            ),
+            output_col_ids,
+        )
+
     def select_column(self, id: str) -> Block:
         return self.select_columns([id])
 
diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py
index 9c1db0f162..682ab93850 100644
--- a/bigframes/core/compile/aggregate_compiler.py
+++ b/bigframes/core/compile/aggregate_compiler.py
@@ -35,6 +35,8 @@ def compile_aggregate(
     aggregate: ex.Aggregation,
     bindings: typing.Dict[str, ibis_types.Value],
 ) -> ibis_types.Value:
+    if isinstance(aggregate, ex.NullaryAggregation):
+        return compile_nullary_agg(aggregate.op)
     if isinstance(aggregate, ex.UnaryAggregation):
         input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings)
         return compile_unary_agg(
@@ -54,7 +56,9 @@ def compile_analytic(
     window: window_spec.WindowSpec,
     bindings: typing.Dict[str, ibis_types.Value],
 ) -> ibis_types.Value:
-    if isinstance(aggregate, ex.UnaryAggregation):
+    if isinstance(aggregate, ex.NullaryAggregation):
+        return compile_nullary_agg(aggregate.op, window)
+    elif isinstance(aggregate, ex.UnaryAggregation):
         input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings)
         return compile_unary_agg(aggregate.op, input, window)
     elif isinstance(aggregate, ex.BinaryAggregation):
@@ -81,6 +85,14 @@ def compile_unary_agg(
     raise ValueError(f"Can't compile unrecognized operation: {op}")
 
 
+@functools.singledispatch
+def compile_nullary_agg(
+    op: agg_ops.WindowOp,
+    window: Optional[window_spec.WindowSpec] = None,
+) -> ibis_types.Value:
+    raise ValueError(f"Can't compile unrecognized operation: {op}")
+
+
 def numeric_op(operation):
     @functools.wraps(operation)
     def constrained_op(op, column: ibis_types.Column, window=None):
@@ -101,6 +113,11 @@ def constrained_op(op, column: ibis_types.Column, window=None):
 ### Specific Op implementations Below
 
 
+@compile_nullary_agg.register
+def _(op: agg_ops.SizeOp, window=None) -> ibis_types.NumericValue:
+    return _apply_window_if_present(vendored_ibis_ops.count(1), window)
+
+
 @compile_unary_agg.register
 @numeric_op
 def _(
diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py
index 4c2ae461fd..b5842977ef 100644
--- a/bigframes/core/expression.py
+++ b/bigframes/core/expression.py
@@ -40,6 +40,11 @@ class Aggregation(abc.ABC):
     op: agg_ops.WindowOp = dataclasses.field()
 
 
+@dataclasses.dataclass(frozen=True)
+class NullaryAggregation(Aggregation):
+    op: agg_ops.NullaryWindowOp = dataclasses.field()
+
+
 @dataclasses.dataclass(frozen=True)
 class UnaryAggregation(Aggregation):
     op: agg_ops.UnaryWindowOp = dataclasses.field()
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
index 2b447a0190..6f7b4a0e6c 100644
--- a/bigframes/core/groupby/__init__.py
+++ b/bigframes/core/groupby/__init__.py
@@ -102,6 +102,20 @@ def __getitem__(
                 dropna=self._dropna,
             )
 
+    def size(self) -> typing.Union[df.DataFrame, series.Series]:
+        agg_block, _ = self._block.aggregate_size(
+            by_column_ids=self._by_col_ids,
+            dropna=self._dropna,
+        )
+        agg_block = agg_block.with_column_labels(pd.Index(["size"]))
+        dataframe = df.DataFrame(agg_block)
+
+        if self._as_index:
+            series = dataframe["size"]
+            return series.rename(None)
+        else:
+            return self._convert_index(dataframe)
+
     def sum(self, numeric_only: bool = False, *args) -> df.DataFrame:
         if not numeric_only:
             self._raise_on_non_numeric("sum")
@@ -475,6 +489,13 @@ def std(self, *args, **kwargs) -> series.Series:
     def var(self, *args, **kwargs) -> series.Series:
         return self._aggregate(agg_ops.var_op)
 
+    def size(self) -> series.Series:
+        agg_block, _ = self._block.aggregate_size(
+            by_column_ids=self._by_col_ids,
+            dropna=self._dropna,
+        )
+        return series.Series(agg_block, name=self._value_name)
+
     def skew(self, *args, **kwargs) -> series.Series:
         block = block_ops.skew(self._block, [self._value_column], self._by_col_ids)
         return series.Series(block)
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
index 6301ece865..efbbeae310 100644
--- a/bigframes/operations/aggregations.py
+++ b/bigframes/operations/aggregations.py
@@ -33,6 +33,13 @@ def handles_ties(self):
         return False
 
 
+@dataclasses.dataclass(frozen=True)
+class NullaryWindowOp(WindowOp):
+    @property
+    def arguments(self) -> int:
+        return 0
+
+
 @dataclasses.dataclass(frozen=True)
 class UnaryWindowOp(WindowOp):
     @property
@@ -55,6 +62,13 @@ def arguments(self) -> int:
         ...
 
 
+@dataclasses.dataclass(frozen=True)
+class NullaryAggregateOp(AggregateOp, NullaryWindowOp):
+    @property
+    def arguments(self) -> int:
+        return 0
+
+
 @dataclasses.dataclass(frozen=True)
 class UnaryAggregateOp(AggregateOp, UnaryWindowOp):
     @property
@@ -69,6 +83,11 @@ def arguments(self) -> int:
         return 2
 
 
+@dataclasses.dataclass(frozen=True)
+class SizeOp(NullaryAggregateOp):
+    name: ClassVar[str] = "size"
+
+
 @dataclasses.dataclass(frozen=True)
 class SumOp(UnaryAggregateOp):
     name: ClassVar[str] = "sum"
@@ -270,6 +289,7 @@ class CovOp(BinaryAggregateOp):
     name: ClassVar[str] = "cov"
 
 
+size_op = SizeOp()
 sum_op = SumOp()
 mean_op = MeanOp()
 median_op = MedianOp()
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index b38dcaf5d1..f70c96af8e 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -19,6 +19,10 @@
 from tests.system.utils import assert_pandas_df_equal
 
 
+# =================
+# DataFrame.groupby
+# =================
+
 @pytest.mark.parametrize(
     ("operator"),
     [
@@ -250,21 +254,26 @@ def test_dataframe_groupby_analytic(
     pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
 
 
-def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index):
-    bf_result = scalars_df_index.groupby("bool_col")["int64_too"].skew().to_pandas()
-    pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].skew()
+def test_dataframe_groupby_size_as_index_false(
+    scalars_df_index, scalars_pandas_df_index
+):
+    bf_result = scalars_df_index.groupby("string_col", as_index=False).size()
+    bf_result_computed = bf_result.to_pandas()
+    pd_result = scalars_pandas_df_index.groupby("string_col", as_index=False).size()
 
-    pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+    pd.testing.assert_frame_equal(
+        pd_result, bf_result_computed, check_dtype=False, check_index_type=False
+    )
 
 
-def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index):
-    bf_result = scalars_df_index.groupby("bool_col")["int64_too"].kurt().to_pandas()
-    # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139
-    pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply(
-        pd.Series.kurt
-    )
+def test_dataframe_groupby_size_as_index_true(
+    scalars_df_index, scalars_pandas_df_index
+):
+    bf_result = scalars_df_index.groupby("string_col", as_index=True).size()
+    pd_result = scalars_pandas_df_index.groupby("string_col", as_index=True).size()
+    bf_result_computed = bf_result.to_pandas()
 
-    pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+    pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False)
 
 
 def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index):
@@ -337,6 +346,26 @@ def test_dataframe_groupby_getitem_list(
     pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
 
 
+def test_dataframe_groupby_nonnumeric_with_mean():
+    df = pd.DataFrame(
+        {
+            "key1": ["a", "a", "a", "b"],
+            "key2": ["a", "a", "c", "c"],
+            "key3": [1, 2, 3, 4],
+            "key4": [1.6, 2, 3, 4],
+        }
+    )
+    pd_result = df.groupby(["key1", "key2"]).mean()
+    bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas()
+
+    pd.testing.assert_frame_equal(
+        pd_result, bf_result, check_index_type=False, check_dtype=False
+    )
+
+# ==============
+# Series.groupby
+# ==============
+
 def test_series_groupby_agg_string(scalars_df_index, scalars_pandas_df_index):
     bf_result = (
         scalars_df_index["int64_col"]
@@ -373,18 +402,46 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index):
     )
 
 
-def test_dataframe_groupby_nonnumeric_with_mean():
-    df = pd.DataFrame(
-        {
-            "key1": ["a", "a", "a", "b"],
-            "key2": ["a", "a", "c", "c"],
-            "key3": [1, 2, 3, 4],
-            "key4": [1.6, 2, 3, 4],
-        }
+def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index):
+    bf_result = (
+        scalars_df_index["int64_too"]
+        .groupby(scalars_df_index["bool_col"])
+        .kurt()
+        .to_pandas()
+    )
+    # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139
+    pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply(
+        pd.Series.kurt
     )
-    pd_result = df.groupby(["key1", "key2"]).mean()
-    bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas()
 
-    pd.testing.assert_frame_equal(
-        pd_result, bf_result, check_index_type=False, check_dtype=False
+    pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+
+
+def test_series_groupby_size(scalars_df_index, scalars_pandas_df_index):
+    bf_result = (
+        scalars_df_index["int64_too"].groupby(scalars_df_index["bool_col"]).size()
     )
+    pd_result = (
+        scalars_pandas_df_index["int64_too"]
+        .groupby(scalars_pandas_df_index["bool_col"])
+        .size()
+    )
+    bf_result_computed = bf_result.to_pandas()
+
+    pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False)
+
+
+def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index):
+    bf_result = (
+        scalars_df_index["int64_too"]
+        .groupby(scalars_df_index["bool_col"])
+        .skew()
+        .to_pandas()
+    )
+    pd_result = (
+        scalars_pandas_df_index["int64_too"]
+        .groupby(scalars_pandas_df_index["bool_col"])
+        .skew()
+    )
+
+    pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py
index 3d6a3b37b1..6373c0b952 100644
--- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py
+++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py
@@ -2,10 +2,17 @@
 
 from __future__ import annotations
 
+import ibis
 import ibis.expr.operations as ops
 import ibis.expr.rules as rlz
 
 
+@ibis.udf.agg.builtin
+def count(value: int) -> int:
+    """Count of a scalar."""
+    return 0  # pragma: NO COVER
+
+
 class FirstNonNullValue(ops.Analytic):
     """Retrieve the first element."""
 
@@ -21,6 +28,7 @@ class LastNonNullValue(ops.Analytic):
 
 
 __all__ = [
+    "count",
     "FirstNonNullValue",
     "LastNonNullValue",
 ]

From 713c5316b2ddc1fb79e2800853afc5be8d68071c Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 20 Mar 2024 19:02:21 +0000
Subject: [PATCH 2/4] add TODO

---
 third_party/bigframes_vendored/ibis/expr/operations/analytic.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py
index 6373c0b952..2e1373533e 100644
--- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py
+++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py
@@ -7,6 +7,8 @@
 import ibis.expr.rules as rlz
 
 
+# TODO(swast): We can remove this if ibis adds aggregates over scalar values.
+# See: https://github.com/ibis-project/ibis/issues/8698
 @ibis.udf.agg.builtin
 def count(value: int) -> int:
     """Count of a scalar."""

From 62be2165e263f0c6b1a560fa06398f37f5bbe5d5 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Thu, 21 Mar 2024 22:07:18 +0000
Subject: [PATCH 3/4] format

---
 bigframes/core/expression.py         | 5 +++++
 bigframes/operations/aggregations.py | 3 +++
 tests/system/small/test_groupby.py   | 4 +++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py
index 26c58aea1e..9648664b71 100644
--- a/bigframes/core/expression.py
+++ b/bigframes/core/expression.py
@@ -50,6 +50,11 @@ def output_type(
 class NullaryAggregation(Aggregation):
     op: agg_ops.NullaryWindowOp = dataclasses.field()
 
+    def output_type(
+        self, input_types: dict[str, bigframes.dtypes.Dtype]
+    ) -> dtypes.ExpressionType:
+        return self.op.output_type()
+
 
 @dataclasses.dataclass(frozen=True)
 class UnaryAggregation(Aggregation):
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
index d29b6318b2..216af796bd 100644
--- a/bigframes/operations/aggregations.py
+++ b/bigframes/operations/aggregations.py
@@ -99,6 +99,9 @@ def arguments(self) -> int:
 class SizeOp(NullaryAggregateOp):
     name: ClassVar[str] = "size"
 
+    def output_type(self, *input_types: dtypes.ExpressionType):
+        return dtypes.INT_DTYPE
+
 
 @dataclasses.dataclass(frozen=True)
 class SumOp(UnaryAggregateOp):
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index f70c96af8e..7a7320be59 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -18,11 +18,11 @@
 import bigframes.pandas as bpd
 from tests.system.utils import assert_pandas_df_equal
 
-
 # =================
 # DataFrame.groupby
 # =================
 
+
 @pytest.mark.parametrize(
     ("operator"),
     [
@@ -362,10 +362,12 @@ def test_dataframe_groupby_nonnumeric_with_mean():
         pd_result, bf_result, check_index_type=False, check_dtype=False
     )
 
+
 # ==============
 # Series.groupby
 # ==============
 
+
 def test_series_groupby_agg_string(scalars_df_index, scalars_pandas_df_index):
     bf_result = (
         scalars_df_index["int64_col"]

From e9d1a63d08fcf7541f2df3842cf1e14c29744782 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Fri, 22 Mar 2024 15:36:13 +0000
Subject: [PATCH 4/4] fix bad merge

---
 bigframes/core/blocks.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 1e0306e558..dd79fd7995 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -952,7 +952,6 @@ def aggregate_size(
         ]
         output_col_ids = [agg_spec[1] for agg_spec in agg_specs]
         result_expr = self.expr.aggregate(agg_specs, by_column_ids, dropna=dropna)
-        aggregate_labels = self._get_labels_for_columns(["size"])
         names: typing.List[Label] = []
         for by_col_id in by_column_ids:
             if by_col_id in self.value_columns:
@@ -963,7 +962,7 @@ def aggregate_size(
             Block(
                 result_expr,
                 index_columns=by_column_ids,
-                column_labels=aggregate_labels,
+                column_labels=["size"],
                 index_labels=names,
             ),
             output_col_ids,