From bc82804da43c03c2311cd56f47a2316d3aae93d2 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Tue, 16 Apr 2024 16:32:17 -0700
Subject: [PATCH 01/15] feat: Add quantile statistic (#613)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
- [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [ ] Ensure the tests and linter pass
- [ ] Code coverage does not decrease (if any source code was changed)
- [ ] Appropriate docs were updated (if necessary)

Fixes #<issue_number_goes_here> 🦕
---
 bigframes/constants.py                        |  3 +
 bigframes/core/block_transforms.py            | 34 +++++++++++
 bigframes/core/blocks.py                      | 13 +++--
 bigframes/core/compile/aggregate_compiler.py  |  8 +++
 bigframes/core/groupby/__init__.py            | 57 +++++++++++++++++--
 bigframes/dataframe.py                        | 30 +++++++++-
 bigframes/operations/aggregations.py          | 12 ++++
 bigframes/series.py                           | 19 +++++--
 tests/system/small/test_dataframe.py          | 30 +++++++++-
 tests/system/small/test_groupby.py            | 35 ++++++++++++
 tests/system/small/test_series.py             | 21 +++++++
 .../ibis/backends/bigquery/registry.py        |  8 +++
 .../bigframes_vendored/pandas/core/frame.py   | 40 ++++++++++++-
 .../pandas/core/groupby/__init__.py           | 30 ++++++++++
 .../bigframes_vendored/pandas/core/series.py  | 42 +++++++++++++-
 .../pandas/plotting/_core.py                  |  4 ++
 16 files changed, 366 insertions(+), 20 deletions(-)

diff --git a/bigframes/constants.py b/bigframes/constants.py
index 0751501085..c6d8f3acc2 100644
--- a/bigframes/constants.py
+++ b/bigframes/constants.py
@@ -92,3 +92,6 @@
 LEP_ENABLED_BIGQUERY_LOCATIONS = frozenset(
     ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS
 )
+
+# BigQuery default is 10000, leave 100 for overhead
+MAX_COLUMNS = 9900
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
index c789b2a69c..1eae73014c 100644
--- a/bigframes/core/block_transforms.py
+++ b/bigframes/core/block_transforms.py
@@ -15,6 +15,7 @@
 
 import functools
 import typing
+from typing import Sequence
 
 import pandas as pd
 
@@ -105,6 +106,39 @@ def indicate_duplicates(
     )
 
 
+def quantile(
+    block: blocks.Block,
+    columns: Sequence[str],
+    qs: Sequence[float],
+    grouping_column_ids: Sequence[str] = (),
+) -> blocks.Block:
+    # TODO: handle windowing and more interpolation methods
+    window = core.WindowSpec(
+        grouping_keys=tuple(grouping_column_ids),
+    )
+    quantile_cols = []
+    labels = []
+    if len(columns) * len(qs) > constants.MAX_COLUMNS:
+        raise NotImplementedError("Too many aggregates requested.")
+    for col in columns:
+        for q in qs:
+            label = block.col_id_to_label[col]
+            new_label = (*label, q) if isinstance(label, tuple) else (label, q)
+            labels.append(new_label)
+            block, quantile_col = block.apply_window_op(
+                col,
+                agg_ops.QuantileOp(q),
+                window_spec=window,
+            )
+            quantile_cols.append(quantile_col)
+    block, results = block.aggregate(
+        grouping_column_ids,
+        tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
+        dropna=True,
+    )
+    return block.select_columns(results).with_column_labels(labels)
+
+
 def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block:
     supported_methods = [
         "linear",
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 5b411e5416..f6850020df 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -1498,12 +1498,17 @@ def stack(self, how="left", levels: int = 1):
 
         row_label_tuples = utils.index_as_tuples(row_labels)
 
-        if col_labels is not None:
+        if col_labels is None:
+            result_index: pd.Index = pd.Index([None])
+            result_col_labels: Sequence[Tuple] = list([()])
+        elif (col_labels.nlevels == 1) and all(
+            col_labels.isna()
+        ):  # isna not implemented for MultiIndex for newer pandas versions
+            result_index = pd.Index([None])
+            result_col_labels = utils.index_as_tuples(col_labels.drop_duplicates())
+        else:
             result_index = col_labels.drop_duplicates().dropna(how="all")
             result_col_labels = utils.index_as_tuples(result_index)
-        else:
-            result_index = pd.Index([None])
-            result_col_labels = list([()])
 
         # Get matching columns
         unpivot_columns: List[Tuple[str, List[str]]] = []
diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py
index ae21243506..98d296c779 100644
--- a/bigframes/core/compile/aggregate_compiler.py
+++ b/bigframes/core/compile/aggregate_compiler.py
@@ -148,6 +148,14 @@ def _(
     return cast(ibis_types.NumericValue, value)
 
 
+@compile_unary_agg.register
+@numeric_op
+def _(
+    op: agg_ops.QuantileOp, column: ibis_types.NumericColumn, window=None
+) -> ibis_types.NumericValue:
+    return _apply_window_if_present(column.quantile(op.q), window)
+
+
 @compile_unary_agg.register
 @numeric_op
 def _(
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
index e2b28553c6..0f53342352 100644
--- a/bigframes/core/groupby/__init__.py
+++ b/bigframes/core/groupby/__init__.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import typing
+from typing import Sequence, Union
 
 import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
 import pandas as pd
@@ -115,14 +116,35 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame:
     def median(
         self, numeric_only: bool = False, *, exact: bool = False
     ) -> df.DataFrame:
-        if exact:
-            raise NotImplementedError(
-                f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
-            )
         if not numeric_only:
             self._raise_on_non_numeric("median")
+        if exact:
+            return self.quantile(0.5)
         return self._aggregate_all(agg_ops.median_op, numeric_only=True)
 
+    def quantile(
+        self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False
+    ) -> df.DataFrame:
+        if not numeric_only:
+            self._raise_on_non_numeric("quantile")
+        q_cols = tuple(
+            col
+            for col in self._selected_cols
+            if self._column_type(col) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
+        )
+        multi_q = utils.is_list_like(q)
+        result = block_ops.quantile(
+            self._block,
+            q_cols,
+            qs=tuple(q) if multi_q else (q,),  # type: ignore
+            grouping_column_ids=self._by_col_ids,
+        )
+        result_df = df.DataFrame(result)
+        if multi_q:
+            return result_df.stack()
+        else:
+            return result_df.droplevel(-1, 1)
+
     def min(self, numeric_only: bool = False, *args) -> df.DataFrame:
         return self._aggregate_all(agg_ops.min_op, numeric_only=numeric_only)
 
@@ -466,8 +488,31 @@ def sum(self, *args) -> series.Series:
     def mean(self, *args) -> series.Series:
         return self._aggregate(agg_ops.mean_op)
 
-    def median(self, *args, **kwargs) -> series.Series:
-        return self._aggregate(agg_ops.mean_op)
+    def median(
+        self,
+        *args,
+        exact: bool = False,
+        **kwargs,
+    ) -> series.Series:
+        if exact:
+            return self.quantile(0.5)
+        else:
+            return self._aggregate(agg_ops.median_op)
+
+    def quantile(
+        self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False
+    ) -> series.Series:
+        multi_q = utils.is_list_like(q)
+        result = block_ops.quantile(
+            self._block,
+            (self._value_column,),
+            qs=tuple(q) if multi_q else (q,),  # type: ignore
+            grouping_column_ids=self._by_col_ids,
+        )
+        if multi_q:
+            return series.Series(result.stack())
+        else:
+            return series.Series(result.stack()).droplevel(-1)
 
     def std(self, *args, **kwargs) -> series.Series:
         return self._aggregate(agg_ops.std_op)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 2deef95277..953a89c34f 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -2009,8 +2009,34 @@ def median(
             frame = self._raise_on_non_numeric("median")
         else:
             frame = self._drop_non_numeric()
-        block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
-        return bigframes.series.Series(block.select_column("values"))
+        if exact:
+            return self.quantile()
+        else:
+            block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
+            return bigframes.series.Series(block.select_column("values"))
+
+    def quantile(
+        self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False
+    ):
+        if not numeric_only:
+            frame = self._raise_on_non_numeric("median")
+        else:
+            frame = self._drop_non_numeric()
+        multi_q = utils.is_list_like(q)
+        result = block_ops.quantile(
+            frame._block, frame._block.value_columns, qs=tuple(q) if multi_q else (q,)  # type: ignore
+        )
+        if multi_q:
+            return DataFrame(result.stack()).droplevel(0)
+        else:
+            result_df = (
+                DataFrame(result)
+                .stack(list(range(0, frame.columns.nlevels)))
+                .droplevel(0)
+            )
+            result_series = bigframes.series.Series(result_df._block)
+            result_series.name = q
+            return result_series
 
     def std(
         self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
index f33dc16e30..0d27d1d75d 100644
--- a/bigframes/operations/aggregations.py
+++ b/bigframes/operations/aggregations.py
@@ -109,6 +109,18 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
             return input_types[0]
 
 
+@dataclasses.dataclass(frozen=True)
+class QuantileOp(UnaryAggregateOp):
+    q: float
+
+    @property
+    def name(self):
+        return f"{int(self.q*100)}%"
+
+    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0])
+
+
 @dataclasses.dataclass(frozen=True)
 class ApproxQuartilesOp(UnaryAggregateOp):
     quartile: int
diff --git a/bigframes/series.py b/bigframes/series.py
index 2f9123f9a3..b834411bce 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -23,7 +23,7 @@
 import os
 import textwrap
 import typing
-from typing import Any, Literal, Mapping, Optional, Sequence, Tuple, Union
+from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union
 
 import bigframes_vendored.pandas.core.series as vendored_pandas_series
 import google.cloud.bigquery as bigquery
@@ -968,10 +968,19 @@ def mean(self) -> float:
 
     def median(self, *, exact: bool = False) -> float:
         if exact:
-            raise NotImplementedError(
-                f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
-            )
-        return typing.cast(float, self._apply_aggregation(agg_ops.median_op))
+            return typing.cast(float, self.quantile(0.5))
+        else:
+            return typing.cast(float, self._apply_aggregation(agg_ops.median_op))
+
+    def quantile(self, q: Union[float, Sequence[float]] = 0.5) -> Union[Series, float]:
+        qs = tuple(q) if utils.is_list_like(q) else (q,)
+        result = block_ops.quantile(self._block, (self._value_column,), qs=qs)
+        if utils.is_list_like(q):
+            result = result.stack()
+            result = result.drop_levels([result.index_columns[0]])
+            return Series(result)
+        else:
+            return cast(float, Series(result).to_pandas().squeeze())
 
     def sum(self) -> float:
         return typing.cast(float, self._apply_aggregation(agg_ops.sum_op))
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index e70764fcc0..7fef7a9dc7 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -2504,7 +2504,10 @@ def test_df_melt_default(scalars_dfs):
 
     # Pandas produces int64 index, Bigframes produces Int64 (nullable)
     pd.testing.assert_frame_equal(
-        bf_result, pd_result, check_index_type=False, check_dtype=False
+        bf_result,
+        pd_result,
+        check_index_type=False,
+        check_dtype=False,
     )
 
 
@@ -3029,6 +3032,31 @@ def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index):
         )
 
 
+def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_index):
+    q = 0.45
+    col_names = ["int64_too", "int64_col", "float64_col"]
+    bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas()
+    pd_result = scalars_pandas_df_index[col_names].quantile(q=q)
+
+    # Pandas may produce narrower numeric types, but bigframes always produces Float64
+    pd_result = pd_result.astype("Float64")
+
+    pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
+
+
+def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index):
+    q = [0, 0.33, 0.67, 1.0]
+    col_names = ["int64_too", "int64_col", "float64_col"]
+    bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas()
+    pd_result = scalars_pandas_df_index[col_names].quantile(q=q)
+
+    # Pandas may produce narrower numeric types, but bigframes always produces Float64
+    pd_result = pd_result.astype("Float64")
+    pd_result.index = pd_result.index.astype("Float64")
+
+    pd.testing.assert_frame_equal(bf_result, pd_result)
+
+
 @pytest.mark.parametrize(
     ("op"),
     [
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index ba79ba1ab1..7b36a06f49 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -65,6 +65,24 @@ def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index):
     assert ((pd_min <= bf_result_computed) & (bf_result_computed <= pd_max)).all().all()
 
 
+@pytest.mark.parametrize(
+    ("q"),
+    [
+        ([0.2, 0.4, 0.6, 0.8]),
+        (0.11),
+    ],
+)
+def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
+    col_names = ["int64_too", "float64_col", "int64_col", "string_col"]
+    bf_result = (
+        scalars_df_index[col_names].groupby("string_col").quantile(q)
+    ).to_pandas()
+    pd_result = scalars_pandas_df_index[col_names].groupby("string_col").quantile(q)
+    pd.testing.assert_frame_equal(
+        pd_result, bf_result, check_dtype=False, check_index_type=False
+    )
+
+
 @pytest.mark.parametrize(
     ("operator"),
     [
@@ -389,3 +407,20 @@ def test_dataframe_groupby_nonnumeric_with_mean():
     pd.testing.assert_frame_equal(
         pd_result, bf_result, check_index_type=False, check_dtype=False
     )
+
+
+@pytest.mark.parametrize(
+    ("q"),
+    [
+        ([0.2, 0.4, 0.6, 0.8]),
+        (0.11),
+    ],
+)
+def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
+    bf_result = (
+        scalars_df_index.groupby("string_col")["int64_col"].quantile(q)
+    ).to_pandas()
+    pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].quantile(q)
+    pd.testing.assert_series_equal(
+        pd_result, bf_result, check_dtype=False, check_index_type=False
+    )
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index d27cd0a236..87267696ba 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -1320,6 +1320,27 @@ def test_median(scalars_dfs):
     assert pd_min < bf_result < pd_max
 
 
+def test_median_exact(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    col_name = "int64_col"
+    bf_result = scalars_df[col_name].median(exact=True)
+    pd_result = scalars_pandas_df[col_name].median()
+    assert math.isclose(pd_result, bf_result)
+
+
+def test_series_quantile(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    col_name = "int64_col"
+    bf_series = scalars_df[col_name]
+    pd_series = scalars_pandas_df[col_name]
+
+    pd_result = pd_series.quantile([0.0, 0.4, 0.6, 1.0])
+    bf_result = bf_series.quantile([0.0, 0.4, 0.6, 1.0])
+    pd.testing.assert_series_equal(
+        pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False
+    )
+
+
 def test_numeric_literal(scalars_dfs):
     scalars_df, _ = scalars_dfs
     col_name = "numeric_col"
diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
index 88826b31ce..fddeab19a2 100644
--- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
+++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py
@@ -3,6 +3,7 @@
 
 import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
 from ibis.backends.bigquery.registry import OPERATION_REGISTRY
+import ibis.expr.operations.reductions as ibis_reductions
 
 
 def _approx_quantiles(translator, op: vendored_ibis_ops.ApproximateMultiQuantile):
@@ -31,12 +32,19 @@ def _generate_array(translator, op: vendored_ibis_ops.GenerateArray):
     return f"GENERATE_ARRAY(0, {arg})"
 
 
+def _quantile(translator, op: ibis_reductions.Quantile):
+    arg = translator.translate(op.arg)
+    quantile = translator.translate(op.quantile)
+    return f"PERCENTILE_CONT({arg}, {quantile})"
+
+
 patched_ops = {
     vendored_ibis_ops.ApproximateMultiQuantile: _approx_quantiles,  # type:ignore
     vendored_ibis_ops.FirstNonNullValue: _first_non_null_value,  # type:ignore
     vendored_ibis_ops.LastNonNullValue: _last_non_null_value,  # type:ignore
     vendored_ibis_ops.ToJsonString: _to_json_string,  # type:ignore
     vendored_ibis_ops.GenerateArray: _generate_array,  # type:ignore
+    ibis_reductions.Quantile: _quantile,  # type:ignore
 }
 
 OPERATION_REGISTRY.update(patched_ops)
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 6707dc1403..e894900646 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4509,13 +4509,51 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
                 Default False. Include only float, int, boolean columns.
             exact (bool. default False):
                 Default False. Get the exact median instead of an approximate
-                one. Note: ``exact=True`` not yet supported.
+                one.
 
         Returns:
             bigframes.series.Series: Series with the median of values.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def quantile(
+        self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False
+    ):
+        """
+        Return values at the given quantile over requested axis.
+
+        **Examples:**
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
+            ...                   columns=['a', 'b'])
+            >>> df.quantile(.1)
+            a    1.3
+            b    3.7
+            Name: 0.1, dtype: Float64
+            >>> df.quantile([.1, .5])
+                   a     b
+            0.1  1.3   3.7
+            0.5  2.5  55.0
+            <BLANKLINE>
+            [2 rows x 2 columns]
+
+        Args:
+            q (float or array-like, default 0.5 (50% quantile)):
+                Value between 0 <= q <= 1, the quantile(s) to compute.
+            numeric_only (bool, default False):
+                Include only `float`, `int` or `boolean` data.
+
+        Returns:
+            Series or DataFrame:
+                If ``q`` is an array, a DataFrame will be returned where the
+                index is ``q``, the columns are the columns of self, and the
+                values are the quantiles.
+                If ``q`` is a float, a Series will be returned where the
+                index is the columns of self and the values are the quantiles.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def var(self, axis=0, *, numeric_only: bool = False):
         """Return unbiased variance over requested axis.
 
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
index ed4ca66f38..6310d7e271 100644
--- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
+++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -85,6 +85,36 @@ def median(
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def quantile(self, q=0.5, *, numeric_only: bool = False):
+        """
+        Return group values at the given quantile, a la numpy.percentile.
+
+        **Examples:**
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> df = bpd.DataFrame([
+            ...     ['a', 1], ['a', 2], ['a', 3],
+            ...     ['b', 1], ['b', 3], ['b', 5]
+            ... ], columns=['key', 'val'])
+            >>> df.groupby('key').quantile()
+                 val
+            key
+            a    2.0
+            b    3.0
+            <BLANKLINE>
+            [2 rows x 1 columns]
+
+        Args:
+            q (float or array-like, default 0.5 (50% quantile)):
+                Value(s) between 0 and 1 providing the quantile(s) to compute.
+            numeric_only (bool, default False):
+                Include only `float`, `int` or `boolean` data.
+
+        Returns:
+            Series or DataFrame: Return type determined by caller of GroupBy object.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def std(
         self,
         *,
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 46bc9714f8..5e3b4c46ef 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -3,7 +3,16 @@
 """
 from __future__ import annotations
 
-from typing import Hashable, IO, Literal, Mapping, Optional, Sequence, TYPE_CHECKING
+from typing import (
+    Hashable,
+    IO,
+    Literal,
+    Mapping,
+    Optional,
+    Sequence,
+    TYPE_CHECKING,
+    Union,
+)
 
 from bigframes_vendored.pandas.core.generic import NDFrame
 import numpy
@@ -3151,6 +3160,37 @@ def median(self, *, exact: bool = False):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def quantile(
+        self,
+        q: Union[float, Sequence[float]] = 0.5,
+    ) -> Union[Series, float]:
+        """
+        Return value at the given quantile.
+
+        **Examples:**
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+            >>> s = bpd.Series([1, 2, 3, 4])
+            >>> s.quantile(.5)
+            2.5
+            >>> s.quantile([.25, .5, .75])
+            0.25    1.75
+            0.5      2.5
+            0.75    3.25
+            dtype: Float64
+
+        Args:
+            q (float or array-like, default 0.5 (50% quantile)):
+                The quantile(s) to compute, which can lie in range: 0 <= q <= 1.
+
+        Returns:
+            float or Series:
+                If ``q`` is an array, a Series will be returned where the
+                index is ``q`` and the values are the quantiles, otherwise
+                a float will be returned.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     def prod(self):
         """Return the product of the values over the requested axis.
 
diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py
index 19f56965df..bf016357a6 100644
--- a/third_party/bigframes_vendored/pandas/plotting/_core.py
+++ b/third_party/bigframes_vendored/pandas/plotting/_core.py
@@ -11,6 +11,7 @@ class PlotAccessor:
     For Series:
 
         >>> import bigframes.pandas as bpd
+        >>> bpd.options.display.progress_bar = None
         >>> ser = bpd.Series([1, 2, 3, 3])
         >>> plot = ser.plot(kind='hist', title="My plot")
 
@@ -57,6 +58,7 @@ def hist(
 
             >>> import bigframes.pandas as bpd
             >>> import numpy as np
+            >>> bpd.options.display.progress_bar = None
             >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one'])
             >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000)
             >>> ax = df.plot.hist(bins=12, alpha=0.5)
@@ -93,6 +95,7 @@ def line(
         **Examples:**
 
             >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
             >>> df = bpd.DataFrame(
             ...     {
             ...         'one': [1, 2, 3, 4],
@@ -160,6 +163,7 @@ def area(
         Draw an area plot based on basic business metrics:
 
             >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
             >>> df = bpd.DataFrame(
             ...     {
             ...         'sales': [3, 2, 3, 9, 10, 6],

From 250548c248fe3a4fdfa92494aa0d550de8608612 Mon Sep 17 00:00:00 2001
From: Henry Solberg <henry.j.solberg@gmail.com>
Date: Tue, 16 Apr 2024 17:42:09 -0700
Subject: [PATCH 02/15] test: add a bigquery usage report to notebook test
 session (#604)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test: add a bigquery usage report to notebook test session

* filter out mocks

* remove pointless type hint

* fix replace statement

* account for dry runs

* ipynb only

* use env var via nox

* don't import bigframes from noxfile

* address comments

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
---
 bigframes/session/_io/bigquery.py | 31 +++++++++++++++++
 noxfile.py                        | 58 ++++++++++++++++++++++++++-----
 2 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py
index 75283a060a..ac6ba4bae4 100644
--- a/bigframes/session/_io/bigquery.py
+++ b/bigframes/session/_io/bigquery.py
@@ -18,6 +18,7 @@
 
 import datetime
 import itertools
+import os
 import textwrap
 import types
 from typing import Dict, Iterable, Optional, Sequence, Tuple, Union
@@ -34,6 +35,8 @@
 MAX_LABELS_COUNT = 64
 TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}"
 
+LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"
+
 
 def create_job_configs_labels(
     job_configs_labels: Optional[Dict[str, str]],
@@ -243,4 +246,32 @@ def start_query_with_client(
         )
     else:
         results_iterator = query_job.result(max_results=max_results)
+
+    if LOGGING_NAME_ENV_VAR in os.environ:
+        # when running notebooks via pytest nbmake
+        pytest_log_job(query_job)
+
     return results_iterator, query_job
+
+
+def pytest_log_job(query_job: bigquery.QueryJob):
+    """For pytest runs only, log information about the query job
+    to a file in order to create a performance report.
+    """
+    if LOGGING_NAME_ENV_VAR not in os.environ:
+        raise EnvironmentError(
+            "Environment variable {env_var} is not set".format(
+                env_var=LOGGING_NAME_ENV_VAR
+            )
+        )
+    test_name = os.environ[LOGGING_NAME_ENV_VAR]
+    current_directory = os.getcwd()
+    bytes_processed = query_job.total_bytes_processed
+    if not isinstance(bytes_processed, int):
+        return  # filter out mocks
+    if query_job.configuration.dry_run:
+        # dry runs don't process their total_bytes_processed
+        bytes_processed = 0
+    bytes_file = os.path.join(current_directory, test_name + ".bytesprocessed")
+    with open(bytes_file, "a") as f:
+        f.write(str(bytes_processed) + "\n")
diff --git a/noxfile.py b/noxfile.py
index fa9c0a57d8..9479a7a318 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -764,6 +764,8 @@ def notebook(session: nox.Session):
         "--nbmake-timeout=900",  # 15 minutes
     ]
 
+    logging_name_env_var = "BIGFRAMES_PERFORMANCE_LOG_NAME"
+
     try:
         # Populate notebook parameters and make a backup so that the notebooks
         # are runnable.
@@ -773,13 +775,21 @@ def notebook(session: nox.Session):
             *notebooks,
         )
 
-        # Run self-contained notebooks in single session.run
-        # achieve parallelization via -n
-        session.run(
-            *pytest_command,
-            "-nauto",
-            *notebooks,
-        )
+        # Run notebooks in parallel session.run's, since each notebook
+        # takes an environment variable for performance logging
+        processes = []
+        for notebook in notebooks:
+            session.env[logging_name_env_var] = os.path.basename(notebook)
+            process = Process(
+                target=session.run,
+                args=(*pytest_command, notebook),
+            )
+            process.start()
+            processes.append(process)
+
+        for process in processes:
+            process.join()
+
     finally:
         # Prevent our notebook changes from getting checked in to git
         # accidentally.
@@ -789,11 +799,12 @@ def notebook(session: nox.Session):
             *notebooks,
         )
 
-    # Run regionalized notebooks in parallel session.run's, since each notebook
-    # takes a different region via env param.
+    # Additionally run regionalized notebooks in parallel session.run's.
+    # Each notebook takes a different region via env param.
     processes = []
     for notebook, regions in notebooks_reg.items():
         for region in regions:
+            session.env[logging_name_env_var] = os.path.basename(notebook)
             process = Process(
                 target=session.run,
                 args=(*pytest_command, notebook),
@@ -805,6 +816,35 @@ def notebook(session: nox.Session):
     for process in processes:
         process.join()
 
+    # when run via pytest, notebooks output a .bytesprocessed report
+    # collect those reports and print a summary
+    _print_bytes_processed_report()
+
+
+def _print_bytes_processed_report():
+    """Add an informational report about http queries and bytes
+    processed to the testlog output for purposes of measuring
+    bigquery-related performance changes.
+    """
+    print("---BIGQUERY USAGE REPORT---")
+    cumulative_queries = 0
+    cumulative_bytes = 0
+    for report in Path("notebooks/").glob("*/*.bytesprocessed"):
+        with open(report, "r") as f:
+            filename = report.stem
+            lines = f.read().splitlines()
+            query_count = len(lines)
+            total_bytes = sum([int(line) for line in lines])
+            format_string = f"{filename} - query count: {query_count}, bytes processed sum: {total_bytes}"
+            print(format_string)
+            cumulative_bytes += total_bytes
+            cumulative_queries += query_count
+    print(
+        "---total queries: {total_queries}, total bytes: {total_bytes}---".format(
+            total_queries=cumulative_queries, total_bytes=cumulative_bytes
+        )
+    )
+
 
 @nox.session(python="3.10")
 def release_dry_run(session):

From 34f9f61eee6878c74f50197f657682e37474becc Mon Sep 17 00:00:00 2001
From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com>
Date: Wed, 17 Apr 2024 10:50:46 -0400
Subject: [PATCH 03/15] chore(python): bump idna from 3.4 to 3.7 in .kokoro
 (#608)

Source-Link: https://github.com/googleapis/synthtool/commit/d50980e704793a2d3310bfb3664f3a82f24b5796
Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:5a4c19d17e597b92d786e569be101e636c9c2817731f80a5adec56b2aa8fe070

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Co-authored-by: Anthonios Partheniou <partheniou@google.com>
---
 .github/.OwlBot.lock.yaml |  4 ++--
 .github/auto-label.yaml   |  5 +++++
 .github/blunderbuss.yml   | 17 +++++++++++++++++
 .kokoro/requirements.txt  |  6 +++---
 4 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 .github/blunderbuss.yml

diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml
index 4bdeef3904..81f87c5691 100644
--- a/.github/.OwlBot.lock.yaml
+++ b/.github/.OwlBot.lock.yaml
@@ -13,5 +13,5 @@
 # limitations under the License.
 docker:
   image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest
-  digest: sha256:a8a80fc6456e433df53fc2a0d72ca0345db0ddefb409f1b75b118dfd1babd952
-# created: 2024-03-15T16:25:47.905264637Z
+  digest: sha256:5a4c19d17e597b92d786e569be101e636c9c2817731f80a5adec56b2aa8fe070
+# created: 2024-04-12T11:35:58.922854369Z
diff --git a/.github/auto-label.yaml b/.github/auto-label.yaml
index b2016d119b..8b37ee8971 100644
--- a/.github/auto-label.yaml
+++ b/.github/auto-label.yaml
@@ -13,3 +13,8 @@
 # limitations under the License.
 requestsize:
   enabled: true
+
+path:
+  pullrequest: true
+  paths:
+    samples: "samples"
diff --git a/.github/blunderbuss.yml b/.github/blunderbuss.yml
new file mode 100644
index 0000000000..8d9cb1008e
--- /dev/null
+++ b/.github/blunderbuss.yml
@@ -0,0 +1,17 @@
+# Blunderbuss config
+#
+# This file controls who is assigned for pull requests and issues.
+# Note: This file is autogenerated. To make changes to the assignee
+# team, please update `codeowner_team` in `.repo-metadata.json`.
+assign_issues:
+  - googleapis/api-bigquery-dataframe
+
+assign_issues_by:
+  - labels:
+      - "samples"
+    to:
+      - googleapis/python-samples-reviewers
+      - googleapis/api-bigquery-dataframe
+
+assign_prs:
+  - googleapis/api-bigquery-dataframe
diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt
index dd61f5f320..51f92b8e12 100644
--- a/.kokoro/requirements.txt
+++ b/.kokoro/requirements.txt
@@ -252,9 +252,9 @@ googleapis-common-protos==1.61.0 \
     --hash=sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0 \
     --hash=sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b
     # via google-api-core
-idna==3.4 \
-    --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \
-    --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2
+idna==3.7 \
+    --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \
+    --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0
     # via requests
 importlib-metadata==6.8.0 \
     --hash=sha256:3ebb78df84a805d7698245025b975d9d67053cd94c79245ba4b3eb694abe68bb \

From 9f8f181279133abdb7da3aa045df6fa278587013 Mon Sep 17 00:00:00 2001
From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com>
Date: Wed, 17 Apr 2024 09:40:51 -0700
Subject: [PATCH 04/15] fix: address technical writers fb (#611)

* fix: address technical writers fb
---
 bigframes/ml/model_selection.py               |  2 +-
 .../pandas/core/indexes/accessor.py           |  2 +-
 .../bigframes_vendored/sklearn/base.py        |  2 +-
 .../sklearn/metrics/_classification.py        |  8 +++---
 .../bigframes_vendored/sklearn/pipeline.py    |  7 ++---
 .../sklearn/preprocessing/_encoder.py         | 26 ++++++++++++-------
 .../sklearn/preprocessing/_label.py           |  2 +-
 7 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
index 42c13fdb40..48eb5a93a7 100644
--- a/bigframes/ml/model_selection.py
+++ b/bigframes/ml/model_selection.py
@@ -35,7 +35,7 @@ def train_test_split(
     Args:
         *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series):
             A sequence of BigQuery DataFrames or Series that can be joined on
-            their indexes
+            their indexes.
         test_size (default None):
             The proportion of the dataset to include in the test split. If
             None, this will default to the complement of train_size. If both
diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py
index 3f0175359a..f34612cb11 100644
--- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py
+++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py
@@ -37,7 +37,7 @@ def dayofweek(self):
         """The day of the week with Monday=0, Sunday=6.
 
         Return the day of the week. It is assumed the week starts on
-        Monday, which is denoted by 0 and ends on Sunday which is denoted
+        Monday, which is denoted by 0 and ends on Sunday, which is denoted
         by 6.
 
         **Examples:**
diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py
index fd8db7a227..1a151a1119 100644
--- a/third_party/bigframes_vendored/sklearn/base.py
+++ b/third_party/bigframes_vendored/sklearn/base.py
@@ -153,7 +153,7 @@ def fit_transform(self, X, y=None):
                 Target values (None for unsupervised transformations).
 
         Returns:
-            bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new)
+            bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new).
                 Transformed DataFrame.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py
index 00bbf8cd60..8e8b2c1952 100644
--- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py
+++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py
@@ -122,7 +122,7 @@ def recall_score(
 ):
     """Compute the recall.
 
-    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    The recall is the ratio ``tp / (tp + fn)``, where ``tp`` is the number of
     true positives and ``fn`` the number of false negatives. The recall is
     intuitively the ability of the classifier to find all the positive samples.
 
@@ -170,7 +170,7 @@ def precision_score(
 ):
     """Compute the precision.
 
-    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    The precision is the ratio ``tp / (tp + fp)``, where ``tp`` is the number of
     true positives and ``fp`` the number of false positives. The precision is
     intuitively the ability of the classifier not to label as positive a sample
     that is negative.
@@ -244,9 +244,9 @@ def f1_score(
         dtype: float64
 
     Args:
-        y_true: Series or DataFrame of shape (n_samples,)
+        y_true: Series or DataFrame of shape (n_samples,).
             Ground truth (correct) target values.
-        y_pred: Series or DataFrame of shape (n_samples,)
+        y_pred: Series or DataFrame of shape (n_samples,).
             Estimated targets as returned by a classifier.
         average: {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \
                 default='binary'
diff --git a/third_party/bigframes_vendored/sklearn/pipeline.py b/third_party/bigframes_vendored/sklearn/pipeline.py
index aed1565960..8a98ee4141 100644
--- a/third_party/bigframes_vendored/sklearn/pipeline.py
+++ b/third_party/bigframes_vendored/sklearn/pipeline.py
@@ -20,13 +20,14 @@ class Pipeline(BaseEstimator, metaclass=ABCMeta):
     """Pipeline of transforms with a final estimator.
 
     Sequentially apply a list of transforms and a final estimator.
-    Intermediate steps of the pipeline must be `transforms`, that is, they
+    Intermediate steps of the pipeline must be `transforms`. That is, they
     must implement `fit` and `transform` methods.
     The final estimator only needs to implement `fit`.
 
     The purpose of the pipeline is to assemble several steps that can be
-    cross-validated together while setting different parameters. This simplifies code, and allows deploying an estimator
-    and peprocessing together, e.g. with `Pipeline.to_gbq(...).`
+    cross-validated together while setting different parameters. This
+    simplifies code and allows for deploying an estimator and peprocessing
+    together, e.g. with `Pipeline.to_gbq(...).`
     """
 
     def fit(
diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py
index 5e5e8ac042..b883e82249 100644
--- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py
+++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py
@@ -23,15 +23,21 @@ class OneHotEncoder(BaseEstimator):
         Given a dataset with two features, we let the encoder find the unique
         values per feature and transform the data to a binary one-hot encoding.
 
-        .. code-block::
-
-            from bigframes.ml.preprocessing import OneHotEncoder
-            import bigframes.pandas as bpd
-
-            enc = OneHotEncoder()
-            X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]})
-            enc.fit(X)
-            print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]})))
+        >>> from bigframes.ml.preprocessing import OneHotEncoder
+        >>> import bigframes.pandas as bpd
+        >>> bpd.options.display.progress_bar = None
+
+        >>> enc = OneHotEncoder()
+        >>> X = bpd.DataFrame({"a": ["Male", "Female", "Female"], "b": ["1", "3", "2"]})
+        >>> enc.fit(X)
+        OneHotEncoder()
+
+        >>> print(enc.transform(bpd.DataFrame({"a": ["Female", "Male"], "b": ["1", "4"]})))
+                        onehotencoded_a               onehotencoded_b
+        0  [{'index': 1, 'value': 1.0}]  [{'index': 1, 'value': 1.0}]
+        1  [{'index': 2, 'value': 1.0}]  [{'index': 0, 'value': 1.0}]
+        <BLANKLINE>
+        [2 rows x 2 columns]
 
     Args:
         drop (Optional[Literal["most_frequent"]], default None):
@@ -52,7 +58,7 @@ class OneHotEncoder(BaseEstimator):
             Specifies an upper limit to the number of output features for each input feature
             when considering infrequent categories. If there are infrequent categories,
             max_categories includes the category representing the infrequent categories along with the frequent categories.
-            Default None, set limit to 1,000,000.
+            Default None. Set limit to 1,000,000.
     """
 
     def fit(self, X, y=None):
diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py
index cc6b995c8c..61a44db92f 100644
--- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py
+++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py
@@ -26,7 +26,7 @@ class LabelEncoder(BaseEstimator):
             Specifies an upper limit to the number of output features for each input feature
             when considering infrequent categories. If there are infrequent categories,
             max_categories includes the category representing the infrequent categories along with the frequent categories.
-            Default None, set limit to 1,000,000.
+            Default None. Set limit to 1,000,000.
     """
 
     def fit(self, y):

From 8f9ece6d13f57f02d677bf0e3fea97dea94ae240 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Wed, 17 Apr 2024 15:48:38 -0700
Subject: [PATCH 05/15] fix: infer narrowest numeric type when combining
 numeric columns (#602)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/core/__init__.py                    |  88 +++++-
 bigframes/core/block_transforms.py            |   2 +-
 bigframes/core/blocks.py                      |  16 +-
 bigframes/core/compile/compiled.py            | 284 ------------------
 bigframes/core/compile/compiler.py            |  12 -
 bigframes/core/compile/scalar_op_compiler.py  |  38 +++
 bigframes/core/expression.py                  |   3 -
 bigframes/core/join_def.py                    |   5 +
 bigframes/core/nodes.py                       |  84 ------
 bigframes/dataframe.py                        |   8 +-
 bigframes/operations/__init__.py              | 100 +++---
 tests/system/small/test_dataframe.py          |  10 +-
 .../bigframes_vendored/pandas/core/frame.py   |  84 +++---
 .../bigframes_vendored/pandas/core/series.py  |   6 +-
 14 files changed, 233 insertions(+), 507 deletions(-)

diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
index 3fa690ef37..9e6b86fc30 100644
--- a/bigframes/core/__init__.py
+++ b/bigframes/core/__init__.py
@@ -354,10 +354,7 @@ def unpivot(
         *,
         passthrough_columns: typing.Sequence[str] = (),
         index_col_ids: typing.Sequence[str] = ["index"],
-        dtype: typing.Union[
-            bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...]
-        ] = pandas.Float64Dtype(),
-        how: typing.Literal["left", "right"] = "left",
+        join_side: typing.Literal["left", "right"] = "left",
     ) -> ArrayValue:
         """
         Unpivot ArrayValue columns.
@@ -367,23 +364,88 @@ def unpivot(
             unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None.
             passthrough_columns: Columns that will not be unpivoted. Column id will be preserved.
             index_col_id (str): The column id to be used for the row labels.
-            dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns.
 
         Returns:
             ArrayValue: The unpivoted ArrayValue
         """
+        # There will be N labels, used to disambiguate which of N source columns produced each output row
+        explode_offsets_id = bigframes.core.guid.generate_guid("unpivot_offsets_")
+        labels_array = self._create_unpivot_labels_array(row_labels, index_col_ids)
+        labels_array = labels_array.promote_offsets(explode_offsets_id)
+
+        # Unpivot creates N output rows for each input row, labels disambiguate these N rows
+        joined_array = self._cross_join_w_labels(labels_array, join_side)
+
+        # Build the output rows as a case statment that selects between the N input columns
+        unpivot_exprs = []
+        # Supports producing multiple stacked ouput columns for stacking only part of hierarchical index
+        for col_id, input_ids in unpivot_columns:
+            # row explode offset used to choose the input column
+            # we use offset instead of label as labels are not necessarily unique
+            cases = tuple(
+                (
+                    ops.eq_op.as_expr(explode_offsets_id, ex.const(i)),
+                    ex.free_var(id_or_null)
+                    if (id_or_null is not None)
+                    else ex.const(None),
+                )
+                for i, id_or_null in enumerate(input_ids)
+            )
+            col_expr = ops.case_when_op.as_expr(*cases)
+            unpivot_exprs.append((col_expr, col_id))
+
+        label_exprs = ((ex.free_var(id), id) for id in index_col_ids)
+        # passthrough columns are unchanged, just repeated N times each
+        passthrough_exprs = ((ex.free_var(id), id) for id in passthrough_columns)
         return ArrayValue(
-            nodes.UnpivotNode(
-                child=self.node,
-                row_labels=tuple(row_labels),
-                unpivot_columns=tuple(unpivot_columns),
-                passthrough_columns=tuple(passthrough_columns),
-                index_col_ids=tuple(index_col_ids),
-                dtype=dtype,
-                how=how,
+            nodes.ProjectionNode(
+                child=joined_array.node,
+                assignments=(*label_exprs, *unpivot_exprs, *passthrough_exprs),
             )
         )
 
+    def _cross_join_w_labels(
+        self, labels_array: ArrayValue, join_side: typing.Literal["left", "right"]
+    ) -> ArrayValue:
+        """
+        Convert each row in self to N rows, one for each label in labels array.
+        """
+        table_join_side = (
+            join_def.JoinSide.LEFT if join_side == "left" else join_def.JoinSide.RIGHT
+        )
+        labels_join_side = table_join_side.inverse()
+        labels_mappings = tuple(
+            join_def.JoinColumnMapping(labels_join_side, id, id)
+            for id in labels_array.schema.names
+        )
+        table_mappings = tuple(
+            join_def.JoinColumnMapping(table_join_side, id, id)
+            for id in self.schema.names
+        )
+        join = join_def.JoinDefinition(
+            conditions=(), mappings=(*labels_mappings, *table_mappings), type="cross"
+        )
+        if join_side == "left":
+            joined_array = self.join(labels_array, join_def=join)
+        else:
+            joined_array = labels_array.join(self, join_def=join)
+        return joined_array
+
+    def _create_unpivot_labels_array(
+        self,
+        former_column_labels: typing.Sequence[typing.Hashable],
+        col_ids: typing.Sequence[str],
+    ) -> ArrayValue:
+        """Create an ArrayValue from a list of label tuples."""
+        rows = []
+        for row_offset in range(len(former_column_labels)):
+            row_label = former_column_labels[row_offset]
+            row_label = (row_label,) if not isinstance(row_label, tuple) else row_label
+            row = {col_ids[i]: row_label[i] for i in range(len(col_ids))}
+            rows.append(row)
+
+        return ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=self.session)
+
     def join(
         self,
         other: ArrayValue,
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
index 1eae73014c..562689a736 100644
--- a/bigframes/core/block_transforms.py
+++ b/bigframes/core/block_transforms.py
@@ -857,5 +857,5 @@ def _idx_extrema(
     # Stack the entire column axis to produce single-column result
     # Assumption: uniform dtype for stackability
     return block.aggregate_all_and_stack(
-        agg_ops.AnyValueOp(), dtype=block.dtypes[0]
+        agg_ops.AnyValueOp(),
     ).with_column_labels([original_block.index.name])
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index f6850020df..0f9cacd83d 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -914,9 +914,6 @@ def aggregate_all_and_stack(
         axis: int | str = 0,
         value_col_id: str = "values",
         dropna: bool = True,
-        dtype: typing.Union[
-            bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...]
-        ] = pd.Float64Dtype(),
     ) -> Block:
         axis_n = utils.get_axis_number(axis)
         if axis_n == 0:
@@ -931,7 +928,6 @@ def aggregate_all_and_stack(
                 row_labels=self.column_labels.to_list(),
                 index_col_ids=index_col_ids,
                 unpivot_columns=tuple([(value_col_id, tuple(self.value_columns))]),
-                dtype=dtype,
             )
             return Block(
                 result_expr,
@@ -949,7 +945,6 @@ def aggregate_all_and_stack(
                 index_col_ids=[guid.generate_guid()],
                 unpivot_columns=[(value_col_id, tuple(self.value_columns))],
                 passthrough_columns=[*self.index_columns, offset_col],
-                dtype=dtype,
             )
             index_aggregations = [
                 (ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.free_var(col_id)), col_id)
@@ -1512,13 +1507,10 @@ def stack(self, how="left", levels: int = 1):
 
         # Get matching columns
         unpivot_columns: List[Tuple[str, List[str]]] = []
-        dtypes = []
         for val in result_col_labels:
             col_id = guid.generate_guid("unpivot_")
             input_columns, dtype = self._create_stack_column(val, row_label_tuples)
             unpivot_columns.append((col_id, input_columns))
-            if dtype:
-                dtypes.append(dtype or pd.Float64Dtype())
 
         added_index_columns = [guid.generate_guid() for _ in range(row_labels.nlevels)]
         unpivot_expr = self._expr.unpivot(
@@ -1526,8 +1518,7 @@ def stack(self, how="left", levels: int = 1):
             passthrough_columns=self.index_columns,
             unpivot_columns=unpivot_columns,
             index_col_ids=added_index_columns,
-            dtype=tuple(dtypes),
-            how=how,
+            join_side=how,
         )
         new_index_level_names = self.column_labels.names[-levels:]
         if how == "left":
@@ -1559,15 +1550,12 @@ def melt(
         value_labels = [self.col_id_to_label[col_id] for col_id in value_vars]
         id_labels = [self.col_id_to_label[col_id] for col_id in id_vars]
 
-        dtype = self._expr.get_column_type(value_vars[0])
-
         unpivot_expr = self._expr.unpivot(
             row_labels=value_labels,
             passthrough_columns=id_vars,
             unpivot_columns=(unpivot_col,),
             index_col_ids=var_col_ids,
-            dtype=dtype,
-            how="right",
+            join_side="right",
         )
         index_id = guid.generate_guid()
         unpivot_expr = unpivot_expr.promote_offsets(index_id)
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
index f1c5d62010..a59d599679 100644
--- a/bigframes/core/compile/compiled.py
+++ b/bigframes/core/compile/compiled.py
@@ -40,10 +40,8 @@
     OrderingExpression,
 )
 import bigframes.core.schema as schemata
-import bigframes.core.utils as utils
 from bigframes.core.window_spec import WindowSpec
 import bigframes.dtypes
-import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
 
 ORDER_ID_COLUMN = "bigframes_ordering_id"
@@ -109,36 +107,6 @@ def filter(self: T, predicate: ex.Expression) -> T:
         """Filter the table on a given expression, the predicate must be a boolean expression."""
         ...
 
-    @abc.abstractmethod
-    def unpivot(
-        self: T,
-        row_labels: typing.Sequence[typing.Hashable],
-        unpivot_columns: typing.Sequence[
-            typing.Tuple[str, typing.Sequence[typing.Optional[str]]]
-        ],
-        *,
-        passthrough_columns: typing.Sequence[str] = (),
-        index_col_ids: typing.Sequence[str] = ["index"],
-        dtype: typing.Union[
-            bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype]
-        ] = pandas.Float64Dtype(),
-        how="left",
-    ) -> T:
-        """
-        Unpivot ArrayValue columns.
-
-        Args:
-            row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument.
-            unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None.
-            passthrough_columns: Columns that will not be unpivoted. Column id will be preserved.
-            index_col_id (str): The column id to be used for the row labels.
-            dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns.
-
-        Returns:
-            ArrayValue: The unpivoted ArrayValue
-        """
-        ...
-
     @abc.abstractmethod
     def _reproject_to_table(self: T) -> T:
         """
@@ -332,115 +300,6 @@ def _filter(self, predicate_value: ibis_types.BooleanValue) -> UnorderedIR:
         expr.predicates = [*self._predicates, predicate_value]
         return expr.build()
 
-    def unpivot(
-        self,
-        row_labels: typing.Sequence[typing.Hashable],
-        unpivot_columns: typing.Sequence[
-            typing.Tuple[str, typing.Sequence[typing.Optional[str]]]
-        ],
-        *,
-        passthrough_columns: typing.Sequence[str] = (),
-        index_col_ids: typing.Sequence[str] = ["index"],
-        dtype: typing.Union[
-            bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype]
-        ] = pandas.Float64Dtype(),
-        how="left",
-    ) -> UnorderedIR:
-        if how not in ("left", "right"):
-            raise ValueError("'how' must be 'left' or 'right'")
-        table = self._to_ibis_expr()
-        row_n = len(row_labels)
-        if not all(
-            len(source_columns) == row_n for _, source_columns in unpivot_columns
-        ):
-            raise ValueError("Columns and row labels must all be same length.")
-
-        unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_")
-        unpivot_table = table.cross_join(
-            ibis.memtable({unpivot_offset_id: range(row_n)})
-        )
-        # Use ibis memtable to infer type of rowlabels (if possible)
-        # TODO: Allow caller to specify dtype
-        if isinstance(row_labels[0], tuple):
-            labels_table = ibis.memtable(row_labels)
-            labels_ibis_types = [
-                labels_table[col].type() for col in labels_table.columns
-            ]
-        else:
-            labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()]
-        labels_dtypes = [
-            bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type)
-            for ibis_type in labels_ibis_types
-        ]
-
-        label_columns = []
-        for label_part, (col_id, label_dtype) in enumerate(
-            zip(index_col_ids, labels_dtypes)
-        ):
-            # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels
-            labels_as_tuples = [
-                label if isinstance(label, tuple) else (label,) for label in row_labels
-            ]
-            cases = [
-                (
-                    i,
-                    bigframes.dtypes.literal_to_ibis_scalar(
-                        label_tuple[label_part],  # type:ignore
-                        force_dtype=label_dtype,  # type:ignore
-                    ),
-                )
-                for i, label_tuple in enumerate(labels_as_tuples)
-            ]
-            labels_value = (
-                typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id])
-                .cases(cases, default=None)  # type:ignore
-                .name(col_id)
-            )
-            label_columns.append(labels_value)
-
-        unpivot_values = []
-        for j in range(len(unpivot_columns)):
-            col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype
-            result_col, source_cols = unpivot_columns[j]
-            null_value = bigframes.dtypes.literal_to_ibis_scalar(
-                None, force_dtype=col_dtype
-            )
-            ibis_values = [
-                op_compiler.compile_row_op(
-                    ops.AsTypeOp(col_dtype), (unpivot_table[col],)
-                )
-                if col is not None
-                else null_value
-                for col in source_cols
-            ]
-            cases = [(i, ibis_values[i]) for i in range(len(ibis_values))]
-            unpivot_value = typing.cast(
-                ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]
-            ).cases(
-                cases, default=null_value  # type:ignore
-            )
-            unpivot_values.append(unpivot_value.name(result_col))
-
-        unpivot_table = unpivot_table.select(
-            passthrough_columns,
-            *label_columns,
-            *unpivot_values,
-            unpivot_offset_id,
-        )
-
-        value_columns = [
-            unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns
-        ]
-        passthrough_values = [unpivot_table[col] for col in passthrough_columns]
-        return UnorderedIR(
-            table=unpivot_table,
-            columns=[
-                *[unpivot_table[col_id] for col_id in index_col_ids],
-                *value_columns,
-                *passthrough_values,
-            ],
-        )
-
     def aggregate(
         self,
         aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]],
@@ -920,149 +779,6 @@ def project_window_op(
         # TODO(tbergeron): Automatically track analytic expression usage and defer reprojection until required for valid query generation.
         return result._reproject_to_table() if not skip_reproject_unsafe else result
 
-    def unpivot(
-        self,
-        row_labels: typing.Sequence[typing.Hashable],
-        unpivot_columns: typing.Sequence[
-            typing.Tuple[str, typing.Sequence[typing.Optional[str]]]
-        ],
-        *,
-        passthrough_columns: typing.Sequence[str] = (),
-        index_col_ids: typing.Sequence[str] = ["index"],
-        dtype: typing.Union[
-            bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype]
-        ] = pandas.Float64Dtype(),
-        how="left",
-    ) -> OrderedIR:
-        if how not in ("left", "right"):
-            raise ValueError("'how' must be 'left' or 'right'")
-        table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
-        row_n = len(row_labels)
-        hidden_col_ids = self._hidden_ordering_column_names.keys()
-        if not all(
-            len(source_columns) == row_n for _, source_columns in unpivot_columns
-        ):
-            raise ValueError("Columns and row labels must all be same length.")
-
-        unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_")
-        unpivot_table = table.cross_join(
-            ibis.memtable({unpivot_offset_id: range(row_n)})
-        )
-        # Use ibis memtable to infer type of rowlabels (if possible)
-        # TODO: Allow caller to specify dtype
-        if isinstance(row_labels[0], tuple):
-            labels_table = ibis.memtable(row_labels)
-            labels_ibis_types = [
-                labels_table[col].type() for col in labels_table.columns
-            ]
-        else:
-            labels_ibis_types = [ibis.memtable({"col": row_labels})["col"].type()]
-        labels_dtypes = [
-            bigframes.dtypes.ibis_dtype_to_bigframes_dtype(ibis_type)
-            for ibis_type in labels_ibis_types
-        ]
-
-        label_columns = []
-        for label_part, (col_id, label_dtype) in enumerate(
-            zip(index_col_ids, labels_dtypes)
-        ):
-            # interpret as tuples even if it wasn't originally so can apply same logic for multi-column labels
-            labels_as_tuples = [
-                label if isinstance(label, tuple) else (label,) for label in row_labels
-            ]
-            cases = [
-                (
-                    i,
-                    bigframes.dtypes.literal_to_ibis_scalar(
-                        label_tuple[label_part],  # type:ignore
-                        force_dtype=label_dtype,  # type:ignore
-                    ),
-                )
-                for i, label_tuple in enumerate(labels_as_tuples)
-            ]
-            labels_value = (
-                typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id])
-                .cases(cases, default=None)  # type:ignore
-                .name(col_id)
-            )
-            label_columns.append(labels_value)
-
-        unpivot_values = []
-        for j in range(len(unpivot_columns)):
-            col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype
-            result_col, source_cols = unpivot_columns[j]
-            null_value = bigframes.dtypes.literal_to_ibis_scalar(
-                None, force_dtype=col_dtype
-            )
-            ibis_values = [
-                op_compiler.compile_row_op(
-                    ops.AsTypeOp(col_dtype), (unpivot_table[col],)
-                )
-                if col is not None
-                else null_value
-                for col in source_cols
-            ]
-            cases = [(i, ibis_values[i]) for i in range(len(ibis_values))]
-            unpivot_value = typing.cast(
-                ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]
-            ).cases(
-                cases, default=null_value  # type:ignore
-            )
-            unpivot_values.append(unpivot_value.name(result_col))
-
-        unpivot_table = unpivot_table.select(
-            passthrough_columns,
-            *label_columns,
-            *unpivot_values,
-            *hidden_col_ids,
-            unpivot_offset_id,
-        )
-
-        # Extend the original ordering using unpivot_offset_id
-        old_ordering = self._ordering
-        if how == "left":
-            new_ordering = ExpressionOrdering(
-                ordering_value_columns=tuple(
-                    [
-                        *old_ordering.ordering_value_columns,
-                        ascending_over(unpivot_offset_id),
-                    ]
-                ),
-                total_ordering_columns=frozenset(
-                    [*old_ordering.total_ordering_columns, unpivot_offset_id]
-                ),
-            )
-        else:  # how=="right"
-            new_ordering = ExpressionOrdering(
-                ordering_value_columns=tuple(
-                    [
-                        ascending_over(unpivot_offset_id),
-                        *old_ordering.ordering_value_columns,
-                    ]
-                ),
-                total_ordering_columns=frozenset(
-                    [*old_ordering.total_ordering_columns, unpivot_offset_id]
-                ),
-            )
-        value_columns = [
-            unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns
-        ]
-        passthrough_values = [unpivot_table[col] for col in passthrough_columns]
-        hidden_ordering_columns = [
-            unpivot_table[unpivot_offset_id],
-            *[unpivot_table[hidden_col] for hidden_col in hidden_col_ids],
-        ]
-        return OrderedIR(
-            table=unpivot_table,
-            columns=[
-                *[unpivot_table[col_id] for col_id in index_col_ids],
-                *value_columns,
-                *passthrough_values,
-            ],
-            hidden_ordering_columns=hidden_ordering_columns,
-            ordering=new_ordering,
-        )
-
     def _reproject_to_table(self) -> OrderedIR:
         table = self._to_ibis_expr(
             ordering_mode="unordered",
diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
index 638e3eacdd..a68023d13d 100644
--- a/bigframes/core/compile/compiler.py
+++ b/bigframes/core/compile/compiler.py
@@ -179,18 +179,6 @@ def compile_reproject(node: nodes.ReprojectOpNode, ordered: bool = True):
     return compile_node(node.child, ordered)._reproject_to_table()
 
 
-@_compile_node.register
-def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True):
-    return compile_node(node.child, ordered).unpivot(
-        node.row_labels,
-        node.unpivot_columns,
-        passthrough_columns=node.passthrough_columns,
-        index_col_ids=node.index_col_ids,
-        dtype=node.dtype,
-        how=node.how,
-    )
-
-
 @_compile_node.register
 def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True):
     return compile_node(node.child, ordered).explode(node.column_ids)
diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
index 53a25d63ed..072d974b39 100644
--- a/bigframes/core/compile/scalar_op_compiler.py
+++ b/bigframes/core/compile/scalar_op_compiler.py
@@ -189,6 +189,25 @@ def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp):
 
         return decorator
 
+    def register_nary_op(self, op_ref: typing.Union[ops.NaryOp, type[ops.NaryOp]]):
+        """
+        Decorator to register a nary op implementation.
+
+        Args:
+            op_ref (NaryOp or NaryOp type):
+                Class or instance of operator that is implemented by the decorated function.
+        """
+        key = typing.cast(str, op_ref.name)
+
+        def decorator(impl: typing.Callable[..., ibis_types.Value]):
+            def normalized_impl(args: typing.Sequence[ibis_types.Value], op: ops.RowOp):
+                return impl(*args)
+
+            self._register(key, normalized_impl)
+            return impl
+
+        return decorator
+
     def _register(
         self,
         op_name: str,
@@ -1346,6 +1365,25 @@ def clip_op(
         )
 
 
+@scalar_op_compiler.register_nary_op(ops.case_when_op)
+def switch_op(*cases_and_outputs: ibis_types.Value) -> ibis_types.Value:
+    # ibis can handle most type coercions, but we need to force bool -> int
+    # TODO: dispatch coercion depending on bigframes dtype schema
+    result_values = cases_and_outputs[1::2]
+    do_upcast_bool = any(t.type().is_numeric() for t in result_values)
+    if do_upcast_bool:
+        # Just need to upcast to int, ibis can handle further coercion
+        result_values = tuple(
+            val.cast(ibis_dtypes.int64) if val.type().is_boolean() else val
+            for val in result_values
+        )
+
+    case_val = ibis.case()
+    for predicate, output in zip(cases_and_outputs[::2], result_values):
+        case_val = case_val.when(predicate, output)
+    return case_val.end()
+
+
 # Helpers
 def is_null(value) -> bool:
     # float NaN/inf should be treated as distinct from 'true' null values
diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py
index 4980f5369d..70eb519a1b 100644
--- a/bigframes/core/expression.py
+++ b/bigframes/core/expression.py
@@ -190,9 +190,6 @@ class OpExpression(Expression):
     op: bigframes.operations.RowOp
     inputs: typing.Tuple[Expression, ...]
 
-    def __post_init__(self):
-        assert self.op.arguments == len(self.inputs)
-
     @property
     def unbound_variables(self) -> typing.Tuple[str, ...]:
         return tuple(
diff --git a/bigframes/core/join_def.py b/bigframes/core/join_def.py
index 4646a0d6ae..632a1864da 100644
--- a/bigframes/core/join_def.py
+++ b/bigframes/core/join_def.py
@@ -22,6 +22,11 @@ class JoinSide(enum.Enum):
     LEFT = 0
     RIGHT = 1
 
+    def inverse(self) -> JoinSide:
+        if self == JoinSide.LEFT:
+            return JoinSide.RIGHT
+        return JoinSide.LEFT
+
 
 JoinType = Literal["inner", "outer", "left", "right", "cross"]
 
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
index a1072b0d68..688e165732 100644
--- a/bigframes/core/nodes.py
+++ b/bigframes/core/nodes.py
@@ -21,8 +21,6 @@
 import typing
 from typing import Callable, Tuple
 
-import pandas
-
 import bigframes.core.expression as ex
 import bigframes.core.guid
 from bigframes.core.join_def import JoinColumnMapping, JoinDefinition, JoinSide
@@ -579,88 +577,6 @@ def relation_ops_created(self) -> int:
         return 0
 
 
-@dataclass(frozen=True)
-class UnpivotNode(UnaryNode):
-    # TODO: Refactor unpivot
-    row_labels: typing.Tuple[typing.Hashable, ...]
-    unpivot_columns: typing.Tuple[
-        typing.Tuple[str, typing.Tuple[typing.Optional[str], ...]], ...
-    ]
-    passthrough_columns: typing.Tuple[str, ...] = ()
-    index_col_ids: typing.Tuple[str, ...] = ("index",)
-    dtype: typing.Union[
-        bigframes.dtypes.Dtype, typing.Tuple[bigframes.dtypes.Dtype, ...]
-    ] = (pandas.Float64Dtype(),)
-    how: typing.Literal["left", "right"] = "left"
-
-    def __hash__(self):
-        return self._node_hash
-
-    @property
-    def row_preserving(self) -> bool:
-        return False
-
-    @property
-    def non_local(self) -> bool:
-        return True
-
-    @property
-    def joins(self) -> bool:
-        return True
-
-    @functools.cached_property
-    def schema(self) -> schemata.ArraySchema:
-        def infer_dtype(
-            values: typing.Iterable[typing.Hashable],
-        ) -> bigframes.dtypes.Dtype:
-            item_types = map(lambda x: bigframes.dtypes.infer_literal_type(x), values)
-            etype = functools.reduce(
-                lambda t1, t2: bigframes.dtypes.lcd_type(t1, t2)
-                if (t1 and t2)
-                else None,
-                item_types,
-            )
-            return bigframes.dtypes.dtype_for_etype(etype)
-
-        label_tuples = [
-            label if isinstance(label, tuple) else (label,) for label in self.row_labels
-        ]
-        idx_dtypes = [
-            infer_dtype(map(lambda x: typing.cast(tuple, x)[i], label_tuples))
-            for i in range(len(self.index_col_ids))
-        ]
-
-        index_items = [
-            schemata.SchemaItem(id, dtype)
-            for id, dtype in zip(self.index_col_ids, idx_dtypes)
-        ]
-        value_dtypes = (
-            self.dtype
-            if isinstance(self.dtype, tuple)
-            else (self.dtype,) * len(self.unpivot_columns)
-        )
-        value_items = [
-            schemata.SchemaItem(col[0], dtype)
-            for col, dtype in zip(self.unpivot_columns, value_dtypes)
-        ]
-        passthrough_items = [
-            schemata.SchemaItem(id, self.child.schema.get_type(id))
-            for id in self.passthrough_columns
-        ]
-        return schemata.ArraySchema((*index_items, *value_items, *passthrough_items))
-
-    @property
-    def variables_introduced(self) -> int:
-        return (
-            len(self.schema.items) - len(self.passthrough_columns) + OVERHEAD_VARIABLES
-        )
-
-    @property
-    def relation_ops_created(self) -> int:
-        # Unpivot is essentially a cross join and a projection.
-        return 2
-
-
 @dataclass(frozen=True)
 class RandomSampleNode(UnaryNode):
     fraction: float
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 953a89c34f..11e592542c 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1961,9 +1961,7 @@ def any(
             frame = self._raise_on_non_boolean("any")
         else:
             frame = self._drop_non_bool()
-        block = frame._block.aggregate_all_and_stack(
-            agg_ops.any_op, dtype=pandas.BooleanDtype(), axis=axis
-        )
+        block = frame._block.aggregate_all_and_stack(agg_ops.any_op, axis=axis)
         return bigframes.series.Series(block.select_column("values"))
 
     def all(
@@ -1973,9 +1971,7 @@ def all(
             frame = self._raise_on_non_boolean("all")
         else:
             frame = self._drop_non_bool()
-        block = frame._block.aggregate_all_and_stack(
-            agg_ops.all_op, dtype=pandas.BooleanDtype(), axis=axis
-        )
+        block = frame._block.aggregate_all_and_stack(agg_ops.all_op, axis=axis)
         return bigframes.series.Series(block.select_column("values"))
 
     def sum(
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index d631ba8508..a7c385a2b8 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -15,7 +15,9 @@
 from __future__ import annotations
 
 import dataclasses
+import functools
 import typing
+from typing import Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -34,11 +36,6 @@ class RowOp(typing.Protocol):
     def name(self) -> str:
         ...
 
-    @property
-    def arguments(self) -> int:
-        """The number of column argument the operation takes"""
-        ...
-
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
         ...
 
@@ -48,21 +45,29 @@ def order_preserving(self) -> bool:
         ...
 
 
-# These classes can be used to create simple ops that don't take local parameters
-# All is needed is a unique name, and to register an implementation in ibis_mappings.py
 @dataclasses.dataclass(frozen=True)
-class UnaryOp:
+class NaryOp:
     @property
     def name(self) -> str:
         raise NotImplementedError("RowOp abstract base class has no implementation")
 
+    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        raise NotImplementedError("Abstract operation has no output type")
+
+    @property
+    def order_preserving(self) -> bool:
+        """Whether the row operation preserves total ordering. Can be pruned from ordering expressions."""
+        return False
+
+
+# These classes can be used to create simple ops that don't take local parameters
+# All is needed is a unique name, and to register an implementation in ibis_mappings.py
+@dataclasses.dataclass(frozen=True)
+class UnaryOp(NaryOp):
     @property
     def arguments(self) -> int:
         return 1
 
-    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
-        raise NotImplementedError("Abstract operation has no output type")
-
     def as_expr(
         self, input_id: typing.Union[str, bigframes.core.expression.Expression] = "arg"
     ) -> bigframes.core.expression.Expression:
@@ -72,25 +77,13 @@ def as_expr(
             self, (_convert_expr_input(input_id),)
         )
 
-    @property
-    def order_preserving(self) -> bool:
-        """Whether the row operation preserves total ordering. Can be pruned from ordering expressions."""
-        return False
-
 
 @dataclasses.dataclass(frozen=True)
-class BinaryOp:
-    @property
-    def name(self) -> str:
-        raise NotImplementedError("RowOp abstract base class has no implementation")
-
+class BinaryOp(NaryOp):
     @property
     def arguments(self) -> int:
         return 2
 
-    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
-        raise NotImplementedError("Abstract operation has no output type")
-
     def as_expr(
         self,
         left_input: typing.Union[str, bigframes.core.expression.Expression] = "arg1",
@@ -106,25 +99,13 @@ def as_expr(
             ),
         )
 
-    @property
-    def order_preserving(self) -> bool:
-        """Whether the row operation preserves total ordering. Can be pruned from ordering expressions."""
-        return False
-
 
 @dataclasses.dataclass(frozen=True)
-class TernaryOp:
-    @property
-    def name(self) -> str:
-        raise NotImplementedError("RowOp abstract base class has no implementation")
-
+class TernaryOp(NaryOp):
     @property
     def arguments(self) -> int:
         return 3
 
-    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
-        raise NotImplementedError("Abstract operation has no output type")
-
     def as_expr(
         self,
         input1: typing.Union[str, bigframes.core.expression.Expression] = "arg1",
@@ -142,11 +123,6 @@ def as_expr(
             ),
         )
 
-    @property
-    def order_preserving(self) -> bool:
-        """Whether the row operation preserves total ordering. Can be pruned from ordering expressions."""
-        return False
-
 
 def _convert_expr_input(
     input: typing.Union[str, bigframes.core.expression.Expression]
@@ -664,6 +640,46 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
 
 clip_op = ClipOp()
 
+
+class CaseWhenOp(NaryOp):
+    name: typing.ClassVar[str] = "switch"
+
+    def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+        assert len(input_types) % 2 == 0
+        # predicate1, output1, predicate2, output2...
+        if not all(map(lambda x: x == dtypes.BOOL_DTYPE, input_types[::2])):
+            raise TypeError(f"Case inputs {input_types[::2]} must be boolean-valued")
+        output_expr_types = input_types[1::2]
+        return functools.reduce(
+            lambda t1, t2: dtypes.coerce_to_common(t1, t2),
+            output_expr_types,
+        )
+
+    def as_expr(
+        self,
+        *case_output_pairs: Tuple[
+            Union[str | bigframes.core.expression.Expression],
+            Union[str | bigframes.core.expression.Expression],
+        ],
+    ) -> bigframes.core.expression.Expression:
+        import bigframes.core.expression
+
+        # Keep this in sync with output_type and compilers
+        inputs: list[bigframes.core.expression.Expression] = []
+
+        for case, output in case_output_pairs:
+            inputs.append(_convert_expr_input(case))
+            inputs.append(_convert_expr_input(output))
+
+        return bigframes.core.expression.OpExpression(
+            self,
+            tuple(inputs),
+        )
+
+
+case_when_op = CaseWhenOp()
+
+
 # Just parameterless unary ops for now
 # TODO: Parameter mappings
 NUMPY_TO_OP: typing.Final = {
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 7fef7a9dc7..4c598a682d 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -2987,10 +2987,14 @@ def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ord
     bf_result = bf_series.to_pandas(ordered=ordered)
 
     # Pandas may produce narrower numeric types, but bigframes always produces Float64
-    pd_series = pd_series.astype("Float64")
     # Pandas has object index type
+    pd_series.index = pd_series.index.astype(pd.StringDtype(storage="pyarrow"))
     assert_series_equal(
-        pd_series, bf_result, check_index_type=False, ignore_order=not ordered
+        pd_series,
+        bf_result,
+        check_index_type=False,
+        ignore_order=not ordered,
+        check_dtype=False,
     )
 
 
@@ -3079,7 +3083,7 @@ def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op
     pd_series = op(scalars_pandas_df_index).astype("boolean")
     bf_result = bf_series.to_pandas()
 
-    # Pandas has object index type
+    pd_series.index = pd_series.index.astype(bf_result.index.dtype)
     pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False)
 
 
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index e894900646..c692bdbfec 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4327,16 +4327,16 @@ def min(self, axis=0, *, numeric_only: bool = False):
         Finding the minimum value in each column (the default behavior without an explicit axis parameter).
 
             >>> df.min()
-            A    1.0
-            B    2.0
-            dtype: Float64
+            A    1
+            B    2
+            dtype: Int64
 
         Finding the minimum value in each row.
 
             >>> df.min(axis=1)
-            0    1.0
-            1    3.0
-            dtype: Float64
+            0    1
+            1    3
+            dtype: Int64
 
         Args:
             axis ({index (0), columns (1)}):
@@ -4372,16 +4372,16 @@ def max(self, axis=0, *, numeric_only: bool = False):
         Finding the maximum value in each column (the default behavior without an explicit axis parameter).
 
             >>> df.max()
-            A    3.0
-            B    4.0
-            dtype: Float64
+            A    3
+            B    4
+            dtype: Int64
 
         Finding the maximum value in each row.
 
             >>> df.max(axis=1)
-            0    2.0
-            1    4.0
-            dtype: Float64
+            0    2
+            1    4
+            dtype: Int64
 
         Args:
             axis ({index (0), columns (1)}):
@@ -4416,16 +4416,16 @@ def sum(self, axis=0, *, numeric_only: bool = False):
         Calculating the sum of each column (the default behavior without an explicit axis parameter).
 
             >>> df.sum()
-            A    4.0
-            B    6.0
-            dtype: Float64
+            A    4
+            B    6
+            dtype: Int64
 
         Calculating the sum of each row.
 
             >>> df.sum(axis=1)
-            0    3.0
-            1    7.0
-            dtype: Float64
+            0    3
+            1    7
+            dtype: Int64
 
         Args:
             axis ({index (0), columns (1)}):
@@ -4500,9 +4500,9 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
         Finding the median value of each column.
 
             >>> df.median()
-            A    1.0
-            B    2.0
-            dtype: Float64
+            A    1
+            B    2
+            dtype: Int64
 
         Args:
             numeric_only (bool. default False):
@@ -4748,10 +4748,10 @@ def count(self, *, numeric_only: bool = False):
         Counting non-NA values for each column:
 
             >>> df.count()
-            A    4.0
-            B    5.0
-            C    3.0
-            dtype: Float64
+            A    4
+            B    5
+            C    3
+            dtype: Int64
 
         Args:
             numeric_only (bool, default False):
@@ -5051,17 +5051,17 @@ def melt(self, id_vars, value_vars, var_name, value_name):
         Using `melt` with `id_vars` and `value_vars`:
 
             >>> df.melt(id_vars='A', value_vars=['B', 'C'])
-                   A	variable	value
-            0	 1.0	       B	    1
-            1	<NA>	       B	    2
-            2	 3.0	       B	    3
-            3	 4.0	       B	    4
-            4	 5.0	       B	    5
-            5	 1.0	       C	 <NA>
-            6	 <NA>	       C	    3
-            7	 3.0	       C	 <NA>
-            8	 4.0	       C	    4
-            9	 5.0	       C	    5
+                  A variable  value
+            0   1.0        B    1.0
+            1  <NA>        B    2.0
+            2   3.0        B    3.0
+            3   4.0        B    4.0
+            4   5.0        B    5.0
+            5   1.0        C   <NA>
+            6  <NA>        C    3.5
+            7   3.0        C   <NA>
+            8   4.0        C    4.5
+            9   5.0        C    5.0
             <BLANKLINE>
             [10 rows x 3 columns]
 
@@ -5102,9 +5102,9 @@ def nunique(self):
             [3 rows x 2 columns]
 
             >>> df.nunique()
-            A    3.0
-            B    2.0
-            dtype: Float64
+            A    3
+            B    2
+            dtype: Int64
 
         Returns:
             bigframes.series.Series: Series with number of distinct elements.
@@ -5313,9 +5313,9 @@ def agg(self, func):
         Using a single function:
 
             >>> df.agg('sum')
-            A    6.0
-            B    6.0
-            dtype: Float64
+            A    6
+            B    6
+            dtype: Int64
 
         Using a list of functions:
 
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 5e3b4c46ef..edefb334b3 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -593,9 +593,9 @@ def agg(self, func):
             1
 
             >>> s.agg(['min', 'max'])
-            min    1.0
-            max    4.0
-            dtype: Float64
+            min    1
+            max    4
+            dtype: Int64
 
         Args:
             func (function):

From 9c106bd24482620ef5ff3c85f94be9da76c49716 Mon Sep 17 00:00:00 2001
From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com>
Date: Thu, 18 Apr 2024 09:56:38 -0700
Subject: [PATCH 06/15] feat: Add fine tuning `fit()` for Palm2TextGenerator
 (#616)

* feat: support list of numerics in pandas.cut (#580)

An internal user encountered this missing overload

* move the tests to load-testing

* add predict tests

* address comments

* address comments

---------

Co-authored-by: Henry Solberg <henry.j.solberg@gmail.com>
---
 bigframes/ml/core.py              | 40 +++++++++++++++++
 bigframes/ml/llm.py               | 71 ++++++++++++++++++++++++++++++-
 bigframes/ml/sql.py               | 17 ++++++++
 tests/system/load/test_llm.py     | 68 +++++++++++++++++++++++++++++
 tests/system/small/ml/test_llm.py |  2 +-
 tests/unit/ml/test_sql.py         | 23 ++++++++++
 6 files changed, 219 insertions(+), 2 deletions(-)
 create mode 100644 tests/system/load/test_llm.py

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index 04aaeec1bc..b94ae39687 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -321,6 +321,46 @@ def create_model(
 
         return self._create_model_with_sql(session=session, sql=sql)
 
+    def create_llm_remote_model(
+        self,
+        X_train: bpd.DataFrame,
+        y_train: bpd.DataFrame,
+        connection_name: str,
+        options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
+    ) -> BqmlModel:
+        """Create a session-temporary BQML model with the CREATE OR REPLACE MODEL statement
+
+        Args:
+            X_train: features columns for training
+            y_train: labels columns for training
+            options: a dict of options to configure the model. Generates a BQML OPTIONS
+                clause
+            connection_name:
+                a BQ connection to talk with Vertex AI, of the format <PROJECT_NUMBER>.<REGION>.<CONNECTION_NAME>. https://cloud.google.com/bigquery/docs/create-cloud-resource-connection
+
+        Returns: a BqmlModel, wrapping a trained model in BigQuery
+        """
+        options = dict(options)
+        # Cache dataframes to make sure base table is not a snapshot
+        # cached dataframe creates a full copy, never uses snapshot
+        input_data = X_train._cached(force=True).join(
+            y_train._cached(force=True), how="outer"
+        )
+        options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()})
+
+        session = X_train._session
+
+        model_ref = self._create_model_ref(session._anonymous_dataset)
+
+        sql = self._model_creation_sql_generator.create_llm_remote_model(
+            source_df=input_data,
+            model_ref=model_ref,
+            options=options,
+            connection_name=connection_name,
+        )
+
+        return self._create_model_with_sql(session=session, sql=sql)
+
     def create_time_series_model(
         self,
         X_train: bpd.DataFrame,
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index 31c691fd51..37a38cdd5c 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -27,6 +27,10 @@
 from bigframes.ml import base, core, globals, utils
 import bigframes.pandas as bpd
 
+_BQML_PARAMS_MAPPING = {
+    "max_iterations": "maxIterations",
+}
+
 _TEXT_GENERATOR_BISON_ENDPOINT = "text-bison"
 _TEXT_GENERATOR_BISON_32K_ENDPOINT = "text-bison-32k"
 _TEXT_GENERATOR_ENDPOINTS = (
@@ -62,6 +66,8 @@ class PaLM2TextGenerator(base.BaseEstimator):
             Connection to connect with remote service. str of the format <PROJECT_NUMBER/PROJECT_ID>.<LOCATION>.<CONNECTION_ID>.
             if None, use default connection in session context. BigQuery DataFrame will try to create the connection and attach
             permission if the connection isn't fully setup.
+        max_iterations (Optional[int], Default to 300):
+            The number of steps to run when performing supervised tuning.
     """
 
     def __init__(
@@ -70,9 +76,11 @@ def __init__(
         model_name: Literal["text-bison", "text-bison-32k"] = "text-bison",
         session: Optional[bigframes.Session] = None,
         connection_name: Optional[str] = None,
+        max_iterations: int = 300,
     ):
         self.model_name = model_name
         self.session = session or bpd.get_global_session()
+        self.max_iterations = max_iterations
         self._bq_connection_manager = self.session.bqconnectionmanager
 
         connection_name = connection_name or self.session._bq_connection
@@ -132,12 +140,73 @@ def _from_bq(
         model_connection = model._properties["remoteModelInfo"]["connection"]
         model_endpoint = bqml_endpoint.split("/")[-1]
 
+        # Get the optional params
+        kwargs: dict = {}
+        last_fitting = model.training_runs[-1]["trainingOptions"]
+
+        dummy_text_generator = cls()
+        for bf_param, _ in dummy_text_generator.__dict__.items():
+            bqml_param = _BQML_PARAMS_MAPPING.get(bf_param)
+            if bqml_param in last_fitting:
+                # Convert types
+                if bf_param in ["max_iterations"]:
+                    kwargs[bf_param] = int(last_fitting[bqml_param])
+
         text_generator_model = cls(
-            session=session, model_name=model_endpoint, connection_name=model_connection
+            **kwargs,
+            session=session,
+            model_name=model_endpoint,
+            connection_name=model_connection,
         )
         text_generator_model._bqml_model = core.BqmlModel(session, model)
         return text_generator_model
 
+    @property
+    def _bqml_options(self) -> dict:
+        """The model options as they will be set for BQML"""
+        options = {
+            "max_iterations": self.max_iterations,
+            "data_split_method": "NO_SPLIT",
+        }
+        return options
+
+    def fit(
+        self,
+        X: Union[bpd.DataFrame, bpd.Series],
+        y: Union[bpd.DataFrame, bpd.Series],
+    ) -> PaLM2TextGenerator:
+        """Fine tune PaLM2TextGenerator model.
+
+        .. note::
+
+            This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the
+            Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is"
+            and might have limited support. For more information, see the launch stage descriptions
+            (https://cloud.google.com/products#product-launch-stages).
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                DataFrame of shape (n_samples, n_features). Training data.
+            y (bigframes.dataframe.DataFrame or bigframes.series.Series:
+                Training labels.
+
+        Returns:
+            PaLM2TextGenerator: Fitted Estimator.
+        """
+        X, y = utils.convert_to_dataframe(X, y)
+
+        options = self._bqml_options
+        options["endpoint"] = self.model_name + "@001"
+        options["prompt_col"] = X.columns.tolist()[0]
+
+        self._bqml_model = self._bqml_model_factory.create_llm_remote_model(
+            X,
+            y,
+            options=options,
+            connection_name=self.connection_name,
+        )
+        return self
+
     def predict(
         self,
         X: Union[bpd.DataFrame, bpd.Series],
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index fab358cce3..59c768ce81 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -177,6 +177,23 @@ def create_model(
         parts.append(f"AS {source_sql}")
         return "\n".join(parts)
 
+    def create_llm_remote_model(
+        self,
+        source_df: bpd.DataFrame,
+        connection_name: str,
+        model_ref: google.cloud.bigquery.ModelReference,
+        options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
+    ) -> str:
+        """Encode the CREATE OR REPLACE MODEL statement for BQML"""
+        source_sql = source_df.sql
+
+        parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"]
+        parts.append(self.connection(connection_name))
+        if options:
+            parts.append(self.options(**options))
+        parts.append(f"AS {source_sql}")
+        return "\n".join(parts)
+
     def create_remote_model(
         self,
         connection_name: str,
diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
new file mode 100644
index 0000000000..62ef7d5c72
--- /dev/null
+++ b/tests/system/load/test_llm.py
@@ -0,0 +1,68 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import pytest
+
+import bigframes.ml.llm
+
+
+@pytest.fixture(scope="session")
+def llm_fine_tune_df_default_index(
+    session: bigframes.Session,
+) -> bigframes.dataframe.DataFrame:
+    sql = """
+SELECT
+  CONCAT("Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: ", text) as prompt,
+  CAST(label AS STRING) as label
+FROM `llm_tuning.emotion_classification_train`
+"""
+    return session.read_gbq(sql)
+
+
+@pytest.fixture(scope="session")
+def llm_remote_text_pandas_df():
+    """Additional data matching the penguins dataset, with a new index"""
+    return pd.DataFrame(
+        {
+            "prompt": [
+                "Please do sentiment analysis on the following text and only output a number from 0 to 5where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey",
+                "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: i was feeling a little vain when i did this one",
+                "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: a father of children killed in an accident",
+            ],
+        }
+    )
+
+
+def test_llm_palm_configure_fit(
+    llm_fine_tune_df_default_index, llm_remote_text_pandas_df
+):
+    model = bigframes.ml.llm.PaLM2TextGenerator(
+        model_name="text-bison", max_iterations=1
+    )
+
+    df = llm_fine_tune_df_default_index.dropna()
+    X_train = df[["prompt"]]
+    y_train = df[["label"]]
+    model.fit(X_train, y_train)
+
+    assert model is not None
+
+    df = model.predict(llm_remote_text_pandas_df).to_pandas()
+    assert df.shape == (3, 4)
+    assert "ml_generate_text_llm_result" in df.columns
+    series = df["ml_generate_text_llm_result"]
+    assert all(series.str.len() == 1)
+
+    # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept
diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py
index b9e4889801..6f6b67597a 100644
--- a/tests/system/small/ml/test_llm.py
+++ b/tests/system/small/ml/test_llm.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index 5b1ff37775..3560f05cb6 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -181,6 +181,29 @@ def test_create_model_transform_correct(
     )
 
 
+def test_create_llm_remote_model_correct(
+    model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator,
+    mock_df: bpd.DataFrame,
+):
+    sql = model_creation_sql_generator.create_llm_remote_model(
+        source_df=mock_df,
+        connection_name="my_project.us.my_connection",
+        model_ref=bigquery.ModelReference.from_string(
+            "test-proj._anonXYZ.create_remote_model"
+        ),
+        options={"option_key1": "option_value1", "option_key2": 2},
+    )
+    assert (
+        sql
+        == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model`
+REMOTE WITH CONNECTION `my_project.us.my_connection`
+OPTIONS(
+  option_key1="option_value1",
+  option_key2=2)
+AS input_X_y_sql"""
+    )
+
+
 def test_create_remote_model_correct(
     model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator,
 ):

From 9665e39ef288841f03a9d823bd2210ef58394ad3 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Thu, 18 Apr 2024 10:10:30 -0700
Subject: [PATCH 07/15] docs: Fix rendering of examples for multiple apis
 (#620)

---
 third_party/bigframes_vendored/pandas/core/frame.py           | 1 +
 third_party/bigframes_vendored/pandas/core/generic.py         | 1 +
 .../bigframes_vendored/pandas/core/groupby/__init__.py        | 1 +
 third_party/bigframes_vendored/pandas/core/series.py          | 4 ++++
 4 files changed, 7 insertions(+)

diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index c692bdbfec..f06128f150 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4523,6 +4523,7 @@ def quantile(
         Return values at the given quantile over requested axis.
 
         **Examples:**
+
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
             >>> df = bpd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py
index 9c6120fd6c..54c876ef3c 100644
--- a/third_party/bigframes_vendored/pandas/core/generic.py
+++ b/third_party/bigframes_vendored/pandas/core/generic.py
@@ -61,6 +61,7 @@ def __iter__(self) -> Iterator:
             iterator
 
         **Examples:**
+
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
 
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
index 6310d7e271..7347963d17 100644
--- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
+++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -90,6 +90,7 @@ def quantile(self, q=0.5, *, numeric_only: bool = False):
         Return group values at the given quantile, a la numpy.percentile.
 
         **Examples:**
+
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
             >>> df = bpd.DataFrame([
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index edefb334b3..a5e14c5b1c 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -862,6 +862,7 @@ def autocorr(self, lag: int = 1) -> float:
         the Series and its shifted self.
 
         **Examples:**
+
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
 
@@ -2812,6 +2813,7 @@ def combine_first(self, other) -> Series:
         of the two indexes.
 
         **Examples:**
+
             >>> import bigframes.pandas as bpd
             >>> import numpy as np
             >>> bpd.options.display.progress_bar = None
@@ -2852,6 +2854,7 @@ def update(self, other) -> None:
         on index.
 
         **Examples:**
+
             >>> import bigframes.pandas as bpd
             >>> import pandas as pd
             >>> import numpy as np
@@ -3168,6 +3171,7 @@ def quantile(
         Return value at the given quantile.
 
         **Examples:**
+
             >>> import bigframes.pandas as bpd
             >>> bpd.options.display.progress_bar = None
             >>> s = bpd.Series([1, 2, 3, 4])

From 3706b4f9dde65788b5e6343a6428fb1866499461 Mon Sep 17 00:00:00 2001
From: Stephanie A <129541811+DevStephanie@users.noreply.github.com>
Date: Thu, 18 Apr 2024 15:29:22 -0500
Subject: [PATCH 08/15] feat: warn if location is set to unknown location
 (#609)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: warn if location is set to unknown location

* tests error message

---------

Co-authored-by: Shobhit Singh <shobs@google.com>
Co-authored-by: Tim Sweña (Swast) <swast@google.com>
---
 bigframes/_config/bigquery_options.py       | 23 ++++++++++
 bigframes/exceptions.py                     | 17 +++++++
 tests/unit/_config/test_bigquery_options.py | 51 +++++++++++++++++++++
 3 files changed, 91 insertions(+)
 create mode 100644 bigframes/exceptions.py

diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py
index 50e14eaf28..74561e6f24 100644
--- a/bigframes/_config/bigquery_options.py
+++ b/bigframes/_config/bigquery_options.py
@@ -22,11 +22,33 @@
 import google.api_core.exceptions
 import google.auth.credentials
 
+import bigframes.constants
+import bigframes.exceptions
+
 SESSION_STARTED_MESSAGE = (
     "Cannot change '{attribute}' once a session has started. "
     "Call bigframes.pandas.close_session() first, if you are using the bigframes.pandas API."
 )
 
+UNKNOWN_LOCATION_MESSAGE = "The location '{location}' is set to an unknown value."
+
+
+def _validate_location(value: Optional[str]):
+
+    if value is None:
+        return
+
+    if value not in bigframes.constants.ALL_BIGQUERY_LOCATIONS:
+        warnings.warn(
+            UNKNOWN_LOCATION_MESSAGE.format(location=value),
+            # There are many layers before we get to (possibly) the user's code:
+            # -> bpd.options.bigquery.location = "us-central-1"
+            # -> location.setter
+            # -> _validate_location
+            stacklevel=3,
+            category=bigframes.exceptions.UnknownLocationWarning,
+        )
+
 
 class BigQueryOptions:
     """Encapsulates configuration for working with a session."""
@@ -93,6 +115,7 @@ def location(self) -> Optional[str]:
     def location(self, value: Optional[str]):
         if self._session_started and self._location != value:
             raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="location"))
+        _validate_location(value)
         self._location = value
 
     @property
diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py
new file mode 100644
index 0000000000..62122e79d2
--- /dev/null
+++ b/bigframes/exceptions.py
@@ -0,0 +1,17 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class UnknownLocationWarning(Warning):
+    """The location is set to an unknown value."""
diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py
index cf13084610..7d9a452f42 100644
--- a/tests/unit/_config/test_bigquery_options.py
+++ b/tests/unit/_config/test_bigquery_options.py
@@ -13,10 +13,13 @@
 # limitations under the License.
 
 import re
+import warnings
 
 import pytest
 
+import bigframes
 import bigframes._config.bigquery_options as bigquery_options
+import bigframes.exceptions
 
 
 @pytest.mark.parametrize(
@@ -78,3 +81,51 @@ def test_setter_if_session_started_but_setting_the_same_value(attribute):
     setattr(options, attribute, original_object)
 
     assert getattr(options, attribute) is original_object
+
+
+@pytest.mark.parametrize(
+    [
+        "valid_location",
+    ],
+    [
+        (None,),
+        ("us-central1",),
+    ],
+)
+def test_location_set_to_valid_no_warning(valid_location):
+    options = bigquery_options.BigQueryOptions()
+    # Ensure that no warnings are emitted.
+    # https://docs.pytest.org/en/7.0.x/how-to/capture-warnings.html#additional-use-cases-of-warnings-in-tests
+    with warnings.catch_warnings():
+        # Turn matching UnknownLocationWarning into exceptions.
+        # https://docs.python.org/3/library/warnings.html#warning-filter
+        warnings.simplefilter(
+            "error", category=bigframes.exceptions.UnknownLocationWarning
+        )
+        options.location = valid_location
+
+
+@pytest.mark.parametrize(
+    [
+        "invalid_location",
+    ],
+    [
+        # Test with common mistakes, see article.
+        # https://en.wikipedia.org/wiki/Edit_distance#Formal_definition_and_properties
+        # Substitution
+        ("us-wist-3",),
+        # Insertion
+        ("us-central-1",),
+        # Deletion
+        ("asia-suth2",),
+    ],
+)
+def test_location_set_to_invalid_warning(invalid_location):
+    options = bigquery_options.BigQueryOptions()
+    with pytest.warns(
+        bigframes.exceptions.UnknownLocationWarning,
+        match=re.escape(
+            f"The location '{invalid_location}' is set to an unknown value."
+        ),
+    ):
+        options.location = invalid_location

From 9d205aecb77f35baeec82a8f6e1b72c2d852ca46 Mon Sep 17 00:00:00 2001
From: TrevorBergeron <tbergeron@google.com>
Date: Thu, 18 Apr 2024 15:31:03 -0700
Subject: [PATCH 09/15] fix: Use exact median implementation by default (#619)

---
 bigframes/core/block_transforms.py            |  3 +-
 bigframes/core/groupby/__init__.py            |  8 ++---
 bigframes/dataframe.py                        | 10 +++----
 bigframes/series.py                           |  2 +-
 tests/system/small/test_series.py             | 29 +++++++++++++++----
 .../bigframes_vendored/pandas/core/frame.py   | 12 ++++----
 .../pandas/core/groupby/__init__.py           |  7 ++---
 .../bigframes_vendored/pandas/core/series.py  |  8 ++---
 8 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
index 562689a736..a221b343a5 100644
--- a/bigframes/core/block_transforms.py
+++ b/bigframes/core/block_transforms.py
@@ -111,6 +111,7 @@ def quantile(
     columns: Sequence[str],
     qs: Sequence[float],
     grouping_column_ids: Sequence[str] = (),
+    dropna: bool = False,
 ) -> blocks.Block:
     # TODO: handle windowing and more interpolation methods
     window = core.WindowSpec(
@@ -134,7 +135,7 @@ def quantile(
     block, results = block.aggregate(
         grouping_column_ids,
         tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
-        dropna=True,
+        dropna=dropna,
     )
     return block.select_columns(results).with_column_labels(labels)
 
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
index 0f53342352..05b1cc7f41 100644
--- a/bigframes/core/groupby/__init__.py
+++ b/bigframes/core/groupby/__init__.py
@@ -113,9 +113,7 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame:
             self._raise_on_non_numeric("mean")
         return self._aggregate_all(agg_ops.mean_op, numeric_only=True)
 
-    def median(
-        self, numeric_only: bool = False, *, exact: bool = False
-    ) -> df.DataFrame:
+    def median(self, numeric_only: bool = False, *, exact: bool = True) -> df.DataFrame:
         if not numeric_only:
             self._raise_on_non_numeric("median")
         if exact:
@@ -138,6 +136,7 @@ def quantile(
             q_cols,
             qs=tuple(q) if multi_q else (q,),  # type: ignore
             grouping_column_ids=self._by_col_ids,
+            dropna=self._dropna,
         )
         result_df = df.DataFrame(result)
         if multi_q:
@@ -491,7 +490,7 @@ def mean(self, *args) -> series.Series:
     def median(
         self,
         *args,
-        exact: bool = False,
+        exact: bool = True,
         **kwargs,
     ) -> series.Series:
         if exact:
@@ -508,6 +507,7 @@ def quantile(
             (self._value_column,),
             qs=tuple(q) if multi_q else (q,),  # type: ignore
             grouping_column_ids=self._by_col_ids,
+            dropna=self._dropna,
         )
         if multi_q:
             return series.Series(result.stack())
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 11e592542c..ff8404761c 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1995,18 +1995,16 @@ def mean(
         return bigframes.series.Series(block.select_column("values"))
 
     def median(
-        self, *, numeric_only: bool = False, exact: bool = False
+        self, *, numeric_only: bool = False, exact: bool = True
     ) -> bigframes.series.Series:
-        if exact:
-            raise NotImplementedError(
-                f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
-            )
         if not numeric_only:
             frame = self._raise_on_non_numeric("median")
         else:
             frame = self._drop_non_numeric()
         if exact:
-            return self.quantile()
+            result = frame.quantile()
+            result.name = None
+            return result
         else:
             block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
             return bigframes.series.Series(block.select_column("values"))
diff --git a/bigframes/series.py b/bigframes/series.py
index b834411bce..47acfd0afb 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -966,7 +966,7 @@ def mode(self) -> Series:
     def mean(self) -> float:
         return typing.cast(float, self._apply_aggregation(agg_ops.mean_op))
 
-    def median(self, *, exact: bool = False) -> float:
+    def median(self, *, exact: bool = True) -> float:
         if exact:
             return typing.cast(float, self.quantile(0.5))
         else:
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 87267696ba..9cb615fdcb 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -1345,10 +1345,9 @@ def test_numeric_literal(scalars_dfs):
     scalars_df, _ = scalars_dfs
     col_name = "numeric_col"
     assert scalars_df[col_name].dtype == pd.ArrowDtype(pa.decimal128(38, 9))
-    bf_result = scalars_df[col_name] - scalars_df[col_name].median()
+    bf_result = scalars_df[col_name] + 42
     assert bf_result.size == scalars_df[col_name].size
-    # TODO(b/323387826): The precision increased by 1 unexpectedly.
-    # assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))
+    assert bf_result.dtype == pd.ArrowDtype(pa.decimal128(38, 9))
 
 
 def test_repr(scalars_dfs):
@@ -1523,12 +1522,32 @@ def test_groupby_mean(scalars_dfs):
     )
 
 
-def test_groupby_median(scalars_dfs):
+def test_groupby_median_exact(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     col_name = "int64_too"
-    bf_series = (
+    bf_result = (
         scalars_df[col_name].groupby(scalars_df["string_col"], dropna=False).median()
     )
+    pd_result = (
+        scalars_pandas_df[col_name]
+        .groupby(scalars_pandas_df["string_col"], dropna=False)
+        .median()
+    )
+
+    assert_series_equal(
+        pd_result,
+        bf_result.to_pandas(),
+    )
+
+
+def test_groupby_median_inexact(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    col_name = "int64_too"
+    bf_series = (
+        scalars_df[col_name]
+        .groupby(scalars_df["string_col"], dropna=False)
+        .median(exact=False)
+    )
     pd_max = (
         scalars_pandas_df[col_name]
         .groupby(scalars_pandas_df["string_col"], dropna=False)
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index f06128f150..0515f690e3 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4481,7 +4481,7 @@ def mean(self, axis=0, *, numeric_only: bool = False):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def median(self, *, numeric_only: bool = False, exact: bool = False):
+    def median(self, *, numeric_only: bool = False, exact: bool = True):
         """Return the median of the values over colunms.
 
         **Examples:**
@@ -4500,15 +4500,15 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
         Finding the median value of each column.
 
             >>> df.median()
-            A    1
-            B    2
-            dtype: Int64
+            A    2.0
+            B    3.0
+            dtype: Float64
 
         Args:
             numeric_only (bool. default False):
                 Default False. Include only float, int, boolean columns.
-            exact (bool. default False):
-                Default False. Get the exact median instead of an approximate
+            exact (bool. default True):
+                Default True. Get the exact median instead of an approximate
                 one.
 
         Returns:
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
index 7347963d17..f3f7748e34 100644
--- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
+++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -68,7 +68,7 @@ def median(
         self,
         numeric_only: bool = False,
         *,
-        exact: bool = False,
+        exact: bool = True,
     ):
         """
         Compute median of groups, excluding missing values.
@@ -76,9 +76,8 @@ def median(
         Args:
             numeric_only (bool, default False):
                 Include only float, int, boolean columns.
-            exact (bool, default False):
-                Calculate the exact median instead of an approximation. Note:
-                    ``exact=True`` is not supported.
+            exact (bool, default True):
+                Calculate the exact median instead of an approximation.
 
         Returns:
             pandas.Series or pandas.DataFrame: Median of groups.
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index a5e14c5b1c..0c5b8d4521 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -3150,13 +3150,13 @@ def mean(self):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
-    def median(self, *, exact: bool = False):
+    def median(self, *, exact: bool = True):
         """Return the median of the values over the requested axis.
 
         Args:
-            exact (bool. default False):
-                Default False. Get the exact median instead of an approximate
-                one. Note: ``exact=True`` not yet supported.
+            exact (bool. default True):
+                Default True. Get the exact median instead of an approximate
+                one.
 
         Returns:
             scalar: Scalar.

From 240a1ac6fa914550bb6216cd5d179a36009f2657 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Fri, 19 Apr 2024 21:54:50 +0000
Subject: [PATCH 10/15] feat: expose `max_batching_rows` in `remote_function`
 (#622)

* feat: expose `max_batching_rows` in `remote_function`

* fix option formation, add tests

* fix type annotation

* assert max_batching_rows after routing creation

* add forgotten assert
---
 bigframes/functions/remote_function.py     | 46 +++++++++++++++++++---
 bigframes/pandas/__init__.py               |  2 +
 bigframes/session/__init__.py              | 11 ++++++
 tests/system/large/test_remote_function.py | 36 +++++++++++++++++
 4 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py
index 178c911591..f866575a26 100644
--- a/bigframes/functions/remote_function.py
+++ b/bigframes/functions/remote_function.py
@@ -145,7 +145,13 @@ def __init__(
         self._cloud_function_docker_repository = cloud_function_docker_repository
 
     def create_bq_remote_function(
-        self, input_args, input_types, output_type, endpoint, bq_function_name
+        self,
+        input_args,
+        input_types,
+        output_type,
+        endpoint,
+        bq_function_name,
+        max_batching_rows,
     ):
         """Create a BigQuery remote function given the artifacts of a user defined
         function and the http endpoint of a corresponding cloud function."""
@@ -169,14 +175,25 @@ def create_bq_remote_function(
             bq_function_args.append(
                 f"{name} {third_party_ibis_bqtypes.BigQueryType.from_ibis(input_types[idx])}"
             )
+
+        remote_function_options = {
+            "endpoint": endpoint,
+            "max_batching_rows": max_batching_rows,
+        }
+
+        remote_function_options_str = ", ".join(
+            [
+                f'{key}="{val}"' if isinstance(val, str) else f"{key}={val}"
+                for key, val in remote_function_options.items()
+                if val is not None
+            ]
+        )
+
         create_function_ddl = f"""
             CREATE OR REPLACE FUNCTION `{self._gcp_project_id}.{self._bq_dataset}`.{bq_function_name}({','.join(bq_function_args)})
             RETURNS {bq_function_return_type}
             REMOTE WITH CONNECTION `{self._gcp_project_id}.{self._bq_location}.{self._bq_connection_id}`
-            OPTIONS (
-              endpoint = "{endpoint}",
-              max_batching_rows = 1000
-            )"""
+            OPTIONS ({remote_function_options_str})"""
 
         logger.info(f"Creating BQ remote function: {create_function_ddl}")
 
@@ -438,6 +455,7 @@ def provision_bq_remote_function(
         reuse,
         name,
         package_requirements,
+        max_batching_rows,
     ):
         """Provision a BigQuery remote function."""
         # If reuse of any existing function with the same name (indicated by the
@@ -485,7 +503,12 @@ def provision_bq_remote_function(
                     "Exactly one type should be provided for every input arg."
                 )
             self.create_bq_remote_function(
-                input_args, input_types, output_type, cf_endpoint, remote_function_name
+                input_args,
+                input_types,
+                output_type,
+                cf_endpoint,
+                remote_function_name,
+                max_batching_rows,
             )
         else:
             logger.info(f"Remote function {remote_function_name} already exists.")
@@ -607,6 +630,7 @@ def remote_function(
     cloud_function_service_account: Optional[str] = None,
     cloud_function_kms_key_name: Optional[str] = None,
     cloud_function_docker_repository: Optional[str] = None,
+    max_batching_rows: Optional[int] = 1000,
 ):
     """Decorator to turn a user defined function into a BigQuery remote function.
 
@@ -723,6 +747,15 @@ def remote_function(
             projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME.
             For more details see
             https://cloud.google.com/functions/docs/securing/cmek#before_you_begin.
+        max_batching_rows (int, Optional):
+            The maximum number of rows to be batched for processing in the
+            BQ remote function. Default value is 1000. A lower number can be
+            passed to avoid timeouts in case the user code is too complex to
+            process large number of rows fast enough. A higher number can be
+            used to increase throughput in case the user code is fast enough.
+            `None` can be passed to let BQ remote functions service apply
+            default batching. See for more details
+            https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request.
     """
     import bigframes.pandas as bpd
 
@@ -846,6 +879,7 @@ def wrapper(f):
             reuse,
             name,
             packages,
+            max_batching_rows,
         )
 
         # TODO: Move ibis logic to compiler step
diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py
index 91c3eb603b..96af6ab1b3 100644
--- a/bigframes/pandas/__init__.py
+++ b/bigframes/pandas/__init__.py
@@ -643,6 +643,7 @@ def remote_function(
     cloud_function_service_account: Optional[str] = None,
     cloud_function_kms_key_name: Optional[str] = None,
     cloud_function_docker_repository: Optional[str] = None,
+    max_batching_rows: Optional[int] = 1000,
 ):
     return global_session.with_default_session(
         bigframes.session.Session.remote_function,
@@ -656,6 +657,7 @@ def remote_function(
         cloud_function_service_account=cloud_function_service_account,
         cloud_function_kms_key_name=cloud_function_kms_key_name,
         cloud_function_docker_repository=cloud_function_docker_repository,
+        max_batching_rows=max_batching_rows,
     )
 
 
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index b6d56006be..64bcebb6cc 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -1541,6 +1541,7 @@ def remote_function(
         cloud_function_service_account: Optional[str] = None,
         cloud_function_kms_key_name: Optional[str] = None,
         cloud_function_docker_repository: Optional[str] = None,
+        max_batching_rows: Optional[int] = 1000,
     ):
         """Decorator to turn a user defined function into a BigQuery remote function. Check out
         the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes.
@@ -1635,6 +1636,15 @@ def remote_function(
                 projects/PROJECT_ID/locations/LOCATION/repositories/REPOSITORY_NAME.
                 For more details see
                 https://cloud.google.com/functions/docs/securing/cmek#before_you_begin.
+            max_batching_rows (int, Optional):
+                The maximum number of rows to be batched for processing in the
+                BQ remote function. Default value is 1000. A lower number can be
+                passed to avoid timeouts in case the user code is too complex to
+                process large number of rows fast enough. A higher number can be
+                used to increase throughput in case the user code is fast enough.
+                `None` can be passed to let BQ remote functions service apply
+                default batching. See for more details
+                https://cloud.google.com/bigquery/docs/remote-functions#limiting_number_of_rows_in_a_batch_request.
         Returns:
             callable: A remote function object pointing to the cloud assets created
             in the background to support the remote execution. The cloud assets can be
@@ -1656,6 +1666,7 @@ def remote_function(
             cloud_function_service_account=cloud_function_service_account,
             cloud_function_kms_key_name=cloud_function_kms_key_name,
             cloud_function_docker_repository=cloud_function_docker_repository,
+            max_batching_rows=max_batching_rows,
         )
 
     def read_gbq_function(
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index cf6b2a01f8..ec9acc292e 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -1300,3 +1300,39 @@ def square_num(x):
         cleanup_remote_function_assets(
             session.bqclient, session.cloudfunctionsclient, square_num
         )
+
+
+@pytest.mark.parametrize(
+    ("max_batching_rows"),
+    [
+        10_000,
+        None,
+    ],
+)
+@pytest.mark.flaky(retries=2, delay=120)
+def test_remote_function_max_batching_rows(session, scalars_dfs, max_batching_rows):
+    try:
+
+        def square(x):
+            return x * x
+
+        square_remote = session.remote_function(
+            [int], int, reuse=False, max_batching_rows=max_batching_rows
+        )(square)
+
+        bq_routine = session.bqclient.get_routine(
+            square_remote.bigframes_remote_function
+        )
+        assert bq_routine.remote_function_options.max_batching_rows == max_batching_rows
+
+        scalars_df, scalars_pandas_df = scalars_dfs
+
+        bf_result = scalars_df["int64_too"].apply(square_remote).to_pandas()
+        pd_result = scalars_pandas_df["int64_too"].apply(square)
+
+        pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
+    finally:
+        # clean up the gcp assets created for the remote function
+        cleanup_remote_function_assets(
+            session.bqclient, session.cloudfunctionsclient, square_remote
+        )

From b66e3e6b221ea18d944ac478330bb009fe1a2c93 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Mon, 22 Apr 2024 18:03:25 +0000
Subject: [PATCH 11/15] chore: add synthetic data generation notebook (#615)

* chore: add synthetic data generation notebook

* markdown improvements

* add copyright header

* add a title
---
 .../apps/synthetic_data_generation.ipynb      | 1133 +++++++++++++++++
 noxfile.py                                    |    4 +
 2 files changed, 1137 insertions(+)
 create mode 100644 notebooks/apps/synthetic_data_generation.ipynb

diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb
new file mode 100644
index 0000000000..a6e8444aac
--- /dev/null
+++ b/notebooks/apps/synthetic_data_generation.ipynb
@@ -0,0 +1,1133 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Copyright 2023 Google LLC\n",
+        "#\n",
+        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "#     https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# BigQuery DataFrames: Synthetic Data Generation"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "In addition to BigQuery DataFrames (installing which also installs `pandas` as a dependency) we will use\n",
+        "`faker` library as a building block for synthetic data generation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "suoG7eWDZARj",
+        "outputId": "b5c620a9-8f5b-413f-dd38-93448f941846"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Collecting faker\n",
+            "  Downloading Faker-24.9.0-py3-none-any.whl (1.8 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: python-dateutil>=2.4 in /usr/local/lib/python3.10/dist-packages (from faker) (2.8.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.4->faker) (1.16.0)\n",
+            "Installing collected packages: faker\n",
+            "Successfully installed faker-24.9.0\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install faker"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "m3q1oeJALhsG"
+      },
+      "outputs": [],
+      "source": [
+        "import bigframes.pandas as bpd\n",
+        "bpd.options.bigquery.project = PROJECT_ID"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let's use `GeminiTextGenerator` for our purpose, which is BigQuery DataFrame's state-of-the-art LLM integration at the time of writing this notebook (Apr 16 2024)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 69
+        },
+        "id": "lIYdn1woOS1n",
+        "outputId": "be474338-44c2-4ce0-955e-d525b8b9c84b"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.10/dist-packages/bigframes/session/__init__.py:1907: UserWarning: No explicit location is set, so using location US for the session.\n",
+            "  return Session(context)\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "Query job 3e8423da-737c-42e2-a3d2-d2180ca18579 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:3e8423da-737c-42e2-a3d2-d2180ca18579&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "from bigframes.ml.llm import GeminiTextGenerator\n",
+        "\n",
+        "model = GeminiTextGenerator()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Craft a prompt for the LLM to indicate the schema of the desired data and hints for the code that could generate such data. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 162
+        },
+        "id": "SSR-lLScLa95",
+        "outputId": "cbaec34e-6fa6-45b4-e54a-f11ca06b61e1"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "Query job d651d0bf-300c-4b1d-9e3c-03310b71287c is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d651d0bf-300c-4b1d-9e3c-03310b71287c&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Query job c67b9bb9-2f3e-4b9e-b680-0b7b6e9d2279 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:c67b9bb9-2f3e-4b9e-b680-0b7b6e9d2279&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>prompt</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>Write python code to generate a pandas datafra...</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>1 rows × 1 columns</p>\n",
+              "</div>[1 rows x 1 columns in total]"
+            ],
+            "text/plain": [
+              "                                              prompt\n",
+              "0  Write python code to generate a pandas datafra...\n",
+              "\n",
+              "[1 rows x 1 columns]"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "prompt = \"\"\"\\\n",
+        "Write python code to generate a pandas dataframe based on the requirements:\n",
+        "  Column name: Name, type: string, Description: Latin American Names\n",
+        "  Column name: Age, type: int\n",
+        "  Column name: Gender, type: string, Description: Inclusive\n",
+        "\n",
+        "Note:\n",
+        "  - Return the code only, no additional texts or comments\n",
+        "  - Use faker library\n",
+        "  - Generate 100 rows\n",
+        "  - The final dataframe should be named 'result_df'.\n",
+        "\"\"\"\n",
+        "\n",
+        "df_prompt = bpd.DataFrame({\"prompt\" : [prompt]})\n",
+        "df_prompt"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Be accommodating that LLM may not produce a runnable code in the first go and may need some nudging. We will retry by adding the failing code and the exception it throws as additional context in the prompt."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 277
+        },
+        "id": "miDe3K4GNvOo",
+        "outputId": "f2039e80-5ad7-4551-f8b2-7ef714a89d63"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "Query job d5c0725d-9070-4712-adfd-8a9bd86eefc3 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:d5c0725d-9070-4712-adfd-8a9bd86eefc3&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Query job 4eb581a3-7f97-411a-bee1-91e8c150cef4 is DONE. 8 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:4eb581a3-7f97-411a-bee1-91e8c150cef4&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Query job f3d5503d-a3e7-49ce-b985-5ffbdbd856e3 is DONE. 2 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:f3d5503d-a3e7-49ce-b985-5ffbdbd856e3&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Query job 8ef76041-f077-4a05-bc03-63e6983ef853 is DONE. 332 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:8ef76041-f077-4a05-bc03-63e6983ef853&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "import pandas as pd\n",
+            "from faker import Faker\n",
+            "\n",
+            "fake = Faker('es_ES')\n",
+            "result_df = pd.DataFrame({\n",
+            "    'Name': [fake.name() for _ in range(100)],\n",
+            "    'Age': [fake.random_int(min=18, max=65) for _ in range(100)],\n",
+            "    'Gender': [fake.random_element(elements=['Male', 'Female', 'Non-binary']) for _ in range(100)]\n",
+            "})\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "max_tries = 5\n",
+        "for i in range(max_tries):\n",
+        "  # Get LLM generated code\n",
+        "  df_result = model.predict(df_prompt)\n",
+        "  llm_result = df_result['ml_generate_text_llm_result'].iloc[0]\n",
+        "\n",
+        "  # Python code comes back as a markdown code block,\n",
+        "  # remove the prefix \"```python\" and suffix \"```\"\n",
+        "  code = llm_result[9:-3]\n",
+        "  print(code)\n",
+        "\n",
+        "  # Check if the generated code is runnable\n",
+        "  try:\n",
+        "    exec(code)\n",
+        "    break\n",
+        "  except Exception as ex:\n",
+        "    print(ex)\n",
+        "    error_context = f\"\"\"\n",
+        "Previous code:\n",
+        "{code}\n",
+        "\n",
+        "Had this exception:\n",
+        "{ex}\"\"\"\n",
+        "\n",
+        "    # Update the prompt to help LLM correct error\n",
+        "    df_prompt[\"prompt\"] += error_context\n",
+        "\n",
+        "    # If we have exhausted max tries then stop trying\n",
+        "    if i+1 == max_tries:\n",
+        "      raise Exception(\"Failed to generate runnable code\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Run the generated code and verify that it produced the desired data."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 424
+        },
+        "id": "GODcPwX2PBEu",
+        "outputId": "dec4c872-c464-49e4-cd7f-9442fc977d18"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "summary": "{\n  \"name\": \"execution_context\",\n  \"rows\": 100,\n  \"fields\": [\n    {\n      \"column\": \"Name\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 100,\n        \"samples\": [\n          \"Renata Pla Cases\",\n          \"Guiomar Carnero-Paz\",\n          \"Luciano Garmendia\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Age\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 13,\n        \"min\": 18,\n        \"max\": 64,\n        \"num_unique_values\": 39,\n        \"samples\": [\n          56,\n          31,\n          34\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Gender\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"Male\",\n          \"Non-binary\",\n          \"Female\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
+              "type": "dataframe"
+            },
+            "text/html": [
+              "\n",
+              "  <div id=\"df-6d9e5763-5a0d-46b3-81a3-19a51edea311\" class=\"colab-df-container\">\n",
+              "    <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Name</th>\n",
+              "      <th>Age</th>\n",
+              "      <th>Gender</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>Pastora Acuña Company</td>\n",
+              "      <td>21</td>\n",
+              "      <td>Male</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>León Reig-Salom</td>\n",
+              "      <td>39</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>Aura Tomás Llobet</td>\n",
+              "      <td>30</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>Vicente Correa Palomar</td>\n",
+              "      <td>64</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>Benito del Fuster</td>\n",
+              "      <td>34</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>...</th>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>95</th>\n",
+              "      <td>Eduardo Cabrera</td>\n",
+              "      <td>27</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>96</th>\n",
+              "      <td>Nazaret de Izaguirre</td>\n",
+              "      <td>40</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>97</th>\n",
+              "      <td>Manuela Agullo Bustamante</td>\n",
+              "      <td>27</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>98</th>\n",
+              "      <td>Eugenio Mateo Naranjo Blazquez</td>\n",
+              "      <td>36</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>99</th>\n",
+              "      <td>Heriberto Vicens Baeza</td>\n",
+              "      <td>53</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>100 rows × 3 columns</p>\n",
+              "</div>\n",
+              "    <div class=\"colab-df-buttons\">\n",
+              "\n",
+              "  <div class=\"colab-df-container\">\n",
+              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-6d9e5763-5a0d-46b3-81a3-19a51edea311')\"\n",
+              "            title=\"Convert this dataframe to an interactive table.\"\n",
+              "            style=\"display:none;\">\n",
+              "\n",
+              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
+              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
+              "  </svg>\n",
+              "    </button>\n",
+              "\n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-buttons div {\n",
+              "      margin-bottom: 4px;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "    <script>\n",
+              "      const buttonEl =\n",
+              "        document.querySelector('#df-6d9e5763-5a0d-46b3-81a3-19a51edea311 button.colab-df-convert');\n",
+              "      buttonEl.style.display =\n",
+              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "      async function convertToInteractive(key) {\n",
+              "        const element = document.querySelector('#df-6d9e5763-5a0d-46b3-81a3-19a51edea311');\n",
+              "        const dataTable =\n",
+              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                    [key], {});\n",
+              "        if (!dataTable) return;\n",
+              "\n",
+              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
+              "          + ' to learn more about interactive tables.';\n",
+              "        element.innerHTML = '';\n",
+              "        dataTable['output_type'] = 'display_data';\n",
+              "        await google.colab.output.renderOutput(dataTable, element);\n",
+              "        const docLink = document.createElement('div');\n",
+              "        docLink.innerHTML = docLinkHtml;\n",
+              "        element.appendChild(docLink);\n",
+              "      }\n",
+              "    </script>\n",
+              "  </div>\n",
+              "\n",
+              "\n",
+              "<div id=\"df-4a650ffd-1a4b-4b97-b197-955aedcc40c2\">\n",
+              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-4a650ffd-1a4b-4b97-b197-955aedcc40c2')\"\n",
+              "            title=\"Suggest charts\"\n",
+              "            style=\"display:none;\">\n",
+              "\n",
+              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
+              "     width=\"24px\">\n",
+              "    <g>\n",
+              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
+              "    </g>\n",
+              "</svg>\n",
+              "  </button>\n",
+              "\n",
+              "<style>\n",
+              "  .colab-df-quickchart {\n",
+              "      --bg-color: #E8F0FE;\n",
+              "      --fill-color: #1967D2;\n",
+              "      --hover-bg-color: #E2EBFA;\n",
+              "      --hover-fill-color: #174EA6;\n",
+              "      --disabled-fill-color: #AAA;\n",
+              "      --disabled-bg-color: #DDD;\n",
+              "  }\n",
+              "\n",
+              "  [theme=dark] .colab-df-quickchart {\n",
+              "      --bg-color: #3B4455;\n",
+              "      --fill-color: #D2E3FC;\n",
+              "      --hover-bg-color: #434B5C;\n",
+              "      --hover-fill-color: #FFFFFF;\n",
+              "      --disabled-bg-color: #3B4455;\n",
+              "      --disabled-fill-color: #666;\n",
+              "  }\n",
+              "\n",
+              "  .colab-df-quickchart {\n",
+              "    background-color: var(--bg-color);\n",
+              "    border: none;\n",
+              "    border-radius: 50%;\n",
+              "    cursor: pointer;\n",
+              "    display: none;\n",
+              "    fill: var(--fill-color);\n",
+              "    height: 32px;\n",
+              "    padding: 0;\n",
+              "    width: 32px;\n",
+              "  }\n",
+              "\n",
+              "  .colab-df-quickchart:hover {\n",
+              "    background-color: var(--hover-bg-color);\n",
+              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "    fill: var(--button-hover-fill-color);\n",
+              "  }\n",
+              "\n",
+              "  .colab-df-quickchart-complete:disabled,\n",
+              "  .colab-df-quickchart-complete:disabled:hover {\n",
+              "    background-color: var(--disabled-bg-color);\n",
+              "    fill: var(--disabled-fill-color);\n",
+              "    box-shadow: none;\n",
+              "  }\n",
+              "\n",
+              "  .colab-df-spinner {\n",
+              "    border: 2px solid var(--fill-color);\n",
+              "    border-color: transparent;\n",
+              "    border-bottom-color: var(--fill-color);\n",
+              "    animation:\n",
+              "      spin 1s steps(1) infinite;\n",
+              "  }\n",
+              "\n",
+              "  @keyframes spin {\n",
+              "    0% {\n",
+              "      border-color: transparent;\n",
+              "      border-bottom-color: var(--fill-color);\n",
+              "      border-left-color: var(--fill-color);\n",
+              "    }\n",
+              "    20% {\n",
+              "      border-color: transparent;\n",
+              "      border-left-color: var(--fill-color);\n",
+              "      border-top-color: var(--fill-color);\n",
+              "    }\n",
+              "    30% {\n",
+              "      border-color: transparent;\n",
+              "      border-left-color: var(--fill-color);\n",
+              "      border-top-color: var(--fill-color);\n",
+              "      border-right-color: var(--fill-color);\n",
+              "    }\n",
+              "    40% {\n",
+              "      border-color: transparent;\n",
+              "      border-right-color: var(--fill-color);\n",
+              "      border-top-color: var(--fill-color);\n",
+              "    }\n",
+              "    60% {\n",
+              "      border-color: transparent;\n",
+              "      border-right-color: var(--fill-color);\n",
+              "    }\n",
+              "    80% {\n",
+              "      border-color: transparent;\n",
+              "      border-right-color: var(--fill-color);\n",
+              "      border-bottom-color: var(--fill-color);\n",
+              "    }\n",
+              "    90% {\n",
+              "      border-color: transparent;\n",
+              "      border-bottom-color: var(--fill-color);\n",
+              "    }\n",
+              "  }\n",
+              "</style>\n",
+              "\n",
+              "  <script>\n",
+              "    async function quickchart(key) {\n",
+              "      const quickchartButtonEl =\n",
+              "        document.querySelector('#' + key + ' button');\n",
+              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
+              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
+              "      try {\n",
+              "        const charts = await google.colab.kernel.invokeFunction(\n",
+              "            'suggestCharts', [key], {});\n",
+              "      } catch (error) {\n",
+              "        console.error('Error during call to suggestCharts:', error);\n",
+              "      }\n",
+              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
+              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
+              "    }\n",
+              "    (() => {\n",
+              "      let quickchartButtonEl =\n",
+              "        document.querySelector('#df-4a650ffd-1a4b-4b97-b197-955aedcc40c2 button');\n",
+              "      quickchartButtonEl.style.display =\n",
+              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "    })();\n",
+              "  </script>\n",
+              "</div>\n",
+              "\n",
+              "    </div>\n",
+              "  </div>\n"
+            ],
+            "text/plain": [
+              "                              Name  Age      Gender\n",
+              "0            Pastora Acuña Company   21        Male\n",
+              "1                  León Reig-Salom   39  Non-binary\n",
+              "2                Aura Tomás Llobet   30      Female\n",
+              "3           Vicente Correa Palomar   64      Female\n",
+              "4                Benito del Fuster   34      Female\n",
+              "..                             ...  ...         ...\n",
+              "95                 Eduardo Cabrera   27  Non-binary\n",
+              "96            Nazaret de Izaguirre   40  Non-binary\n",
+              "97       Manuela Agullo Bustamante   27      Female\n",
+              "98  Eugenio Mateo Naranjo Blazquez   36  Non-binary\n",
+              "99          Heriberto Vicens Baeza   53      Female\n",
+              "\n",
+              "[100 rows x 3 columns]"
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "execution_context = {}\n",
+        "exec(code, execution_context)\n",
+        "execution_context.get(\"result_df\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We want to run this code at scale to generate since we want to generate large amount of data. Let's deploy a `remote_function` for this purpose."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 36
+        },
+        "id": "n-BsGciNqSwU",
+        "outputId": "996e5639-a49c-4542-a0dc-ede450e0eb6d"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "string"
+            },
+            "text/plain": [
+              "'projects/bigframes-dev/locations/us-central1/functions/bigframes-19f2f35637098969770261a2974bef32'"
+            ]
+          },
+          "execution_count": 8,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "@bpd.remote_function([int], str, packages=['faker', 'pandas'])\n",
+        "def data_generator(id):\n",
+        "  context = {}\n",
+        "  exec(code, context)\n",
+        "  result_df = context.get(\"result_df\")\n",
+        "  return result_df.to_json(orient=\"records\")\n",
+        "\n",
+        "data_generator.bigframes_cloud_function"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Let’s say we want to generate 1 million rows of synthetic data. Since our generated code produces 100 rows in one run, we can initialize an indicator dataframe with 1M/100 = 10K indicator rows. Then we can apply the remote function to produce 100 synthetic data rows for each indicator row."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "id": "Odkmev9nsYqA",
+        "outputId": "4aa7a1fd-0c0d-4412-f326-a20e19f583b5"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "Load job 40b9c3a8-27fc-40a8-9edf-4aa2e0fec332 is DONE. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:40b9c3a8-27fc-40a8-9edf-4aa2e0fec332&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "desired_num_rows = 1_000_000 # 1 million rows\n",
+        "batch_size = 100 # used in the prompt\n",
+        "num_batches = int(desired_num_rows/batch_size)\n",
+        "\n",
+        "df = bpd.DataFrame({\"row_id\": range(num_batches)})"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "id": "UyBhlJFVsmQC",
+        "outputId": "29748df5-673b-4320-bb1f-53abaace3b81"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "Query job 9dd49b50-2dbf-4351-b9ad-b17aeb627caf is DONE. 240.0 kB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:9dd49b50-2dbf-4351-b9ad-b17aeb627caf&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "df[\"json_data\"] = df[\"row_id\"].apply(data_generator)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "At this point each item in `df[\"json_data\"]` is a json serialized array of 100 records. Let’s flatten that into 1 record per row using a direct SQL."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 932
+        },
+        "id": "6p3eM21qvRvy",
+        "outputId": "333f4e49-a555-4d2f-b527-02142782b3a7"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "Query job 3f8d2133-b01d-402d-a731-79592810ca1c is DONE. 63.7 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:3f8d2133-b01d-402d-a731-79592810ca1c&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Query job 4a613aa3-6323-4914-8e34-93323885d458 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:4a613aa3-6323-4914-8e34-93323885d458&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "Query job 0deb03be-725b-40b4-a7a1-1023b0477f35 is DONE. 40.1 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:0deb03be-725b-40b4-a7a1-1023b0477f35&page=queryresults\">Open Job</a>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Name</th>\n",
+              "      <th>Age</th>\n",
+              "      <th>Gender</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>Eloy Santiago-Aragón</td>\n",
+              "      <td>31</td>\n",
+              "      <td>Male</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>Amanda Mata Abril</td>\n",
+              "      <td>20</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>Danilo Velázquez Salcedo</td>\n",
+              "      <td>58</td>\n",
+              "      <td>Male</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>Leyre Alba España</td>\n",
+              "      <td>61</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>Paulina Amores Pastor</td>\n",
+              "      <td>41</td>\n",
+              "      <td>Male</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>5</th>\n",
+              "      <td>Jorge Cuadrado Mena</td>\n",
+              "      <td>50</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>6</th>\n",
+              "      <td>Chucho Catalán</td>\n",
+              "      <td>36</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>7</th>\n",
+              "      <td>Vidal Benavente Lerma</td>\n",
+              "      <td>38</td>\n",
+              "      <td>Male</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>8</th>\n",
+              "      <td>Clementina Álamo</td>\n",
+              "      <td>32</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>9</th>\n",
+              "      <td>Petrona Roselló-Valls</td>\n",
+              "      <td>61</td>\n",
+              "      <td>Male</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>10</th>\n",
+              "      <td>Luís Camilo Sastre Marin</td>\n",
+              "      <td>45</td>\n",
+              "      <td>Male</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>11</th>\n",
+              "      <td>Gil Baudelio Carbajo Ordóñez</td>\n",
+              "      <td>58</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>12</th>\n",
+              "      <td>David del Donoso</td>\n",
+              "      <td>44</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>13</th>\n",
+              "      <td>Dolores Arnau Ros</td>\n",
+              "      <td>21</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>14</th>\n",
+              "      <td>Febe de León</td>\n",
+              "      <td>46</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>15</th>\n",
+              "      <td>Ariadna Almazán</td>\n",
+              "      <td>34</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>16</th>\n",
+              "      <td>Blas Serna Aguiló</td>\n",
+              "      <td>24</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>17</th>\n",
+              "      <td>Paulino Barreda Almeida</td>\n",
+              "      <td>59</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>18</th>\n",
+              "      <td>Eligio Valcárcel Tormo</td>\n",
+              "      <td>35</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>19</th>\n",
+              "      <td>Toño Amador Torres Portillo</td>\n",
+              "      <td>48</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>20</th>\n",
+              "      <td>Florencia del Bejarano</td>\n",
+              "      <td>65</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>21</th>\n",
+              "      <td>Clímaco Andreu Gómez</td>\n",
+              "      <td>18</td>\n",
+              "      <td>Male</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>22</th>\n",
+              "      <td>Xiomara Dominguez Solana</td>\n",
+              "      <td>35</td>\n",
+              "      <td>Female</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>23</th>\n",
+              "      <td>Leire Castilla Borrego</td>\n",
+              "      <td>19</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>24</th>\n",
+              "      <td>Angelita Garmendia Carpio</td>\n",
+              "      <td>21</td>\n",
+              "      <td>Non-binary</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>25 rows × 3 columns</p>\n",
+              "</div>[1000000 rows x 3 columns in total]"
+            ],
+            "text/plain": [
+              "                            Name  Age      Gender\n",
+              "0           Eloy Santiago-Aragón   31        Male\n",
+              "1              Amanda Mata Abril   20  Non-binary\n",
+              "2       Danilo Velázquez Salcedo   58        Male\n",
+              "3              Leyre Alba España   61      Female\n",
+              "4          Paulina Amores Pastor   41        Male\n",
+              "5            Jorge Cuadrado Mena   50      Female\n",
+              "6                 Chucho Catalán   36  Non-binary\n",
+              "7          Vidal Benavente Lerma   38        Male\n",
+              "8               Clementina Álamo   32      Female\n",
+              "9          Petrona Roselló-Valls   61        Male\n",
+              "10      Luís Camilo Sastre Marin   45        Male\n",
+              "11  Gil Baudelio Carbajo Ordóñez   58  Non-binary\n",
+              "12              David del Donoso   44      Female\n",
+              "13             Dolores Arnau Ros   21  Non-binary\n",
+              "14                  Febe de León   46  Non-binary\n",
+              "15               Ariadna Almazán   34      Female\n",
+              "16             Blas Serna Aguiló   24  Non-binary\n",
+              "17       Paulino Barreda Almeida   59      Female\n",
+              "18        Eligio Valcárcel Tormo   35  Non-binary\n",
+              "19   Toño Amador Torres Portillo   48      Female\n",
+              "20        Florencia del Bejarano   65  Non-binary\n",
+              "21          Clímaco Andreu Gómez   18        Male\n",
+              "22      Xiomara Dominguez Solana   35      Female\n",
+              "23        Leire Castilla Borrego   19  Non-binary\n",
+              "24     Angelita Garmendia Carpio   21  Non-binary\n",
+              "...\n",
+              "\n",
+              "[1000000 rows x 3 columns]"
+            ]
+          },
+          "execution_count": 12,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "sql = f\"\"\"\n",
+        "WITH T0 AS ({df.sql}),\n",
+        "T1 AS (\n",
+        "  SELECT PARSE_JSON(json_row) AS json_row\n",
+        "  FROM T0, UNNEST(JSON_EXTRACT_ARRAY(json_data)) AS json_row\n",
+        ")\n",
+        "SELECT STRING(json_row.Name) AS Name,\n",
+        "       INT64(json_row.Age) AS Age,\n",
+        "       STRING(json_row.Gender) AS Gender\n",
+        "FROM T1\n",
+        "\"\"\"\n",
+        "df_result = bpd.read_gbq(sql)\n",
+        "df_result"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "There you have it, 1 million synthetic data rows ready to use, or save them in a BigQuery table for future use."
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/noxfile.py b/noxfile.py
index 9479a7a318..91ad6bc0e6 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -723,6 +723,10 @@ def notebook(session: nox.Session):
         # The experimental notebooks imagine features that don't yet
         # exist or only exist as temporary prototypes.
         "notebooks/experimental/longer_ml_demo.ipynb",
+        # The notebooks that are added for more use cases, such as backing a
+        # blog post, which may take longer to execute and need not be
+        # continuously tested.
+        "notebooks/apps/synthetic_data_generation.ipynb",
     ]
 
     # Convert each Path notebook object to a string using a list comprehension.

From d924ec2937c158644b5d1bbae4f82476de2c1655 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com>
Date: Mon, 22 Apr 2024 12:20:14 -0700
Subject: [PATCH 12/15] feat: add `Series.struct.dtypes` property (#599)

---
 bigframes/operations/structs.py               | 12 ++++++++
 .../pandas/core/arrays/arrow/accessors.py     | 29 +++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py
index e8a1af9602..d222f0993b 100644
--- a/bigframes/operations/structs.py
+++ b/bigframes/operations/structs.py
@@ -15,9 +15,11 @@
 from __future__ import annotations
 
 import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors
+import pandas as pd
 
 from bigframes.core import log_adapter
 import bigframes.dataframe
+import bigframes.dtypes
 import bigframes.operations
 import bigframes.operations.base
 import bigframes.series
@@ -45,3 +47,13 @@ def explode(self) -> bigframes.dataframe.DataFrame:
         return bigframes.pandas.concat(
             [self.field(i) for i in range(pa_type.num_fields)], axis="columns"
         )
+
+    def dtypes(self) -> pd.Series:
+        pa_type = self._dtype.pyarrow_dtype
+        return pd.Series(
+            data=[
+                bigframes.dtypes.arrow_dtype_to_bigframes_dtype(pa_type.field(i).type)
+                for i in range(pa_type.num_fields)
+            ],
+            index=[pa_type.field(i).name for i in range(pa_type.num_fields)],
+        )
diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
index 8e3ea06a3d..bd6e50d096 100644
--- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
+++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py
@@ -92,3 +92,32 @@ def explode(self):
                 The data corresponding to all child fields.
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def dtypes(self):
+        """
+        Return the dtype object of each child field of the struct.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> import pyarrow as pa
+            >>> bpd.options.display.progress_bar = None
+            >>> s = bpd.Series(
+            ...     [
+            ...         {"version": 1, "project": "pandas"},
+            ...         {"version": 2, "project": "pandas"},
+            ...         {"version": 1, "project": "numpy"},
+            ...     ],
+            ...     dtype=bpd.ArrowDtype(pa.struct(
+            ...         [("version", pa.int64()), ("project", pa.string())]
+            ...     ))
+            ... )
+            >>> s.struct.dtypes()
+            version              Int64
+            project    string[pyarrow]
+            dtype: object
+
+        Returns:
+            A *pandas* Series with the data type of all child fields.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

From 70015b79e8cff16ff1b36c5e3f019fe099750a9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Mon, 22 Apr 2024 15:05:33 -0500
Subject: [PATCH 13/15] docs: set `index_cols` in `read_gbq` as a best practice
 (#624)

---
 .../bigframes_vendored/pandas/io/gbq.py        | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py
index b5feeb13c5..c60a276338 100644
--- a/third_party/bigframes_vendored/pandas/io/gbq.py
+++ b/third_party/bigframes_vendored/pandas/io/gbq.py
@@ -27,13 +27,17 @@ def read_gbq(
     ):
         """Loads a DataFrame from BigQuery.
 
-        BigQuery tables are an unordered, unindexed data source. By default,
-        the DataFrame will have an arbitrary index and ordering.
-
-        Set the `index_col` argument to one or more columns to choose an
-        index. The resulting DataFrame is sorted by the index columns. For the
-        best performance, ensure the index columns don't contain duplicate
-        values.
+        BigQuery tables are an unordered, unindexed data source. To add support
+        pandas-compatibility, the following indexing options are supported:
+
+        * (Default behavior) Add an arbitrary sequential index and ordering
+          using an an analytic windowed operation that prevents filtering
+          push down.
+        * (Recommended) Set the ``index_col`` argument to one or more columns.
+          Unique values for the row labels are recommended. Duplicate labels
+          are possible, but note that joins on a non-unique index can duplicate
+          rows and operations like ``cumsum()`` that window across a non-unique
+          index can have some non-deternimism.
 
         .. note::
             By default, even SQL query inputs with an ORDER BY clause create a

From 75bb2409532e80de742030d05ffcbacacf5ffba2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Mon, 22 Apr 2024 17:00:36 -0500
Subject: [PATCH 14/15] feat: support primary key(s) in `read_gbq` by using as
 the `index_col` by default (#625)

* feat: support primary key(s) in `read_gbq` by using as the `index_col` by default

* revert WIP commit

* address type error in tests
---
 bigframes/session/__init__.py                 | 25 ++++++------
 setup.py                                      |  2 +-
 testing/constraints-3.9.txt                   |  2 +-
 tests/system/small/test_session.py            | 13 +++----
 tests/unit/resources.py                       |  7 ++--
 tests/unit/session/test_session.py            | 39 +++++++++++++++++++
 .../bigframes_vendored/pandas/io/gbq.py       |  3 ++
 7 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 64bcebb6cc..f3f1ffce16 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -708,13 +708,15 @@ def _get_snapshot_sql_and_primary_key(
                 f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}"
             )
 
-        # TODO(b/305264153): Use public properties to fetch primary keys once
-        # added to google-cloud-bigquery.
-        primary_keys = (
-            table._properties.get("tableConstraints", {})
-            .get("primaryKey", {})
-            .get("columns")
-        )
+        primary_keys = None
+        if (
+            (table_constraints := getattr(table, "table_constraints", None)) is not None
+            and (primary_key := table_constraints.primary_key) is not None
+            # This will be False for either None or empty list.
+            # We want primary_keys = None if no primary keys are set.
+            and (columns := primary_key.columns)
+        ):
+            primary_keys = columns
 
         job_config = bigquery.QueryJobConfig()
         job_config.labels["bigframes-api"] = api_name
@@ -777,12 +779,13 @@ def _read_gbq_table(
             query, default_project=self.bqclient.project
         )
 
-        (
-            table_expression,
-            total_ordering_cols,
-        ) = self._get_snapshot_sql_and_primary_key(
+        (table_expression, primary_keys,) = self._get_snapshot_sql_and_primary_key(
             table_ref, api_name=api_name, use_cache=use_cache
         )
+        total_ordering_cols = primary_keys
+
+        if not index_col and primary_keys is not None:
+            index_col = primary_keys
 
         for key in columns:
             if key not in table_expression.columns:
diff --git a/setup.py b/setup.py
index 83049f9715..2ccf63259c 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
     "gcsfs >=2023.3.0",
     "geopandas >=0.12.2",
     "google-auth >=2.15.0,<3.0dev",
-    "google-cloud-bigquery[bqstorage,pandas] >=3.10.0",
+    "google-cloud-bigquery[bqstorage,pandas] >=3.16.0",
     "google-cloud-functions >=1.12.0",
     "google-cloud-bigquery-connection >=1.12.0",
     "google-cloud-iam >=2.12.1",
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
index 1e1f3a3e66..f5007ed564 100644
--- a/testing/constraints-3.9.txt
+++ b/testing/constraints-3.9.txt
@@ -4,7 +4,7 @@ fsspec==2023.3.0
 gcsfs==2023.3.0
 geopandas==0.12.2
 google-auth==2.15.0
-google-cloud-bigquery==3.10.0
+google-cloud-bigquery==3.16.0
 google-cloud-functions==1.12.0
 google-cloud-bigquery-connection==1.12.0
 google-cloud-iam==2.12.1
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index ce415f9324..1e76a8bd8b 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -236,14 +236,13 @@ def test_read_gbq_w_anonymous_query_results_table(session: bigframes.Session):
 def test_read_gbq_w_primary_keys_table(
     session: bigframes.Session, usa_names_grouped_table: bigquery.Table
 ):
+    # Validate that the table we're querying has a primary key.
     table = usa_names_grouped_table
-    # TODO(b/305264153): Use public properties to fetch primary keys once
-    # added to google-cloud-bigquery.
-    primary_keys = (
-        table._properties.get("tableConstraints", {})
-        .get("primaryKey", {})
-        .get("columns")
-    )
+    table_constraints = table.table_constraints
+    assert table_constraints is not None
+    primary_key = table_constraints.primary_key
+    assert primary_key is not None
+    primary_keys = primary_key.columns
     assert len(primary_keys) != 0
 
     df = session.read_gbq(f"{table.project}.{table.dataset_id}.{table.table_id}")
diff --git a/tests/unit/resources.py b/tests/unit/resources.py
index 6846659930..28b08e49dc 100644
--- a/tests/unit/resources.py
+++ b/tests/unit/resources.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import datetime
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Sequence
 import unittest.mock as mock
 
 import google.auth.credentials
@@ -37,6 +37,7 @@
 def create_bigquery_session(
     bqclient: Optional[mock.Mock] = None,
     session_id: str = "abcxyz",
+    table_schema: Sequence[google.cloud.bigquery.SchemaField] = TEST_SCHEMA,
     anonymous_dataset: Optional[google.cloud.bigquery.DatasetReference] = None,
 ) -> bigframes.Session:
     credentials = mock.create_autospec(
@@ -51,7 +52,7 @@ def create_bigquery_session(
         table = mock.create_autospec(google.cloud.bigquery.Table, instance=True)
         table._properties = {}
         type(table).location = mock.PropertyMock(return_value="test-region")
-        type(table).schema = mock.PropertyMock(return_value=TEST_SCHEMA)
+        type(table).schema = mock.PropertyMock(return_value=table_schema)
         bqclient.get_table.return_value = table
 
     if anonymous_dataset is None:
@@ -72,7 +73,7 @@ def query_mock(query, *args, **kwargs):
         if query.startswith("SELECT CURRENT_TIMESTAMP()"):
             query_job.result = mock.MagicMock(return_value=[[datetime.datetime.now()]])
         else:
-            type(query_job).schema = mock.PropertyMock(return_value=TEST_SCHEMA)
+            type(query_job).schema = mock.PropertyMock(return_value=table_schema)
 
         return query_job
 
diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py
index 3e2b28c200..543196066a 100644
--- a/tests/unit/session/test_session.py
+++ b/tests/unit/session/test_session.py
@@ -19,9 +19,11 @@
 
 import google.api_core.exceptions
 import google.cloud.bigquery
+import google.cloud.bigquery.table
 import pytest
 
 import bigframes
+import bigframes.exceptions
 
 from .. import resources
 
@@ -50,6 +52,43 @@ def test_read_gbq_cached_table():
     assert "1999-01-02T03:04:05.678901" in df.sql
 
 
+def test_read_gbq_clustered_table_ok_default_index_with_primary_key():
+    """If a primary key is set on the table, we use that as the index column
+    by default, no error should be raised in this case.
+
+    See internal issue 335727141.
+    """
+    table = google.cloud.bigquery.Table("my-project.my_dataset.my_table")
+    table.clustering_fields = ["col1", "col2"]
+    table.schema = (
+        google.cloud.bigquery.SchemaField("pk_1", "INT64"),
+        google.cloud.bigquery.SchemaField("pk_2", "INT64"),
+        google.cloud.bigquery.SchemaField("col_1", "INT64"),
+        google.cloud.bigquery.SchemaField("col_2", "INT64"),
+    )
+
+    # TODO(b/305264153): use setter for table_constraints in client library
+    # when available.
+    table._properties["tableConstraints"] = {
+        "primaryKey": {
+            "columns": ["pk_1", "pk_2"],
+        },
+    }
+    bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
+    bqclient.project = "test-project"
+    bqclient.get_table.return_value = table
+    session = resources.create_bigquery_session(
+        bqclient=bqclient, table_schema=table.schema
+    )
+    table._properties["location"] = session._location
+
+    df = session.read_gbq("my-project.my_dataset.my_table")
+
+    # There should be no analytic operators to prevent row filtering pushdown.
+    assert "OVER" not in df.sql
+    assert tuple(df.index.names) == ("pk_1", "pk_2")
+
+
 @pytest.mark.parametrize(
     "not_found_table_id",
     [("unknown.dataset.table"), ("project.unknown.table"), ("project.dataset.unknown")],
diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py
index c60a276338..93cee71289 100644
--- a/third_party/bigframes_vendored/pandas/io/gbq.py
+++ b/third_party/bigframes_vendored/pandas/io/gbq.py
@@ -109,6 +109,9 @@ def read_gbq(
                 In tha case, will read all the matched table as one DataFrame.
             index_col (Iterable[str] or str):
                 Name of result column(s) to use for index in results DataFrame.
+
+                **New in bigframes version 1.3.0**: If ``index_cols`` is not
+                set, the primary key(s) of the table are used as the index.
             columns (Iterable[str]):
                 List of BigQuery column names in the desired order for results
                 DataFrame.

From 7227a6af37a3c0553db2d9a5a6d86c1e37d33b21 Mon Sep 17 00:00:00 2001
From: "release-please[bot]"
 <55107282+release-please[bot]@users.noreply.github.com>
Date: Mon, 22 Apr 2024 23:16:17 +0000
Subject: [PATCH 15/15] chore(main): release 1.3.0 (#617)

:robot: I have created a release *beep* *boop*
---


## [1.3.0](https://togithub.com/googleapis/python-bigquery-dataframes/compare/v1.2.0...v1.3.0) (2024-04-22)


### Features

* Add `Series.struct.dtypes` property ([#599](https://togithub.com/googleapis/python-bigquery-dataframes/issues/599)) ([d924ec2](https://togithub.com/googleapis/python-bigquery-dataframes/commit/d924ec2937c158644b5d1bbae4f82476de2c1655))
* Add fine tuning `fit()` for Palm2TextGenerator ([#616](https://togithub.com/googleapis/python-bigquery-dataframes/issues/616)) ([9c106bd](https://togithub.com/googleapis/python-bigquery-dataframes/commit/9c106bd24482620ef5ff3c85f94be9da76c49716))
* Add quantile statistic ([#613](https://togithub.com/googleapis/python-bigquery-dataframes/issues/613)) ([bc82804](https://togithub.com/googleapis/python-bigquery-dataframes/commit/bc82804da43c03c2311cd56f47a2316d3aae93d2))
* Expose `max_batching_rows` in `remote_function` ([#622](https://togithub.com/googleapis/python-bigquery-dataframes/issues/622)) ([240a1ac](https://togithub.com/googleapis/python-bigquery-dataframes/commit/240a1ac6fa914550bb6216cd5d179a36009f2657))
* Support primary key(s) in `read_gbq` by using as the `index_col` by default ([#625](https://togithub.com/googleapis/python-bigquery-dataframes/issues/625)) ([75bb240](https://togithub.com/googleapis/python-bigquery-dataframes/commit/75bb2409532e80de742030d05ffcbacacf5ffba2))
* Warn if location is set to unknown location ([#609](https://togithub.com/googleapis/python-bigquery-dataframes/issues/609)) ([3706b4f](https://togithub.com/googleapis/python-bigquery-dataframes/commit/3706b4f9dde65788b5e6343a6428fb1866499461))


### Bug Fixes

* Address technical writers fb ([#611](https://togithub.com/googleapis/python-bigquery-dataframes/issues/611)) ([9f8f181](https://togithub.com/googleapis/python-bigquery-dataframes/commit/9f8f181279133abdb7da3aa045df6fa278587013))
* Infer narrowest numeric type when combining numeric columns ([#602](https://togithub.com/googleapis/python-bigquery-dataframes/issues/602)) ([8f9ece6](https://togithub.com/googleapis/python-bigquery-dataframes/commit/8f9ece6d13f57f02d677bf0e3fea97dea94ae240))
* Use exact median implementation by default ([#619](https://togithub.com/googleapis/python-bigquery-dataframes/issues/619)) ([9d205ae](https://togithub.com/googleapis/python-bigquery-dataframes/commit/9d205aecb77f35baeec82a8f6e1b72c2d852ca46))


### Documentation

* Fix rendering of examples for multiple apis ([#620](https://togithub.com/googleapis/python-bigquery-dataframes/issues/620)) ([9665e39](https://togithub.com/googleapis/python-bigquery-dataframes/commit/9665e39ef288841f03a9d823bd2210ef58394ad3))
* Set `index_cols` in `read_gbq` as a best practice ([#624](https://togithub.com/googleapis/python-bigquery-dataframes/issues/624)) ([70015b7](https://togithub.com/googleapis/python-bigquery-dataframes/commit/70015b79e8cff16ff1b36c5e3f019fe099750a9d))

---
This PR was generated with [Release Please](https://togithub.com/googleapis/release-please). See [documentation](https://togithub.com/googleapis/release-please#release-please).
---
 CHANGELOG.md         | 25 +++++++++++++++++++++++++
 bigframes/version.py |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a3314c976e..a96c902835 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,31 @@
 
 [1]: https://pypi.org/project/bigframes/#history
 
+## [1.3.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.2.0...v1.3.0) (2024-04-22)
+
+
+### Features
+
+* Add `Series.struct.dtypes` property ([#599](https://github.com/googleapis/python-bigquery-dataframes/issues/599)) ([d924ec2](https://github.com/googleapis/python-bigquery-dataframes/commit/d924ec2937c158644b5d1bbae4f82476de2c1655))
+* Add fine tuning `fit()` for Palm2TextGenerator ([#616](https://github.com/googleapis/python-bigquery-dataframes/issues/616)) ([9c106bd](https://github.com/googleapis/python-bigquery-dataframes/commit/9c106bd24482620ef5ff3c85f94be9da76c49716))
+* Add quantile statistic ([#613](https://github.com/googleapis/python-bigquery-dataframes/issues/613)) ([bc82804](https://github.com/googleapis/python-bigquery-dataframes/commit/bc82804da43c03c2311cd56f47a2316d3aae93d2))
+* Expose `max_batching_rows` in `remote_function` ([#622](https://github.com/googleapis/python-bigquery-dataframes/issues/622)) ([240a1ac](https://github.com/googleapis/python-bigquery-dataframes/commit/240a1ac6fa914550bb6216cd5d179a36009f2657))
+* Support primary key(s) in `read_gbq` by using as the `index_col` by default ([#625](https://github.com/googleapis/python-bigquery-dataframes/issues/625)) ([75bb240](https://github.com/googleapis/python-bigquery-dataframes/commit/75bb2409532e80de742030d05ffcbacacf5ffba2))
+* Warn if location is set to unknown location ([#609](https://github.com/googleapis/python-bigquery-dataframes/issues/609)) ([3706b4f](https://github.com/googleapis/python-bigquery-dataframes/commit/3706b4f9dde65788b5e6343a6428fb1866499461))
+
+
+### Bug Fixes
+
+* Address technical writers fb ([#611](https://github.com/googleapis/python-bigquery-dataframes/issues/611)) ([9f8f181](https://github.com/googleapis/python-bigquery-dataframes/commit/9f8f181279133abdb7da3aa045df6fa278587013))
+* Infer narrowest numeric type when combining numeric columns ([#602](https://github.com/googleapis/python-bigquery-dataframes/issues/602)) ([8f9ece6](https://github.com/googleapis/python-bigquery-dataframes/commit/8f9ece6d13f57f02d677bf0e3fea97dea94ae240))
+* Use exact median implementation by default ([#619](https://github.com/googleapis/python-bigquery-dataframes/issues/619)) ([9d205ae](https://github.com/googleapis/python-bigquery-dataframes/commit/9d205aecb77f35baeec82a8f6e1b72c2d852ca46))
+
+
+### Documentation
+
+* Fix rendering of examples for multiple apis ([#620](https://github.com/googleapis/python-bigquery-dataframes/issues/620)) ([9665e39](https://github.com/googleapis/python-bigquery-dataframes/commit/9665e39ef288841f03a9d823bd2210ef58394ad3))
+* Set `index_cols` in `read_gbq` as a best practice ([#624](https://github.com/googleapis/python-bigquery-dataframes/issues/624)) ([70015b7](https://github.com/googleapis/python-bigquery-dataframes/commit/70015b79e8cff16ff1b36c5e3f019fe099750a9d))
+
 ## [1.2.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.1.0...v1.2.0) (2024-04-15)
 
 
diff --git a/bigframes/version.py b/bigframes/version.py
index ec2105b648..1f103401e4 100644
--- a/bigframes/version.py
+++ b/bigframes/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.2.0"
+__version__ = "1.3.0"