feat: add series.sample (identical to existing dataframe.sample) (#187)

milkshakeiii · web-flow · commit 37914a4077c6 · 2023-11-09T02:08:13.000Z
We're duplicating some arg-parsing logic here. Discussed briefly with Trevor. This is the case for other methods as well- we might want to add a sharing mechanism for dataframe/series (superclass like pandas?) in the future.

The documentation already exists in third_party/core/generic.py, which is actually what prompted this feat/fix.
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -1447,6 +1447,22 @@ def map(
         result_df = self_df.join(map_df, on="series")
         return result_df[self.name]
 
+    def sample(
+        self,
+        n: Optional[int] = None,
+        frac: Optional[float] = None,
+        *,
+        random_state: Optional[int] = None,
+    ) -> Series:
+        if n is not None and frac is not None:
+            raise ValueError("Only one of 'n' or 'frac' parameter can be specified.")
+
+        ns = (n,) if n is not None else ()
+        fracs = (frac,) if frac is not None else ()
+        return Series(
+            self._block._split(ns=ns, fracs=fracs, random_state=random_state)[0]
+        )
+
     def __array_ufunc__(
         self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs
     ) -> Series:
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -2922,3 +2922,30 @@ def test_map_series_input_duplicates_error(scalars_dfs):
         scalars_pandas_df.int64_too.map(pd_map_series)
     with pytest.raises(pd.errors.InvalidIndexError):
         scalars_df.int64_too.map(bf_map_series, verify_integrity=True)
+
+
+@pytest.mark.parametrize(
+    ("frac", "n", "random_state"),
+    [
+        (None, 4, None),
+        (0.5, None, None),
+        (None, 4, 10),
+        (0.5, None, 10),
+        (None, None, None),
+    ],
+    ids=[
+        "n_wo_random_state",
+        "frac_wo_random_state",
+        "n_w_random_state",
+        "frac_w_random_state",
+        "n_default",
+    ],
+)
+def test_sample(scalars_dfs, frac, n, random_state):
+    scalars_df, _ = scalars_dfs
+    df = scalars_df.int64_col.sample(frac=frac, n=n, random_state=random_state)
+    bf_result = df.to_pandas()
+
+    n = 1 if n is None else n
+    expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n
+    assert bf_result.shape[0] == expected_sample_size