Skip to content

Commit 4ff53db

Browse files
authored
feat: update cut to work without labels = False and show intervals as dict (#335)
* test ver. * add test and adjustment * update test and docstring. * remove unused import. * update code examples. * COde formatted. * Update error and unittest. * Update labels selections.
1 parent ae43905 commit 4ff53db

File tree

5 files changed

+80
-17
lines changed

5 files changed

+80
-17
lines changed

bigframes/core/reshape/__init__.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -129,12 +129,15 @@ def cut(
129129
if bins.is_overlapping:
130130
raise ValueError("Overlapping IntervalIndex is not accepted.")
131131

132-
if labels is not False:
132+
if labels is not None and labels is not False:
133133
raise NotImplementedError(
134-
f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}"
134+
"The 'labels' parameter must be either False or None. "
135+
"Please provide a valid value for 'labels'."
135136
)
136137

137-
return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec())
138+
return x._apply_window_op(
139+
agg_ops.CutOp(bins, labels=labels), window_spec=core.WindowSpec()
140+
)
138141

139142

140143
def qcut(

bigframes/operations/aggregations.py

+34-7
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def skips_nulls(self):
229229

230230

231231
class CutOp(WindowOp):
232-
def __init__(self, bins: typing.Union[int, pd.IntervalIndex]):
232+
def __init__(self, bins: typing.Union[int, pd.IntervalIndex], labels=None):
233233
if isinstance(bins, int):
234234
if not bins > 0:
235235
raise ValueError("`bins` should be a positive integer.")
@@ -239,6 +239,8 @@ def __init__(self, bins: typing.Union[int, pd.IntervalIndex]):
239239
self._bins_int = 0
240240
self._bins = bins
241241

242+
self._labels = labels
243+
242244
def _as_ibis(self, x: ibis_types.Column, window=None):
243245
out = ibis.case()
244246

@@ -247,12 +249,37 @@ def _as_ibis(self, x: ibis_types.Column, window=None):
247249
col_max = _apply_window_if_present(x.max(), window)
248250
bin_width = (col_max - col_min) / self._bins
249251

250-
for this_bin in range(self._bins_int - 1):
251-
out = out.when(
252-
x <= (col_min + (this_bin + 1) * bin_width),
253-
dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()),
254-
)
255-
out = out.when(x.notnull(), self._bins - 1)
252+
if self._labels is False:
253+
for this_bin in range(self._bins_int - 1):
254+
out = out.when(
255+
x <= (col_min + (this_bin + 1) * bin_width),
256+
dtypes.literal_to_ibis_scalar(
257+
this_bin, force_dtype=Int64Dtype()
258+
),
259+
)
260+
out = out.when(x.notnull(), self._bins - 1)
261+
else:
262+
interval_struct = None
263+
adj = (col_max - col_min) * 0.001
264+
for this_bin in range(self._bins_int):
265+
left_edge = (
266+
col_min + this_bin * bin_width - (0 if this_bin > 0 else adj)
267+
)
268+
right_edge = col_min + (this_bin + 1) * bin_width
269+
interval_struct = ibis.struct(
270+
{
271+
"left_exclusive": left_edge,
272+
"right_inclusive": right_edge,
273+
}
274+
)
275+
276+
if this_bin < self._bins_int - 1:
277+
out = out.when(
278+
x <= (col_min + (this_bin + 1) * bin_width),
279+
interval_struct,
280+
)
281+
else:
282+
out = out.when(x.notnull(), interval_struct)
256283
else:
257284
for interval in self._bins:
258285
condition = (x > interval.left) & (x <= interval.right)

tests/system/small/test_pandas.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,30 @@ def test_cut(scalars_dfs):
397397
pd.testing.assert_series_equal(bf_result, pd_result)
398398

399399

400+
def test_cut_default_labels(scalars_dfs):
401+
scalars_df, scalars_pandas_df = scalars_dfs
402+
403+
pd_result = pd.cut(scalars_pandas_df["float64_col"], 5)
404+
bf_result = bpd.cut(scalars_df["float64_col"], 5).to_pandas()
405+
406+
# Convert to match data format
407+
pd_result_converted = pd.Series(
408+
[
409+
{"left_exclusive": interval.left, "right_inclusive": interval.right}
410+
if pd.notna(val)
411+
else pd.NA
412+
for val, interval in zip(
413+
pd_result, pd_result.cat.categories[pd_result.cat.codes]
414+
)
415+
],
416+
name=pd_result.name,
417+
)
418+
419+
pd.testing.assert_series_equal(
420+
bf_result, pd_result_converted, check_index=False, check_dtype=False
421+
)
422+
423+
400424
@pytest.mark.parametrize(
401425
("bins",),
402426
[
@@ -424,7 +448,6 @@ def test_cut_with_interval(scalars_dfs, bins):
424448
],
425449
name=pd_result.name,
426450
)
427-
pd_result.index = pd_result.index.astype("Int64")
428451

429452
pd.testing.assert_series_equal(
430453
bf_result, pd_result_converted, check_index=False, check_dtype=False

tests/unit/test_pandas.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,10 @@ def test_method_matches_session(method_name: str):
8585

8686

8787
def test_cut_raises_with_labels():
88-
with pytest.raises(NotImplementedError, match="Only labels=False"):
88+
with pytest.raises(
89+
NotImplementedError,
90+
match="The 'labels' parameter must be either False or None.",
91+
):
8992
mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
9093
bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"])
9194

third_party/bigframes_vendored/pandas/core/reshape/tile.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ def cut(
3838
3939
Cut with an integer (equal-width bins):
4040
41+
>>> bpd.cut(s, bins=4)
42+
0 {'left_exclusive': -0.01, 'right_inclusive': 2.5}
43+
1 {'left_exclusive': -0.01, 'right_inclusive': 2.5}
44+
2 {'left_exclusive': 2.5, 'right_inclusive': 5.0}
45+
3 {'left_exclusive': 7.5, 'right_inclusive': 10.0}
46+
dtype: struct<left_exclusive: double, right_inclusive: double>[pyarrow]
47+
48+
Cut with an integer (equal-width bins) and labels=False:
49+
4150
>>> bpd.cut(s, bins=4, labels=False)
4251
0 0
4352
1 0
@@ -50,7 +59,7 @@ def cut(
5059
>>> import pandas as pd
5160
5261
>>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)])
53-
>>> bpd.cut(s, bins=interval_index, labels=False)
62+
>>> bpd.cut(s, bins=interval_index)
5463
0 <NA>
5564
1 {'left_exclusive': 0, 'right_inclusive': 1}
5665
2 {'left_exclusive': 1, 'right_inclusive': 5}
@@ -60,7 +69,7 @@ def cut(
6069
Cut with an iterable of tuples:
6170
6271
>>> bins_tuples = [(0, 1), (1, 4), (5, 20)]
63-
>>> bpd.cut(s, bins=bins_tuples, labels=False)
72+
>>> bpd.cut(s, bins=bins_tuples)
6473
0 <NA>
6574
1 {'left_exclusive': 0, 'right_inclusive': 1}
6675
2 <NA>
@@ -82,9 +91,7 @@ def cut(
8291
labels (None):
8392
Specifies the labels for the returned bins. Must be the same length as
8493
the resulting bins. If False, returns only integer indicators of the
85-
bins. This affects the type of the output container (see below).
86-
If True, raises an error. When `ordered=False`, labels must be
87-
provided.
94+
bins. This affects the type of the output container.
8895
8996
Returns:
9097
Series: A Series representing the respective bin for each value

0 commit comments

Comments
 (0)