Skip to content

Commit bc82804

Browse files
feat: Add quantile statistic (#613)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 458bfb2 commit bc82804

File tree

16 files changed

+366
-20
lines changed

16 files changed

+366
-20
lines changed

bigframes/constants.py

+3
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,6 @@
9292
LEP_ENABLED_BIGQUERY_LOCATIONS = frozenset(
9393
ALL_BIGQUERY_LOCATIONS - REP_ENABLED_BIGQUERY_LOCATIONS
9494
)
95+
96+
# BigQuery default is 10000, leave 100 for overhead
97+
MAX_COLUMNS = 9900

bigframes/core/block_transforms.py

+34
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import functools
1717
import typing
18+
from typing import Sequence
1819

1920
import pandas as pd
2021

@@ -105,6 +106,39 @@ def indicate_duplicates(
105106
)
106107

107108

109+
def quantile(
110+
block: blocks.Block,
111+
columns: Sequence[str],
112+
qs: Sequence[float],
113+
grouping_column_ids: Sequence[str] = (),
114+
) -> blocks.Block:
115+
# TODO: handle windowing and more interpolation methods
116+
window = core.WindowSpec(
117+
grouping_keys=tuple(grouping_column_ids),
118+
)
119+
quantile_cols = []
120+
labels = []
121+
if len(columns) * len(qs) > constants.MAX_COLUMNS:
122+
raise NotImplementedError("Too many aggregates requested.")
123+
for col in columns:
124+
for q in qs:
125+
label = block.col_id_to_label[col]
126+
new_label = (*label, q) if isinstance(label, tuple) else (label, q)
127+
labels.append(new_label)
128+
block, quantile_col = block.apply_window_op(
129+
col,
130+
agg_ops.QuantileOp(q),
131+
window_spec=window,
132+
)
133+
quantile_cols.append(quantile_col)
134+
block, results = block.aggregate(
135+
grouping_column_ids,
136+
tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
137+
dropna=True,
138+
)
139+
return block.select_columns(results).with_column_labels(labels)
140+
141+
108142
def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block:
109143
supported_methods = [
110144
"linear",

bigframes/core/blocks.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -1498,12 +1498,17 @@ def stack(self, how="left", levels: int = 1):
14981498

14991499
row_label_tuples = utils.index_as_tuples(row_labels)
15001500

1501-
if col_labels is not None:
1501+
if col_labels is None:
1502+
result_index: pd.Index = pd.Index([None])
1503+
result_col_labels: Sequence[Tuple] = list([()])
1504+
elif (col_labels.nlevels == 1) and all(
1505+
col_labels.isna()
1506+
): # isna not implemented for MultiIndex for newer pandas versions
1507+
result_index = pd.Index([None])
1508+
result_col_labels = utils.index_as_tuples(col_labels.drop_duplicates())
1509+
else:
15021510
result_index = col_labels.drop_duplicates().dropna(how="all")
15031511
result_col_labels = utils.index_as_tuples(result_index)
1504-
else:
1505-
result_index = pd.Index([None])
1506-
result_col_labels = list([()])
15071512

15081513
# Get matching columns
15091514
unpivot_columns: List[Tuple[str, List[str]]] = []

bigframes/core/compile/aggregate_compiler.py

+8
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,14 @@ def _(
148148
return cast(ibis_types.NumericValue, value)
149149

150150

151+
@compile_unary_agg.register
152+
@numeric_op
153+
def _(
154+
op: agg_ops.QuantileOp, column: ibis_types.NumericColumn, window=None
155+
) -> ibis_types.NumericValue:
156+
return _apply_window_if_present(column.quantile(op.q), window)
157+
158+
151159
@compile_unary_agg.register
152160
@numeric_op
153161
def _(

bigframes/core/groupby/__init__.py

+51-6
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from __future__ import annotations
1616

1717
import typing
18+
from typing import Sequence, Union
1819

1920
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
2021
import pandas as pd
@@ -115,14 +116,35 @@ def mean(self, numeric_only: bool = False, *args) -> df.DataFrame:
115116
def median(
116117
self, numeric_only: bool = False, *, exact: bool = False
117118
) -> df.DataFrame:
118-
if exact:
119-
raise NotImplementedError(
120-
f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
121-
)
122119
if not numeric_only:
123120
self._raise_on_non_numeric("median")
121+
if exact:
122+
return self.quantile(0.5)
124123
return self._aggregate_all(agg_ops.median_op, numeric_only=True)
125124

125+
def quantile(
126+
self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False
127+
) -> df.DataFrame:
128+
if not numeric_only:
129+
self._raise_on_non_numeric("quantile")
130+
q_cols = tuple(
131+
col
132+
for col in self._selected_cols
133+
if self._column_type(col) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
134+
)
135+
multi_q = utils.is_list_like(q)
136+
result = block_ops.quantile(
137+
self._block,
138+
q_cols,
139+
qs=tuple(q) if multi_q else (q,), # type: ignore
140+
grouping_column_ids=self._by_col_ids,
141+
)
142+
result_df = df.DataFrame(result)
143+
if multi_q:
144+
return result_df.stack()
145+
else:
146+
return result_df.droplevel(-1, 1)
147+
126148
def min(self, numeric_only: bool = False, *args) -> df.DataFrame:
127149
return self._aggregate_all(agg_ops.min_op, numeric_only=numeric_only)
128150

@@ -466,8 +488,31 @@ def sum(self, *args) -> series.Series:
466488
def mean(self, *args) -> series.Series:
467489
return self._aggregate(agg_ops.mean_op)
468490

469-
def median(self, *args, **kwargs) -> series.Series:
470-
return self._aggregate(agg_ops.mean_op)
491+
def median(
492+
self,
493+
*args,
494+
exact: bool = False,
495+
**kwargs,
496+
) -> series.Series:
497+
if exact:
498+
return self.quantile(0.5)
499+
else:
500+
return self._aggregate(agg_ops.median_op)
501+
502+
def quantile(
503+
self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False
504+
) -> series.Series:
505+
multi_q = utils.is_list_like(q)
506+
result = block_ops.quantile(
507+
self._block,
508+
(self._value_column,),
509+
qs=tuple(q) if multi_q else (q,), # type: ignore
510+
grouping_column_ids=self._by_col_ids,
511+
)
512+
if multi_q:
513+
return series.Series(result.stack())
514+
else:
515+
return series.Series(result.stack()).droplevel(-1)
471516

472517
def std(self, *args, **kwargs) -> series.Series:
473518
return self._aggregate(agg_ops.std_op)

bigframes/dataframe.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -2009,8 +2009,34 @@ def median(
20092009
frame = self._raise_on_non_numeric("median")
20102010
else:
20112011
frame = self._drop_non_numeric()
2012-
block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
2013-
return bigframes.series.Series(block.select_column("values"))
2012+
if exact:
2013+
return self.quantile()
2014+
else:
2015+
block = frame._block.aggregate_all_and_stack(agg_ops.median_op)
2016+
return bigframes.series.Series(block.select_column("values"))
2017+
2018+
def quantile(
2019+
self, q: Union[float, Sequence[float]] = 0.5, *, numeric_only: bool = False
2020+
):
2021+
if not numeric_only:
2022+
frame = self._raise_on_non_numeric("median")
2023+
else:
2024+
frame = self._drop_non_numeric()
2025+
multi_q = utils.is_list_like(q)
2026+
result = block_ops.quantile(
2027+
frame._block, frame._block.value_columns, qs=tuple(q) if multi_q else (q,) # type: ignore
2028+
)
2029+
if multi_q:
2030+
return DataFrame(result.stack()).droplevel(0)
2031+
else:
2032+
result_df = (
2033+
DataFrame(result)
2034+
.stack(list(range(0, frame.columns.nlevels)))
2035+
.droplevel(0)
2036+
)
2037+
result_series = bigframes.series.Series(result_df._block)
2038+
result_series.name = q
2039+
return result_series
20142040

20152041
def std(
20162042
self, axis: typing.Union[str, int] = 0, *, numeric_only: bool = False

bigframes/operations/aggregations.py

+12
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,18 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
109109
return input_types[0]
110110

111111

112+
@dataclasses.dataclass(frozen=True)
113+
class QuantileOp(UnaryAggregateOp):
114+
q: float
115+
116+
@property
117+
def name(self):
118+
return f"{int(self.q*100)}%"
119+
120+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
121+
return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0])
122+
123+
112124
@dataclasses.dataclass(frozen=True)
113125
class ApproxQuartilesOp(UnaryAggregateOp):
114126
quartile: int

bigframes/series.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import os
2424
import textwrap
2525
import typing
26-
from typing import Any, Literal, Mapping, Optional, Sequence, Tuple, Union
26+
from typing import Any, cast, Literal, Mapping, Optional, Sequence, Tuple, Union
2727

2828
import bigframes_vendored.pandas.core.series as vendored_pandas_series
2929
import google.cloud.bigquery as bigquery
@@ -968,10 +968,19 @@ def mean(self) -> float:
968968

969969
def median(self, *, exact: bool = False) -> float:
970970
if exact:
971-
raise NotImplementedError(
972-
f"Only approximate median is supported. {constants.FEEDBACK_LINK}"
973-
)
974-
return typing.cast(float, self._apply_aggregation(agg_ops.median_op))
971+
return typing.cast(float, self.quantile(0.5))
972+
else:
973+
return typing.cast(float, self._apply_aggregation(agg_ops.median_op))
974+
975+
def quantile(self, q: Union[float, Sequence[float]] = 0.5) -> Union[Series, float]:
976+
qs = tuple(q) if utils.is_list_like(q) else (q,)
977+
result = block_ops.quantile(self._block, (self._value_column,), qs=qs)
978+
if utils.is_list_like(q):
979+
result = result.stack()
980+
result = result.drop_levels([result.index_columns[0]])
981+
return Series(result)
982+
else:
983+
return cast(float, Series(result).to_pandas().squeeze())
975984

976985
def sum(self) -> float:
977986
return typing.cast(float, self._apply_aggregation(agg_ops.sum_op))

tests/system/small/test_dataframe.py

+29-1
Original file line numberDiff line numberDiff line change
@@ -2504,7 +2504,10 @@ def test_df_melt_default(scalars_dfs):
25042504

25052505
# Pandas produces int64 index, Bigframes produces Int64 (nullable)
25062506
pd.testing.assert_frame_equal(
2507-
bf_result, pd_result, check_index_type=False, check_dtype=False
2507+
bf_result,
2508+
pd_result,
2509+
check_index_type=False,
2510+
check_dtype=False,
25082511
)
25092512

25102513

@@ -3029,6 +3032,31 @@ def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index):
30293032
)
30303033

30313034

3035+
def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_index):
3036+
q = 0.45
3037+
col_names = ["int64_too", "int64_col", "float64_col"]
3038+
bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas()
3039+
pd_result = scalars_pandas_df_index[col_names].quantile(q=q)
3040+
3041+
# Pandas may produce narrower numeric types, but bigframes always produces Float64
3042+
pd_result = pd_result.astype("Float64")
3043+
3044+
pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
3045+
3046+
3047+
def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index):
3048+
q = [0, 0.33, 0.67, 1.0]
3049+
col_names = ["int64_too", "int64_col", "float64_col"]
3050+
bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas()
3051+
pd_result = scalars_pandas_df_index[col_names].quantile(q=q)
3052+
3053+
# Pandas may produce narrower numeric types, but bigframes always produces Float64
3054+
pd_result = pd_result.astype("Float64")
3055+
pd_result.index = pd_result.index.astype("Float64")
3056+
3057+
pd.testing.assert_frame_equal(bf_result, pd_result)
3058+
3059+
30323060
@pytest.mark.parametrize(
30333061
("op"),
30343062
[

tests/system/small/test_groupby.py

+35
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,24 @@ def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index):
6565
assert ((pd_min <= bf_result_computed) & (bf_result_computed <= pd_max)).all().all()
6666

6767

68+
@pytest.mark.parametrize(
69+
("q"),
70+
[
71+
([0.2, 0.4, 0.6, 0.8]),
72+
(0.11),
73+
],
74+
)
75+
def test_dataframe_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
76+
col_names = ["int64_too", "float64_col", "int64_col", "string_col"]
77+
bf_result = (
78+
scalars_df_index[col_names].groupby("string_col").quantile(q)
79+
).to_pandas()
80+
pd_result = scalars_pandas_df_index[col_names].groupby("string_col").quantile(q)
81+
pd.testing.assert_frame_equal(
82+
pd_result, bf_result, check_dtype=False, check_index_type=False
83+
)
84+
85+
6886
@pytest.mark.parametrize(
6987
("operator"),
7088
[
@@ -389,3 +407,20 @@ def test_dataframe_groupby_nonnumeric_with_mean():
389407
pd.testing.assert_frame_equal(
390408
pd_result, bf_result, check_index_type=False, check_dtype=False
391409
)
410+
411+
412+
@pytest.mark.parametrize(
413+
("q"),
414+
[
415+
([0.2, 0.4, 0.6, 0.8]),
416+
(0.11),
417+
],
418+
)
419+
def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
420+
bf_result = (
421+
scalars_df_index.groupby("string_col")["int64_col"].quantile(q)
422+
).to_pandas()
423+
pd_result = scalars_pandas_df_index.groupby("string_col")["int64_col"].quantile(q)
424+
pd.testing.assert_series_equal(
425+
pd_result, bf_result, check_dtype=False, check_index_type=False
426+
)

tests/system/small/test_series.py

+21
Original file line numberDiff line numberDiff line change
@@ -1320,6 +1320,27 @@ def test_median(scalars_dfs):
13201320
assert pd_min < bf_result < pd_max
13211321

13221322

1323+
def test_median_exact(scalars_dfs):
1324+
scalars_df, scalars_pandas_df = scalars_dfs
1325+
col_name = "int64_col"
1326+
bf_result = scalars_df[col_name].median(exact=True)
1327+
pd_result = scalars_pandas_df[col_name].median()
1328+
assert math.isclose(pd_result, bf_result)
1329+
1330+
1331+
def test_series_quantile(scalars_dfs):
1332+
scalars_df, scalars_pandas_df = scalars_dfs
1333+
col_name = "int64_col"
1334+
bf_series = scalars_df[col_name]
1335+
pd_series = scalars_pandas_df[col_name]
1336+
1337+
pd_result = pd_series.quantile([0.0, 0.4, 0.6, 1.0])
1338+
bf_result = bf_series.quantile([0.0, 0.4, 0.6, 1.0])
1339+
pd.testing.assert_series_equal(
1340+
pd_result, bf_result.to_pandas(), check_dtype=False, check_index_type=False
1341+
)
1342+
1343+
13231344
def test_numeric_literal(scalars_dfs):
13241345
scalars_df, _ = scalars_dfs
13251346
col_name = "numeric_col"

0 commit comments

Comments
 (0)