Skip to content

Commit ca26fe5

Browse files
feat: Allow windowing in 'partial' ordering mode (#861)
1 parent c415eb9 commit ca26fe5

File tree

13 files changed

+205
-117
lines changed

13 files changed

+205
-117
lines changed

bigframes/core/__init__.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -194,8 +194,17 @@ def promote_offsets(self, col_id: str) -> ArrayValue:
194194
"""
195195
Convenience function to promote copy of column offsets to a value column. Can be used to reset index.
196196
"""
197-
if self.node.order_ambiguous and not self.session._strictly_ordered:
198-
raise ValueError("Generating offsets not supported in unordered mode")
197+
if self.node.order_ambiguous and not (self.session._strictly_ordered):
198+
if not self.session._allows_ambiguity:
199+
raise ValueError(
200+
"Generating offsets not supported in partial ordering mode"
201+
)
202+
else:
203+
warnings.warn(
204+
"Window ordering may be ambiguous, this can cause unstable results.",
205+
bigframes.exceptions.AmbiguousWindowWarning,
206+
)
207+
199208
return ArrayValue(nodes.PromoteOffsetsNode(child=self.node, col_id=col_id))
200209

201210
def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue:
@@ -347,9 +356,16 @@ def project_window_op(
347356
# TODO: Support non-deterministic windowing
348357
if window_spec.row_bounded or not op.order_independent:
349358
if self.node.order_ambiguous and not self.session._strictly_ordered:
350-
raise ValueError(
351-
"Order-dependent windowed ops not supported in unordered mode"
352-
)
359+
if not self.session._allows_ambiguity:
360+
raise ValueError(
361+
"Generating offsets not supported in partial ordering mode"
362+
)
363+
else:
364+
warnings.warn(
365+
"Window ordering may be ambiguous, this can cause unstable results.",
366+
bigframes.exceptions.AmbiguousWindowWarning,
367+
)
368+
353369
return ArrayValue(
354370
nodes.WindowOpNode(
355371
child=self.node,

bigframes/core/blocks.py

+4
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,10 @@ def index_name_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]:
280280
mapping[label] = (*mapping.get(label, ()), id)
281281
return mapping
282282

283+
@property
284+
def explicitly_ordered(self) -> bool:
285+
return self.expr.node.explicitly_ordered
286+
283287
def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]:
284288
"""
285289
Unlike label_to_col_id, this works with partial labels for multi-index.

bigframes/core/compile/compiled.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ def to_sql(
263263
ordered: bool = False,
264264
) -> str:
265265
if offset_column or ordered:
266-
raise ValueError("Cannot produce sorted sql in unordered mode")
266+
raise ValueError("Cannot produce sorted sql in partial ordering mode")
267267
sql = ibis_bigquery.Backend().compile(
268268
self._to_ibis_expr(
269269
col_id_overrides=col_id_overrides,

bigframes/core/groupby/__init__.py

+19-19
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __getitem__(
109109
dropna=self._dropna,
110110
)
111111

112-
@validations.requires_strict_ordering()
112+
@validations.requires_ordering()
113113
def head(self, n: int = 5) -> df.DataFrame:
114114
block = self._block
115115
if self._dropna:
@@ -235,25 +235,25 @@ def count(self) -> df.DataFrame:
235235
def nunique(self) -> df.DataFrame:
236236
return self._aggregate_all(agg_ops.nunique_op)
237237

238-
@validations.requires_strict_ordering()
238+
@validations.requires_ordering()
239239
def cumsum(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame:
240240
if not numeric_only:
241241
self._raise_on_non_numeric("cumsum")
242242
return self._apply_window_op(agg_ops.sum_op, numeric_only=True)
243243

244-
@validations.requires_strict_ordering()
244+
@validations.requires_ordering()
245245
def cummin(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame:
246246
return self._apply_window_op(agg_ops.min_op, numeric_only=numeric_only)
247247

248-
@validations.requires_strict_ordering()
248+
@validations.requires_ordering()
249249
def cummax(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame:
250250
return self._apply_window_op(agg_ops.max_op, numeric_only=numeric_only)
251251

252-
@validations.requires_strict_ordering()
252+
@validations.requires_ordering()
253253
def cumprod(self, *args, **kwargs) -> df.DataFrame:
254254
return self._apply_window_op(agg_ops.product_op, numeric_only=True)
255255

256-
@validations.requires_strict_ordering()
256+
@validations.requires_ordering()
257257
def shift(self, periods=1) -> series.Series:
258258
window = window_specs.rows(
259259
grouping_keys=tuple(self._by_col_ids),
@@ -262,7 +262,7 @@ def shift(self, periods=1) -> series.Series:
262262
)
263263
return self._apply_window_op(agg_ops.ShiftOp(periods), window=window)
264264

265-
@validations.requires_strict_ordering()
265+
@validations.requires_ordering()
266266
def diff(self, periods=1) -> series.Series:
267267
window = window_specs.rows(
268268
grouping_keys=tuple(self._by_col_ids),
@@ -271,7 +271,7 @@ def diff(self, periods=1) -> series.Series:
271271
)
272272
return self._apply_window_op(agg_ops.DiffOp(periods), window=window)
273273

274-
@validations.requires_strict_ordering()
274+
@validations.requires_ordering()
275275
def rolling(self, window: int, min_periods=None) -> windows.Window:
276276
# To get n size window, need current row and n-1 preceding rows.
277277
window_spec = window_specs.rows(
@@ -287,7 +287,7 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
287287
block, window_spec, self._selected_cols, drop_null_groups=self._dropna
288288
)
289289

290-
@validations.requires_strict_ordering()
290+
@validations.requires_ordering()
291291
def expanding(self, min_periods: int = 1) -> windows.Window:
292292
window_spec = window_specs.cumulative_rows(
293293
grouping_keys=tuple(self._by_col_ids),
@@ -532,7 +532,7 @@ def __init__(
532532
def _session(self) -> core.Session:
533533
return self._block.session
534534

535-
@validations.requires_strict_ordering()
535+
@validations.requires_ordering()
536536
def head(self, n: int = 5) -> series.Series:
537537
block = self._block
538538
if self._dropna:
@@ -650,31 +650,31 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]:
650650

651651
aggregate = agg
652652

653-
@validations.requires_strict_ordering()
653+
@validations.requires_ordering()
654654
def cumsum(self, *args, **kwargs) -> series.Series:
655655
return self._apply_window_op(
656656
agg_ops.sum_op,
657657
)
658658

659-
@validations.requires_strict_ordering()
659+
@validations.requires_ordering()
660660
def cumprod(self, *args, **kwargs) -> series.Series:
661661
return self._apply_window_op(
662662
agg_ops.product_op,
663663
)
664664

665-
@validations.requires_strict_ordering()
665+
@validations.requires_ordering()
666666
def cummax(self, *args, **kwargs) -> series.Series:
667667
return self._apply_window_op(
668668
agg_ops.max_op,
669669
)
670670

671-
@validations.requires_strict_ordering()
671+
@validations.requires_ordering()
672672
def cummin(self, *args, **kwargs) -> series.Series:
673673
return self._apply_window_op(
674674
agg_ops.min_op,
675675
)
676676

677-
@validations.requires_strict_ordering()
677+
@validations.requires_ordering()
678678
def cumcount(self, *args, **kwargs) -> series.Series:
679679
return (
680680
self._apply_window_op(
@@ -684,7 +684,7 @@ def cumcount(self, *args, **kwargs) -> series.Series:
684684
- 1
685685
)
686686

687-
@validations.requires_strict_ordering()
687+
@validations.requires_ordering()
688688
def shift(self, periods=1) -> series.Series:
689689
"""Shift index by desired number of periods."""
690690
window = window_specs.rows(
@@ -694,7 +694,7 @@ def shift(self, periods=1) -> series.Series:
694694
)
695695
return self._apply_window_op(agg_ops.ShiftOp(periods), window=window)
696696

697-
@validations.requires_strict_ordering()
697+
@validations.requires_ordering()
698698
def diff(self, periods=1) -> series.Series:
699699
window = window_specs.rows(
700700
grouping_keys=tuple(self._by_col_ids),
@@ -703,7 +703,7 @@ def diff(self, periods=1) -> series.Series:
703703
)
704704
return self._apply_window_op(agg_ops.DiffOp(periods), window=window)
705705

706-
@validations.requires_strict_ordering()
706+
@validations.requires_ordering()
707707
def rolling(self, window: int, min_periods=None) -> windows.Window:
708708
# To get n size window, need current row and n-1 preceding rows.
709709
window_spec = window_specs.rows(
@@ -723,7 +723,7 @@ def rolling(self, window: int, min_periods=None) -> windows.Window:
723723
is_series=True,
724724
)
725725

726-
@validations.requires_strict_ordering()
726+
@validations.requires_ordering()
727727
def expanding(self, min_periods: int = 1) -> windows.Window:
728728
window_spec = window_specs.cumulative_rows(
729729
grouping_keys=tuple(self._by_col_ids),

bigframes/core/indexes/base.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def empty(self) -> bool:
184184
return self.shape[0] == 0
185185

186186
@property
187-
@validations.requires_strict_ordering()
187+
@validations.requires_ordering()
188188
def is_monotonic_increasing(self) -> bool:
189189
"""
190190
Return a boolean if the values are equal or increasing.
@@ -198,7 +198,7 @@ def is_monotonic_increasing(self) -> bool:
198198
)
199199

200200
@property
201-
@validations.requires_strict_ordering()
201+
@validations.requires_ordering()
202202
def is_monotonic_decreasing(self) -> bool:
203203
"""
204204
Return a boolean if the values are equal or decreasing.
@@ -348,7 +348,7 @@ def max(self) -> typing.Any:
348348
def min(self) -> typing.Any:
349349
return self._apply_aggregation(agg_ops.min_op)
350350

351-
@validations.requires_strict_ordering()
351+
@validations.requires_ordering()
352352
def argmax(self) -> int:
353353
block, row_nums = self._block.promote_offsets()
354354
block = block.order_by(
@@ -361,7 +361,7 @@ def argmax(self) -> int:
361361

362362
return typing.cast(int, series.Series(block.select_column(row_nums)).iloc[0])
363363

364-
@validations.requires_strict_ordering()
364+
@validations.requires_ordering()
365365
def argmin(self) -> int:
366366
block, row_nums = self._block.promote_offsets()
367367
block = block.order_by(

bigframes/core/nodes.py

+42
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,14 @@ def order_ambiguous(self) -> bool:
135135
"""
136136
...
137137

138+
@property
139+
@abc.abstractmethod
140+
def explicitly_ordered(self) -> bool:
141+
"""
142+
Whether row ordering is potentially ambiguous. For example, ReadTable (without a primary key) could be ordered in different ways.
143+
"""
144+
...
145+
138146
@functools.cached_property
139147
def total_variables(self) -> int:
140148
return self.variables_introduced + sum(
@@ -180,6 +188,10 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]:
180188
def schema(self) -> schemata.ArraySchema:
181189
return self.child.schema
182190

191+
@property
192+
def explicitly_ordered(self) -> bool:
193+
return self.child.explicitly_ordered
194+
183195
def transform_children(
184196
self, t: Callable[[BigFrameNode], BigFrameNode]
185197
) -> BigFrameNode:
@@ -212,6 +224,10 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]:
212224
def order_ambiguous(self) -> bool:
213225
return True
214226

227+
@property
228+
def explicitly_ordered(self) -> bool:
229+
return False
230+
215231
def __hash__(self):
216232
return self._node_hash
217233

@@ -267,6 +283,10 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]:
267283
def order_ambiguous(self) -> bool:
268284
return any(child.order_ambiguous for child in self.children)
269285

286+
@property
287+
def explicitly_ordered(self) -> bool:
288+
return all(child.explicitly_ordered for child in self.children)
289+
270290
def __hash__(self):
271291
return self._node_hash
272292

@@ -317,6 +337,10 @@ def variables_introduced(self) -> int:
317337
def order_ambiguous(self) -> bool:
318338
return False
319339

340+
@property
341+
def explicitly_ordered(self) -> bool:
342+
return True
343+
320344
def transform_children(
321345
self, t: Callable[[BigFrameNode], BigFrameNode]
322346
) -> BigFrameNode:
@@ -378,6 +402,10 @@ def relation_ops_created(self) -> int:
378402
def order_ambiguous(self) -> bool:
379403
return len(self.total_order_cols) == 0
380404

405+
@property
406+
def explicitly_ordered(self) -> bool:
407+
return len(self.total_order_cols) > 0
408+
381409
@functools.cached_property
382410
def variables_introduced(self) -> int:
383411
return len(self.schema.items) + 1
@@ -449,6 +477,12 @@ def hidden_columns(self) -> typing.Tuple[str, ...]:
449477
def order_ambiguous(self) -> bool:
450478
return not isinstance(self.ordering, orderings.TotalOrdering)
451479

480+
@property
481+
def explicitly_ordered(self) -> bool:
482+
return (self.ordering is not None) and len(
483+
self.ordering.all_ordering_columns
484+
) > 0
485+
452486
def transform_children(
453487
self, t: Callable[[BigFrameNode], BigFrameNode]
454488
) -> BigFrameNode:
@@ -523,6 +557,10 @@ def relation_ops_created(self) -> int:
523557
# Doesnt directly create any relational operations
524558
return 0
525559

560+
@property
561+
def explicitly_ordered(self) -> bool:
562+
return True
563+
526564

527565
@dataclass(frozen=True)
528566
class ReversedNode(UnaryNode):
@@ -636,6 +674,10 @@ def variables_introduced(self) -> int:
636674
def order_ambiguous(self) -> bool:
637675
return False
638676

677+
@property
678+
def explicitly_ordered(self) -> bool:
679+
return True
680+
639681

640682
@dataclass(frozen=True)
641683
class WindowOpNode(UnaryNode):

bigframes/core/validations.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,20 @@
2424

2525
if TYPE_CHECKING:
2626
from bigframes import Session
27+
from bigframes.core.blocks import Block
2728

2829

2930
class HasSession(Protocol):
3031
@property
3132
def _session(self) -> Session:
3233
...
3334

35+
@property
36+
def _block(self) -> Block:
37+
...
38+
3439

35-
def requires_strict_ordering(suggestion: Optional[str] = None):
40+
def requires_ordering(suggestion: Optional[str] = None):
3641
def decorator(meth):
3742
@functools.wraps(meth)
3843
def guarded_meth(object: HasSession, *args, **kwargs):
@@ -47,8 +52,16 @@ def guarded_meth(object: HasSession, *args, **kwargs):
4752
def enforce_ordered(
4853
object: HasSession, opname: str, suggestion: Optional[str] = None
4954
) -> None:
50-
if not object._session._strictly_ordered:
55+
session = object._session
56+
if session._strictly_ordered or not object._block.expr.node.order_ambiguous:
57+
# No ambiguity for how to calculate ordering, so no error or warning
58+
return None
59+
if not session._allows_ambiguity:
5160
suggestion_substr = suggestion + " " if suggestion else ""
5261
raise bigframes.exceptions.OrderRequiredError(
5362
f"Op {opname} not supported when strict ordering is disabled. {suggestion_substr}{bigframes.constants.FEEDBACK_LINK}"
5463
)
64+
if not object._block.explicitly_ordered:
65+
raise bigframes.exceptions.OrderRequiredError(
66+
f"Op {opname} requires an ordering. Use .sort_values or .sort_index to provide an ordering. {bigframes.constants.FEEDBACK_LINK}"
67+
)

0 commit comments

Comments
 (0)