Skip to content

feat: (Series|DataFrame).explode #556

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions bigframes/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,15 @@ def join(
return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node))
return ArrayValue(join_node)

def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
assert len(column_ids) > 0
for column_id in column_ids:
assert bigframes.dtypes.is_array_like(self.get_column_type(column_id))

return ArrayValue(
nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
)

def _uniform_sampling(self, fraction: float) -> ArrayValue:
"""Sampling the table on given fraction.

Expand Down
30 changes: 30 additions & 0 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,6 +1162,36 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
index_labels=self.column_labels.names,
)

def explode(
self,
column_ids: typing.Sequence[str],
ignore_index: Optional[bool],
) -> Block:
column_ids = [
column_id
for column_id in column_ids
if bigframes.dtypes.is_array_like(self.expr.get_column_type(column_id))
]
if len(column_ids) == 0:
expr = self.expr
else:
expr = self.expr.explode(column_ids)

if ignore_index:
return Block(
expr.drop_columns(self.index_columns),
column_labels=self.column_labels,
# Initiates default index creation using the block constructor.
index_columns=[],
)
else:
return Block(
expr,
column_labels=self.column_labels,
index_columns=self.index_columns,
index_labels=self.column_labels.names,
)

def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]:
"""
Gets a standard set of stats to preemptively fetch for a column if
Expand Down
118 changes: 118 additions & 0 deletions bigframes/core/compile/compiled.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import typing
from typing import Collection, Iterable, Literal, Optional, Sequence

import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
import ibis
import ibis.backends.bigquery as ibis_bigquery
import ibis.common.deferred # type: ignore
Expand Down Expand Up @@ -502,6 +503,51 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
columns=columns,
)

def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
table = self._to_ibis_expr()

# The offset array ensures null represents empty arrays after unnesting.
offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
offset_array = (
vendored_ibis_ops.GenerateArray(
ibis.greatest(
0,
ibis.least(
*[table[column_id].length() - 1 for column_id in column_ids]
),
)
)
.to_expr()
.name(offset_array_id),
)
table_w_offset_array = table.select(
offset_array,
*self._column_names,
)

unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
unnest_offset = (
table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
)
table_w_offset = table_w_offset_array.select(
unnest_offset,
*self._column_names,
)

unnested_columns = [
table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
if column_id in column_ids
else table_w_offset[column_id]
for column_id in self._column_names
]
table_w_unnest = table_w_offset.select(*unnested_columns)

columns = [table_w_unnest[column_name] for column_name in self._column_names]
return UnorderedIR(
table_w_unnest,
columns=columns,
)

## Helpers
def _set_or_replace_by_id(
self, id: str, new_value: ibis_types.Value
Expand Down Expand Up @@ -719,6 +765,78 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR:
ordering=self._ordering,
)

def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR:
table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)

offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
offset_array = (
vendored_ibis_ops.GenerateArray(
ibis.greatest(
0,
ibis.least(
*[table[column_id].length() - 1 for column_id in column_ids]
),
)
)
.to_expr()
.name(offset_array_id),
)
table_w_offset_array = table.select(
offset_array,
*self._column_names,
*self._hidden_ordering_column_names,
)

unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
unnest_offset = (
table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
)
table_w_offset = table_w_offset_array.select(
unnest_offset,
*self._column_names,
*self._hidden_ordering_column_names,
)

unnested_columns = [
table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
if column_id in column_ids
else table_w_offset[column_id]
for column_id in self._column_names
]

table_w_unnest = table_w_offset.select(
table_w_offset[unnest_offset_id],
*unnested_columns,
*self._hidden_ordering_column_names,
)

columns = [table_w_unnest[column_name] for column_name in self._column_names]
hidden_ordering_columns = [
*[
table_w_unnest[column_name]
for column_name in self._hidden_ordering_column_names
],
table_w_unnest[unnest_offset_id],
]
ordering = ExpressionOrdering(
ordering_value_columns=tuple(
[
*self._ordering.ordering_value_columns,
ascending_over(unnest_offset_id),
]
),
total_ordering_columns=frozenset(
[*self._ordering.total_ordering_columns, unnest_offset_id]
),
)

return OrderedIR(
table_w_unnest,
columns=columns,
hidden_ordering_columns=hidden_ordering_columns,
ordering=ordering,
)

def promote_offsets(self, col_id: str) -> OrderedIR:
"""
Convenience function to promote copy of column offsets to a value column. Can be used to reset index.
Expand Down
5 changes: 5 additions & 0 deletions bigframes/core/compile/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True):
)


@_compile_node.register
def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True):
return compile_node(node.child, ordered).explode(node.column_ids)


@_compile_node.register
def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True):
return compile_node(node.child, ordered)._uniform_sampling(node.fraction)
27 changes: 27 additions & 0 deletions bigframes/core/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,3 +484,30 @@ def row_preserving(self) -> bool:

def __hash__(self):
return self._node_hash


@dataclass(frozen=True)
class ExplodeNode(UnaryNode):
column_ids: typing.Tuple[str, ...]

@property
def row_preserving(self) -> bool:
return False

def __hash__(self):
return self._node_hash

@functools.cached_property
def schema(self) -> schemata.ArraySchema:
items = tuple(
schemata.SchemaItem(
name,
bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
self.child.schema.get_type(name).pyarrow_dtype.value_type
),
)
if name in self.column_ids
else schemata.SchemaItem(name, self.child.schema.get_type(name))
for name in self.child.schema.names
)
return schemata.ArraySchema(items)
30 changes: 30 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2532,6 +2532,36 @@ def sample(
)[0]
)

def explode(
self,
column: typing.Union[blocks.Label, typing.Sequence[blocks.Label]],
*,
ignore_index: Optional[bool] = False,
) -> DataFrame:
if not utils.is_list_like(column):
column_labels = typing.cast(typing.Sequence[blocks.Label], (column,))
else:
column_labels = typing.cast(typing.Sequence[blocks.Label], tuple(column))

if not column_labels:
raise ValueError("column must be nonempty")
if len(column_labels) > len(set(column_labels)):
raise ValueError("column must be unique")

column_ids = [self._resolve_label_exact(label) for label in column_labels]
missing = [
column_labels[i] for i in range(len(column_ids)) if column_ids[i] is None
]
if len(missing) > 0:
raise KeyError(f"None of {missing} are in the columns")

return DataFrame(
self._block.explode(
column_ids=typing.cast(typing.Sequence[str], tuple(column_ids)),
ignore_index=ignore_index,
)
)

def _split(
self,
ns: Iterable[int] = (),
Expand Down
15 changes: 11 additions & 4 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,19 @@ def is_string_like(type: ExpressionType) -> bool:


def is_array_like(type: ExpressionType) -> bool:
if isinstance(type, pd.ArrowDtype) and isinstance(type.pyarrow_dtype, pa.ListType):
return True
else:
return type in (STRING_DTYPE, BYTES_DTYPE)
return isinstance(type, pd.ArrowDtype) and isinstance(
type.pyarrow_dtype, pa.ListType
)


def is_numeric(type: ExpressionType) -> bool:
return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE


def is_iterable(type: ExpressionType) -> bool:
return type in (STRING_DTYPE, BYTES_DTYPE) or is_array_like(type)


def is_comparable(type: ExpressionType) -> bool:
return (type is not None) and (type not in UNORDERED_DTYPES)

Expand Down Expand Up @@ -348,6 +351,10 @@ def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType:
)


def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
return ibis_dtype_to_bigframes_dtype(arrow_dtype_to_ibis_dtype(arrow_dtype))


def bigframes_dtype_to_ibis_dtype(
bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]]
) -> ibis_dtypes.DataType:
Expand Down
2 changes: 1 addition & 1 deletion bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def create_binary_op(
len_op = create_unary_op(
name="len",
type_signature=op_typing.FixedOutputType(
dtypes.is_array_like, dtypes.INT_DTYPE, description="array-like"
dtypes.is_iterable, dtypes.INT_DTYPE, description="iterable"
),
)
reverse_op = create_unary_op(name="reverse", type_signature=op_typing.STRING_TRANSFORM)
Expand Down
7 changes: 7 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1547,6 +1547,13 @@ def sample(
)[0]
)

def explode(self, *, ignore_index: Optional[bool] = False) -> Series:
return Series(
self._block.explode(
column_ids=[self._value_column], ignore_index=ignore_index
)
)

def __array_ufunc__(
self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs
) -> Series:
Expand Down
Loading