googleapis · chelsea-lin · Apr 4, 2024 · Apr 1, 2024 · Apr 1, 2024 · Apr 2, 2024
@@ -401,6 +401,15 @@ def join(
             return ArrayValue(bigframes.core.rewrite.maybe_rewrite_join(join_node))
         return ArrayValue(join_node)
 
+    def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
+        assert len(column_ids) > 0
+        for column_id in column_ids:
+            assert bigframes.dtypes.is_array_like(self.get_column_type(column_id))
+
+        return ArrayValue(
+            nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
+        )
+
     def _uniform_sampling(self, fraction: float) -> ArrayValue:
         """Sampling the table on given fraction.
 

@@ -1162,6 +1162,36 @@ def calculate_pairwise_metric(self, op=agg_ops.CorrOp()):
             index_labels=self.column_labels.names,
         )
 
+    def explode(
+        self,
+        column_ids: typing.Sequence[str],
+        ignore_index: Optional[bool],
+    ) -> Block:
+        column_ids = [
+            column_id
+            for column_id in column_ids
+            if bigframes.dtypes.is_array_like(self.expr.get_column_type(column_id))
+        ]
+        if len(column_ids) == 0:
+            expr = self.expr
+        else:
+            expr = self.expr.explode(column_ids)
+
+        if ignore_index:
+            return Block(
+                expr.drop_columns(self.index_columns),
+                column_labels=self.column_labels,
+                # Initiates default index creation using the block constructor.
+                index_columns=[],
+            )
+        else:
+            return Block(
+                expr,
+                column_labels=self.column_labels,
+                index_columns=self.index_columns,
+                index_labels=self.column_labels.names,
+            )
+
     def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.UnaryAggregateOp]:
         """
         Gets a standard set of stats to preemptively fetch for a column if

@@ -20,6 +20,7 @@
 import typing
 from typing import Collection, Iterable, Literal, Optional, Sequence
 
+import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
 import ibis
 import ibis.backends.bigquery as ibis_bigquery
 import ibis.common.deferred  # type: ignore
@@ -502,6 +503,51 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
             columns=columns,
         )
 
+    def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
+        table = self._to_ibis_expr()
+
+        # The offset array ensures null represents empty arrays after unnesting.
+        offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
+        offset_array = (
+            vendored_ibis_ops.GenerateArray(
+                ibis.greatest(
+                    0,
+                    ibis.least(
+                        *[table[column_id].length() - 1 for column_id in column_ids]
+                    ),
+                )
+            )
+            .to_expr()
+            .name(offset_array_id),
+        )
+        table_w_offset_array = table.select(
+            offset_array,
+            *self._column_names,
+        )
+
+        unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
+        unnest_offset = (
+            table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
+        )
+        table_w_offset = table_w_offset_array.select(
+            unnest_offset,
+            *self._column_names,
+        )
+
+        unnested_columns = [
+            table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
+            if column_id in column_ids
+            else table_w_offset[column_id]
+            for column_id in self._column_names
+        ]
+        table_w_unnest = table_w_offset.select(*unnested_columns)
+
+        columns = [table_w_unnest[column_name] for column_name in self._column_names]
+        return UnorderedIR(
+            table_w_unnest,
+            columns=columns,
+        )
+
     ## Helpers
     def _set_or_replace_by_id(
         self, id: str, new_value: ibis_types.Value
@@ -719,6 +765,78 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR:
             ordering=self._ordering,
         )
 
+    def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR:
+        table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
+
+        offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
+        offset_array = (
+            vendored_ibis_ops.GenerateArray(
+                ibis.greatest(
+                    0,
+                    ibis.least(
+                        *[table[column_id].length() - 1 for column_id in column_ids]
+                    ),
+                )
+            )
+            .to_expr()
+            .name(offset_array_id),
+        )
+        table_w_offset_array = table.select(
+            offset_array,
+            *self._column_names,
+            *self._hidden_ordering_column_names,
+        )
+
+        unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
+        unnest_offset = (
+            table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
+        )
+        table_w_offset = table_w_offset_array.select(
+            unnest_offset,
+            *self._column_names,
+            *self._hidden_ordering_column_names,
+        )
+
+        unnested_columns = [
+            table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
+            if column_id in column_ids
+            else table_w_offset[column_id]
+            for column_id in self._column_names
+        ]
+
+        table_w_unnest = table_w_offset.select(
+            table_w_offset[unnest_offset_id],
+            *unnested_columns,
+            *self._hidden_ordering_column_names,
+        )
+
+        columns = [table_w_unnest[column_name] for column_name in self._column_names]
+        hidden_ordering_columns = [
+            *[
+                table_w_unnest[column_name]
+                for column_name in self._hidden_ordering_column_names
+            ],
+            table_w_unnest[unnest_offset_id],
+        ]
+        ordering = ExpressionOrdering(
+            ordering_value_columns=tuple(
+                [
+                    *self._ordering.ordering_value_columns,
+                    ascending_over(unnest_offset_id),
+                ]
+            ),
+            total_ordering_columns=frozenset(
+                [*self._ordering.total_ordering_columns, unnest_offset_id]
+            ),
+        )
+
+        return OrderedIR(
+            table_w_unnest,
+            columns=columns,
+            hidden_ordering_columns=hidden_ordering_columns,
+            ordering=ordering,
+        )
+
     def promote_offsets(self, col_id: str) -> OrderedIR:
         """
         Convenience function to promote copy of column offsets to a value column. Can be used to reset index.

@@ -191,6 +191,11 @@ def compile_unpivot(node: nodes.UnpivotNode, ordered: bool = True):
     )
 
 
+@_compile_node.register
+def compiler_explode(node: nodes.ExplodeNode, ordered: bool = True):
+    return compile_node(node.child, ordered).explode(node.column_ids)
+
+
 @_compile_node.register
 def compiler_random_sample(node: nodes.RandomSampleNode, ordered: bool = True):
     return compile_node(node.child, ordered)._uniform_sampling(node.fraction)
@@ -484,3 +484,30 @@ def row_preserving(self) -> bool:
 
     def __hash__(self):
         return self._node_hash
+
+
+@dataclass(frozen=True)
+class ExplodeNode(UnaryNode):
+    column_ids: typing.Tuple[str, ...]
+
+    @property
+    def row_preserving(self) -> bool:
+        return False
+
+    def __hash__(self):
+        return self._node_hash
+
+    @functools.cached_property
+    def schema(self) -> schemata.ArraySchema:
+        items = tuple(
+            schemata.SchemaItem(
+                name,
+                bigframes.dtypes.arrow_dtype_to_bigframes_dtype(
+                    self.child.schema.get_type(name).pyarrow_dtype.value_type
+                ),
+            )
+            if name in self.column_ids
+            else schemata.SchemaItem(name, self.child.schema.get_type(name))
+            for name in self.child.schema.names
+        )
+        return schemata.ArraySchema(items)
@@ -2532,6 +2532,36 @@ def sample(
             )[0]
         )
 
+    def explode(
+        self,
+        column: typing.Union[blocks.Label, typing.Sequence[blocks.Label]],
+        *,
+        ignore_index: Optional[bool] = False,
+    ) -> DataFrame:
+        if not utils.is_list_like(column):
+            column_labels = typing.cast(typing.Sequence[blocks.Label], (column,))
+        else:
+            column_labels = typing.cast(typing.Sequence[blocks.Label], tuple(column))
+
+        if not column_labels:
+            raise ValueError("column must be nonempty")
+        if len(column_labels) > len(set(column_labels)):
+            raise ValueError("column must be unique")
+
+        column_ids = [self._resolve_label_exact(label) for label in column_labels]
+        missing = [
+            column_labels[i] for i in range(len(column_ids)) if column_ids[i] is None
+        ]
+        if len(missing) > 0:
+            raise KeyError(f"None of {missing} are in the columns")
+
+        return DataFrame(
+            self._block.explode(
+                column_ids=typing.cast(typing.Sequence[str], tuple(column_ids)),
+                ignore_index=ignore_index,
+            )
+        )
+
     def _split(
         self,
         ns: Iterable[int] = (),

@@ -129,16 +129,19 @@ def is_string_like(type: ExpressionType) -> bool:
 
 
 def is_array_like(type: ExpressionType) -> bool:
-    if isinstance(type, pd.ArrowDtype) and isinstance(type.pyarrow_dtype, pa.ListType):
-        return True
-    else:
-        return type in (STRING_DTYPE, BYTES_DTYPE)
+    return isinstance(type, pd.ArrowDtype) and isinstance(
+        type.pyarrow_dtype, pa.ListType
+    )
 
 
 def is_numeric(type: ExpressionType) -> bool:
     return type in NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
 
 
+def is_iterable(type: ExpressionType) -> bool:
+    return type in (STRING_DTYPE, BYTES_DTYPE) or is_array_like(type)
+
+
 def is_comparable(type: ExpressionType) -> bool:
     return (type is not None) and (type not in UNORDERED_DTYPES)
 
@@ -348,6 +351,10 @@ def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType:
         )
 
 
+def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
+    return ibis_dtype_to_bigframes_dtype(arrow_dtype_to_ibis_dtype(arrow_dtype))
+
+
 def bigframes_dtype_to_ibis_dtype(
     bigframes_dtype: Union[DtypeString, Dtype, np.dtype[Any]]
 ) -> ibis_dtypes.DataType:

@@ -212,7 +212,7 @@ def create_binary_op(
 len_op = create_unary_op(
     name="len",
     type_signature=op_typing.FixedOutputType(
-        dtypes.is_array_like, dtypes.INT_DTYPE, description="array-like"
+        dtypes.is_iterable, dtypes.INT_DTYPE, description="iterable"
     ),
 )
 reverse_op = create_unary_op(name="reverse", type_signature=op_typing.STRING_TRANSFORM)

@@ -1547,6 +1547,13 @@ def sample(
             )[0]
         )
 
+    def explode(self, *, ignore_index: Optional[bool] = False) -> Series:
+        return Series(
+            self._block.explode(
+                column_ids=[self._value_column], ignore_index=ignore_index
+            )
+        )
+
     def __array_ufunc__(
         self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs
     ) -> Series: