Skip to content

Commit 97b8bec

Browse files
feat: add level param to DataFrame.stack (#88)
* feat: add level param to DataFrame.stack
1 parent 228aeba commit 97b8bec

File tree

5 files changed

+72
-33
lines changed

5 files changed

+72
-33
lines changed

bigframes/core/block_transforms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def equals(block1: blocks.Block, block2: blocks.Block) -> bool:
5353
joined_block = joined_block.select_columns(equality_ids).with_column_labels(
5454
list(range(len(equality_ids)))
5555
)
56-
stacked_block = joined_block.stack(dropna=False, sort=False)
56+
stacked_block = joined_block.stack()
5757
result = stacked_block.get_stat(stacked_block.value_columns[0], agg_ops.all_op)
5858
return typing.cast(bool, result)
5959

bigframes/core/blocks.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -1284,20 +1284,20 @@ def pivot(
12841284

12851285
return result_block.with_column_labels(column_index)
12861286

1287-
def stack(self, how="left", dropna=True, sort=True, levels: int = 1):
1287+
def stack(self, how="left", levels: int = 1):
12881288
"""Unpivot last column axis level into row axis"""
1289+
if levels == 0:
1290+
return self
1291+
12891292
# These are the values that will be turned into rows
12901293

12911294
col_labels, row_labels = utils.split_index(self.column_labels, levels=levels)
1292-
if dropna:
1293-
row_labels = row_labels.drop_duplicates()
1294-
if sort:
1295-
row_labels = row_labels.sort_values()
1295+
row_labels = row_labels.drop_duplicates()
12961296

12971297
row_label_tuples = utils.index_as_tuples(row_labels)
12981298

12991299
if col_labels is not None:
1300-
result_index = col_labels.drop_duplicates().sort_values().dropna(how="all")
1300+
result_index = col_labels.drop_duplicates().dropna(how="all")
13011301
result_col_labels = utils.index_as_tuples(result_index)
13021302
else:
13031303
result_index = pd.Index([None])

bigframes/dataframe.py

+37-12
Original file line numberDiff line numberDiff line change
@@ -1741,24 +1741,49 @@ def pivot(
17411741
)
17421742
return DataFrame(pivot_block)
17431743

1744-
def stack(self):
1745-
# TODO: support 'level' param by simply reordering levels such that selected level is last before passing to Block.stack.
1746-
# TODO: match impl to pandas future_stack as described in pandas 2.1 release notes
1747-
stack_block = self._block.stack()
1748-
result_block = block_ops.dropna(
1749-
stack_block, stack_block.value_columns, how="all"
1750-
)
1744+
def stack(self, level: LevelsType = -1):
17511745
if not isinstance(self.columns, pandas.MultiIndex):
1752-
return bigframes.series.Series(result_block)
1753-
return DataFrame(result_block)
1746+
if level not in [0, -1, self.columns.name]:
1747+
raise IndexError(f"Invalid level {level} for single-level index")
1748+
return self._stack_mono()
1749+
return self._stack_multi(level)
1750+
1751+
def _stack_mono(self):
1752+
result_block = self._block.stack()
1753+
return bigframes.series.Series(result_block)
1754+
1755+
def _stack_multi(self, level: LevelsType = -1):
1756+
n_levels = self.columns.nlevels
1757+
if isinstance(level, int) or isinstance(level, str):
1758+
level = [level]
1759+
level_indices = []
1760+
for level_ref in level:
1761+
if isinstance(level_ref, int):
1762+
if level_ref < 0:
1763+
level_indices.append(n_levels + level_ref)
1764+
else:
1765+
level_indices.append(level_ref)
1766+
else: # str
1767+
level_indices.append(self.columns.names.index(level_ref))
1768+
1769+
new_order = [
1770+
*[i for i in range(n_levels) if i not in level_indices],
1771+
*level_indices,
1772+
]
1773+
1774+
original_columns = typing.cast(pandas.MultiIndex, self.columns)
1775+
new_columns = original_columns.reorder_levels(new_order)
1776+
1777+
block = self._block.with_column_labels(new_columns)
1778+
1779+
block = block.stack(levels=len(level))
1780+
return DataFrame(block)
17541781

17551782
def unstack(self):
17561783
block = self._block
17571784
# Special case, unstack with mono-index transpose into a series
17581785
if self.index.nlevels == 1:
1759-
block = block.stack(
1760-
how="right", dropna=False, sort=False, levels=self.columns.nlevels
1761-
)
1786+
block = block.stack(how="right", levels=self.columns.nlevels)
17621787
return bigframes.series.Series(block)
17631788

17641789
# Pivot by last level of index

tests/system/small/test_dataframe.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1885,6 +1885,8 @@ def test_df_describe(scalars_dfs):
18851885

18861886

18871887
def test_df_stack(scalars_dfs):
1888+
if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"):
1889+
pytest.skip("pandas <2.1 uses different stack implementation")
18881890
scalars_df, scalars_pandas_df = scalars_dfs
18891891
# To match bigquery dataframes
18901892
scalars_pandas_df = scalars_pandas_df.copy()
@@ -1893,7 +1895,7 @@ def test_df_stack(scalars_dfs):
18931895
columns = ["int64_col", "int64_too", "rowindex_2"]
18941896

18951897
bf_result = scalars_df[columns].stack().to_pandas()
1896-
pd_result = scalars_pandas_df[columns].stack()
1898+
pd_result = scalars_pandas_df[columns].stack(future_stack=True)
18971899

18981900
# Pandas produces NaN, where bq dataframes produces pd.NA
18991901
pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)

tests/system/small/test_multiindex.py

+25-13
Original file line numberDiff line numberDiff line change
@@ -718,25 +718,37 @@ def test_column_multi_index_cumsum(scalars_df_index, scalars_pandas_df_index):
718718
pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
719719

720720

721-
def test_column_multi_index_stack(scalars_df_index, scalars_pandas_df_index):
722-
columns = ["int64_too", "int64_col", "rowindex_2"]
721+
@pytest.mark.parametrize(
722+
("level",),
723+
[(["l3", "l1"],), ([-2, -1],), (["l3"],), ("l2",), (-3,)],
724+
)
725+
def test_column_multi_index_stack(level):
726+
if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"):
727+
pytest.skip("pandas <2.1 uses different stack implementation")
728+
723729
level1 = pandas.Index(["b", "a", "b"])
724-
# Need resulting column to be pyarrow string rather than object dtype
725-
level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]")
726-
multi_columns = pandas.MultiIndex.from_arrays([level1, level2])
727-
bf_df = scalars_df_index[columns].copy()
728-
bf_df.columns = multi_columns
729-
pd_df = scalars_pandas_df_index[columns].copy()
730-
pd_df.columns = multi_columns
730+
level2 = pandas.Index(["a", "b", "b"])
731+
level3 = pandas.Index(["b", "b", "a"])
731732

732-
bf_result = bf_df.stack().to_pandas()
733-
# Shifting sort behavior in stack
734-
pd_result = pd_df.stack()
733+
multi_columns = pandas.MultiIndex.from_arrays(
734+
[level1, level2, level3], names=["l1", "l2", "l3"]
735+
)
736+
pd_df = pandas.DataFrame(
737+
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
738+
index=[5, 2, None],
739+
columns=multi_columns,
740+
dtype="Int64",
741+
)
742+
bf_df = bpd.DataFrame(pd_df)
743+
744+
bf_result = bf_df.stack(level=level).to_pandas()
745+
# BigFrames emulates future_stack impl
746+
pd_result = pd_df.stack(level=level, future_stack=True)
735747

736748
# Pandas produces NaN, where bq dataframes produces pd.NA
737749
# Column ordering seems to depend on pandas version
738750
pandas.testing.assert_frame_equal(
739-
bf_result.sort_index(axis=1), pd_result.sort_index(axis=1), check_dtype=False
751+
bf_result, pd_result, check_dtype=False, check_index_type=False
740752
)
741753

742754

0 commit comments

Comments
 (0)