Skip to content

Commit 4a27f44

Browse files
TrevorBergeronashleyxuu
authored andcommitted
feat: add unstack to series, add level param (#115)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent c4c1e6e commit 4a27f44

File tree

7 files changed

+112
-48
lines changed

7 files changed

+112
-48
lines changed

bigframes/core/blocks.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@
6767
_MONOTONIC_DECREASING = "monotonic_decreasing"
6868

6969

70+
LevelType = typing.Union[str, int]
71+
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
72+
73+
7074
class BlockHolder(typing.Protocol):
7175
"""Interface for mutable objects with state represented by a block value object."""
7276

@@ -1423,9 +1427,7 @@ def _get_unique_values(
14231427
raise ValueError(f"Too many unique values: {pd_values}")
14241428

14251429
if len(columns) > 1:
1426-
return pd.MultiIndex.from_frame(
1427-
pd_values.sort_values(by=list(pd_values.columns), na_position="first")
1428-
)
1430+
return pd.MultiIndex.from_frame(pd_values)
14291431
else:
14301432
return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first"))
14311433

@@ -1611,6 +1613,24 @@ def cached(self) -> Block:
16111613
index_labels=self.index_labels,
16121614
)
16131615

1616+
def resolve_index_level(self, level: LevelsType) -> typing.Sequence[str]:
1617+
if utils.is_list_like(level):
1618+
levels = list(level)
1619+
else:
1620+
levels = [level]
1621+
resolved_level_ids = []
1622+
for level_ref in levels:
1623+
if isinstance(level_ref, int):
1624+
resolved_level_ids.append(self.index_columns[level_ref])
1625+
elif isinstance(level_ref, typing.Hashable):
1626+
matching_ids = self.index_name_to_col_id.get(level_ref, [])
1627+
if len(matching_ids) != 1:
1628+
raise ValueError("level name cannot be found or is ambiguous")
1629+
resolved_level_ids.append(matching_ids[0])
1630+
else:
1631+
raise ValueError(f"Unexpected level: {level_ref}")
1632+
return resolved_level_ids
1633+
16141634
def _is_monotonic(
16151635
self, column_ids: typing.Union[str, Sequence[str]], increasing: bool
16161636
) -> bool:

bigframes/dataframe.py

+11-21
Original file line numberDiff line numberDiff line change
@@ -1040,22 +1040,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
10401040
raise ValueError("Columns must be a multiindex to reorder levels.")
10411041

10421042
def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
1043-
if utils.is_list_like(level):
1044-
levels = list(level)
1045-
else:
1046-
levels = [level]
1047-
resolved_level_ids = []
1048-
for level_ref in levels:
1049-
if isinstance(level_ref, int):
1050-
resolved_level_ids.append(self._block.index_columns[level_ref])
1051-
elif isinstance(level_ref, typing.Hashable):
1052-
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
1053-
if len(matching_ids) != 1:
1054-
raise ValueError("level name cannot be found or is ambiguous")
1055-
resolved_level_ids.append(matching_ids[0])
1056-
else:
1057-
raise ValueError(f"Unexpected level: {level_ref}")
1058-
return resolved_level_ids
1043+
return self._block.resolve_index_level(level)
10591044

10601045
def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame:
10611046
block = self._block.rename(columns=columns)
@@ -1804,20 +1789,25 @@ def _stack_multi(self, level: LevelsType = -1):
18041789
block = block.stack(levels=len(level))
18051790
return DataFrame(block)
18061791

1807-
def unstack(self):
1792+
def unstack(self, level: LevelsType = -1):
1793+
if isinstance(level, int) or isinstance(level, str):
1794+
level = [level]
1795+
18081796
block = self._block
18091797
# Special case, unstack with mono-index transpose into a series
18101798
if self.index.nlevels == 1:
18111799
block = block.stack(how="right", levels=self.columns.nlevels)
18121800
return bigframes.series.Series(block)
18131801

1814-
# Pivot by last level of index
1815-
index_ids = block.index_columns
1802+
# Pivot by index levels
1803+
unstack_ids = self._resolve_levels(level)
18161804
block = block.reset_index(drop=False)
1817-
block = block.set_index(index_ids[:-1])
1805+
block = block.set_index(
1806+
[col for col in self._block.index_columns if col not in unstack_ids]
1807+
)
18181808

18191809
pivot_block = block.pivot(
1820-
columns=[index_ids[-1]],
1810+
columns=unstack_ids,
18211811
values=self._block.value_columns,
18221812
values_in_index=True,
18231813
)

bigframes/series.py

+24-16
Original file line numberDiff line numberDiff line change
@@ -354,22 +354,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
354354
return Series(self._block.reorder_levels(resolved_level_ids))
355355

356356
def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
357-
if _is_list_like(level):
358-
levels = list(level)
359-
else:
360-
levels = [level]
361-
resolved_level_ids = []
362-
for level_ref in levels:
363-
if isinstance(level_ref, int):
364-
resolved_level_ids.append(self._block.index_columns[level_ref])
365-
elif isinstance(level_ref, typing.Hashable):
366-
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
367-
if len(matching_ids) != 1:
368-
raise ValueError("level name cannot be found or is ambiguous")
369-
resolved_level_ids.append(matching_ids[0])
370-
else:
371-
raise ValueError(f"Unexpected level: {level_ref}")
372-
return resolved_level_ids
357+
return self._block.resolve_index_level(level)
373358

374359
def between(self, left, right, inclusive="both"):
375360
if inclusive not in ["both", "neither", "left", "right"]:
@@ -920,6 +905,29 @@ def argmin(self) -> int:
920905
scalars.Scalar, Series(block.select_column(row_nums)).iloc[0]
921906
)
922907

908+
def unstack(self, level: LevelsType = -1):
909+
if isinstance(level, int) or isinstance(level, str):
910+
level = [level]
911+
912+
block = self._block
913+
914+
if self.index.nlevels == 1:
915+
raise ValueError("Series must have multi-index to unstack")
916+
917+
# Pivot by index levels
918+
unstack_ids = self._resolve_levels(level)
919+
block = block.reset_index(drop=False)
920+
block = block.set_index(
921+
[col for col in self._block.index_columns if col not in unstack_ids]
922+
)
923+
924+
pivot_block = block.pivot(
925+
columns=unstack_ids,
926+
values=self._block.value_columns,
927+
values_in_index=False,
928+
)
929+
return bigframes.dataframe.DataFrame(pivot_block)
930+
923931
def idxmax(self) -> blocks.Label:
924932
block = self._block.order_by(
925933
[

tests/system/conftest.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,11 @@ def hockey_df(
400400
hockey_table_id: str, session: bigframes.Session
401401
) -> bigframes.dataframe.DataFrame:
402402
"""DataFrame pointing at test data."""
403-
return session.read_gbq(hockey_table_id)
403+
return (
404+
session.read_gbq(hockey_table_id)
405+
.set_index(["player_name", "season"])
406+
.sort_index()
407+
)
404408

405409

406410
@pytest.fixture(scope="session")
@@ -419,7 +423,7 @@ def hockey_pandas_df() -> pd.DataFrame:
419423
"season": pd.Int64Dtype(),
420424
},
421425
)
422-
df.index = df.index.astype("Int64")
426+
df = df.set_index(["player_name", "season"]).sort_index()
423427
return df
424428

425429

tests/system/small/test_dataframe.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1949,8 +1949,14 @@ def test_df_pivot(scalars_dfs, values, index, columns):
19491949
],
19501950
)
19511951
def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns):
1952-
bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas()
1953-
pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns)
1952+
bf_result = (
1953+
hockey_df.reset_index()
1954+
.pivot(values=values, index=index, columns=columns)
1955+
.to_pandas()
1956+
)
1957+
pd_result = hockey_pandas_df.reset_index().pivot(
1958+
values=values, index=index, columns=columns
1959+
)
19541960

19551961
# Pandas produces NaN, where bq dataframes produces pd.NA
19561962
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)

tests/system/small/test_multiindex.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -909,13 +909,36 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i
909909
pandas.testing.assert_frame_equal(bf_result, pd_result)
910910

911911

912-
def test_multi_index_unstack(hockey_df, hockey_pandas_df):
912+
@pytest.mark.parametrize(
913+
("level",),
914+
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
915+
)
916+
def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level):
913917
bf_result = (
914-
hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas()
918+
hockey_df.set_index(["team_name", "position"], append=True)
919+
.unstack(level=level)
920+
.to_pandas()
915921
)
916922
pd_result = hockey_pandas_df.set_index(
917-
["team_name", "season", "position"]
918-
).unstack()
923+
["team_name", "position"], append=True
924+
).unstack(level=level)
925+
926+
pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
927+
928+
929+
@pytest.mark.parametrize(
930+
("level",),
931+
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
932+
)
933+
def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level):
934+
bf_result = (
935+
hockey_df.set_index(["team_name", "position"], append=True)["number"]
936+
.unstack(level=level)
937+
.to_pandas()
938+
)
939+
pd_result = hockey_pandas_df.set_index(["team_name", "position"], append=True)[
940+
"number"
941+
].unstack(level=level)
919942

920943
pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
921944

third_party/bigframes_vendored/pandas/core/series.py

+13
Original file line numberDiff line numberDiff line change
@@ -1654,6 +1654,19 @@ def clip(self):
16541654
"""
16551655
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
16561656

1657+
def unstack(self, level):
1658+
"""
1659+
Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.
1660+
1661+
Args:
1662+
level (int, str, or list of these, default last level):
1663+
Level(s) to unstack, can pass level name.
1664+
1665+
Returns:
1666+
DataFrame: Unstacked Series.
1667+
"""
1668+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
1669+
16571670
def argmax(self):
16581671
"""
16591672
Return int position of the smallest value in the Series.

0 commit comments

Comments
 (0)