Skip to content

Commit 5a3ac6c

Browse files
feat: add unstack to series, add param
1 parent 752a1d6 commit 5a3ac6c

File tree

6 files changed

+104
-48
lines changed

6 files changed

+104
-48
lines changed

bigframes/core/blocks.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@
6969
_MONOTONIC_DECREASING = "monotonic_decreasing"
7070

7171

72+
LevelType = typing.Union[str, int]
73+
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
74+
75+
7276
class BlockHolder(typing.Protocol):
7377
"""Interface for mutable objects with state represented by a block value object."""
7478

@@ -1433,9 +1437,7 @@ def _get_unique_values(
14331437
raise ValueError(f"Too many unique values: {pd_values}")
14341438

14351439
if len(columns) > 1:
1436-
return pd.MultiIndex.from_frame(
1437-
pd_values.sort_values(by=list(pd_values.columns), na_position="first")
1438-
)
1440+
return pd.MultiIndex.from_frame(pd_values)
14391441
else:
14401442
return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first"))
14411443

@@ -1621,6 +1623,24 @@ def cached(self) -> Block:
16211623
index_labels=self.index_labels,
16221624
)
16231625

1626+
def resolve_index_level(self, level: LevelsType) -> typing.Sequence[str]:
1627+
if utils.is_list_like(level):
1628+
levels = list(level)
1629+
else:
1630+
levels = [level]
1631+
resolved_level_ids = []
1632+
for level_ref in levels:
1633+
if isinstance(level_ref, int):
1634+
resolved_level_ids.append(self.index_columns[level_ref])
1635+
elif isinstance(level_ref, typing.Hashable):
1636+
matching_ids = self.index_name_to_col_id.get(level_ref, [])
1637+
if len(matching_ids) != 1:
1638+
raise ValueError("level name cannot be found or is ambiguous")
1639+
resolved_level_ids.append(matching_ids[0])
1640+
else:
1641+
raise ValueError(f"Unexpected level: {level_ref}")
1642+
return resolved_level_ids
1643+
16241644
def _is_monotonic(
16251645
self, column_ids: typing.Union[str, Sequence[str]], increasing: bool
16261646
) -> bool:

bigframes/dataframe.py

+11-21
Original file line numberDiff line numberDiff line change
@@ -1034,22 +1034,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
10341034
raise ValueError("Columns must be a multiindex to reorder levels.")
10351035

10361036
def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
1037-
if utils.is_list_like(level):
1038-
levels = list(level)
1039-
else:
1040-
levels = [level]
1041-
resolved_level_ids = []
1042-
for level_ref in levels:
1043-
if isinstance(level_ref, int):
1044-
resolved_level_ids.append(self._block.index_columns[level_ref])
1045-
elif isinstance(level_ref, typing.Hashable):
1046-
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
1047-
if len(matching_ids) != 1:
1048-
raise ValueError("level name cannot be found or is ambiguous")
1049-
resolved_level_ids.append(matching_ids[0])
1050-
else:
1051-
raise ValueError(f"Unexpected level: {level_ref}")
1052-
return resolved_level_ids
1037+
return self._block.resolve_index_level(level)
10531038

10541039
def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame:
10551040
block = self._block.rename(columns=columns)
@@ -1781,20 +1766,25 @@ def _stack_multi(self, level: LevelsType = -1):
17811766
block = block.stack(levels=len(level))
17821767
return DataFrame(block)
17831768

1784-
def unstack(self):
1769+
def unstack(self, level: LevelsType = -1):
1770+
if isinstance(level, int) or isinstance(level, str):
1771+
level = [level]
1772+
17851773
block = self._block
17861774
# Special case, unstack with mono-index transpose into a series
17871775
if self.index.nlevels == 1:
17881776
block = block.stack(how="right", levels=self.columns.nlevels)
17891777
return bigframes.series.Series(block)
17901778

1791-
# Pivot by last level of index
1792-
index_ids = block.index_columns
1779+
# Pivot by index levels
1780+
unstack_ids = self._resolve_levels(level)
17931781
block = block.reset_index(drop=False)
1794-
block = block.set_index(index_ids[:-1])
1782+
block = block.set_index(
1783+
[col for col in self._block.index_columns if col not in unstack_ids]
1784+
)
17951785

17961786
pivot_block = block.pivot(
1797-
columns=[index_ids[-1]],
1787+
columns=unstack_ids,
17981788
values=self._block.value_columns,
17991789
values_in_index=True,
18001790
)

bigframes/series.py

+24-16
Original file line numberDiff line numberDiff line change
@@ -348,22 +348,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
348348
return Series(self._block.reorder_levels(resolved_level_ids))
349349

350350
def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
351-
if _is_list_like(level):
352-
levels = list(level)
353-
else:
354-
levels = [level]
355-
resolved_level_ids = []
356-
for level_ref in levels:
357-
if isinstance(level_ref, int):
358-
resolved_level_ids.append(self._block.index_columns[level_ref])
359-
elif isinstance(level_ref, typing.Hashable):
360-
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
361-
if len(matching_ids) != 1:
362-
raise ValueError("level name cannot be found or is ambiguous")
363-
resolved_level_ids.append(matching_ids[0])
364-
else:
365-
raise ValueError(f"Unexpected level: {level_ref}")
366-
return resolved_level_ids
351+
return self._block.resolve_index_level(level)
367352

368353
def between(self, left, right, inclusive="both"):
369354
if inclusive not in ["both", "neither", "left", "right"]:
@@ -914,6 +899,29 @@ def argmin(self) -> int:
914899
scalars.Scalar, Series(block.select_column(row_nums)).iloc[0]
915900
)
916901

902+
def unstack(self, level: LevelsType = -1):
903+
if isinstance(level, int) or isinstance(level, str):
904+
level = [level]
905+
906+
block = self._block
907+
908+
if self.index.nlevels == 1:
909+
raise ValueError("Series must have multi-index to unstack")
910+
911+
# Pivot by index levels
912+
unstack_ids = self._resolve_levels(level)
913+
block = block.reset_index(drop=False)
914+
block = block.set_index(
915+
[col for col in self._block.index_columns if col not in unstack_ids]
916+
)
917+
918+
pivot_block = block.pivot(
919+
columns=unstack_ids,
920+
values=self._block.value_columns,
921+
values_in_index=False,
922+
)
923+
return bigframes.dataframe.DataFrame(pivot_block)
924+
917925
def idxmax(self) -> blocks.Label:
918926
block = self._block.order_by(
919927
[

tests/system/conftest.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,11 @@ def hockey_df(
387387
hockey_table_id: str, session: bigframes.Session
388388
) -> bigframes.dataframe.DataFrame:
389389
"""DataFrame pointing at test data."""
390-
return session.read_gbq(hockey_table_id)
390+
return (
391+
session.read_gbq(hockey_table_id)
392+
.set_index(["player_name", "season"])
393+
.sort_index()
394+
)
391395

392396

393397
@pytest.fixture(scope="session")
@@ -406,7 +410,7 @@ def hockey_pandas_df() -> pd.DataFrame:
406410
"season": pd.Int64Dtype(),
407411
},
408412
)
409-
df.index = df.index.astype("Int64")
413+
df = df.set_index(["player_name", "season"]).sort_index()
410414
return df
411415

412416

tests/system/small/test_dataframe.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1949,8 +1949,14 @@ def test_df_pivot(scalars_dfs, values, index, columns):
19491949
],
19501950
)
19511951
def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns):
1952-
bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas()
1953-
pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns)
1952+
bf_result = (
1953+
hockey_df.reset_index()
1954+
.pivot(values=values, index=index, columns=columns)
1955+
.to_pandas()
1956+
)
1957+
pd_result = hockey_pandas_df.reset_index().pivot(
1958+
values=values, index=index, columns=columns
1959+
)
19541960

19551961
# Pandas produces NaN, where bq dataframes produces pd.NA
19561962
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)

tests/system/small/test_multiindex.py

+32-4
Original file line numberDiff line numberDiff line change
@@ -909,13 +909,41 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i
909909
pandas.testing.assert_frame_equal(bf_result, pd_result)
910910

911911

912-
def test_multi_index_unstack(hockey_df, hockey_pandas_df):
912+
@pytest.mark.parametrize(
913+
("level",),
914+
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
915+
)
916+
def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level):
913917
bf_result = (
914-
hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas()
918+
hockey_df.set_index(["team_name", "position"], append=True)
919+
.unstack(level=level)
920+
.to_pandas()
915921
)
916922
pd_result = hockey_pandas_df.set_index(
917-
["team_name", "season", "position"]
918-
).unstack()
923+
["team_name", "position"], append=True
924+
).unstack(level=level)
925+
926+
pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
927+
928+
929+
@pytest.mark.parametrize(
930+
("level",),
931+
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
932+
)
933+
def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level):
934+
bf_result = (
935+
hockey_df.set_index(["team_name", "position"], append=True)["number"]
936+
.unstack(level=level)
937+
.to_pandas()
938+
)
939+
pd_result = hockey_pandas_df.set_index(["team_name", "position"], append=True)[
940+
"number"
941+
].unstack(level=level)
942+
943+
print("pandas")
944+
print(pd_result.to_string())
945+
print("bigframes")
946+
print(bf_result.to_string())
919947

920948
pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
921949

0 commit comments

Comments
 (0)