Skip to content

feat: add unstack to series, add level param #115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 26, 2023
26 changes: 23 additions & 3 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@
_MONOTONIC_DECREASING = "monotonic_decreasing"


LevelType = typing.Union[str, int]
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]


class BlockHolder(typing.Protocol):
"""Interface for mutable objects with state represented by a block value object."""

Expand Down Expand Up @@ -1423,9 +1427,7 @@ def _get_unique_values(
raise ValueError(f"Too many unique values: {pd_values}")

if len(columns) > 1:
return pd.MultiIndex.from_frame(
pd_values.sort_values(by=list(pd_values.columns), na_position="first")
)
return pd.MultiIndex.from_frame(pd_values)
else:
return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first"))

Expand Down Expand Up @@ -1611,6 +1613,24 @@ def cached(self) -> Block:
index_labels=self.index_labels,
)

def resolve_index_level(self, level: LevelsType) -> typing.Sequence[str]:
if utils.is_list_like(level):
levels = list(level)
else:
levels = [level]
resolved_level_ids = []
for level_ref in levels:
if isinstance(level_ref, int):
resolved_level_ids.append(self.index_columns[level_ref])
elif isinstance(level_ref, typing.Hashable):
matching_ids = self.index_name_to_col_id.get(level_ref, [])
if len(matching_ids) != 1:
raise ValueError("level name cannot be found or is ambiguous")
resolved_level_ids.append(matching_ids[0])
else:
raise ValueError(f"Unexpected level: {level_ref}")
return resolved_level_ids

def _is_monotonic(
self, column_ids: typing.Union[str, Sequence[str]], increasing: bool
) -> bool:
Expand Down
32 changes: 11 additions & 21 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,22 +1038,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
raise ValueError("Columns must be a multiindex to reorder levels.")

def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
if utils.is_list_like(level):
levels = list(level)
else:
levels = [level]
resolved_level_ids = []
for level_ref in levels:
if isinstance(level_ref, int):
resolved_level_ids.append(self._block.index_columns[level_ref])
elif isinstance(level_ref, typing.Hashable):
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
if len(matching_ids) != 1:
raise ValueError("level name cannot be found or is ambiguous")
resolved_level_ids.append(matching_ids[0])
else:
raise ValueError(f"Unexpected level: {level_ref}")
return resolved_level_ids
return self._block.resolve_index_level(level)

def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame:
block = self._block.rename(columns=columns)
Expand Down Expand Up @@ -1802,20 +1787,25 @@ def _stack_multi(self, level: LevelsType = -1):
block = block.stack(levels=len(level))
return DataFrame(block)

def unstack(self):
def unstack(self, level: LevelsType = -1):
if isinstance(level, int) or isinstance(level, str):
level = [level]

block = self._block
# Special case, unstack with mono-index transpose into a series
if self.index.nlevels == 1:
block = block.stack(how="right", levels=self.columns.nlevels)
return bigframes.series.Series(block)

# Pivot by last level of index
index_ids = block.index_columns
# Pivot by index levels
unstack_ids = self._resolve_levels(level)
block = block.reset_index(drop=False)
block = block.set_index(index_ids[:-1])
block = block.set_index(
[col for col in self._block.index_columns if col not in unstack_ids]
)

pivot_block = block.pivot(
columns=[index_ids[-1]],
columns=unstack_ids,
values=self._block.value_columns,
values_in_index=True,
)
Expand Down
40 changes: 24 additions & 16 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,22 +352,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
return Series(self._block.reorder_levels(resolved_level_ids))

def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
if _is_list_like(level):
levels = list(level)
else:
levels = [level]
resolved_level_ids = []
for level_ref in levels:
if isinstance(level_ref, int):
resolved_level_ids.append(self._block.index_columns[level_ref])
elif isinstance(level_ref, typing.Hashable):
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
if len(matching_ids) != 1:
raise ValueError("level name cannot be found or is ambiguous")
resolved_level_ids.append(matching_ids[0])
else:
raise ValueError(f"Unexpected level: {level_ref}")
return resolved_level_ids
return self._block.resolve_index_level(level)

def between(self, left, right, inclusive="both"):
if inclusive not in ["both", "neither", "left", "right"]:
Expand Down Expand Up @@ -918,6 +903,29 @@ def argmin(self) -> int:
scalars.Scalar, Series(block.select_column(row_nums)).iloc[0]
)

def unstack(self, level: LevelsType = -1):
if isinstance(level, int) or isinstance(level, str):
level = [level]

block = self._block

if self.index.nlevels == 1:
raise ValueError("Series must have multi-index to unstack")

# Pivot by index levels
unstack_ids = self._resolve_levels(level)
block = block.reset_index(drop=False)
block = block.set_index(
[col for col in self._block.index_columns if col not in unstack_ids]
)

pivot_block = block.pivot(
columns=unstack_ids,
values=self._block.value_columns,
values_in_index=False,
)
return bigframes.dataframe.DataFrame(pivot_block)

def idxmax(self) -> blocks.Label:
block = self._block.order_by(
[
Expand Down
8 changes: 6 additions & 2 deletions tests/system/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,11 @@ def hockey_df(
hockey_table_id: str, session: bigframes.Session
) -> bigframes.dataframe.DataFrame:
"""DataFrame pointing at test data."""
return session.read_gbq(hockey_table_id)
return (
session.read_gbq(hockey_table_id)
.set_index(["player_name", "season"])
.sort_index()
)


@pytest.fixture(scope="session")
Expand All @@ -419,7 +423,7 @@ def hockey_pandas_df() -> pd.DataFrame:
"season": pd.Int64Dtype(),
},
)
df.index = df.index.astype("Int64")
df = df.set_index(["player_name", "season"]).sort_index()
return df


Expand Down
10 changes: 8 additions & 2 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1949,8 +1949,14 @@ def test_df_pivot(scalars_dfs, values, index, columns):
],
)
def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns):
bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas()
pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns)
bf_result = (
hockey_df.reset_index()
.pivot(values=values, index=index, columns=columns)
.to_pandas()
)
pd_result = hockey_pandas_df.reset_index().pivot(
values=values, index=index, columns=columns
)

# Pandas produces NaN, where bq dataframes produces pd.NA
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
Expand Down
31 changes: 27 additions & 4 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,13 +909,36 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_multi_index_unstack(hockey_df, hockey_pandas_df):
@pytest.mark.parametrize(
("level",),
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
)
def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level):
bf_result = (
hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas()
hockey_df.set_index(["team_name", "position"], append=True)
.unstack(level=level)
.to_pandas()
)
pd_result = hockey_pandas_df.set_index(
["team_name", "season", "position"]
).unstack()
["team_name", "position"], append=True
).unstack(level=level)

pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)


@pytest.mark.parametrize(
("level",),
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
)
def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level):
bf_result = (
hockey_df.set_index(["team_name", "position"], append=True)["number"]
.unstack(level=level)
.to_pandas()
)
pd_result = hockey_pandas_df.set_index(["team_name", "position"], append=True)[
"number"
].unstack(level=level)

pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)

Expand Down
13 changes: 13 additions & 0 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1654,6 +1654,19 @@ def clip(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def unstack(self, level):
"""
Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.

Args:
level (int, str, or list of these, default last level):
Level(s) to unstack, can pass level name.

Returns:
DataFrame: Unstacked Series.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def argmax(self):
"""
Return int position of the smallest value in the Series.
Expand Down