Skip to content

Commit 4e4409c

Browse files
feat: add dataframe melt (#116)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 2d7128d commit 4e4409c

File tree

5 files changed

+176
-2
lines changed

5 files changed

+176
-2
lines changed

bigframes/core/blocks.py

+39-2
Original file line numberDiff line numberDiff line change
@@ -1356,13 +1356,50 @@ def stack(self, how="left", levels: int = 1):
13561356
index_columns = [*added_index_columns, *self.index_columns]
13571357
index_labels = [*new_index_level_names, *self._index_labels]
13581358

1359-
block = Block(
1359+
return Block(
13601360
unpivot_expr,
13611361
index_columns=index_columns,
13621362
column_labels=result_index,
13631363
index_labels=index_labels,
13641364
)
1365-
return block
1365+
1366+
def melt(
1367+
self,
1368+
id_vars=typing.Sequence[str],
1369+
value_vars=typing.Sequence[str],
1370+
var_names=typing.Sequence[typing.Hashable],
1371+
value_name: typing.Hashable = "value",
1372+
):
1373+
# TODO: Implement col_level and ignore_index
1374+
unpivot_col_id = guid.generate_guid()
1375+
var_col_ids = tuple([guid.generate_guid() for _ in var_names])
1376+
# single unpivot col
1377+
unpivot_col = (unpivot_col_id, tuple(value_vars))
1378+
value_labels = [self.col_id_to_label[col_id] for col_id in value_vars]
1379+
id_labels = [self.col_id_to_label[col_id] for col_id in id_vars]
1380+
1381+
dtype = self._expr.get_column_type(value_vars[0])
1382+
1383+
unpivot_expr = self._expr.unpivot(
1384+
row_labels=value_labels,
1385+
passthrough_columns=id_vars,
1386+
unpivot_columns=(unpivot_col,),
1387+
index_col_ids=var_col_ids,
1388+
dtype=dtype,
1389+
how="right",
1390+
)
1391+
index_id = guid.generate_guid()
1392+
unpivot_expr = unpivot_expr.promote_offsets(index_id)
1393+
# Need to reorder to get id_vars before var_col and unpivot_col
1394+
unpivot_expr = unpivot_expr.select_columns(
1395+
[index_id, *id_vars, *var_col_ids, unpivot_col_id]
1396+
)
1397+
1398+
return Block(
1399+
unpivot_expr,
1400+
column_labels=[*id_labels, *var_names, value_name],
1401+
index_columns=[index_id],
1402+
)
13661403

13671404
def _create_stack_column(
13681405
self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple]

bigframes/dataframe.py

+38
Original file line numberDiff line numberDiff line change
@@ -1673,6 +1673,44 @@ def idxmin(self) -> bigframes.series.Series:
16731673
def idxmax(self) -> bigframes.series.Series:
16741674
return bigframes.series.Series(block_ops.idxmax(self._block))
16751675

1676+
def melt(
1677+
self,
1678+
id_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None,
1679+
value_vars: typing.Optional[typing.Iterable[typing.Hashable]] = None,
1680+
var_name: typing.Union[
1681+
typing.Hashable, typing.Sequence[typing.Hashable]
1682+
] = None,
1683+
value_name: typing.Hashable = "value",
1684+
):
1685+
if var_name is None:
1686+
# Determine default var_name. Attempt to use column labels if they are unique
1687+
if self.columns.nlevels > 1:
1688+
if len(set(self.columns.names)) == len(self.columns.names):
1689+
var_name = self.columns.names
1690+
else:
1691+
var_name = [f"variable_{i}" for i in range(len(self.columns.names))]
1692+
else:
1693+
var_name = self.columns.name or "variable"
1694+
1695+
var_name = tuple(var_name) if utils.is_list_like(var_name) else (var_name,)
1696+
1697+
if id_vars is not None:
1698+
id_col_ids = [self._resolve_label_exact(col) for col in id_vars]
1699+
else:
1700+
id_col_ids = []
1701+
if value_vars is not None:
1702+
val_col_ids = [self._resolve_label_exact(col) for col in value_vars]
1703+
else:
1704+
val_col_ids = [
1705+
col_id
1706+
for col_id in self._block.value_columns
1707+
if col_id not in id_col_ids
1708+
]
1709+
1710+
return DataFrame(
1711+
self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
1712+
)
1713+
16761714
def describe(self) -> DataFrame:
16771715
df_numeric = self._drop_non_numeric(keep_bool=False)
16781716
if len(df_numeric.columns) == 0:

tests/system/small/test_dataframe.py

+43
Original file line numberDiff line numberDiff line change
@@ -1919,6 +1919,49 @@ def test_df_stack(scalars_dfs):
19191919
pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
19201920

19211921

1922+
def test_df_melt_default(scalars_dfs):
1923+
scalars_df, scalars_pandas_df = scalars_dfs
1924+
# To match bigquery dataframes
1925+
scalars_pandas_df = scalars_pandas_df.copy()
1926+
scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]")
1927+
# Can only stack identically-typed columns
1928+
columns = ["int64_col", "int64_too", "rowindex_2"]
1929+
1930+
bf_result = scalars_df[columns].melt().to_pandas()
1931+
pd_result = scalars_pandas_df[columns].melt()
1932+
1933+
# Pandas produces int64 index, Bigframes produces Int64 (nullable)
1934+
pd.testing.assert_frame_equal(
1935+
bf_result, pd_result, check_index_type=False, check_dtype=False
1936+
)
1937+
1938+
1939+
def test_df_melt_parameterized(scalars_dfs):
1940+
scalars_df, scalars_pandas_df = scalars_dfs
1941+
# To match bigquery dataframes
1942+
scalars_pandas_df = scalars_pandas_df.copy()
1943+
scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]")
1944+
# Can only stack identically-typed columns
1945+
1946+
bf_result = scalars_df.melt(
1947+
var_name="alice",
1948+
value_name="bob",
1949+
id_vars=["string_col"],
1950+
value_vars=["int64_col", "int64_too"],
1951+
).to_pandas()
1952+
pd_result = scalars_pandas_df.melt(
1953+
var_name="alice",
1954+
value_name="bob",
1955+
id_vars=["string_col"],
1956+
value_vars=["int64_col", "int64_too"],
1957+
)
1958+
1959+
# Pandas produces int64 index, Bigframes produces Int64 (nullable)
1960+
pd.testing.assert_frame_equal(
1961+
bf_result, pd_result, check_index_type=False, check_dtype=False
1962+
)
1963+
1964+
19221965
def test_df_unstack(scalars_dfs):
19231966
scalars_df, scalars_pandas_df = scalars_dfs
19241967
# To match bigquery dataframes

tests/system/small/test_multiindex.py

+28
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,34 @@ def test_column_multi_index_stack(level):
752752
)
753753

754754

755+
def test_column_multi_index_melt():
756+
if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"):
757+
pytest.skip("pandas <2.1 uses different stack implementation")
758+
759+
level1 = pandas.Index(["b", "a", "b"])
760+
level2 = pandas.Index(["a", "b", "b"])
761+
level3 = pandas.Index(["b", "b", "a"])
762+
763+
multi_columns = pandas.MultiIndex.from_arrays(
764+
[level1, level2, level3], names=["l1", "l2", "l3"]
765+
)
766+
pd_df = pandas.DataFrame(
767+
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
768+
index=[5, 2, None],
769+
columns=multi_columns,
770+
dtype="Int64",
771+
)
772+
bf_df = bpd.DataFrame(pd_df)
773+
774+
bf_result = bf_df.melt().to_pandas()
775+
pd_result = pd_df.melt()
776+
777+
# BigFrames uses different string and int types, but values are identical
778+
pandas.testing.assert_frame_equal(
779+
bf_result, pd_result, check_index_type=False, check_dtype=False
780+
)
781+
782+
755783
def test_column_multi_index_unstack(scalars_df_index, scalars_pandas_df_index):
756784
columns = ["int64_too", "int64_col", "rowindex_2"]
757785
level1 = pandas.Index(["b", "a", "b"], dtype="string[pyarrow]")

third_party/bigframes_vendored/pandas/core/frame.py

+28
Original file line numberDiff line numberDiff line change
@@ -2010,6 +2010,34 @@ def idxmax(self):
20102010
"""
20112011
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
20122012

2013+
def melt(self, id_vars, value_vars, var_name, value_name):
2014+
"""
2015+
Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
2016+
2017+
This function is useful to massage a DataFrame into a format where one
2018+
or more columns are identifier variables (`id_vars`), while all other
2019+
columns, considered measured variables (`value_vars`), are "unpivoted" to
2020+
the row axis, leaving just two non-identifier columns, 'variable' and
2021+
'value'.
2022+
2023+
Parameters
2024+
----------
2025+
id_vars (tuple, list, or ndarray, optional):
2026+
Column(s) to use as identifier variables.
2027+
value_vars (tuple, list, or ndarray, optional):
2028+
Column(s) to unpivot. If not specified, uses all columns that
2029+
are not set as `id_vars`.
2030+
var_name (scalar):
2031+
Name to use for the 'variable' column. If None it uses
2032+
``frame.columns.name`` or 'variable'.
2033+
value_name (scalar, default 'value'):
2034+
Name to use for the 'value' column.
2035+
2036+
Returns:
2037+
DataFrame: Unpivoted DataFrame.
2038+
"""
2039+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
2040+
20132041
def nunique(self):
20142042
"""
20152043
Count number of distinct elements in specified axis.

0 commit comments

Comments
 (0)