Skip to content

Commit 94d7cc9

Browse files
Add items, apply methods to dataframe.
Change-Id: Id3a0e78da3bb9ccce64e190f7797f737b239c33f
1 parent edabdbb commit 94d7cc9

File tree

3 files changed

+103
-0
lines changed

3 files changed

+103
-0
lines changed

bigframes/dataframe.py

+18
Original file line numberDiff line numberDiff line change
@@ -1448,6 +1448,12 @@ def isin(self, values) -> DataFrame:
14481448
f"isin(), you passed a [{type(values).__name__}]"
14491449
)
14501450

1451+
def items(self):
1452+
column_ids = self._block.value_columns
1453+
column_labels = self._block.column_labels
1454+
for col_id, col_label in zip(column_ids, column_labels):
1455+
yield col_label, bigframes.series.Series(self._block.select_column(col_id))
1456+
14511457
def dropna(
14521458
self,
14531459
*,
@@ -2415,6 +2421,18 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
24152421
ops.RemoteFunctionOp(func, apply_on_null=(na_action is None))
24162422
)
24172423

2424+
def apply(self, func, *, args: typing.Tuple = (), **kwargs):
2425+
results = {name: func(col, *args, **kwargs) for name, col in self.items()}
2426+
if all(
2427+
[
2428+
isinstance(val, bigframes.series.Series) or utils.is_list_like(val)
2429+
for val in results.values()
2430+
]
2431+
):
2432+
return DataFrame(data=results)
2433+
else:
2434+
return pandas.Series(data=results)
2435+
24182436
def drop_duplicates(
24192437
self,
24202438
subset: typing.Union[blocks.Label, typing.Sequence[blocks.Label]] = None,

tests/system/small/test_dataframe.py

+51
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,57 @@ def test_df_bfill(scalars_dfs):
663663
pandas.testing.assert_frame_equal(bf_result, pd_result)
664664

665665

666+
def test_apply_series_series_callable(
667+
scalars_df_index,
668+
scalars_pandas_df_index,
669+
):
670+
columns = ["int64_too", "int64_col"]
671+
672+
def foo(series, arg1, arg2, *, kwarg1=0, kwarg2=0):
673+
return series**2 + (arg1 * arg2 % 4) + (kwarg1 * kwarg2 % 7)
674+
675+
bf_result = (
676+
scalars_df_index[columns]
677+
.apply(foo, args=(33, 61), kwarg1=52, kwarg2=21)
678+
.to_pandas()
679+
)
680+
681+
pd_result = scalars_pandas_df_index[columns].apply(
682+
foo, args=(33, 61), kwarg1=52, kwarg2=21
683+
)
684+
685+
pandas.testing.assert_frame_equal(bf_result, pd_result)
686+
687+
688+
def test_apply_series_listlike_callable(
689+
scalars_df_index,
690+
scalars_pandas_df_index,
691+
):
692+
columns = ["int64_too", "int64_col"]
693+
bf_result = (
694+
scalars_df_index[columns].apply(lambda x: [len(x), x.min(), 24]).to_pandas()
695+
)
696+
697+
pd_result = scalars_pandas_df_index[columns].apply(lambda x: [len(x), x.min(), 24])
698+
699+
# Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes.
700+
pd_result.index = pd_result.index.astype("Int64")
701+
pd_result = pd_result.astype("Int64")
702+
pandas.testing.assert_frame_equal(bf_result, pd_result)
703+
704+
705+
def test_apply_series_scalar_callable(
706+
scalars_df_index,
707+
scalars_pandas_df_index,
708+
):
709+
columns = ["int64_too", "int64_col"]
710+
bf_result = scalars_df_index[columns].apply(lambda x: x.sum())
711+
712+
pd_result = scalars_pandas_df_index[columns].apply(lambda x: x.sum())
713+
714+
pandas.testing.assert_series_equal(bf_result, pd_result)
715+
716+
666717
def test_df_isin_list(scalars_dfs):
667718
scalars_df, scalars_pandas_df = scalars_dfs
668719
values = ["Hello, World!", 55555, 2.51, pd.NA, True]

third_party/bigframes_vendored/pandas/core/frame.py

+34
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,18 @@ def isin(self, values):
734734
"""
735735
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
736736

737+
def items(self):
738+
"""
739+
Iterate over (column name, Series) pairs.
740+
741+
Iterates over the DataFrame columns, returning a tuple with
742+
the column name and the content as a Series.
743+
744+
Returns:
745+
Iterator: Iterator of label, Series for each column.
746+
"""
747+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
748+
737749
# ----------------------------------------------------------------------
738750
# Sorting
739751

@@ -1420,6 +1432,28 @@ def merge(
14201432
"""
14211433
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
14221434

1435+
def apply(self, func, *, args=(), **kwargs):
1436+
"""Apply a function along an axis of the DataFrame.
1437+
1438+
Objects passed to the function are Series objects whose index is
1439+
the DataFrame's index (``axis=0``) the final return type
1440+
is inferred from the return type of the applied function.
1441+
1442+
Args:
1443+
func (function):
1444+
Function to apply to each column or row.
1445+
args (tuple):
1446+
Positional arguments to pass to `func` in addition to the
1447+
array/series.
1448+
**kwargs:
1449+
Additional keyword arguments to pass as keywords arguments to
1450+
`func`.
1451+
1452+
Returns:
1453+
pandas.Series or bigframes.DataFrame: Result of applying ``func`` along the given axis of the DataFrame.
1454+
"""
1455+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
1456+
14231457
# ----------------------------------------------------------------------
14241458
# ndarray-like stats methods
14251459

0 commit comments

Comments
 (0)