Skip to content

feat: Add hasnans, combine_first, update to Series #600

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 10, 2024
28 changes: 27 additions & 1 deletion bigframes/core/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,27 @@
# limitations under the License.
from __future__ import annotations

from typing import Optional

import pandas as pd

import bigframes.core.indexes as index
import bigframes.series as series


def to_bf_series(obj, default_index: index.Index) -> series.Series:
def to_bf_series(obj, default_index: Optional[index.Index]) -> series.Series:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May worth add some docs how the default_index will be handled. Couldn't tell without reading code.

"""
Convert a an object to a bigframes series

Args:
obj (list-like or Series):
Object to convert to bigframes Series
default_index (list-like or Index or None):
Index to use if obj has no index

Returns
bigframes.pandas.Series
"""
if isinstance(obj, series.Series):
return obj
if isinstance(obj, pd.Series):
Expand All @@ -35,6 +49,18 @@ def to_bf_series(obj, default_index: index.Index) -> series.Series:


def to_pd_series(obj, default_index: pd.Index) -> pd.Series:
"""
Convert a an object to a pandas series

Args:
obj (list-like or Series):
Object to convert to pandas Series
default_index (list-like or Index or None):
Index to use if obj has no index

Returns
pandas.Series
"""
if isinstance(obj, series.Series):
return obj.to_pandas()
if isinstance(obj, pd.Series):
Expand Down
21 changes: 20 additions & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import os
import textwrap
import typing
from typing import Any, Literal, Mapping, Optional, Tuple, Union
from typing import Any, Literal, Mapping, Optional, Sequence, Tuple, Union

import bigframes_vendored.pandas.core.series as vendored_pandas_series
import google.cloud.bigquery as bigquery
Expand Down Expand Up @@ -130,6 +130,11 @@ def ndim(self) -> int:
def empty(self) -> bool:
return self.shape[0] == 0

@property
def hasnans(self) -> bool:
# Note, hasnans is actually a null check, and NaNs don't count for nullable float
return self.isnull().any()

@property
def values(self) -> numpy.ndarray:
return self.to_numpy()
Expand Down Expand Up @@ -753,6 +758,20 @@ def __matmul__(self, other):

dot = __matmul__

def combine_first(self, other: Series) -> Series:
result = self._apply_binary_op(other, ops.coalesce_op)
result.name = self.name
return result

def update(self, other: Union[Series, Sequence, Mapping]) -> None:
import bigframes.core.convert

other = bigframes.core.convert.to_bf_series(other, default_index=None)
result = self._apply_binary_op(
other, ops.coalesce_op, reverse=True, alignment="left"
)
self._set_block(result._get_block())

def abs(self) -> Series:
return self._apply_unary_op(ops.abs_op)

Expand Down
51 changes: 51 additions & 0 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,39 @@ def test_binop_right_filtered(scalars_dfs):
)


@skip_legacy_pandas
def test_series_combine_first(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
int64_col = scalars_df["int64_col"].head(7)
float64_col = scalars_df["float64_col"].tail(7)
bf_result = int64_col.combine_first(float64_col).to_pandas()

pd_int64_col = scalars_pandas_df["int64_col"].head(7)
pd_float64_col = scalars_pandas_df["float64_col"].tail(7)
pd_result = pd_int64_col.combine_first(pd_float64_col)

assert_series_equal(
bf_result,
pd_result,
)


def test_series_update(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
int64_col = scalars_df["int64_col"].head(7)
float64_col = scalars_df["float64_col"].tail(7).copy()
float64_col.update(int64_col)

pd_int64_col = scalars_pandas_df["int64_col"].head(7)
pd_float64_col = scalars_pandas_df["float64_col"].tail(7).copy()
pd_float64_col.update(pd_int64_col)

assert_series_equal(
float64_col.to_pandas(),
pd_float64_col,
)


def test_mean(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_col"
Expand Down Expand Up @@ -1649,6 +1682,24 @@ def test_size(scalars_dfs):
assert pd_result == bf_result


def test_series_hasnans_true(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

bf_result = scalars_df["string_col"].hasnans
pd_result = scalars_pandas_df["string_col"].hasnans

assert pd_result == bf_result


def test_series_hasnans_false(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

bf_result = scalars_df["string_col"].dropna().hasnans
pd_result = scalars_pandas_df["string_col"].dropna().hasnans

assert pd_result == bf_result


def test_empty_false(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

Expand Down
138 changes: 138 additions & 0 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,31 @@ def name(self) -> Hashable:
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def hasnans(self) -> bool:
"""
Return True if there are any NaNs.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series([1, 2, 3, None])
>>> s
0 1.0
1 2.0
2 3.0
3 <NA>
dtype: Float64
>>> s.hasnans
True

Returns:
bool
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def T(self) -> Series:
"""Return the transpose, which is by definition self.
Expand Down Expand Up @@ -2343,6 +2368,119 @@ def rdivmod(self, other) -> Series:
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def combine_first(self, other) -> Series:
"""
Update null elements with value in the same location in 'other'.

Combine two Series objects by filling null values in one Series with
non-null values from the other Series. Result index will be the union
of the two indexes.

**Examples:**
>>> import bigframes.pandas as bpd
>>> import numpy as np
>>> bpd.options.display.progress_bar = None

>>> s1 = bpd.Series([1, np.nan])
>>> s2 = bpd.Series([3, 4, 5])
>>> s1.combine_first(s2)
0 1.0
1 4.0
2 5.0
dtype: Float64

Null values still persist if the location of that null value
does not exist in `other`

>>> s1 = bpd.Series({'falcon': np.nan, 'eagle': 160.0})
>>> s2 = bpd.Series({'eagle': 200.0, 'duck': 30.0})
>>> s1.combine_first(s2)
falcon <NA>
eagle 160.0
duck 30.0
dtype: Float64

Args:
other (Series):
The value(s) to be used for filling null values.

Returns:
Series: The result of combining the provided Series with the other object.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def update(self, other) -> None:
"""
Modify Series in place using values from passed Series.

Uses non-NA values from passed Series to make updates. Aligns
on index.

**Examples:**
>>> import bigframes.pandas as bpd
>>> import pandas as pd
>>> import numpy as np
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series([1, 2, 3])
>>> s.update(bpd.Series([4, 5, 6]))
>>> s
0 4
1 5
2 6
dtype: Int64

>>> s = bpd.Series(['a', 'b', 'c'])
>>> s.update(bpd.Series(['d', 'e'], index=[0, 2]))
>>> s
0 d
1 b
2 e
dtype: string

>>> s = bpd.Series([1, 2, 3])
>>> s.update(bpd.Series([4, 5, 6, 7, 8]))
>>> s
0 4
1 5
2 6
dtype: Int64

If ``other`` contains NaNs the corresponding values are not updated
in the original Series.

>>> s = bpd.Series([1, 2, 3])
>>> s.update(bpd.Series([4, np.nan, 6], dtype=pd.Int64Dtype()))
>>> s
0 4
1 2
2 6
dtype: Int64

``other`` can also be a non-Series object type
that is coercible into a Series

>>> s = bpd.Series([1, 2, 3])
>>> s.update([4, np.nan, 6])
>>> s
0 4.0
1 2.0
2 6.0
dtype: Float64

>>> s = bpd.Series([1, 2, 3])
>>> s.update({1: 9})
>>> s
0 1
1 9
2 3
dtype: Int64

Args:
other (Series, or object coercible into Series)
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def all(
self,
):
Expand Down