Skip to content

feat: add equals methods to series/dataframe #76

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,39 @@
import bigframes.operations.aggregations as agg_ops


def equals(block1: blocks.Block, block2: blocks.Block):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: can add a return type hint here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

if not block1.column_labels.equals(block2.column_labels):
return False
if block1.dtypes != block2.dtypes:
return False
# TODO: More advanced expression tree traversals to short circuit actually querying data

block1 = block1.reset_index(drop=False)
block2 = block2.reset_index(drop=False)

joined, (lmap, rmap) = block1.index.join(block2.index, how="outer")
joined_block = joined._block

equality_ids = []
for lcol, rcol in zip(block1.value_columns, block2.value_columns):
lcolmapped = lmap(lcol)
rcolmapped = rmap(rcol)
joined_block, result_id = joined_block.apply_binary_op(
lcolmapped, rcolmapped, ops.eq_nulls_match_op
)
joined_block, result_id = joined_block.apply_unary_op(
result_id, ops.partial_right(ops.fillna_op, False)
)
equality_ids.append(result_id)

joined_block = joined_block.select_columns(equality_ids).with_column_labels(
list(range(len(equality_ids)))
)
stacked_block = joined_block.stack(dropna=False, sort=False)
result = stacked_block.get_stat(stacked_block.value_columns[0], agg_ops.all_op)
return typing.cast(bool, result)


def indicate_duplicates(
block: blocks.Block, columns: typing.Sequence[str], keep: str = "first"
) -> typing.Tuple[blocks.Block, str]:
Expand Down
6 changes: 6 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,12 @@ def rename_axis(
labels = [mapper]
return DataFrame(self._block.with_index_labels(labels))

def equals(self, other: typing.Union[bigframes.series.Series, DataFrame]) -> bool:
# Must be same object type, same column dtypes, and same label values
if not isinstance(other, DataFrame):
return False
return block_ops.equals(self._block, other._block)

def assign(self, **kwargs) -> DataFrame:
# TODO(garrettwu) Support list-like values. Requires ordering.
# TODO(garrettwu) Support callable values.
Expand Down
10 changes: 10 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,16 @@ def eq_op(
return x == y


def eq_nulls_match_op(
x: ibis_types.Value,
y: ibis_types.Value,
):
"""Variant of eq_op where nulls match each other. Only use where dtypes are known to be same."""
left = x.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$"))
right = y.cast(ibis_dtypes.str).fillna(ibis_types.literal("$NULL_SENTINEL$"))
return left == right


def ne_op(
x: ibis_types.Value,
y: ibis_types.Value,
Expand Down
8 changes: 8 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,14 @@ def rename_axis(
labels = [mapper]
return Series(self._block.with_index_labels(labels))

def equals(
self, other: typing.Union[Series, bigframes.dataframe.DataFrame]
) -> bool:
# Must be same object type, same column dtypes, and same label values
if not isinstance(other, Series):
return False
return block_ops.equals(self._block, other._block)

def reset_index(
self,
*,
Expand Down
68 changes: 68 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2423,6 +2423,74 @@ def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index):
)


def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index):
unsupported = [
"geography_col",
]
scalars_df_index = scalars_df_index.drop(columns=unsupported)
scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported)

bf_result = scalars_df_index.equals(scalars_df_index)
pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index)

assert pd_result == bf_result


def test_df_equals_series(scalars_df_index, scalars_pandas_df_index):
bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"])
pd_result = scalars_pandas_df_index[["int64_col"]].equals(
scalars_pandas_df_index["int64_col"]
)

assert pd_result == bf_result


def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index):
columns = ["int64_col", "int64_too"]
scalars_df_index = scalars_df_index[columns]
scalars_pandas_df_index = scalars_pandas_df_index[columns]

bf_modified = scalars_df_index.copy()
bf_modified = bf_modified.astype("Float64")

pd_modified = scalars_pandas_df_index.copy()
pd_modified = pd_modified.astype("Float64")

bf_result = scalars_df_index.equals(bf_modified)
pd_result = scalars_pandas_df_index.equals(pd_modified)

assert pd_result == bf_result


def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index):
columns = ["int64_col", "int64_too"]
scalars_df_index = scalars_df_index[columns]
scalars_pandas_df_index = scalars_pandas_df_index[columns]

bf_modified = scalars_df_index.copy()
bf_modified["int64_col"] = bf_modified.int64_col + 1

pd_modified = scalars_pandas_df_index.copy()
pd_modified["int64_col"] = pd_modified.int64_col + 1

bf_result = scalars_df_index.equals(bf_modified)
pd_result = scalars_pandas_df_index.equals(pd_modified)

assert pd_result == bf_result


def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index):
columns = ["int64_col", "int64_too"]
more_columns = ["int64_col", "int64_too", "float64_col"]

bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns])
pd_result = scalars_pandas_df_index[columns].equals(
scalars_pandas_df_index[more_columns]
)

assert pd_result == bf_result


def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index):
reindex_target_bf = scalars_df_index.reindex(
columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1]
Expand Down
38 changes: 38 additions & 0 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,44 @@ def test_series_get_column_default(scalars_dfs):
assert result == "default_val"


def test_series_equals_identical(scalars_df_index, scalars_pandas_df_index):
bf_result = scalars_df_index.int64_col.equals(scalars_df_index.int64_col)
pd_result = scalars_pandas_df_index.int64_col.equals(
scalars_pandas_df_index.int64_col
)

assert pd_result == bf_result


def test_series_equals_df(scalars_df_index, scalars_pandas_df_index):
bf_result = scalars_df_index["int64_col"].equals(scalars_df_index[["int64_col"]])
pd_result = scalars_pandas_df_index["int64_col"].equals(
scalars_pandas_df_index[["int64_col"]]
)

assert pd_result == bf_result


def test_series_equals_different_dtype(scalars_df_index, scalars_pandas_df_index):
bf_series = scalars_df_index["int64_col"]
pd_series = scalars_pandas_df_index["int64_col"]

bf_result = bf_series.equals(bf_series.astype("Float64"))
pd_result = pd_series.equals(pd_series.astype("Float64"))

assert pd_result == bf_result


def test_series_equals_different_values(scalars_df_index, scalars_pandas_df_index):
bf_series = scalars_df_index["int64_col"]
pd_series = scalars_pandas_df_index["int64_col"]

bf_result = bf_series.equals(bf_series + 1)
pd_result = pd_series.equals(pd_series + 1)

assert pd_result == bf_result


def test_series_get_with_default_index(scalars_dfs):
col_name = "float64_col"
key = 2
Expand Down
22 changes: 22 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,28 @@ def to_orc(self, path=None, **kwargs) -> bytes | None:
# ----------------------------------------------------------------------
# Unsorted

def equals(self, other) -> bool:
"""
Test whether two objects contain the same elements.

This function allows two Series or DataFrames to be compared against
each other to see if they have the same shape and elements. NaNs in
the same location are considered equal.

The row/column index do not need to have the same type, as long
as the values are considered equal. Corresponding columns must be of
the same dtype.

Args:
other (Series or DataFrame):
The other Series or DataFrame to be compared with the first.

Returns:
bool: True if all elements are the same in both objects, False
otherwise.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def assign(self, **kwargs) -> DataFrame:
r"""
Assign new columns to a DataFrame.
Expand Down