diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 866be9c400..9426df54f0 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -417,6 +417,7 @@ def join( "left", "outer", "right", + "cross", ], allow_row_identity_join: bool = True, ): diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index e831b42752..bf3b9321c3 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1531,6 +1531,7 @@ def merge( "left", "outer", "right", + "cross", ], left_join_ids: typing.Sequence[str], right_join_ids: typing.Sequence[str], diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index b992aa1d1d..93ba3f16f1 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -40,6 +40,7 @@ def join_by_column( "left", "outer", "right", + "cross", ], allow_row_identity_join: bool = True, ) -> compiled.CompiledArrayValue: @@ -107,7 +108,7 @@ def join_by_column( left_table, right_table, predicates=join_conditions, - how=how, + how=how, # type: ignore ) # Preserve ordering accross joins. diff --git a/bigframes/core/joins/merge.py b/bigframes/core/joins/merge.py index fac16b3607..c65e1bdd54 100644 --- a/bigframes/core/joins/merge.py +++ b/bigframes/core/joins/merge.py @@ -32,6 +32,7 @@ def merge( "left", "outer", "right", + "cross", ] = "inner", on: Optional[str] = None, *, diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 7b252b164f..8f1e2e5e73 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -86,6 +86,7 @@ class JoinNode(BigFrameNode): "left", "outer", "right", + "cross", ] allow_row_identity_join: bool = True diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c5b48822fb..701c26f42f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1933,6 +1933,7 @@ def merge( "left", "outer", "right", + "cross", ] = "inner", # TODO(garrettwu): Currently can take inner, outer, left and right. To support # cross joins @@ -1943,6 +1944,19 @@ def merge( sort: bool = False, suffixes: tuple[str, str] = ("_x", "_y"), ) -> DataFrame: + if how == "cross": + if on is not None: + raise ValueError("'on' is not supported for cross join.") + result_block = self._block.merge( + right._block, + left_join_ids=[], + right_join_ids=[], + suffixes=suffixes, + how=how, + sort=True, + ) + return DataFrame(result_block) + if on is None: if left_on is None or right_on is None: raise ValueError("Must specify `on` or `left_on` + `right_on`.") @@ -1996,6 +2010,18 @@ def join( raise NotImplementedError( f"Deduping column names is not implemented. {constants.FEEDBACK_LINK}" ) + if how == "cross": + if on is not None: + raise ValueError("'on' is not supported for cross join.") + result_block = left._block.merge( + right._block, + left_join_ids=[], + right_join_ids=[], + suffixes=("", ""), + how="cross", + sort=True, + ) + return DataFrame(result_block) # Join left columns with right index if on is not None: diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 1c52b103fb..d35f838366 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -332,6 +332,7 @@ def merge( "left", "outer", "right", + "cross", ] = "inner", on: Optional[str] = None, *, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3d7ba867ea..605d4abc1d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -914,6 +914,26 @@ def test_df_isin_dict(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) +def test_df_cross_merge(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + bf_result = left.merge(right, "cross").to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "cross", + ) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + @pytest.mark.parametrize( ("merge_how",), [ @@ -1745,12 +1765,7 @@ def test_series_binop_add_different_table( all_joins = pytest.mark.parametrize( ("how",), - ( - ("outer",), - ("left",), - ("right",), - ("inner",), - ), + (("outer",), ("left",), ("right",), ("inner",), ("cross",)), ) @@ -1795,13 +1810,18 @@ def test_join_param_on(scalars_dfs, how): bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) bf_df_b = bf_df[["float64_col"]] - bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() - pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] - pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) - pd_df_b = pd_df[["float64_col"]] - pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_df_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_df_b = pd_df[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) @pytest.mark.parametrize( diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 0292ebd206..b88901f3bc 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -289,6 +289,27 @@ def test_merge_left_on_right_on(scalars_dfs, merge_how): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) +def test_pd_merge_cross(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "int64_too"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + right = scalars_df[right_columns] + + df = bpd.merge(left, right, "cross", sort=True) + bf_result = df.to_pandas() + + pd_result = pd.merge( + scalars_pandas_df[left_columns], + scalars_pandas_df[right_columns], + "cross", + sort=True, + ) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + @pytest.mark.parametrize( ("merge_how",), [ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6f4f6be35d..3e4cec284c 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2194,6 +2194,8 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: and sort it lexicographically. ``inner``: form intersection of calling frame's index (or column if on is specified) with `other`'s index, preserving the order of the calling's one. + ``cross``: creates the cartesian product from both frames, preserves + the order of the left keys. Returns: bigframes.dataframe.DataFrame: A dataframe containing columns from both the caller and `other`. @@ -2208,6 +2210,7 @@ def merge( "left", "outer", "right", + "cross", ] = "inner", on: Optional[str] = None, *, @@ -2243,6 +2246,8 @@ def merge( join; sort keys lexicographically. ``inner``: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + ``cross``: creates the cartesian product from both frames, preserves the order + of the left keys. on (label or list of labels): Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py index b03f366fca..704e50f516 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/merge.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py @@ -49,6 +49,8 @@ def merge( join; sort keys lexicographically. ``inner``: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + ``cross``: creates the cartesian product from both frames, preserves the order + of the left keys. on (label or list of labels): Columns to join on. It must be found in both DataFrames. Either on or left_on + right_on