Skip to content

Commit c9fa85c

Browse files
feat: add multi-column dataframe merge (#73)
1 parent 971d091 commit c9fa85c

File tree

7 files changed

+206
-146
lines changed

7 files changed

+206
-146
lines changed

bigframes/core/blocks.py

+73
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import bigframes.core as core
3939
import bigframes.core.guid as guid
4040
import bigframes.core.indexes as indexes
41+
import bigframes.core.joins as joins
4142
import bigframes.core.ordering as ordering
4243
import bigframes.core.utils
4344
import bigframes.core.utils as utils
@@ -1403,6 +1404,78 @@ def concat(
14031404
result_block = result_block.reset_index()
14041405
return result_block
14051406

1407+
def merge(
1408+
self,
1409+
other: Block,
1410+
how: typing.Literal[
1411+
"inner",
1412+
"left",
1413+
"outer",
1414+
"right",
1415+
],
1416+
left_col_ids: typing.Sequence[str],
1417+
right_col_ids: typing.Sequence[str],
1418+
sort: bool,
1419+
suffixes: tuple[str, str] = ("_x", "_y"),
1420+
) -> Block:
1421+
(
1422+
joined_expr,
1423+
coalesced_join_cols,
1424+
(get_column_left, get_column_right),
1425+
) = joins.join_by_column(
1426+
self.expr,
1427+
left_col_ids,
1428+
other.expr,
1429+
right_col_ids,
1430+
how=how,
1431+
sort=sort,
1432+
)
1433+
1434+
# which join key parts should be coalesced
1435+
merge_join_key_mask = [
1436+
str(self.col_id_to_label[left_id]) == str(other.col_id_to_label[right_id])
1437+
for left_id, right_id in zip(left_col_ids, right_col_ids)
1438+
]
1439+
labels_to_coalesce = [
1440+
self.col_id_to_label[col_id]
1441+
for i, col_id in enumerate(left_col_ids)
1442+
if merge_join_key_mask[i]
1443+
]
1444+
1445+
def left_col_mapping(col_id: str) -> str:
1446+
if col_id in left_col_ids:
1447+
join_key_part = left_col_ids.index(col_id)
1448+
if merge_join_key_mask[join_key_part]:
1449+
return coalesced_join_cols[join_key_part]
1450+
return get_column_left(col_id)
1451+
1452+
def right_col_mapping(col_id: str) -> typing.Optional[str]:
1453+
if col_id in right_col_ids:
1454+
join_key_part = right_col_ids.index(col_id)
1455+
if merge_join_key_mask[join_key_part]:
1456+
return None
1457+
return get_column_right(col_id)
1458+
1459+
left_columns = [left_col_mapping(col_id) for col_id in self.value_columns]
1460+
1461+
right_columns = [
1462+
typing.cast(str, right_col_mapping(col_id))
1463+
for col_id in other.value_columns
1464+
if right_col_mapping(col_id)
1465+
]
1466+
1467+
expr = joined_expr.select_columns([*left_columns, *right_columns])
1468+
labels = utils.merge_column_labels(
1469+
self.column_labels,
1470+
other.column_labels,
1471+
coalesce_labels=labels_to_coalesce,
1472+
suffixes=suffixes,
1473+
)
1474+
1475+
# Constructs default index
1476+
expr, offset_index_id = expr.promote_offsets()
1477+
return Block(expr, index_columns=[offset_index_id], column_labels=labels)
1478+
14061479
def _force_reproject(self) -> Block:
14071480
"""Forces a reprojection of the underlying tables expression. Used to force predicate/order application before subsequent operations."""
14081481
return Block(

bigframes/core/joins/single_column.py

+20-40
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ def join_by_column(
4444
"right",
4545
],
4646
sort: bool = False,
47-
coalesce_join_keys: bool = True,
4847
allow_row_identity_join: bool = True,
4948
) -> Tuple[
5049
core.ArrayValue,
@@ -59,8 +58,6 @@ def join_by_column(
5958
right: Expression for right table to join.
6059
right_column_ids: Column IDs (not label) to join by.
6160
how: The type of join to perform.
62-
coalesce_join_keys: if set to False, returned column ids will contain
63-
both left and right join key columns.
6461
allow_row_identity_join (bool):
6562
If True, allow matching by row identity. Set to False to always
6663
perform a true JOIN in generated SQL.
@@ -71,8 +68,6 @@ def join_by_column(
7168
* Sequence[str]: Column IDs of the coalesced join columns. Sometimes either the
7269
left/right table will have missing rows. This column pulls the
7370
non-NULL value from either left/right.
74-
If coalesce_join_keys is False, will return uncombined left and
75-
right key columns.
7671
* Tuple[Callable, Callable]: For a given column ID from left or right,
7772
respectively, return the new column id from the combined expression.
7873
"""
@@ -100,9 +95,7 @@ def join_by_column(
10095
right_join_keys = [
10196
combined_expr.get_column(get_column_right(col)) for col in right_column_ids
10297
]
103-
join_key_cols = get_join_cols(
104-
left_join_keys, right_join_keys, how, coalesce_join_keys
105-
)
98+
join_key_cols = get_coalesced_join_cols(left_join_keys, right_join_keys, how)
10699
join_key_ids = [col.get_name() for col in join_key_cols]
107100
combined_expr = combined_expr.projection(
108101
[*join_key_cols, *combined_expr.columns]
@@ -182,9 +175,7 @@ def get_column_right(col_id):
182175
right_join_keys = [
183176
combined_table[get_column_right(col)] for col in right_column_ids
184177
]
185-
join_key_cols = get_join_cols(
186-
left_join_keys, right_join_keys, how, coalesce_join_keys
187-
)
178+
join_key_cols = get_coalesced_join_cols(left_join_keys, right_join_keys, how)
188179
# We could filter out the original join columns, but predicates/ordering
189180
# might still reference them in implicit joins.
190181
columns = (
@@ -226,46 +217,35 @@ def get_column_right(col_id):
226217
)
227218

228219

229-
def get_join_cols(
220+
def get_coalesced_join_cols(
230221
left_join_cols: typing.Iterable[ibis_types.Value],
231222
right_join_cols: typing.Iterable[ibis_types.Value],
232223
how: str,
233-
coalesce_join_keys: bool = True,
234224
) -> typing.List[ibis_types.Value]:
235225
join_key_cols: list[ibis_types.Value] = []
236226
for left_col, right_col in zip(left_join_cols, right_join_cols):
237-
if not coalesce_join_keys:
227+
if how == "left" or how == "inner":
238228
join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_")))
229+
elif how == "right":
239230
join_key_cols.append(right_col.name(guid.generate_guid(prefix="index_")))
240-
else:
241-
if how == "left" or how == "inner":
231+
elif how == "outer":
232+
# The left index and the right index might contain null values, for
233+
# example due to an outer join with different numbers of rows. Coalesce
234+
# these to take the index value from either column.
235+
# Use a random name in case the left index and the right index have the
236+
# same name. In such a case, _x and _y suffixes will already be used.
237+
# Don't need to coalesce if they are exactly the same column.
238+
if left_col.name("index").equals(right_col.name("index")):
242239
join_key_cols.append(left_col.name(guid.generate_guid(prefix="index_")))
243-
elif how == "right":
244-
join_key_cols.append(
245-
right_col.name(guid.generate_guid(prefix="index_"))
246-
)
247-
elif how == "outer":
248-
# The left index and the right index might contain null values, for
249-
# example due to an outer join with different numbers of rows. Coalesce
250-
# these to take the index value from either column.
251-
# Use a random name in case the left index and the right index have the
252-
# same name. In such a case, _x and _y suffixes will already be used.
253-
# Don't need to coalesce if they are exactly the same column.
254-
if left_col.name("index").equals(right_col.name("index")):
255-
join_key_cols.append(
256-
left_col.name(guid.generate_guid(prefix="index_"))
257-
)
258-
else:
259-
join_key_cols.append(
260-
ibis.coalesce(
261-
left_col,
262-
right_col,
263-
).name(guid.generate_guid(prefix="index_"))
264-
)
265240
else:
266-
raise ValueError(
267-
f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}"
241+
join_key_cols.append(
242+
ibis.coalesce(
243+
left_col,
244+
right_col,
245+
).name(guid.generate_guid(prefix="index_"))
268246
)
247+
else:
248+
raise ValueError(f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}")
269249
return join_key_cols
270250

271251

bigframes/core/utils.py

+33
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,36 @@ def get_standardized_ids(
104104
idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :]
105105

106106
return col_ids, idx_ids
107+
108+
109+
def merge_column_labels(
110+
left_labels: pd.Index,
111+
right_labels: pd.Index,
112+
coalesce_labels: typing.Sequence,
113+
suffixes: tuple[str, str] = ("_x", "_y"),
114+
) -> pd.Index:
115+
result_labels = []
116+
117+
for col_label in left_labels:
118+
if col_label in right_labels:
119+
if col_label in coalesce_labels:
120+
# Merging on the same column only returns 1 key column from coalesce both.
121+
# Take the left key column.
122+
result_labels.append(col_label)
123+
else:
124+
result_labels.append(str(col_label) + suffixes[0])
125+
else:
126+
result_labels.append(col_label)
127+
128+
for col_label in right_labels:
129+
if col_label in left_labels:
130+
if col_label in coalesce_labels:
131+
# Merging on the same column only returns 1 key column from coalesce both.
132+
# Pass the right key column.
133+
pass
134+
else:
135+
result_labels.append(str(col_label) + suffixes[1])
136+
else:
137+
result_labels.append(col_label)
138+
139+
return pd.Index(result_labels)

bigframes/dataframe.py

+34-93
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646
import bigframes.core.indexers as indexers
4747
import bigframes.core.indexes as indexes
4848
import bigframes.core.io
49-
import bigframes.core.joins as joins
5049
import bigframes.core.ordering as order
5150
import bigframes.core.utils as utils
5251
import bigframes.core.window
@@ -1779,12 +1778,10 @@ def merge(
17791778
] = "inner",
17801779
# TODO(garrettwu): Currently can take inner, outer, left and right. To support
17811780
# cross joins
1782-
# TODO(garrettwu): Support "on" list of columns and None. Currently a single
1783-
# column must be provided
1784-
on: Optional[str] = None,
1781+
on: Union[blocks.Label, Sequence[blocks.Label], None] = None,
17851782
*,
1786-
left_on: Optional[str] = None,
1787-
right_on: Optional[str] = None,
1783+
left_on: Union[blocks.Label, Sequence[blocks.Label], None] = None,
1784+
right_on: Union[blocks.Label, Sequence[blocks.Label], None] = None,
17881785
sort: bool = False,
17891786
suffixes: tuple[str, str] = ("_x", "_y"),
17901787
) -> DataFrame:
@@ -1798,97 +1795,41 @@ def merge(
17981795
)
17991796
left_on, right_on = on, on
18001797

1801-
left = self
1802-
left_on_sql = self._sql_names(left_on)
1803-
# 0 elements already throws an exception
1804-
if len(left_on_sql) > 1:
1805-
raise ValueError(f"The column label {left_on} is not unique.")
1806-
left_on_sql = left_on_sql[0]
1807-
1808-
right_on_sql = right._sql_names(right_on)
1809-
if len(right_on_sql) > 1:
1810-
raise ValueError(f"The column label {right_on} is not unique.")
1811-
right_on_sql = right_on_sql[0]
1812-
1813-
(
1814-
joined_expr,
1815-
join_key_ids,
1816-
(get_column_left, get_column_right),
1817-
) = joins.join_by_column(
1818-
left._block.expr,
1819-
[left_on_sql],
1820-
right._block.expr,
1821-
[right_on_sql],
1822-
how=how,
1823-
sort=sort,
1824-
# In merging on the same column, it only returns 1 key column from coalesced both.
1825-
# While if 2 different columns, both will be presented in the result.
1826-
coalesce_join_keys=(left_on == right_on),
1827-
)
1828-
# TODO(swast): Add suffixes to the column labels instead of reusing the
1829-
# column IDs as the new labels.
1830-
# Drop the index column(s) to be consistent with pandas.
1831-
left_columns = [
1832-
join_key_ids[0] if (col_id == left_on_sql) else get_column_left(col_id)
1833-
for col_id in left._block.value_columns
1834-
]
1835-
1836-
right_columns = []
1837-
for col_id in right._block.value_columns:
1838-
if col_id == right_on_sql:
1839-
# When left_on == right_on
1840-
if len(join_key_ids) > 1:
1841-
right_columns.append(join_key_ids[1])
1842-
else:
1843-
right_columns.append(get_column_right(col_id))
1844-
1845-
expr = joined_expr.select_columns([*left_columns, *right_columns])
1846-
labels = self._get_merged_col_labels(
1847-
right, left_on=left_on, right_on=right_on, suffixes=suffixes
1848-
)
1798+
if utils.is_list_like(left_on):
1799+
left_on = list(left_on) # type: ignore
1800+
else:
1801+
left_on = [left_on]
18491802

1850-
# Constructs default index
1851-
expr, offset_index_id = expr.promote_offsets()
1852-
block = blocks.Block(
1853-
expr, index_columns=[offset_index_id], column_labels=labels
1803+
if utils.is_list_like(right_on):
1804+
right_on = list(right_on) # type: ignore
1805+
else:
1806+
right_on = [right_on]
1807+
1808+
left_join_ids = []
1809+
for label in left_on: # type: ignore
1810+
left_col_id = self._resolve_label_exact(label)
1811+
# 0 elements already throws an exception
1812+
if not left_col_id:
1813+
raise ValueError(f"No column {label} found in self.")
1814+
left_join_ids.append(left_col_id)
1815+
1816+
right_join_ids = []
1817+
for label in right_on: # type: ignore
1818+
right_col_id = right._resolve_label_exact(label)
1819+
if not right_col_id:
1820+
raise ValueError(f"No column {label} found in other.")
1821+
right_join_ids.append(right_col_id)
1822+
1823+
block = self._block.merge(
1824+
right._block,
1825+
how,
1826+
left_join_ids,
1827+
right_join_ids,
1828+
sort=sort,
1829+
suffixes=suffixes,
18541830
)
18551831
return DataFrame(block)
18561832

1857-
def _get_merged_col_labels(
1858-
self,
1859-
right: DataFrame,
1860-
left_on: str,
1861-
right_on: str,
1862-
suffixes: tuple[str, str] = ("_x", "_y"),
1863-
) -> List[blocks.Label]:
1864-
on_col_equal = left_on == right_on
1865-
1866-
left_col_labels: list[blocks.Label] = []
1867-
for col_label in self._block.column_labels:
1868-
if col_label in right._block.column_labels:
1869-
if on_col_equal and col_label == left_on:
1870-
# Merging on the same column only returns 1 key column from coalesce both.
1871-
# Take the left key column.
1872-
left_col_labels.append(col_label)
1873-
else:
1874-
left_col_labels.append(str(col_label) + suffixes[0])
1875-
else:
1876-
left_col_labels.append(col_label)
1877-
1878-
right_col_labels: list[blocks.Label] = []
1879-
for col_label in right._block.column_labels:
1880-
if col_label in self._block.column_labels:
1881-
if on_col_equal and col_label == left_on:
1882-
# Merging on the same column only returns 1 key column from coalesce both.
1883-
# Pass the right key column.
1884-
pass
1885-
else:
1886-
right_col_labels.append(str(col_label) + suffixes[1])
1887-
else:
1888-
right_col_labels.append(col_label)
1889-
1890-
return left_col_labels + right_col_labels
1891-
18921833
def join(
18931834
self, other: DataFrame, *, on: Optional[str] = None, how: str = "left"
18941835
) -> DataFrame:

0 commit comments

Comments
 (0)