Skip to content

Commit 923da03

Browse files
perf: Prune unused operations from sql (#1365)
1 parent 0c55b07 commit 923da03

File tree

12 files changed

+489
-252
lines changed

12 files changed

+489
-252
lines changed

bigframes/core/__init__.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -304,18 +304,25 @@ def assign(self, source_id: str, destination_id: str) -> ArrayValue:
304304
if destination_id in self.column_ids: # Mutate case
305305
exprs = [
306306
(
307-
ex.deref(source_id if (col_id == destination_id) else col_id),
308-
ids.ColumnId(col_id),
307+
bigframes.core.nodes.AliasedRef(
308+
ex.deref(source_id if (col_id == destination_id) else col_id),
309+
ids.ColumnId(col_id),
310+
)
309311
)
310312
for col_id in self.column_ids
311313
]
312314
else: # append case
313315
self_projection = (
314-
(ex.deref(col_id), ids.ColumnId(col_id)) for col_id in self.column_ids
316+
bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id))
317+
for col_id in self.column_ids
315318
)
316319
exprs = [
317320
*self_projection,
318-
(ex.deref(source_id), ids.ColumnId(destination_id)),
321+
(
322+
bigframes.core.nodes.AliasedRef(
323+
ex.deref(source_id), ids.ColumnId(destination_id)
324+
)
325+
),
319326
]
320327
return ArrayValue(
321328
nodes.SelectionNode(
@@ -337,7 +344,10 @@ def create_constant(
337344

338345
def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue:
339346
# This basically just drops and reorders columns - logically a no-op except as a final step
340-
selections = ((ex.deref(col_id), ids.ColumnId(col_id)) for col_id in column_ids)
347+
selections = (
348+
bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id))
349+
for col_id in column_ids
350+
)
341351
return ArrayValue(
342352
nodes.SelectionNode(
343353
child=self.node,
@@ -488,7 +498,9 @@ def prepare_join_names(
488498
nodes.SelectionNode(
489499
other.node,
490500
tuple(
491-
(ex.deref(old_id), ids.ColumnId(new_id))
501+
bigframes.core.nodes.AliasedRef(
502+
ex.deref(old_id), ids.ColumnId(new_id)
503+
)
492504
for old_id, new_id in r_mapping.items()
493505
),
494506
),

bigframes/core/compile/compiled.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def _to_ibis_expr(
184184
# Special case for empty tables, since we can't create an empty
185185
# projection.
186186
if not self._columns:
187-
return bigframes_vendored.ibis.memtable([])
187+
return self._table.select([bigframes_vendored.ibis.literal(1)])
188188

189189
table = self._table.select(self._columns)
190190
if fraction is not None:

bigframes/core/compile/compiler.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@ def compile_sql(
6262
if ordered:
6363
node, limit = rewrites.pullup_limit_from_slice(node)
6464
node = nodes.bottom_up(node, rewrites.rewrite_slice)
65+
# TODO: Extract out CTEs
6566
node, ordering = rewrites.pull_up_order(
6667
node, order_root=True, ordered_joins=self.strict
6768
)
69+
node = rewrites.column_pruning(node)
6870
ir = self.compile_node(node)
6971
return ir.to_sql(
7072
order_by=ordering.all_ordering_columns,
@@ -76,6 +78,7 @@ def compile_sql(
7678
node, _ = rewrites.pull_up_order(
7779
node, order_root=False, ordered_joins=self.strict
7880
)
81+
node = rewrites.column_pruning(node)
7982
ir = self.compile_node(node)
8083
return ir.to_sql(selections=output_ids)
8184

@@ -86,6 +89,7 @@ def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str:
8689
node, _ = rewrites.pull_up_order(
8790
node, order_root=False, ordered_joins=self.strict
8891
)
92+
node = rewrites.column_pruning(node)
8993
return self.compile_node(node).to_sql(limit=n_rows, selections=ids)
9094

9195
def compile_raw(
@@ -97,6 +101,7 @@ def compile_raw(
97101
node = nodes.bottom_up(node, rewrites.rewrite_slice)
98102
node = nodes.top_down(node, rewrites.rewrite_timedelta_ops)
99103
node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict)
104+
node = rewrites.column_pruning(node)
100105
ir = self.compile_node(node)
101106
sql = ir.to_sql()
102107
return sql, node.schema.to_bigquery(), ordering
@@ -192,31 +197,34 @@ def compile_readtable(self, node: nodes.ReadTableNode):
192197
return self.compile_read_table_unordered(node.source, node.scan_list)
193198

194199
def read_table_as_unordered_ibis(
195-
self, source: nodes.BigqueryDataSource
200+
self,
201+
source: nodes.BigqueryDataSource,
202+
scan_cols: typing.Sequence[str],
196203
) -> ibis_types.Table:
197204
full_table_name = f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}"
198-
used_columns = tuple(col.name for col in source.table.physical_schema)
199205
# Physical schema might include unused columns, unsupported datatypes like JSON
200206
physical_schema = ibis_bigquery.BigQuerySchema.to_ibis(
201-
list(i for i in source.table.physical_schema if i.name in used_columns)
207+
list(source.table.physical_schema)
202208
)
203209
if source.at_time is not None or source.sql_predicate is not None:
204210
import bigframes.session._io.bigquery
205211

206212
sql = bigframes.session._io.bigquery.to_query(
207213
full_table_name,
208-
columns=used_columns,
214+
columns=scan_cols,
209215
sql_predicate=source.sql_predicate,
210216
time_travel_timestamp=source.at_time,
211217
)
212218
return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql)
213219
else:
214-
return ibis_api.table(physical_schema, full_table_name)
220+
return ibis_api.table(physical_schema, full_table_name).select(scan_cols)
215221

216222
def compile_read_table_unordered(
217223
self, source: nodes.BigqueryDataSource, scan: nodes.ScanList
218224
):
219-
ibis_table = self.read_table_as_unordered_ibis(source)
225+
ibis_table = self.read_table_as_unordered_ibis(
226+
source, scan_cols=[col.source_id for col in scan.items]
227+
)
220228
return compiled.UnorderedIR(
221229
ibis_table,
222230
tuple(
@@ -291,7 +299,7 @@ def set_output_names(
291299
return nodes.SelectionNode(
292300
node,
293301
tuple(
294-
(ex.DerefOp(old_id), ids.ColumnId(out_id))
302+
bigframes.core.nodes.AliasedRef(ex.DerefOp(old_id), ids.ColumnId(out_id))
295303
for old_id, out_id in zip(node.ids, output_ids)
296304
),
297305
)

0 commit comments

Comments
 (0)