Skip to content

Commit 014bd33

Browse files
fix: Local data always has sequential index (#1514)
1 parent 5b7c3af commit 014bd33

File tree

3 files changed

+63
-50
lines changed

3 files changed

+63
-50
lines changed

bigframes/core/indexes/base.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,7 @@ def __new__(
7070
elif isinstance(data, series.Series) or isinstance(data, Index):
7171
if isinstance(data, series.Series):
7272
block = data._block
73-
block = block.set_index(
74-
col_ids=[data._value_column],
75-
)
73+
block = block.set_index(col_ids=[data._value_column])
7674
elif isinstance(data, Index):
7775
block = data._block
7876
index = Index(data=block)
@@ -508,7 +506,10 @@ def to_pandas(
508506
...
509507

510508
def to_pandas(
511-
self, *, allow_large_results: Optional[bool] = None, dry_run: bool = False
509+
self,
510+
*,
511+
allow_large_results: Optional[bool] = None,
512+
dry_run: bool = False,
512513
) -> pandas.Index | pandas.Series:
513514
"""Gets the Index as a pandas Index.
514515

bigframes/operations/base.py

+39-45
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import bigframes.core.identifiers as ids
2828
import bigframes.core.indexes as indexes
2929
import bigframes.core.scalar as scalars
30+
import bigframes.core.utils as bf_utils
3031
import bigframes.dtypes
3132
import bigframes.operations as ops
3233
import bigframes.operations.aggregations as agg_ops
@@ -69,40 +70,21 @@ def __init__(
6970
raise ValueError(
7071
f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}"
7172
)
73+
7274
if isinstance(data, blocks.Block):
73-
# Constructing from block is for internal use only - shouldn't use parameters, block encompasses all state
74-
assert len(data.value_columns) == 1
75-
assert len(data.column_labels) == 1
76-
assert index is None
77-
assert name is None
78-
assert dtype is None
7975
block = data
80-
81-
# interpret these cases as both index and data
82-
elif isinstance(data, bigframes.pandas.Series) or pd.api.types.is_dict_like(
83-
data
84-
): # includes pd.Series
85-
if isinstance(data, bigframes.pandas.Series):
86-
data = data.copy()
87-
if name is not None:
88-
data.name = name
89-
if dtype is not None:
90-
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
91-
data = data.astype(bf_dtype)
92-
else: # local dict-like data
93-
data = read_pandas_func(pd.Series(data, name=name, dtype=dtype)) # type: ignore
94-
data_block = data._block
95-
if index is not None:
96-
# reindex
97-
bf_index = indexes.Index(index, session=session)
98-
idx_block = bf_index._block
99-
idx_cols = idx_block.value_columns
100-
block_idx, _ = idx_block.join(data_block, how="left")
101-
data_block = block_idx.with_index_labels(bf_index.names)
102-
block = data_block
103-
104-
# list-like data that will get default index
105-
elif isinstance(data, indexes.Index) or pd.api.types.is_list_like(data):
76+
elif isinstance(data, SeriesMethods):
77+
block = data._get_block()
78+
# special case where data is local scalar, but index is bigframes index (maybe very big)
79+
elif (
80+
not bf_utils.is_list_like(data) and not isinstance(data, indexes.Index)
81+
) and isinstance(index, indexes.Index):
82+
block = index._block
83+
block, _ = block.create_constant(data)
84+
block = block.with_column_labels([None])
85+
# prevents no-op reindex later
86+
index = None
87+
elif isinstance(data, indexes.Index) or isinstance(index, indexes.Index):
10688
data = indexes.Index(data, dtype=dtype, name=name, session=session)
10789
# set to none as it has already been applied, avoid re-cast later
10890
if data.nlevels != 1:
@@ -111,8 +93,7 @@ def __init__(
11193
data_block = data._block.reset_index(drop=False).with_column_labels(
11294
data.names
11395
)
114-
if index is not None:
115-
# Align by offset
96+
if index is not None: # Align data and index by offset
11697
bf_index = indexes.Index(index, session=session)
11798
idx_block = bf_index._block.reset_index(
11899
drop=False
@@ -121,19 +102,32 @@ def __init__(
121102
data_block, (l_mapping, _) = idx_block.join(data_block, how="left")
122103
data_block = data_block.set_index([l_mapping[col] for col in idx_cols])
123104
data_block = data_block.with_index_labels(bf_index.names)
105+
# prevents no-op reindex later
106+
index = None
124107
block = data_block
125108

126-
else: # Scalar case
127-
if index is not None:
128-
bf_index = indexes.Index(index, session=session)
129-
else:
130-
bf_index = indexes.Index(
131-
[] if (data is None) else [0],
132-
session=session,
133-
dtype=bigframes.dtypes.INT_DTYPE,
134-
)
135-
block, _ = bf_index._block.create_constant(data, dtype)
136-
block = block.with_column_labels([name])
109+
if block:
110+
assert len(block.value_columns) == 1
111+
assert len(block.column_labels) == 1
112+
if index is not None: # reindexing operation
113+
bf_index = indexes.Index(index)
114+
idx_block = bf_index._block
115+
idx_cols = idx_block.index_columns
116+
block, _ = idx_block.join(block, how="left")
117+
block = block.with_index_labels(bf_index.names)
118+
if name:
119+
block = block.with_column_labels([name])
120+
if dtype:
121+
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
122+
block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
123+
else:
124+
pd_series = pd.Series(
125+
data=data,
126+
index=index, # type:ignore
127+
dtype=dtype, # type:ignore
128+
name=name,
129+
)
130+
block = read_pandas_func(pd_series)._get_block() # type:ignore
137131

138132
assert block is not None
139133
self._block: blocks.Block = block

tests/system/small/test_series.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ def test_series_construct_reindex():
140140

141141
# BigQuery DataFrame default indices use nullable Int64 always
142142
pd_result.index = pd_result.index.astype("Int64")
143-
144143
pd.testing.assert_series_equal(bf_result, pd_result)
145144

146145

@@ -201,6 +200,17 @@ def test_series_construct_nan():
201200
pd.testing.assert_series_equal(bf_result, pd_result)
202201

203202

203+
def test_series_construct_scalar_w_bf_index():
204+
bf_result = series.Series(
205+
"hello", index=bigframes.pandas.Index([1, 2, 3])
206+
).to_pandas()
207+
pd_result = pd.Series("hello", index=pd.Index([1, 2, 3], dtype="Int64"))
208+
209+
pd_result = pd_result.astype("string[pyarrow]")
210+
211+
pd.testing.assert_series_equal(bf_result, pd_result)
212+
213+
204214
def test_series_construct_from_list_escaped_strings():
205215
"""Check that special characters are supported."""
206216
strings = [
@@ -303,6 +313,14 @@ def test_series_construct_w_dtype_for_array_struct():
303313
)
304314

305315

316+
def test_series_construct_local_unordered_has_sequential_index(unordered_session):
317+
series = bigframes.pandas.Series(
318+
["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session
319+
)
320+
expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype())
321+
pd.testing.assert_index_equal(series.index.to_pandas(), expected)
322+
323+
306324
def test_series_construct_w_dtype_for_json():
307325
# Until b/401630655 is resolved, json, not compatible with allow_large_results=False
308326
with bigframes.option_context("bigquery.allow_large_results", True):

0 commit comments

Comments
 (0)