fix: Local data always has sequential index (#1514)

TrevorBergeron · web-flow · commit 014bd3331796 · 2025-03-26T13:39:27.000-07:00
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
@@ -70,9 +70,7 @@ def __new__(
         elif isinstance(data, series.Series) or isinstance(data, Index):
             if isinstance(data, series.Series):
                 block = data._block
-                block = block.set_index(
-                    col_ids=[data._value_column],
-                )
+                block = block.set_index(col_ids=[data._value_column])
             elif isinstance(data, Index):
                 block = data._block
             index = Index(data=block)
@@ -508,7 +506,10 @@ def to_pandas(
         ...
 
     def to_pandas(
-        self, *, allow_large_results: Optional[bool] = None, dry_run: bool = False
+        self,
+        *,
+        allow_large_results: Optional[bool] = None,
+        dry_run: bool = False,
     ) -> pandas.Index | pandas.Series:
         """Gets the Index as a pandas Index.
 
diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py
@@ -27,6 +27,7 @@
 import bigframes.core.identifiers as ids
 import bigframes.core.indexes as indexes
 import bigframes.core.scalar as scalars
+import bigframes.core.utils as bf_utils
 import bigframes.dtypes
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
@@ -69,40 +70,21 @@ def __init__(
             raise ValueError(
                 f"Series constructor only supports copy=True. {constants.FEEDBACK_LINK}"
             )
+
         if isinstance(data, blocks.Block):
-            # Constructing from block is for internal use only - shouldn't use parameters, block encompasses all state
-            assert len(data.value_columns) == 1
-            assert len(data.column_labels) == 1
-            assert index is None
-            assert name is None
-            assert dtype is None
             block = data
-
-        # interpret these cases as both index and data
-        elif isinstance(data, bigframes.pandas.Series) or pd.api.types.is_dict_like(
-            data
-        ):  # includes pd.Series
-            if isinstance(data, bigframes.pandas.Series):
-                data = data.copy()
-                if name is not None:
-                    data.name = name
-                if dtype is not None:
-                    bf_dtype = bigframes.dtypes.bigframes_type(dtype)
-                    data = data.astype(bf_dtype)
-            else:  # local dict-like data
-                data = read_pandas_func(pd.Series(data, name=name, dtype=dtype))  # type: ignore
-            data_block = data._block
-            if index is not None:
-                # reindex
-                bf_index = indexes.Index(index, session=session)
-                idx_block = bf_index._block
-                idx_cols = idx_block.value_columns
-                block_idx, _ = idx_block.join(data_block, how="left")
-                data_block = block_idx.with_index_labels(bf_index.names)
-            block = data_block
-
-        # list-like data that will get default index
-        elif isinstance(data, indexes.Index) or pd.api.types.is_list_like(data):
+        elif isinstance(data, SeriesMethods):
+            block = data._get_block()
+        # special case where data is local scalar, but index is bigframes index (maybe very big)
+        elif (
+            not bf_utils.is_list_like(data) and not isinstance(data, indexes.Index)
+        ) and isinstance(index, indexes.Index):
+            block = index._block
+            block, _ = block.create_constant(data)
+            block = block.with_column_labels([None])
+            # prevents no-op reindex later
+            index = None
+        elif isinstance(data, indexes.Index) or isinstance(index, indexes.Index):
             data = indexes.Index(data, dtype=dtype, name=name, session=session)
             # set to none as it has already been applied, avoid re-cast later
             if data.nlevels != 1:
@@ -111,8 +93,7 @@ def __init__(
             data_block = data._block.reset_index(drop=False).with_column_labels(
                 data.names
             )
-            if index is not None:
-                # Align by offset
+            if index is not None:  # Align data and index by offset
                 bf_index = indexes.Index(index, session=session)
                 idx_block = bf_index._block.reset_index(
                     drop=False
@@ -121,19 +102,32 @@ def __init__(
                 data_block, (l_mapping, _) = idx_block.join(data_block, how="left")
                 data_block = data_block.set_index([l_mapping[col] for col in idx_cols])
                 data_block = data_block.with_index_labels(bf_index.names)
+                # prevents no-op reindex later
+                index = None
             block = data_block
 
-        else:  # Scalar case
-            if index is not None:
-                bf_index = indexes.Index(index, session=session)
-            else:
-                bf_index = indexes.Index(
-                    [] if (data is None) else [0],
-                    session=session,
-                    dtype=bigframes.dtypes.INT_DTYPE,
-                )
-            block, _ = bf_index._block.create_constant(data, dtype)
-            block = block.with_column_labels([name])
+        if block:
+            assert len(block.value_columns) == 1
+            assert len(block.column_labels) == 1
+            if index is not None:  # reindexing operation
+                bf_index = indexes.Index(index)
+                idx_block = bf_index._block
+                idx_cols = idx_block.index_columns
+                block, _ = idx_block.join(block, how="left")
+                block = block.with_index_labels(bf_index.names)
+            if name:
+                block = block.with_column_labels([name])
+            if dtype:
+                bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+                block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
+        else:
+            pd_series = pd.Series(
+                data=data,
+                index=index,  # type:ignore
+                dtype=dtype,  # type:ignore
+                name=name,
+            )
+            block = read_pandas_func(pd_series)._get_block()  # type:ignore
 
         assert block is not None
         self._block: blocks.Block = block
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -140,7 +140,6 @@ def test_series_construct_reindex():
 
     # BigQuery DataFrame default indices use nullable Int64 always
     pd_result.index = pd_result.index.astype("Int64")
-
     pd.testing.assert_series_equal(bf_result, pd_result)
 
 
@@ -201,6 +200,17 @@ def test_series_construct_nan():
     pd.testing.assert_series_equal(bf_result, pd_result)
 
 
+def test_series_construct_scalar_w_bf_index():
+    bf_result = series.Series(
+        "hello", index=bigframes.pandas.Index([1, 2, 3])
+    ).to_pandas()
+    pd_result = pd.Series("hello", index=pd.Index([1, 2, 3], dtype="Int64"))
+
+    pd_result = pd_result.astype("string[pyarrow]")
+
+    pd.testing.assert_series_equal(bf_result, pd_result)
+
+
 def test_series_construct_from_list_escaped_strings():
     """Check that special characters are supported."""
     strings = [
@@ -303,6 +313,14 @@ def test_series_construct_w_dtype_for_array_struct():
     )
 
 
+def test_series_construct_local_unordered_has_sequential_index(unordered_session):
+    series = bigframes.pandas.Series(
+        ["Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat"], session=unordered_session
+    )
+    expected: pd.Index = pd.Index([0, 1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype())
+    pd.testing.assert_index_equal(series.index.to_pandas(), expected)
+
+
 def test_series_construct_w_dtype_for_json():
     # Until b/401630655 is resolved, json, not compatible with allow_large_results=False
     with bigframes.option_context("bigquery.allow_large_results", True):