fix: Fix bug converting non-string labels to sql ids (#296)

TrevorBergeron · web-flow · commit a61c5fef1e3b · 2024-01-04T21:14:15.000Z
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
@@ -27,6 +27,7 @@
 import bigframes.core.nodes as nodes
 from bigframes.core.ordering import OrderingColumnReference
 import bigframes.core.ordering as orderings
+import bigframes.core.utils
 from bigframes.core.window_spec import WindowSpec
 import bigframes.dtypes
 import bigframes.operations as ops
@@ -69,10 +70,14 @@ def from_ibis(
     @classmethod
     def from_pandas(cls, pd_df: pandas.DataFrame):
         iobytes = io.BytesIO()
-        # Discard row labels and use simple string ids for columns
-        column_ids = tuple(str(label) for label in pd_df.columns)
-        pd_df.reset_index(drop=True).set_axis(column_ids, axis=1).to_feather(iobytes)
-        node = nodes.ReadLocalNode(iobytes.getvalue(), column_ids=column_ids)
+        # Use alphanumeric identifiers, to avoid downstream problems with escaping.
+        as_ids = [
+            bigframes.core.utils.label_to_identifier(label, strict=True)
+            for label in pd_df.columns
+        ]
+        unique_ids = tuple(bigframes.core.utils.disambiguate_ids(as_ids))
+        pd_df.reset_index(drop=True).set_axis(unique_ids, axis=1).to_feather(iobytes)
+        node = nodes.ReadLocalNode(iobytes.getvalue())
         return cls(node)
 
     @property
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
@@ -122,7 +122,6 @@ def __hash__(self):
 @dataclass(frozen=True)
 class ReadLocalNode(BigFrameNode):
     feather_bytes: bytes
-    column_ids: typing.Tuple[str, ...]
 
     def __hash__(self):
         return self._node_hash
diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 import typing
 from typing import Hashable, Iterable, List
 
@@ -84,26 +85,42 @@ def get_standardized_ids(
         Tuple of (standardized_column_ids, standardized_index_ids)
     """
     col_ids = [
-        UNNAMED_COLUMN_ID if col_label is None else str(col_label)
+        UNNAMED_COLUMN_ID if col_label is None else label_to_identifier(col_label)
         for col_label in col_labels
     ]
     idx_ids = [
-        UNNAMED_INDEX_ID if idx_label is None else str(idx_label)
+        UNNAMED_INDEX_ID if idx_label is None else label_to_identifier(idx_label)
         for idx_label in idx_labels
     ]
 
-    ids = idx_ids + col_ids
+    ids = disambiguate_ids(idx_ids + col_ids)
+
+    idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :]
+
+    return col_ids, idx_ids
+
+
+def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
+    """
+    Convert pandas label to make legal bigquery identifier. May create collisions (should deduplicate after).
+    Strict mode might not be necessary, but ibis seems to escape non-alphanumeric characters inconsistently.
+    """
     # Column values will be loaded as null if the column name has spaces.
     # https://github.com/googleapis/python-bigquery/issues/1566
-    ids = [id.replace(" ", "_") for id in ids]
+    identifier = str(label).replace(" ", "_")
+    if strict:
+        identifier = re.sub(r"[^a-zA-Z0-9_]", "", identifier)
+        if not identifier:
+            identifier = "id"
+    return identifier
+
 
-    ids = typing.cast(
+def disambiguate_ids(ids: typing.Sequence[str]) -> typing.List[str]:
+    """Disambiguate list of ids by adding suffixes where needed. If inputs are legal sql ids, outputs should be as well."""
+    return typing.cast(
         List[str],
         vendored_pandas_io_common.dedup_names(ids, is_potential_multiindex=False),
     )
-    idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :]
-
-    return col_ids, idx_ids
 
 
 def merge_column_labels(
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
@@ -16,7 +16,37 @@
 import pytest
 
 import bigframes.pandas as bpd
-from tests.system.utils import assert_pandas_df_equal
+from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas
+
+
+@skip_legacy_pandas
+def test_read_pandas_multi_index_axes():
+    index = pandas.MultiIndex.from_arrays(
+        [
+            pandas.Index([4, 99], dtype=pandas.Int64Dtype()),
+            pandas.Index(
+                [" Hello, World!", "_some_new_string"],
+                dtype=pandas.StringDtype(storage="pyarrow"),
+            ),
+        ],
+        names=[" 1index 1", "_1index 2"],
+    )
+    columns = pandas.MultiIndex.from_arrays(
+        [
+            pandas.Index([6, 87], dtype=pandas.Int64Dtype()),
+            pandas.Index(
+                [" Bonjour le monde!", "_une_chaîne_de_caractères"],
+                dtype=pandas.StringDtype(storage="pyarrow"),
+            ),
+        ],
+        names=[" 1columns 1", "_1new_index 2"],
+    )
+    pandas_df = pandas.DataFrame(
+        [[1, 2], [3, 4]], index=index, columns=columns, dtype=pandas.Int64Dtype()
+    )
+    bf_df = bpd.DataFrame(pandas_df)
+
+    pandas.testing.assert_frame_equal(bf_df.to_pandas(), pandas_df)
 
 
 # Row Multi-index tests