Skip to content

Commit a61c5fe

Browse files
fix: Fix bug converting non-string labels to sql ids (#296)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent e3a056a commit a61c5fe

File tree

4 files changed

+65
-14
lines changed

4 files changed

+65
-14
lines changed

bigframes/core/__init__.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import bigframes.core.nodes as nodes
2828
from bigframes.core.ordering import OrderingColumnReference
2929
import bigframes.core.ordering as orderings
30+
import bigframes.core.utils
3031
from bigframes.core.window_spec import WindowSpec
3132
import bigframes.dtypes
3233
import bigframes.operations as ops
@@ -69,10 +70,14 @@ def from_ibis(
6970
@classmethod
7071
def from_pandas(cls, pd_df: pandas.DataFrame):
7172
iobytes = io.BytesIO()
72-
# Discard row labels and use simple string ids for columns
73-
column_ids = tuple(str(label) for label in pd_df.columns)
74-
pd_df.reset_index(drop=True).set_axis(column_ids, axis=1).to_feather(iobytes)
75-
node = nodes.ReadLocalNode(iobytes.getvalue(), column_ids=column_ids)
73+
# Use alphanumeric identifiers, to avoid downstream problems with escaping.
74+
as_ids = [
75+
bigframes.core.utils.label_to_identifier(label, strict=True)
76+
for label in pd_df.columns
77+
]
78+
unique_ids = tuple(bigframes.core.utils.disambiguate_ids(as_ids))
79+
pd_df.reset_index(drop=True).set_axis(unique_ids, axis=1).to_feather(iobytes)
80+
node = nodes.ReadLocalNode(iobytes.getvalue())
7681
return cls(node)
7782

7883
@property

bigframes/core/nodes.py

-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,6 @@ def __hash__(self):
122122
@dataclass(frozen=True)
123123
class ReadLocalNode(BigFrameNode):
124124
feather_bytes: bytes
125-
column_ids: typing.Tuple[str, ...]
126125

127126
def __hash__(self):
128127
return self._node_hash

bigframes/core/utils.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import re
1415
import typing
1516
from typing import Hashable, Iterable, List
1617

@@ -84,26 +85,42 @@ def get_standardized_ids(
8485
Tuple of (standardized_column_ids, standardized_index_ids)
8586
"""
8687
col_ids = [
87-
UNNAMED_COLUMN_ID if col_label is None else str(col_label)
88+
UNNAMED_COLUMN_ID if col_label is None else label_to_identifier(col_label)
8889
for col_label in col_labels
8990
]
9091
idx_ids = [
91-
UNNAMED_INDEX_ID if idx_label is None else str(idx_label)
92+
UNNAMED_INDEX_ID if idx_label is None else label_to_identifier(idx_label)
9293
for idx_label in idx_labels
9394
]
9495

95-
ids = idx_ids + col_ids
96+
ids = disambiguate_ids(idx_ids + col_ids)
97+
98+
idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :]
99+
100+
return col_ids, idx_ids
101+
102+
103+
def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
104+
"""
105+
Convert pandas label to make legal bigquery identifier. May create collisions (should deduplicate after).
106+
Strict mode might not be necessary, but ibis seems to escape non-alphanumeric characters inconsistently.
107+
"""
96108
# Column values will be loaded as null if the column name has spaces.
97109
# https://github.com/googleapis/python-bigquery/issues/1566
98-
ids = [id.replace(" ", "_") for id in ids]
110+
identifier = str(label).replace(" ", "_")
111+
if strict:
112+
identifier = re.sub(r"[^a-zA-Z0-9_]", "", identifier)
113+
if not identifier:
114+
identifier = "id"
115+
return identifier
116+
99117

100-
ids = typing.cast(
118+
def disambiguate_ids(ids: typing.Sequence[str]) -> typing.List[str]:
119+
"""Disambiguate list of ids by adding suffixes where needed. If inputs are legal sql ids, outputs should be as well."""
120+
return typing.cast(
101121
List[str],
102122
vendored_pandas_io_common.dedup_names(ids, is_potential_multiindex=False),
103123
)
104-
idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :]
105-
106-
return col_ids, idx_ids
107124

108125

109126
def merge_column_labels(

tests/system/small/test_multiindex.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,37 @@
1616
import pytest
1717

1818
import bigframes.pandas as bpd
19-
from tests.system.utils import assert_pandas_df_equal
19+
from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas
20+
21+
22+
@skip_legacy_pandas
23+
def test_read_pandas_multi_index_axes():
24+
index = pandas.MultiIndex.from_arrays(
25+
[
26+
pandas.Index([4, 99], dtype=pandas.Int64Dtype()),
27+
pandas.Index(
28+
[" Hello, World!", "_some_new_string"],
29+
dtype=pandas.StringDtype(storage="pyarrow"),
30+
),
31+
],
32+
names=[" 1index 1", "_1index 2"],
33+
)
34+
columns = pandas.MultiIndex.from_arrays(
35+
[
36+
pandas.Index([6, 87], dtype=pandas.Int64Dtype()),
37+
pandas.Index(
38+
[" Bonjour le monde!", "_une_chaîne_de_caractères"],
39+
dtype=pandas.StringDtype(storage="pyarrow"),
40+
),
41+
],
42+
names=[" 1columns 1", "_1new_index 2"],
43+
)
44+
pandas_df = pandas.DataFrame(
45+
[[1, 2], [3, 4]], index=index, columns=columns, dtype=pandas.Int64Dtype()
46+
)
47+
bf_df = bpd.DataFrame(pandas_df)
48+
49+
pandas.testing.assert_frame_equal(bf_df.to_pandas(), pandas_df)
2050

2151

2252
# Row Multi-index tests

0 commit comments

Comments
 (0)