Skip to content

Commit ffb0d15

Browse files
authored
feat: Support bigframes.pandas.to_datetime for scalars, iterables and series. (#372)
* feat: Support pd.to_datetime for scalars, iterables and series. * update test and docstring * update types * format update * remove import. * update docstring * update arg conversion * update examples * update format * update code examples, and working logic. * docstring update. * type update. * format update. * Update docstring format * remove import * remove empty line * Remove extra code * remove prints. * Code logic updates. * Add constants. * Update comments * Move datetime helpers to the end of file. * Update helper * update format * String process logic updated. * update import * remove print * update docstring * update docstring * update docstring * update note * update docstring * Update code examples
1 parent de1e0a4 commit ffb0d15

File tree

8 files changed

+322
-0
lines changed

8 files changed

+322
-0
lines changed

bigframes/core/compile/scalar_op_compiler.py

+43
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@
4040
# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow.
4141
_FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78))
4242

43+
# Datetime constants
44+
UNIT_TO_US_CONVERSION_FACTORS = {
45+
"D": 24 * 60 * 60 * 1000 * 1000,
46+
"h": 60 * 60 * 1000 * 1000,
47+
"m": 60 * 1000 * 1000,
48+
"s": 1000 * 1000,
49+
"ms": 1000,
50+
"us": 1,
51+
"ns": 1e-3,
52+
}
53+
4354

4455
class ScalarOpCompiler:
4556
# Mapping of operation name to implemenations
@@ -656,6 +667,33 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp):
656667
return x.isin(matchable_ibis_values)
657668

658669

670+
@scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True)
671+
def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp):
672+
if x.type() == ibis_dtypes.str:
673+
x = x.to_timestamp(op.format) if op.format else timestamp(x)
674+
elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"):
675+
return x
676+
elif x.type() != ibis_dtypes.timestamp:
677+
# The default unit is set to "ns" (nanoseconds) for consistency
678+
# with pandas, where "ns" is the default unit for datetime operations.
679+
unit = op.unit or "ns"
680+
if unit not in UNIT_TO_US_CONVERSION_FACTORS:
681+
raise ValueError(f"Cannot convert input with unit '{unit}'.")
682+
x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit]
683+
x_converted = x_converted.cast(ibis_dtypes.int64)
684+
685+
# Note: Due to an issue where casting directly to a timestamp
686+
# without a timezone does not work, we first cast to UTC. This
687+
# approach appears to bypass a potential bug in Ibis's cast function,
688+
# allowing for subsequent casting to a timestamp type without timezone
689+
# information. Further investigation is needed to confirm this behavior.
690+
x = x_converted.to_timestamp(unit="us").cast(
691+
ibis_dtypes.Timestamp(timezone="UTC")
692+
)
693+
694+
return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None))
695+
696+
659697
@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True)
660698
def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp):
661699
if not hasattr(op.func, "bigframes_remote_function"):
@@ -1141,3 +1179,8 @@ def is_null(value) -> bool:
11411179

11421180
def _ibis_num(number: float):
11431181
return typing.cast(ibis_types.NumericValue, ibis_types.literal(number))
1182+
1183+
1184+
@ibis.udf.scalar.builtin
1185+
def timestamp(a: str) -> ibis_dtypes.timestamp:
1186+
"""Convert string to timestamp."""

bigframes/core/tools/__init__.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from bigframes.core.tools.datetimes import to_datetime
16+
17+
__all__ = [
18+
"to_datetime",
19+
]

bigframes/core/tools/datetimes.py

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from collections.abc import Mapping
16+
from datetime import datetime
17+
from typing import Optional, Union
18+
19+
import pandas as pd
20+
21+
import bigframes.constants as constants
22+
import bigframes.core.global_session as global_session
23+
import bigframes.dataframe
24+
import bigframes.operations as ops
25+
import bigframes.series
26+
import third_party.bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes
27+
28+
29+
def to_datetime(
30+
arg: Union[
31+
vendored_pandas_datetimes.local_scalars,
32+
vendored_pandas_datetimes.local_iterables,
33+
bigframes.series.Series,
34+
bigframes.dataframe.DataFrame,
35+
],
36+
*,
37+
utc: bool = False,
38+
format: Optional[str] = None,
39+
unit: Optional[str] = None,
40+
) -> Union[pd.Timestamp, datetime, bigframes.series.Series]:
41+
if isinstance(arg, (int, float, str, datetime)):
42+
return pd.to_datetime(
43+
arg,
44+
utc=utc,
45+
format=format,
46+
unit=unit,
47+
)
48+
49+
if isinstance(arg, (Mapping, pd.DataFrame, bigframes.dataframe.DataFrame)):
50+
raise NotImplementedError(
51+
"Conversion of Mapping, pandas.DataFrame, or bigframes.dataframe.DataFrame "
52+
f"to datetime is not implemented. {constants.FEEDBACK_LINK}"
53+
)
54+
55+
if not isinstance(arg, bigframes.series.Series):
56+
# This block ensures compatibility with local data formats, including
57+
# iterables and pandas.Series
58+
# TODO: Currently, data upload is performed using pandas DataFrames
59+
# combined with the `read_pandas` method due to the BigFrames DataFrame
60+
# constructor's limitations in handling various data types. Plan to update
61+
# the upload process to utilize the BigFrames DataFrame constructor directly
62+
# once it is enhanced for more related datatypes.
63+
arg = global_session.with_default_session(
64+
bigframes.session.Session.read_pandas, pd.DataFrame(arg)
65+
)
66+
if len(arg.columns) != 1:
67+
raise ValueError("Input must be 1-dimensional.")
68+
69+
arg = arg[arg.columns[0]]
70+
71+
if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore
72+
raise NotImplementedError(
73+
f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}"
74+
)
75+
76+
return arg._apply_unary_op( # type: ignore
77+
ops.ToDatetimeOp(
78+
utc=utc,
79+
format=format,
80+
unit=unit,
81+
)
82+
)

bigframes/operations/__init__.py

+11
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,17 @@ def output_type(self, *input_types):
404404
return input_types[0]
405405

406406

407+
@dataclasses.dataclass(frozen=True)
408+
class ToDatetimeOp(UnaryOp):
409+
name: typing.ClassVar[str] = "to_datetime"
410+
utc: bool = False
411+
format: typing.Optional[str] = None
412+
unit: typing.Optional[str] = None
413+
414+
def output_type(self, *input_types):
415+
return input_types[0]
416+
417+
407418
# Binary Ops
408419
fillna_op = create_binary_op(name="fillna")
409420
cliplower_op = create_binary_op(name="clip_lower")

bigframes/pandas/__init__.py

+28
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from __future__ import annotations
1818

1919
from collections import namedtuple
20+
from datetime import datetime
2021
import inspect
2122
import sys
2223
import typing
@@ -52,6 +53,7 @@
5253
import bigframes.core.global_session as global_session
5354
import bigframes.core.indexes
5455
import bigframes.core.reshape
56+
import bigframes.core.tools
5557
import bigframes.dataframe
5658
import bigframes.operations as ops
5759
import bigframes.series
@@ -61,6 +63,7 @@
6163
import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding
6264
import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge
6365
import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile
66+
import third_party.bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes
6467
import third_party.bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq
6568

6669

@@ -635,6 +638,30 @@ def read_gbq_function(function_name: str):
635638

636639
read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function)
637640

641+
642+
def to_datetime(
643+
arg: Union[
644+
vendored_pandas_datetimes.local_scalars,
645+
vendored_pandas_datetimes.local_iterables,
646+
bigframes.series.Series,
647+
bigframes.dataframe.DataFrame,
648+
],
649+
*,
650+
utc: bool = False,
651+
format: Optional[str] = None,
652+
unit: Optional[str] = None,
653+
) -> Union[pandas.Timestamp, datetime, bigframes.series.Series]:
654+
return bigframes.core.tools.to_datetime(
655+
arg,
656+
utc=utc,
657+
format=format,
658+
unit=unit,
659+
)
660+
661+
662+
to_datetime.__doc__ = vendored_pandas_datetimes.to_datetime.__doc__
663+
664+
638665
# pandas dtype attributes
639666
NA = pandas.NA
640667
BooleanDtype = pandas.BooleanDtype
@@ -680,6 +707,7 @@ def read_gbq_function(function_name: str):
680707
"read_pandas",
681708
"read_pickle",
682709
"remote_function",
710+
"to_datetime",
683711
# pandas dtype attributes
684712
"NA",
685713
"BooleanDtype",

tests/system/small/test_pandas.py

+62
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from datetime import datetime
16+
1517
import pandas as pd
1618
import pytest
19+
import pytz
1720

1821
import bigframes.pandas as bpd
1922
from tests.system.utils import assert_pandas_df_equal
@@ -477,3 +480,62 @@ def test_qcut(scalars_dfs, q):
477480
pd_result = pd_result.astype("Int64")
478481

479482
pd.testing.assert_series_equal(bf_result, pd_result)
483+
484+
485+
@pytest.mark.parametrize(
486+
("arg", "utc", "unit", "format"),
487+
[
488+
(173872738, False, None, None),
489+
(32787983.23, True, "s", None),
490+
("2023-01-01", False, None, "%Y-%m-%d"),
491+
(datetime(2023, 1, 1, 12, 0), False, None, None),
492+
],
493+
)
494+
def test_to_datetime_scalar(arg, utc, unit, format):
495+
bf_result = bpd.to_datetime(arg, utc=utc, unit=unit, format=format)
496+
pd_result = pd.to_datetime(arg, utc=utc, unit=unit, format=format)
497+
498+
assert bf_result == pd_result
499+
500+
501+
@pytest.mark.parametrize(
502+
("arg", "utc", "unit", "format"),
503+
[
504+
([173872738], False, None, None),
505+
([32787983.23], True, "s", None),
506+
(
507+
[datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("America/New_York"))],
508+
True,
509+
None,
510+
None,
511+
),
512+
(["2023-01-01"], True, None, "%Y-%m-%d"),
513+
(["2023-02-01T15:00:00+07:22"], True, None, None),
514+
(["01-31-2023 14:30 -0800"], True, None, "%m-%d-%Y %H:%M %z"),
515+
(["01-31-2023 14:00", "02-01-2023 15:00"], True, None, "%m-%d-%Y %H:%M"),
516+
],
517+
)
518+
def test_to_datetime_iterable(arg, utc, unit, format):
519+
bf_result = (
520+
bpd.to_datetime(arg, utc=utc, unit=unit, format=format)
521+
.to_pandas()
522+
.astype("datetime64[ns, UTC]" if utc else "datetime64[ns]")
523+
)
524+
pd_result = pd.Series(
525+
pd.to_datetime(arg, utc=utc, unit=unit, format=format)
526+
).dt.floor("us")
527+
pd.testing.assert_series_equal(
528+
bf_result, pd_result, check_index_type=False, check_names=False
529+
)
530+
531+
532+
def test_to_datetime_series(scalars_dfs):
533+
scalars_df, scalars_pandas_df = scalars_dfs
534+
col = "int64_too"
535+
bf_result = (
536+
bpd.to_datetime(scalars_df[col], unit="s").to_pandas().astype("datetime64[s]")
537+
)
538+
pd_result = pd.Series(pd.to_datetime(scalars_pandas_df[col], unit="s"))
539+
pd.testing.assert_series_equal(
540+
bf_result, pd_result, check_index_type=False, check_names=False
541+
)

third_party/bigframes_vendored/pandas/core/tools/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/tools/datetimes.py
2+
3+
from datetime import datetime
4+
from typing import Iterable, Mapping, Union
5+
6+
import pandas as pd
7+
8+
from bigframes import constants, series
9+
10+
local_scalars = Union[int, float, str, datetime]
11+
local_iterables = Union[Iterable, pd.Series, pd.DataFrame, Mapping]
12+
13+
14+
def to_datetime(
15+
arg,
16+
*,
17+
utc=False,
18+
format=None,
19+
unit=None,
20+
) -> Union[pd.Timestamp, datetime, series.Series]:
21+
"""
22+
This function converts a scalar, array-like or Series to a datetime object.
23+
24+
.. note::
25+
BigQuery only supports precision up to microseconds (us). Therefore, when working
26+
with timestamps that have a finer granularity than microseconds, be aware that
27+
the additional precision will not be represented in BigQuery.
28+
29+
.. note::
30+
The format strings for specifying datetime representations in BigQuery and pandas
31+
are not completely identical. Ensure that the format string provided is compatible
32+
with BigQuery.
33+
34+
**Examples:**
35+
36+
>>> import bigframes.pandas as bpd
37+
>>> bpd.options.display.progress_bar = None
38+
39+
Converting a Scalar to datetime:
40+
41+
>>> scalar = 123456.789
42+
>>> bpd.to_datetime(scalar, unit = 's')
43+
Timestamp('1970-01-02 10:17:36.789000')
44+
45+
Converting a List of Strings without Timezone Information:
46+
47+
>>> list_str = ["01-31-2021 14:30", "02-28-2021 15:45"]
48+
>>> bpd.to_datetime(list_str, format="%m-%d-%Y %H:%M", utc=True)
49+
0 2021-01-31 14:30:00+00:00
50+
1 2021-02-28 15:45:00+00:00
51+
Name: 0, dtype: timestamp[us, tz=UTC][pyarrow]
52+
53+
Converting a Series of Strings with Timezone Information:
54+
55+
>>> series_str = bpd.Series(["01-31-2021 14:30+08:00", "02-28-2021 15:45+00:00"])
56+
>>> bpd.to_datetime(series_str, format="%m-%d-%Y %H:%M%Z", utc=True)
57+
0 2021-01-31 06:30:00+00:00
58+
1 2021-02-28 15:45:00+00:00
59+
dtype: timestamp[us, tz=UTC][pyarrow]
60+
61+
Args:
62+
arg (int, float, str, datetime, list, tuple, 1-d array, Series):
63+
The object to convert to a datetime.
64+
utc (bool, default False):
65+
Control timezone-related parsing, localization and conversion. If True, the
66+
function always returns a timezone-aware UTC-localized timestamp or series.
67+
If False (default), inputs will not be coerced to UTC.
68+
format (str, default None):
69+
The strftime to parse time, e.g. "%d/%m/%Y".
70+
unit (str, default 'ns'):
71+
The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or
72+
float number.
73+
74+
Returns:
75+
Timestamp, datetime.datetime or bigframes.series.Series: Return type depends on input.
76+
"""
77+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

0 commit comments

Comments
 (0)