Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance and move ISO-8601 parser to coding.times #9899

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ Internal Changes
~~~~~~~~~~~~~~~~
- Move non-CF related ``ensure_dtype_not_object`` from conventions to backends (:pull:`9828`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import) (:pull:`9899`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

.. _whats-new.2024.11.0:

Expand Down
9 changes: 9 additions & 0 deletions properties/test_encode_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
# isort: split

import hypothesis.extra.numpy as npst
import hypothesis.strategies as st
import numpy as np
from hypothesis import given

import xarray as xr
from xarray.coding.times import _parse_iso8601_without_reso
from xarray.testing.strategies import variables


Expand Down Expand Up @@ -43,3 +45,10 @@ def test_CFScaleOffset_coder_roundtrip(original) -> None:
coder = xr.coding.variables.CFScaleOffsetCoder()
roundtripped = coder.decode(coder.encode(original))
xr.testing.assert_identical(original, roundtripped)


# TODO: add cftime.datetime
@given(dt=st.datetimes())
def test_iso8601_decode(dt):
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
iso = dt.isoformat()
assert dt == _parse_iso8601_without_reso(type(dt), iso)
3 changes: 2 additions & 1 deletion xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@
import pandas as pd
from packaging.version import Version

from xarray.coding.cftimeindex import CFTimeIndex, _parse_iso8601_with_reso
from xarray.coding.cftimeindex import CFTimeIndex
from xarray.coding.times import (
_is_standard_calendar,
_parse_iso8601_with_reso,
_should_cftime_be_used,
convert_time_or_go_back,
format_cftime_datetime,
Expand Down
73 changes: 2 additions & 71 deletions xarray/coding/cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from __future__ import annotations

import math
import re
import warnings
from datetime import timedelta
from typing import TYPE_CHECKING, Any
Expand All @@ -53,6 +52,8 @@

from xarray.coding.times import (
_STANDARD_CALENDARS,
_parse_iso8601_with_reso,
_parse_iso8601_without_reso,
cftime_to_nptime,
infer_calendar_name,
)
Expand All @@ -78,71 +79,6 @@
OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (OverflowError,)


def named(name, pattern):
return "(?P<" + name + ">" + pattern + ")"


def optional(x):
return "(?:" + x + ")?"


def trailing_optional(xs):
if not xs:
return ""
return xs[0] + optional(trailing_optional(xs[1:]))


def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."):
pieces = [
(None, "year", r"\d{4}"),
(date_sep, "month", r"\d{2}"),
(date_sep, "day", r"\d{2}"),
(datetime_sep, "hour", r"\d{2}"),
(time_sep, "minute", r"\d{2}"),
(time_sep, "second", r"\d{2}"),
(micro_sep, "microsecond", r"\d{1,6}"),
]
pattern_list = []
for sep, name, sub_pattern in pieces:
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
# TODO: allow timezone offsets?
return "^" + trailing_optional(pattern_list) + "$"


_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
_EXTENDED_PATTERN = build_pattern()
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]


def parse_iso8601_like(datetime_string):
for pattern in _PATTERNS:
match = re.match(pattern, datetime_string)
if match:
return match.groupdict()
raise ValueError(
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
)


def _parse_iso8601_with_reso(date_type, timestr):
_ = attempt_import("cftime")

default = date_type(1, 1, 1)
result = parse_iso8601_like(timestr)
replace = {}

for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
value = result.get(attr, None)
if value is not None:
if attr == "microsecond":
# convert match string into valid microsecond value
value = 10 ** (6 - len(value)) * int(value)
replace[attr] = int(value)
resolution = attr
return default.replace(**replace), resolution


def _parsed_string_to_bounds(date_type, resolution, parsed):
"""Generalization of
pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds
Expand Down Expand Up @@ -811,11 +747,6 @@ def is_leap_year(self):
return func(self.year, calendar=self.calendar)


def _parse_iso8601_without_reso(date_type, datetime_str):
date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
return date


def _parse_array_of_cftime_strings(strings, date_type):
"""Create a numpy array from an array of strings.

Expand Down
70 changes: 70 additions & 0 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,76 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
return delta_units, ref_date


def named(name, pattern):
return "(?P<" + name + ">" + pattern + ")"


def optional(x):
return "(?:" + x + ")?"


def trailing_optional(xs):
if not xs:
return ""
return xs[0] + optional(trailing_optional(xs[1:]))


def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."):
pieces = [
(None, "year", r"[+-]?\d{4,5}"),
(date_sep, "month", r"\d{2}"),
(date_sep, "day", r"\d{2}"),
(datetime_sep, "hour", r"\d{2}"),
(time_sep, "minute", r"\d{2}"),
(time_sep, "second", r"\d{2}"),
(micro_sep, "microsecond", r"\d{1,6}"),
]
pattern_list = []
for sep, name, sub_pattern in pieces:
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
# TODO: allow timezone offsets?
return "^" + trailing_optional(pattern_list) + "$"


_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
_EXTENDED_PATTERN = build_pattern()
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]


def parse_iso8601_like(datetime_string):
for pattern in _PATTERNS:
match = re.match(pattern, datetime_string)
if match:
return match.groupdict()
raise ValueError(
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
)


def _parse_iso8601_with_reso(date_type, timestr):
default = date_type(1, 1, 1)
result = parse_iso8601_like(timestr)
replace = {}

for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
value = result.get(attr, None)
if value is not None:
resolution = attr
if attr == "microsecond":
if len(value) <= 3:
resolution = "millisecond"
# convert match string into valid microsecond value
value = 10 ** (6 - len(value)) * int(value)
replace[attr] = int(value)
return default.replace(**replace), resolution


def _parse_iso8601_without_reso(date_type, datetime_str):
date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
return date


def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
Expand Down
26 changes: 19 additions & 7 deletions xarray/tests/test_cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
from xarray.coding.cftimeindex import (
CFTimeIndex,
_parse_array_of_cftime_strings,
_parse_iso8601_with_reso,
_parsed_string_to_bounds,
assert_all_valid_date_type,
)
from xarray.coding.times import (
_parse_iso8601_with_reso,
parse_iso8601_like,
)
from xarray.tests import (
Expand Down Expand Up @@ -132,16 +134,26 @@ def date_dict(
list(ISO8601_LIKE_STRING_TESTS.values()),
ids=list(ISO8601_LIKE_STRING_TESTS.keys()),
)
def test_parse_iso8601_like(string, expected):
result = parse_iso8601_like(string)
@pytest.mark.parametrize("five", [False, True], ids=["4Y", "5Y"])
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"])
def test_parse_iso8601_like(five, sign, string, expected):
pre = "1" if five else ""
datestring = sign + pre + string
result = parse_iso8601_like(datestring)
expected = expected.copy()
expected.update(year=sign + pre + expected["year"])
assert result == expected

if result["microsecond"] is None:
# check malformed single digit addendum
# tests for year/month/day excluded as year can be 4 or 5 digits
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
kmuehlbauer marked this conversation as resolved.
Show resolved Hide resolved
if result["microsecond"] is None and result["hour"] is not None:
with pytest.raises(ValueError):
parse_iso8601_like(string + "3")
if result["second"] is None:
parse_iso8601_like(datestring + "3")

# check malformed floating point addendum
if result["second"] is None or result["microsecond"] is not None:
with pytest.raises(ValueError):
parse_iso8601_like(string + ".3")
parse_iso8601_like(datestring + ".3")


_CFTIME_CALENDARS = [
Expand Down
Loading