"""Provides the :class:`Arrow ` class, a better way to parse datetime strings.""" import re import sys from datetime import datetime, timedelta from datetime import tzinfo as dt_tzinfo from functools import lru_cache from typing import ( Any, ClassVar, Dict, Iterable, List, Match, Optional, Pattern, SupportsFloat, SupportsInt, Tuple, Union, cast, overload, ) from dateutil import tz from arrow import locales from arrow.constants import DEFAULT_LOCALE from arrow.util import next_weekday, normalize_timestamp if sys.version_info < (3, 8): # pragma: no cover from typing_extensions import Literal, TypedDict else: from typing import Literal, TypedDict # pragma: no cover class ParserError(ValueError): pass # Allows for ParserErrors to be propagated from _build_datetime() # when day_of_year errors occur. # Before this, the ParserErrors were caught by the try/except in # _parse_multiformat() and the appropriate error message was not # transmitted to the user. class ParserMatchError(ParserError): pass _WEEKDATE_ELEMENT = Union[str, bytes, SupportsInt, bytearray] _FORMAT_TYPE = Literal[ "YYYY", "YY", "MM", "M", "DDDD", "DDD", "DD", "D", "HH", "H", "hh", "h", "mm", "m", "ss", "s", "X", "x", "ZZZ", "ZZ", "Z", "S", "W", "MMMM", "MMM", "Do", "dddd", "ddd", "d", "a", "A", ] class _Parts(TypedDict, total=False): year: int month: int day_of_year: int day: int hour: int minute: int second: int microsecond: int timestamp: float expanded_timestamp: int tzinfo: dt_tzinfo am_pm: Literal["am", "pm"] day_of_week: int weekdate: Tuple[_WEEKDATE_ELEMENT, _WEEKDATE_ELEMENT, Optional[_WEEKDATE_ELEMENT]] class DateTimeParser: _FORMAT_RE: ClassVar[Pattern[str]] = re.compile( r"(YYY?Y?|MM?M?M?|Do|DD?D?D?|d?d?d?d|HH?|hh?|mm?|ss?|S+|ZZ?Z?|a|A|x|X|W)" ) _ESCAPE_RE: ClassVar[Pattern[str]] = re.compile(r"\[[^\[\]]*\]") _ONE_OR_TWO_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{1,2}") _ONE_OR_TWO_OR_THREE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{1,3}") _ONE_OR_MORE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d+") _TWO_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{2}") _THREE_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{3}") _FOUR_DIGIT_RE: ClassVar[Pattern[str]] = re.compile(r"\d{4}") _TZ_Z_RE: ClassVar[Pattern[str]] = re.compile(r"([\+\-])(\d{2})(?:(\d{2}))?|Z") _TZ_ZZ_RE: ClassVar[Pattern[str]] = re.compile(r"([\+\-])(\d{2})(?:\:(\d{2}))?|Z") _TZ_NAME_RE: ClassVar[Pattern[str]] = re.compile(r"\w[\w+\-/]+") # NOTE: timestamps cannot be parsed from natural language strings (by removing the ^...$) because it will # break cases like "15 Jul 2000" and a format list (see issue #447) _TIMESTAMP_RE: ClassVar[Pattern[str]] = re.compile(r"^\-?\d+\.?\d+$") _TIMESTAMP_EXPANDED_RE: ClassVar[Pattern[str]] = re.compile(r"^\-?\d+$") _TIME_RE: ClassVar[Pattern[str]] = re.compile( r"^(\d{2})(?:\:?(\d{2}))?(?:\:?(\d{2}))?(?:([\.\,])(\d+))?$" ) _WEEK_DATE_RE: ClassVar[Pattern[str]] = re.compile( r"(?P\d{4})[\-]?W(?P\d{2})[\-]?(?P\d)?" ) _BASE_INPUT_RE_MAP: ClassVar[Dict[_FORMAT_TYPE, Pattern[str]]] = { "YYYY": _FOUR_DIGIT_RE, "YY": _TWO_DIGIT_RE, "MM": _TWO_DIGIT_RE, "M": _ONE_OR_TWO_DIGIT_RE, "DDDD": _THREE_DIGIT_RE, "DDD": _ONE_OR_TWO_OR_THREE_DIGIT_RE, "DD": _TWO_DIGIT_RE, "D": _ONE_OR_TWO_DIGIT_RE, "HH": _TWO_DIGIT_RE, "H": _ONE_OR_TWO_DIGIT_RE, "hh": _TWO_DIGIT_RE, "h": _ONE_OR_TWO_DIGIT_RE, "mm": _TWO_DIGIT_RE, "m": _ONE_OR_TWO_DIGIT_RE, "ss": _TWO_DIGIT_RE, "s": _ONE_OR_TWO_DIGIT_RE, "X": _TIMESTAMP_RE, "x": _TIMESTAMP_EXPANDED_RE, "ZZZ": _TZ_NAME_RE, "ZZ": _TZ_ZZ_RE, "Z": _TZ_Z_RE, "S": _ONE_OR_MORE_DIGIT_RE, "W": _WEEK_DATE_RE, } SEPARATORS: ClassVar[List[str]] = ["-", "/", "."] locale: locales.Locale _input_re_map: Dict[_FORMAT_TYPE, Pattern[str]] def __init__(self, locale: str = DEFAULT_LOCALE, cache_size: int = 0) -> None: self.locale = locales.get_locale(locale) self._input_re_map = self._BASE_INPUT_RE_MAP.copy() self._input_re_map.update( { "MMMM": self._generate_choice_re( self.locale.month_names[1:], re.IGNORECASE ), "MMM": self._generate_choice_re( self.locale.month_abbreviations[1:], re.IGNORECASE ), "Do": re.compile(self.locale.ordinal_day_re), "dddd": self._generate_choice_re( self.locale.day_names[1:], re.IGNORECASE ), "ddd": self._generate_choice_re( self.locale.day_abbreviations[1:], re.IGNORECASE ), "d": re.compile(r"[1-7]"), "a": self._generate_choice_re( (self.locale.meridians["am"], self.locale.meridians["pm"]) ), # note: 'A' token accepts both 'am/pm' and 'AM/PM' formats to # ensure backwards compatibility of this token "A": self._generate_choice_re(self.locale.meridians.values()), } ) if cache_size > 0: self._generate_pattern_re = lru_cache(maxsize=cache_size)( # type: ignore self._generate_pattern_re ) # TODO: since we support more than ISO 8601, we should rename this function # IDEA: break into multiple functions def parse_iso( self, datetime_string: str, normalize_whitespace: bool = False ) -> datetime: if normalize_whitespace: datetime_string = re.sub(r"\s+", " ", datetime_string.strip()) has_space_divider = " " in datetime_string has_t_divider = "T" in datetime_string num_spaces = datetime_string.count(" ") if has_space_divider and num_spaces != 1 or has_t_divider and num_spaces > 0: raise ParserError( f"Expected an ISO 8601-like string, but was given {datetime_string!r}. " "Try passing in a format string to resolve this." ) has_time = has_space_divider or has_t_divider has_tz = False # date formats (ISO 8601 and others) to test against # NOTE: YYYYMM is omitted to avoid confusion with YYMMDD (no longer part of ISO 8601, but is still often used) formats = [ "YYYY-MM-DD", "YYYY-M-DD", "YYYY-M-D", "YYYY/MM/DD", "YYYY/M/DD", "YYYY/M/D", "YYYY.MM.DD", "YYYY.M.DD", "YYYY.M.D", "YYYYMMDD", "YYYY-DDDD", "YYYYDDDD", "YYYY-MM", "YYYY/MM", "YYYY.MM", "YYYY", "W", ] if has_time: if has_space_divider: date_string, time_string = datetime_string.split(" ", 1) else: date_string, time_string = datetime_string.split("T", 1) time_parts = re.split(r"[\+\-Z]", time_string, 1, re.IGNORECASE) time_components: Optional[Match[str]] = self._TIME_RE.match(time_parts[0]) if time_components is None: raise ParserError( "Invalid time component provided. " "Please specify a format or provide a valid time component in the basic or extended ISO 8601 time format." ) ( hours, minutes, seconds, subseconds_sep, subseconds, ) = time_components.groups() has_tz = len(time_parts) == 2 has_minutes = minutes is not None has_seconds = seconds is not None has_subseconds = subseconds is not None is_basic_time_format = ":" not in time_parts[0] tz_format = "Z" # use 'ZZ' token instead since tz offset is present in non-basic format if has_tz and ":" in time_parts[1]: tz_format = "ZZ" time_sep = "" if is_basic_time_format else ":" if has_subseconds: time_string = "HH{time_sep}mm{time_sep}ss{subseconds_sep}S".format( time_sep=time_sep, subseconds_sep=subseconds_sep ) elif has_seconds: time_string = "HH{time_sep}mm{time_sep}ss".format(time_sep=time_sep) elif has_minutes: time_string = f"HH{time_sep}mm" else: time_string = "HH" if has_space_divider: formats = [f"{f} {time_string}" for f in formats] else: formats = [f"{f}T{time_string}" for f in formats] if has_time and has_tz: # Add "Z" or "ZZ" to the format strings to indicate to # _parse_token() that a timezone needs to be parsed formats = [f"{f}{tz_format}" for f in formats] return self._parse_multiformat(datetime_string, formats) def parse( self, datetime_string: str, fmt: Union[List[str], str], normalize_whitespace: bool = False, ) -> datetime: if normalize_whitespace: datetime_string = re.sub(r"\s+", " ", datetime_string) if isinstance(fmt, list): return self._parse_multiformat(datetime_string, fmt) try: fmt_tokens: List[_FORMAT_TYPE] fmt_pattern_re: Pattern[str] fmt_tokens, fmt_pattern_re = self._generate_pattern_re(fmt) except re.error as e: raise ParserMatchError( f"Failed to generate regular expression pattern: {e}." ) match = fmt_pattern_re.search(datetime_string) if match is None: raise ParserMatchError( f"Failed to match {fmt!r} when parsing {datetime_string!r}." ) parts: _Parts = {} for token in fmt_tokens: value: Union[Tuple[str, str, str], str] if token == "Do": value = match.group("value") elif token == "W": value = (match.group("year"), match.group("week"), match.group("day")) else: value = match.group(token) if value is None: raise ParserMatchError( f"Unable to find a match group for the specified token {token!r}." ) self._parse_token(token, value, parts) # type: ignore return self._build_datetime(parts) def _generate_pattern_re(self, fmt: str) -> Tuple[List[_FORMAT_TYPE], Pattern[str]]: # fmt is a string of tokens like 'YYYY-MM-DD' # we construct a new string by replacing each # token by its pattern: # 'YYYY-MM-DD' -> '(?P\d{4})-(?P\d{2})-(?P

\d{2})' tokens: List[_FORMAT_TYPE] = [] offset = 0 # Escape all special RegEx chars escaped_fmt = re.escape(fmt) # Extract the bracketed expressions to be reinserted later. escaped_fmt = re.sub(self._ESCAPE_RE, "#", escaped_fmt) # Any number of S is the same as one. # TODO: allow users to specify the number of digits to parse escaped_fmt = re.sub(r"S+", "S", escaped_fmt) escaped_data = re.findall(self._ESCAPE_RE, fmt) fmt_pattern = escaped_fmt for m in self._FORMAT_RE.finditer(escaped_fmt): token: _FORMAT_TYPE = cast(_FORMAT_TYPE, m.group(0)) try: input_re = self._input_re_map[token] except KeyError: raise ParserError(f"Unrecognized token {token!r}.") input_pattern = f"(?P<{token}>{input_re.pattern})" tokens.append(token) # a pattern doesn't have the same length as the token # it replaces! We keep the difference in the offset variable. # This works because the string is scanned left-to-right and matches # are returned in the order found by finditer. fmt_pattern = ( fmt_pattern[: m.start() + offset] + input_pattern + fmt_pattern[m.end() + offset :] ) offset += len(input_pattern) - (m.end() - m.start()) final_fmt_pattern = "" split_fmt = fmt_pattern.split(r"\#") # Due to the way Python splits, 'split_fmt' will always be longer for i in range(len(split_fmt)): final_fmt_pattern += split_fmt[i] if i < len(escaped_data): final_fmt_pattern += escaped_data[i][1:-1] # Wrap final_fmt_pattern in a custom word boundary to strictly # match the formatting pattern and filter out date and time formats # that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah, # blah1998-09-12blah. The custom word boundary matches every character # that is not a whitespace character to allow for searching for a date # and time string in a natural language sentence. Therefore, searching # for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will # work properly. # Certain punctuation before or after the target pattern such as # "1998-09-12," is permitted. For the full list of valid punctuation, # see the documentation. starting_word_boundary = ( r"(?\s])" # This is the list of punctuation that is ok before the # pattern (i.e. "It can't not be these characters before the pattern") r"(\b|^)" # The \b is to block cases like 1201912 but allow 201912 for pattern YYYYMM. The ^ was necessary to allow a # negative number through i.e. before epoch numbers ) ending_word_boundary = ( r"(?=[\,\.\;\:\?\!\"\'\`\[\]\{\}\<\>]?" # Positive lookahead stating that these punctuation marks # can appear after the pattern at most 1 time r"(?!\S))" # Don't allow any non-whitespace character after the punctuation ) bounded_fmt_pattern = r"{}{}{}".format( starting_word_boundary, final_fmt_pattern, ending_word_boundary ) return tokens, re.compile(bounded_fmt_pattern, flags=re.IGNORECASE) @overload def _parse_token( self, token: Literal[ "YYYY", "YY", "MM", "M", "DDDD", "DDD", "DD", "D", "Do", "HH", "hh", "h", "H", "mm", "m", "ss", "s", "x", ], value: Union[str, bytes, SupportsInt, bytearray], parts: _Parts, ) -> None: ... # pragma: no cover @overload def _parse_token( self, token: Literal["X"], value: Union[str, bytes, SupportsFloat, bytearray], parts: _Parts, ) -> None: ... # pragma: no cover @overload def _parse_token( self, token: Literal["MMMM", "MMM", "dddd", "ddd", "S"], value: Union[str, bytes, bytearray], parts: _Parts, ) -> None: ... # pragma: no cover @overload def _parse_token( self, token: Literal["a", "A", "ZZZ", "ZZ", "Z"], value: Union[str, bytes], parts: _Parts, ) -> None: ... # pragma: no cover @overload def _parse_token( self, token: Literal["W"], value: Tuple[_WEEKDATE_ELEMENT, _WEEKDATE_ELEMENT, Optional[_WEEKDATE_ELEMENT]], parts: _Parts, ) -> None: ... # pragma: no cover def _parse_token( self, token: Any, value: Any, parts: _Parts, ) -> None: if token == "YYYY": parts["year"] = int(value) elif token == "YY": value = int(value) parts["year"] = 1900 + value if value > 68 else 2000 + value elif token in ["MMMM", "MMM"]: # FIXME: month_number() is nullable parts["month"] = self.locale.month_number(value.lower()) # type: ignore elif token in ["MM", "M"]: parts["month"] = int(value) elif token in ["DDDD", "DDD"]: parts["day_of_year"] = int(value) elif token in ["DD", "D"]: parts["day"] = int(value) elif token == "Do": parts["day"] = int(value) elif token == "dddd": # locale day names are 1-indexed day_of_week = [x.lower() for x in self.locale.day_names].index( value.lower() ) parts["day_of_week"] = day_of_week - 1 elif token == "ddd": # locale day abbreviations are 1-indexed day_of_week = [x.lower() for x in self.locale.day_abbreviations].index( value.lower() ) parts["day_of_week"] = day_of_week - 1 elif token.upper() in ["HH", "H"]: parts["hour"] = int(value) elif token in ["mm", "m"]: parts["minute"] = int(value) elif token in ["ss", "s"]: parts["second"] = int(value) elif token == "S": # We have the *most significant* digits of an arbitrary-precision integer. # We want the six most significant digits as an integer, rounded. # IDEA: add nanosecond support somehow? Need datetime support for it first. value = value.ljust(7, "0") # floating-point (IEEE-754) defaults to half-to-even rounding seventh_digit = int(value[6]) if seventh_digit == 5: rounding = int(value[5]) % 2 elif seventh_digit > 5: rounding = 1 else: rounding = 0 parts["microsecond"] = int(value[:6]) + rounding elif token == "X": parts["timestamp"] = float(value) elif token == "x": parts["expanded_timestamp"] = int(value) elif token in ["ZZZ", "ZZ", "Z"]: parts["tzinfo"] = TzinfoParser.parse(value) elif token in ["a", "A"]: if value in (self.locale.meridians["am"], self.locale.meridians["AM"]): parts["am_pm"] = "am" if "hour" in parts and not 0 <= parts["hour"] <= 12: raise ParserMatchError( f"Hour token value must be between 0 and 12 inclusive for token {token!r}." ) elif value in (self.locale.meridians["pm"], self.locale.meridians["PM"]): parts["am_pm"] = "pm" elif token == "W": parts["weekdate"] = value @staticmethod def _build_datetime(parts: _Parts) -> datetime: weekdate = parts.get("weekdate") if weekdate is not None: year, week = int(weekdate[0]), int(weekdate[1]) if weekdate[2] is not None: _day = int(weekdate[2]) else: # day not given, default to 1 _day = 1 date_string = f"{year}-{week}-{_day}" # tokens for ISO 8601 weekdates dt = datetime.strptime(date_string, "%G-%V-%u") parts["year"] = dt.year parts["month"] = dt.month parts["day"] = dt.day timestamp = parts.get("timestamp") if timestamp is not None: return datetime.fromtimestamp(timestamp, tz=tz.tzutc()) expanded_timestamp = parts.get("expanded_timestamp") if expanded_timestamp is not None: return datetime.fromtimestamp( normalize_timestamp(expanded_timestamp), tz=tz.tzutc(), ) day_of_year = parts.get("day_of_year") if day_of_year is not None: _year = parts.get("year") month = parts.get("month") if _year is None: raise ParserError( "Year component is required with the DDD and DDDD tokens." ) if month is not None: raise ParserError( "Month component is not allowed with the DDD and DDDD tokens." ) date_string = f"{_year}-{day_of_year}" try: dt = datetime.strptime(date_string, "%Y-%j") except ValueError: raise ParserError( f"The provided day of year {day_of_year!r} is invalid." ) parts["year"] = dt.year parts["month"] = dt.month parts["day"] = dt.day day_of_week: Optional[int] = parts.get("day_of_week") day = parts.get("day") # If day is passed, ignore day of week if day_of_week is not None and day is None: year = parts.get("year", 1970) month = parts.get("month", 1) day = 1 # dddd => first day of week after epoch # dddd YYYY => first day of week in specified year # dddd MM YYYY => first day of week in specified year and month # dddd MM => first day after epoch in specified month next_weekday_dt = next_weekday(datetime(year, month, day), day_of_week) parts["year"] = next_weekday_dt.year parts["month"] = next_weekday_dt.month parts["day"] = next_weekday_dt.day am_pm = parts.get("am_pm") hour = parts.get("hour", 0) if am_pm == "pm" and hour < 12: hour += 12 elif am_pm == "am" and hour == 12: hour = 0 # Support for midnight at the end of day if hour == 24: if parts.get("minute", 0) != 0: raise ParserError("Midnight at the end of day must not contain minutes") if parts.get("second", 0) != 0: raise ParserError("Midnight at the end of day must not contain seconds") if parts.get("microsecond", 0) != 0: raise ParserError( "Midnight at the end of day must not contain microseconds" ) hour = 0 day_increment = 1 else: day_increment = 0 # account for rounding up to 1000000 microsecond = parts.get("microsecond", 0) if microsecond == 1000000: microsecond = 0 second_increment = 1 else: second_increment = 0 increment = timedelta(days=day_increment, seconds=second_increment) return ( datetime( year=parts.get("year", 1), month=parts.get("month", 1), day=parts.get("day", 1), hour=hour, minute=parts.get("minute", 0), second=parts.get("second", 0), microsecond=microsecond, tzinfo=parts.get("tzinfo"), ) + increment ) def _parse_multiformat(self, string: str, formats: Iterable[str]) -> datetime: _datetime: Optional[datetime] = None for fmt in formats: try: _datetime = self.parse(string, fmt) break except ParserMatchError: pass if _datetime is None: supported_formats = ", ".join(formats) raise ParserError( f"Could not match input {string!r} to any of the following formats: {supported_formats}." ) return _datetime # generates a capture group of choices separated by an OR operator @staticmethod def _generate_choice_re( choices: Iterable[str], flags: Union[int, re.RegexFlag] = 0 ) -> Pattern[str]: return re.compile(r"({})".format("|".join(choices)), flags=flags) class TzinfoParser: _TZINFO_RE: ClassVar[Pattern[str]] = re.compile( r"^([\+\-])?(\d{2})(?:\:?(\d{2}))?$" ) @classmethod def parse(cls, tzinfo_string: str) -> dt_tzinfo: tzinfo: Optional[dt_tzinfo] = None if tzinfo_string == "local": tzinfo = tz.tzlocal() elif tzinfo_string in ["utc", "UTC", "Z"]: tzinfo = tz.tzutc() else: iso_match = cls._TZINFO_RE.match(tzinfo_string) if iso_match: sign: Optional[str] hours: str minutes: Union[str, int, None] sign, hours, minutes = iso_match.groups() seconds = int(hours) * 3600 + int(minutes or 0) * 60 if sign == "-": seconds *= -1 tzinfo = tz.tzoffset(None, seconds) else: tzinfo = tz.gettz(tzinfo_string) if tzinfo is None: raise ParserError(f"Could not parse timezone expression {tzinfo_string!r}.") return tzinfo