Skip to content

gh-53203: Fix strptime() for %c and %x formats on many locales #124946

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 106 additions & 21 deletions Lib/_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ def _getlang():
# Figure out what the current language is set to.
return locale.getlocale(locale.LC_TIME)

def _findall(haystack, needle):
# Find all positions of needle in haystack.
if not needle:
return
i = 0
while True:
i = haystack.find(needle, i)
if i < 0:
break
yield i
i += len(needle)

class LocaleTime(object):
"""Stores and handles locale-specific information related to time.

Expand Down Expand Up @@ -102,7 +114,8 @@ def __calc_am_pm(self):
am_pm = []
for hour in (1, 22):
time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0))
am_pm.append(time.strftime("%p", time_tuple).lower())
# br_FR has AM/PM info (' ',' ').
am_pm.append(time.strftime("%p", time_tuple).lower().strip())
self.am_pm = am_pm

def __calc_date_time(self):
Expand All @@ -114,42 +127,114 @@ def __calc_date_time(self):
# values within the format string is very important; it eliminates
# possible ambiguity for what something represents.
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
date_time = [None, None, None]
date_time[0] = time.strftime("%c", time_tuple).lower()
date_time[1] = time.strftime("%x", time_tuple).lower()
date_time[2] = time.strftime("%X", time_tuple).lower()
replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'),
(self.f_month[3], '%B'), (self.a_weekday[2], '%a'),
(self.a_month[3], '%b'), (self.am_pm[1], '%p'),
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
replacement_pairs = [
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
('44', '%M'), ('55', '%S'), ('76', '%j'),
('17', '%d'), ('03', '%m'), ('3', '%m'),
# '3' needed for when no leading zero.
('2', '%w'), ('10', '%I')]
replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone
for tz in tz_values])
for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')):
current_format = date_time[offset]
for old, new in replacement_pairs:
date_time = []
for directive in ('%c', '%x', '%X'):
current_format = time.strftime(directive, time_tuple).lower()
current_format = current_format.replace('%', '%%')
# The month and the day of the week formats are treated specially
# because of a possible ambiguity in some locales where the full
# and abbreviated names are equal or names of different types
# are equal. See doc of __find_month_format for more details.
lst, fmt = self.__find_weekday_format(directive)
if lst:
current_format = current_format.replace(lst[2], fmt, 1)
lst, fmt = self.__find_month_format(directive)
if lst:
current_format = current_format.replace(lst[3], fmt, 1)
if self.am_pm[1]:
# Must deal with possible lack of locale info
# manifesting itself as the empty string (e.g., Swedish's
# lack of AM/PM info) or a platform returning a tuple of empty
# strings (e.g., MacOS 9 having timezone as ('','')).
if old:
current_format = current_format.replace(old, new)
current_format = current_format.replace(self.am_pm[1], '%p')
for tz_values in self.timezone:
for tz in tz_values:
if tz:
current_format = current_format.replace(tz, "%Z")
for old, new in replacement_pairs:
current_format = current_format.replace(old, new)
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
# 2005-01-03 occurs before the first Monday of the year. Otherwise
# %U is used.
time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0))
if '00' in time.strftime(directive, time_tuple):
if '00' in time.strftime(directive, time_tuple2):
U_W = '%W'
else:
U_W = '%U'
date_time[offset] = current_format.replace('11', U_W)
current_format = current_format.replace('11', U_W)
date_time.append(current_format)
self.LC_date_time = date_time[0]
self.LC_date = date_time[1]
self.LC_time = date_time[2]

def __find_month_format(self, directive):
"""Find the month format appropriate for the current locale.

In some locales (for example French and Hebrew), the default month
used in __calc_date_time has the same name in full and abbreviated
form. Also, the month name can by accident match other part of the
representation: the day of the week name (for example in Morisyen)
or the month number (for example in Japanese). Thus, cycle months
of the year and find all positions that match the month name for
each month, If no common positions are found, the representation
does not use the month name.
"""
full_indices = abbr_indices = None
for m in range(1, 13):
time_tuple = time.struct_time((1999, m, 17, 22, 44, 55, 2, 76, 0))
datetime = time.strftime(directive, time_tuple).lower()
indices = set(_findall(datetime, self.f_month[m]))
if full_indices is None:
full_indices = indices
else:
full_indices &= indices
indices = set(_findall(datetime, self.a_month[m]))
if abbr_indices is None:
abbr_indices = indices
else:
abbr_indices &= indices
if not full_indices and not abbr_indices:
return None, None
if full_indices:
return self.f_month, '%B'
if abbr_indices:
return self.a_month, '%b'
return None, None

def __find_weekday_format(self, directive):
"""Find the day of the week format appropriate for the current locale.

Similar to __find_month_format().
"""
full_indices = abbr_indices = None
for wd in range(7):
time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, wd, 76, 0))
datetime = time.strftime(directive, time_tuple).lower()
indices = set(_findall(datetime, self.f_weekday[wd]))
if full_indices is None:
full_indices = indices
else:
full_indices &= indices
if self.f_weekday[wd] != self.a_weekday[wd]:
indices = set(_findall(datetime, self.a_weekday[wd]))
if abbr_indices is None:
abbr_indices = indices
else:
abbr_indices &= indices
if not full_indices and not abbr_indices:
return None, None
if full_indices:
return self.f_weekday, '%A'
if abbr_indices:
return self.a_weekday, '%a'
return None, None

def __calc_timezone(self):
# Set self.timezone by using time.tzname.
# Do not worry about possibility of time.tzname[0] == time.tzname[1]
Expand Down Expand Up @@ -187,7 +272,7 @@ def __init__(self, locale_time=None):
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
'f': r"(?P<f>[0-9]{1,6})",
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9]| [1-9])",
'G': r"(?P<G>\d\d\d\d)",
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
'm': r"(?P<m>1[0-2]|0[1-9]|[1-9])",
Expand Down Expand Up @@ -349,8 +434,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
_regex_cache[format] = format_regex
found = format_regex.match(data_string)
if not found:
raise ValueError("time data %r does not match format %r" %
(data_string, format))
raise ValueError("time data %r does not match format %r :: /%s/" %
(data_string, format, format_regex.pattern))
if len(data_string) != found.end():
raise ValueError("unconverted data remains: %s" %
data_string[found.end():])
Expand Down
36 changes: 23 additions & 13 deletions Lib/test/test_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import locale
import re
import os
import platform
import sys
from test import support
from test.support import warnings_helper
Expand All @@ -13,6 +14,13 @@

import _strptime

libc_ver = platform.libc_ver()
if libc_ver[0] == 'glibc':
glibc_ver = tuple(map(int, libc_ver[1].split('.')))
else:
glibc_ver = None


class getlang_Tests(unittest.TestCase):
"""Test _getlang"""
def test_basic(self):
Expand Down Expand Up @@ -478,16 +486,16 @@ def test_bad_timezone(self):
# * Year is not included: ha_NG.
# * Use non-Gregorian calendar: lo_LA, thai, th_TH.
#
# BUG: Generates invalid regexp for br_FR, csb_PL, Arabic.
# BUG: Generates regexp that does not match the current date and time
# for fa_IR, gez_ER, gez_ET, lzh_TW, my_MM, or_IN, shn_MM, yo_NG.
# BUG: Generates regexp that does not match the current date and time
# for fa_IR, gez_ER, gez_ET, lzh_TW, my_MM, or_IN, shn_MM, yo_NG,
# fr_FR, ja_JP, he_IL, ko_KR, zh_CN, etc.
@run_with_locales('LC_TIME', 'C', 'en_US', 'de_DE',
'eu_ES', 'mfe_MU')
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
def test_date_time_locale(self):
# Test %c directive
loc = locale.getlocale(locale.LC_TIME)[0]
if glibc_ver and glibc_ver < (2, 31) and loc == 'br_FR':
self.skipTest('%c in locale br_FR does not include time')
now = time.time()
self.roundtrip('%c', slice(0, 6), time.localtime(now))
# 1 hour 20 minutes 30 seconds ago
Expand All @@ -505,18 +513,19 @@ def test_date_time_locale(self):

# NB: Dates before 1969 do not roundtrip on some locales:
# bo_CN, bo_IN, dz_BT, eu_ES, eu_FR.
@run_with_locales('LC_TIME', 'C', 'en_US', 'de_DE', 'ja_JP')
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG',
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
def test_date_time_locale2(self):
# Test %c directive
self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0))

# NB: Does not roundtrip because use non-Gregorian calendar:
# lo_LA, thai, th_TH.
# BUG: Generates regexp that does not match the current date
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM,
# Arabic, ja_JP, ko_KR, zh_CN, etc.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE',
'he_IL', 'eu_ES')
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'eu_ES', 'ar_AE')
def test_date_locale(self):
# Test %x directive
now = time.time()
Expand All @@ -535,7 +544,8 @@ def test_date_locale(self):
support.is_emscripten or support.is_wasi,
"musl libc issue on Emscripten, bpo-46390"
)
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP')
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'eu_ES', 'ar_AE')
def test_date_locale2(self):
# Test %x directive
self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Fix :func:`time.strptime` for ``%c`` and ``%x`` formats in many locales:
Arabic, Bislama, Breton, Bodo, Kashubian, Chuvash, Estonian, French, Irish,
Ge'ez, Gurajati, Manx Gaelic, Hebrew, Hindi, Chhattisgarhi, Haitian Kreyol,
Japanese, Kannada, Korean, Marathi, Malay, Norwegian, Nynorsk, Punjabi,
Rajasthani, Tok Pisin, Yoruba, Yue Chinese, Yau/Nungon and Chinese.
Loading