diff --git a/Lib/test/test_tools/i18n_data/docstrings.pot b/Lib/test/test_tools/i18n_data/docstrings.pot index 5af1d41422ff62..387db2413a575f 100644 --- a/Lib/test/test_tools/i18n_data/docstrings.pot +++ b/Lib/test/test_tools/i18n_data/docstrings.pot @@ -15,26 +15,40 @@ msgstr "" "Generated-By: pygettext.py 1.5\n" -#: docstrings.py:7 +#: docstrings.py:1 +#, docstring +msgid "Module docstring" +msgstr "" + +#: docstrings.py:9 #, docstring msgid "" msgstr "" -#: docstrings.py:18 +#: docstrings.py:15 +#, docstring +msgid "docstring" +msgstr "" + +#: docstrings.py:20 #, docstring msgid "" "multiline\n" -" docstring\n" -" " +"docstring" msgstr "" -#: docstrings.py:25 +#: docstrings.py:27 #, docstring msgid "docstring1" msgstr "" -#: docstrings.py:30 +#: docstrings.py:38 +#, docstring +msgid "nested docstring" +msgstr "" + +#: docstrings.py:43 #, docstring -msgid "Hello, {}!" +msgid "nested class docstring" msgstr "" diff --git a/Lib/test/test_tools/i18n_data/docstrings.py b/Lib/test/test_tools/i18n_data/docstrings.py index 85d7f159d37775..151a55a4b56ba6 100644 --- a/Lib/test/test_tools/i18n_data/docstrings.py +++ b/Lib/test/test_tools/i18n_data/docstrings.py @@ -1,3 +1,5 @@ +"""Module docstring""" + # Test docstring extraction from gettext import gettext as _ @@ -10,10 +12,10 @@ def test(x): # Leading empty line def test2(x): - """docstring""" # XXX This should be extracted but isn't. + """docstring""" -# XXX Multiline docstrings should be cleaned with `inspect.cleandoc`. +# Multiline docstrings are cleaned with `inspect.cleandoc`. def test3(x): """multiline docstring @@ -27,15 +29,15 @@ def test4(x): def test5(x): - """Hello, {}!""".format("world!") # XXX This should not be extracted. + """Hello, {}!""".format("world!") # This should not be extracted. # Nested docstrings def test6(x): def inner(y): - """nested docstring""" # XXX This should be extracted but isn't. + """nested docstring""" class Outer: class Inner: - "nested class docstring" # XXX This should be extracted but isn't. + "nested class docstring" diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index 8d66fbc4f3a937..e8167acfc0742b 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -19,22 +19,22 @@ msgstr "" msgid "" msgstr "" -#: messages.py:19 messages.py:20 +#: messages.py:19 messages.py:20 messages.py:21 msgid "parentheses" msgstr "" -#: messages.py:23 +#: messages.py:24 msgid "Hello, world!" msgstr "" -#: messages.py:26 +#: messages.py:27 msgid "" "Hello,\n" " multiline!\n" msgstr "" #: messages.py:46 messages.py:89 messages.py:90 messages.py:93 messages.py:94 -#: messages.py:99 +#: messages.py:99 messages.py:100 messages.py:101 msgid "foo" msgid_plural "foos" msgstr[0] "" @@ -68,7 +68,7 @@ msgstr "" msgid "set" msgstr "" -#: messages.py:63 +#: messages.py:62 messages.py:63 msgid "nested string" msgstr "" @@ -76,6 +76,10 @@ msgstr "" msgid "baz" msgstr "" +#: messages.py:71 messages.py:75 +msgid "default value" +msgstr "" + #: messages.py:91 messages.py:92 messages.py:95 messages.py:96 msgctxt "context" msgid "foo" @@ -83,7 +87,13 @@ msgid_plural "foos" msgstr[0] "" msgstr[1] "" -#: messages.py:100 +#: messages.py:102 msgid "domain foo" msgstr "" +#: messages.py:118 messages.py:119 +msgid "world" +msgid_plural "worlds" +msgstr[0] "" +msgstr[1] "" + diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py index 1e03f4e556830d..9457bcb8611020 100644 --- a/Lib/test/test_tools/i18n_data/messages.py +++ b/Lib/test/test_tools/i18n_data/messages.py @@ -18,6 +18,7 @@ # Extra parentheses (_("parentheses")) ((_("parentheses"))) +_(("parentheses")) # Multiline strings _("Hello, " @@ -32,7 +33,6 @@ _(None) _(1) _(False) -_(("invalid")) _(["invalid"]) _({"invalid"}) _("string"[3]) @@ -40,7 +40,7 @@ _({"string": "foo"}) # pygettext does not allow keyword arguments, but both xgettext and pybabel do -_(x="kwargs work!") +_(x="kwargs are not allowed!") # Unusual, but valid arguments _("foo", "bar") @@ -48,7 +48,7 @@ # .format() _("Hello, {}!").format("world") # valid -_("Hello, {}!".format("world")) # invalid, but xgettext and pybabel extract the first string +_("Hello, {}!".format("world")) # invalid, but xgettext extracts the first string # Nested structures _("1"), _("2") @@ -59,7 +59,7 @@ # Nested functions and classes def test(): - _("nested string") # XXX This should be extracted but isn't. + _("nested string") [_("nested string")] @@ -68,11 +68,11 @@ def bar(self): return _("baz") -def bar(x=_('default value')): # XXX This should be extracted but isn't. +def bar(x=_('default value')): pass -def baz(x=[_('default value')]): # XXX This should be extracted but isn't. +def baz(x=[_('default value')]): pass @@ -97,6 +97,8 @@ def _(x="don't extract me"): # Complex arguments ngettext("foo", "foos", 42 + (10 - 20)) +ngettext("foo", "foos", *args) +ngettext("foo", "foos", **kwargs) dgettext(["some", {"complex"}, ("argument",)], "domain foo") # Invalid calls which are not extracted @@ -108,3 +110,10 @@ def _(x="don't extract me"): dngettext('domain', 'foo') dpgettext('domain', 'context') dnpgettext('domain', 'context', 'foo') +dgettext(*args, 'foo') +dpgettext(*args, 'context', 'foo') +dnpgettext(*args, 'context', 'foo', 'foos') + +# f-strings +f"Hello, {_('world')}!" +f"Hello, {ngettext('world', 'worlds', 3)}!" diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py index 29c3423e234d20..d23479104d438d 100644 --- a/Lib/test/test_tools/test_i18n.py +++ b/Lib/test/test_tools/test_i18n.py @@ -87,7 +87,7 @@ def assert_POT_equal(self, expected, actual): self.maxDiff = None self.assertEqual(normalize_POT_file(expected), normalize_POT_file(actual)) - def extract_from_str(self, module_content, *, args=(), strict=True): + def extract_from_str(self, module_content, *, args=(), strict=True, with_stderr=False): """Return all msgids extracted from module_content.""" filename = 'test.py' with temp_cwd(None): @@ -98,12 +98,18 @@ def extract_from_str(self, module_content, *, args=(), strict=True): self.assertEqual(res.err, b'') with open('messages.pot', encoding='utf-8') as fp: data = fp.read() - return self.get_msgids(data) + msgids = self.get_msgids(data) + if not with_stderr: + return msgids + return msgids, res.err def extract_docstrings_from_str(self, module_content): """Return all docstrings extracted from module_content.""" return self.extract_from_str(module_content, args=('--docstrings',), strict=False) + def get_stderr(self, module_content): + return self.extract_from_str(module_content, strict=False, with_stderr=True)[1] + def test_header(self): """Make sure the required fields are in the header, according to: http://www.gnu.org/software/gettext/manual/gettext.html#Header-Entry @@ -407,6 +413,24 @@ def test_files_list(self): self.assertIn(f'msgid "{text2}"', data) self.assertNotIn(text3, data) + def test_error_messages(self): + """Test that pygettext outputs error messages to stderr.""" + stderr = self.get_stderr(dedent('''\ + _(1+2) + ngettext('foo') + dgettext(*args, 'foo') + ''')) + + # Normalize line endings on Windows + stderr = stderr.decode('utf-8').replace('\r', '') + + self.assertEqual( + stderr, + "*** test.py:1: Expected a string constant for argument 1, got 1 + 2\n" + "*** test.py:2: Expected at least 2 positional argument(s) in gettext call, got 1\n" + "*** test.py:3: Variable positional arguments are not allowed in gettext calls\n" + ) + def update_POT_snapshots(): for input_file in DATA_DIR.glob('*.py'): diff --git a/Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst b/Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst new file mode 100644 index 00000000000000..82954e170ec3e2 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2023-05-11-23-32-25.gh-issue-104400.23vxm7.rst @@ -0,0 +1 @@ +Fix several bugs in extraction by switching to an AST parser in :program:`pygettext`. diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index d8a0e379ab82cb..4720aecbdc515a 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -140,8 +140,6 @@ import os import sys import time -import tokenize -from collections import defaultdict from dataclasses import dataclass, field from operator import itemgetter @@ -206,15 +204,6 @@ def escape_nonascii(s, encoding): return ''.join(escapes[b] for b in s.encode(encoding)) -def is_literal_string(s): - return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"') - - -def safe_eval(s): - # unwrap quotes, safely - return eval(s, {'__builtins__':{}}, {}) - - def normalize(s, encoding): # This converts the various Python string types into a format that is # appropriate for .po files, namely much closer to C style. @@ -296,11 +285,6 @@ def getFilesForName(name): } -def matches_spec(message, spec): - """Check if a message has all the keys defined by the keyword spec.""" - return all(key in message for key in spec.values()) - - @dataclass(frozen=True) class Location: filename: str @@ -325,203 +309,91 @@ def add_location(self, filename, lineno, msgid_plural=None, *, is_docstring=Fals self.is_docstring |= is_docstring -class TokenEater: +class GettextVisitor(ast.NodeVisitor): def __init__(self, options): - self.__options = options - self.__messages = {} - self.__state = self.__waiting - self.__data = defaultdict(str) - self.__curr_arg = 0 - self.__curr_keyword = None - self.__lineno = -1 - self.__freshmodule = 1 - self.__curfile = None - self.__enclosurecount = 0 - - def __call__(self, ttype, tstring, stup, etup, line): - # dispatch -## import token -## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, -## file=sys.stderr) - self.__state(ttype, tstring, stup[0]) - - @property - def messages(self): - return self.__messages - - def __waiting(self, ttype, tstring, lineno): - opts = self.__options - # Do docstring extractions, if enabled - if opts.docstrings and not opts.nodocstrings.get(self.__curfile): - # module docstring? - if self.__freshmodule: - if ttype == tokenize.STRING and is_literal_string(tstring): - self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True) - self.__freshmodule = 0 - return - if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING): - return - self.__freshmodule = 0 - # class or func/method docstring? - if ttype == tokenize.NAME and tstring in ('class', 'def'): - self.__state = self.__suiteseen - return - if ttype == tokenize.NAME and tstring in ('class', 'def'): - self.__state = self.__ignorenext + super().__init__() + self.options = options + self.filename = None + self.messages = {} + + def visit_file(self, node, filename): + self.filename = filename + self.visit(node) + + def visit_Module(self, node): + self._extract_docstring(node) + self.generic_visit(node) + + visit_ClassDef = visit_FunctionDef = visit_AsyncFunctionDef = visit_Module + + def visit_Call(self, node): + self._extract_message(node) + self.generic_visit(node) + + def _extract_docstring(self, node): + if (not self.options.docstrings or + self.options.nodocstrings.get(self.filename)): return - if ttype == tokenize.NAME and tstring in opts.keywords: - self.__state = self.__keywordseen - self.__curr_keyword = tstring + + docstring = ast.get_docstring(node) + if docstring is not None: + lineno = node.body[0].lineno # The first statement is the docstring + self._add_message(lineno, docstring, is_docstring=True) + + def _extract_message(self, node): + func_name = self._get_func_name(node) + spec = self.options.keywords.get(func_name) + if spec is None: + return + + max_index = max(spec) + has_var_positional = any(isinstance(arg, ast.Starred) for + arg in node.args[:max_index+1]) + if has_var_positional: + print(f'*** {self.filename}:{node.lineno}: Variable positional ' + f'arguments are not allowed in gettext calls', file=sys.stderr) return - if ttype == tokenize.STRING: - maybe_fstring = ast.parse(tstring, mode='eval').body - if not isinstance(maybe_fstring, ast.JoinedStr): - return - for value in filter(lambda node: isinstance(node, ast.FormattedValue), - maybe_fstring.values): - for call in filter(lambda node: isinstance(node, ast.Call), - ast.walk(value)): - func = call.func - if isinstance(func, ast.Name): - func_name = func.id - elif isinstance(func, ast.Attribute): - func_name = func.attr - else: - continue - - if func_name not in opts.keywords: - continue - if len(call.args) != 1: - print(( - '*** %(file)s:%(lineno)s: Seen unexpected amount of' - ' positional arguments in gettext call: %(source_segment)s' - ) % { - 'source_segment': ast.get_source_segment(tstring, call) or tstring, - 'file': self.__curfile, - 'lineno': lineno - }, file=sys.stderr) - continue - if call.keywords: - print(( - '*** %(file)s:%(lineno)s: Seen unexpected keyword arguments' - ' in gettext call: %(source_segment)s' - ) % { - 'source_segment': ast.get_source_segment(tstring, call) or tstring, - 'file': self.__curfile, - 'lineno': lineno - }, file=sys.stderr) - continue - arg = call.args[0] - if not isinstance(arg, ast.Constant): - print(( - '*** %(file)s:%(lineno)s: Seen unexpected argument type' - ' in gettext call: %(source_segment)s' - ) % { - 'source_segment': ast.get_source_segment(tstring, call) or tstring, - 'file': self.__curfile, - 'lineno': lineno - }, file=sys.stderr) - continue - if isinstance(arg.value, str): - self.__curr_keyword = func_name - self.__addentry({'msgid': arg.value}, lineno) - - def __suiteseen(self, ttype, tstring, lineno): - # skip over any enclosure pairs until we see the colon - if ttype == tokenize.OP: - if tstring == ':' and self.__enclosurecount == 0: - # we see a colon and we're not in an enclosure: end of def - self.__state = self.__suitedocstring - elif tstring in '([{': - self.__enclosurecount += 1 - elif tstring in ')]}': - self.__enclosurecount -= 1 - - def __suitedocstring(self, ttype, tstring, lineno): - # ignore any intervening noise - if ttype == tokenize.STRING and is_literal_string(tstring): - self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True) - self.__state = self.__waiting - elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, - tokenize.COMMENT): - # there was no class docstring - self.__state = self.__waiting - - def __keywordseen(self, ttype, tstring, lineno): - if ttype == tokenize.OP and tstring == '(': - self.__data.clear() - self.__curr_arg = 0 - self.__enclosurecount = 0 - self.__lineno = lineno - self.__state = self.__openseen - else: - self.__state = self.__waiting - - def __openseen(self, ttype, tstring, lineno): - spec = self.__options.keywords[self.__curr_keyword] - arg_type = spec.get(self.__curr_arg) - expect_string_literal = arg_type is not None - - if ttype == tokenize.OP and self.__enclosurecount == 0: - if tstring == ')': - # We've seen the last of the translatable strings. Record the - # line number of the first line of the strings and update the list - # of messages seen. Reset state for the next batch. If there - # were no strings inside _(), then just ignore this entry. - if self.__data: - self.__addentry(self.__data) - self.__state = self.__waiting - return - elif tstring == ',': - # Advance to the next argument - self.__curr_arg += 1 - return - if expect_string_literal: - if ttype == tokenize.STRING and is_literal_string(tstring): - self.__data[arg_type] += safe_eval(tstring) - elif ttype not in (tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT, - tokenize.NEWLINE, tokenize.NL): - # We are inside an argument which is a translatable string and - # we encountered a token that is not a string. This is an error. - self.warn_unexpected_token(tstring) - self.__enclosurecount = 0 - self.__state = self.__waiting - elif ttype == tokenize.OP: - if tstring in '([{': - self.__enclosurecount += 1 - elif tstring in ')]}': - self.__enclosurecount -= 1 - - def __ignorenext(self, ttype, tstring, lineno): - self.__state = self.__waiting - - def __addentry(self, msg, lineno=None, *, is_docstring=False): - msgid = msg.get('msgid') - if msgid in self.__options.toexclude: + if max_index >= len(node.args): + print(f'*** {self.filename}:{node.lineno}: Expected at least ' + f'{max(spec) + 1} positional argument(s) in gettext call, ' + f'got {len(node.args)}', file=sys.stderr) return - if not is_docstring: - spec = self.__options.keywords[self.__curr_keyword] - if not matches_spec(msg, spec): + + msg_data = {} + for position, arg_type in spec.items(): + arg = node.args[position] + if not self._is_string_const(arg): + print(f'*** {self.filename}:{arg.lineno}: Expected a string ' + f'constant for argument {position + 1}, ' + f'got {ast.unparse(arg)}', file=sys.stderr) return - if lineno is None: - lineno = self.__lineno - msgctxt = msg.get('msgctxt') - msgid_plural = msg.get('msgid_plural') + msg_data[arg_type] = arg.value + + lineno = node.lineno + self._add_message(lineno, **msg_data) + + def _add_message( + self, lineno, msgid, msgid_plural=None, msgctxt=None, *, + is_docstring=False): + if msgid in self.options.toexclude: + return + key = self._key_for(msgid, msgctxt) - if key in self.__messages: - self.__messages[key].add_location( - self.__curfile, + message = self.messages.get(key) + if message: + message.add_location( + self.filename, lineno, msgid_plural, is_docstring=is_docstring, ) else: - self.__messages[key] = Message( + self.messages[key] = Message( msgid=msgid, msgid_plural=msgid_plural, msgctxt=msgctxt, - locations={Location(self.__curfile, lineno)}, + locations={Location(self.filename, lineno)}, is_docstring=is_docstring, ) @@ -531,19 +403,17 @@ def _key_for(msgid, msgctxt=None): return (msgctxt, msgid) return msgid - def warn_unexpected_token(self, token): - print(( - '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' - ) % { - 'token': token, - 'file': self.__curfile, - 'lineno': self.__lineno - }, file=sys.stderr) - - def set_filename(self, filename): - self.__curfile = filename - self.__freshmodule = 1 + def _get_func_name(self, node): + match node.func: + case ast.Name(id=id): + return id + case ast.Attribute(attr=attr): + return attr + case _: + return None + def _is_string_const(self, node): + return isinstance(node, ast.Constant) and isinstance(node.value, str) def write_pot_file(messages, options, fp): timestamp = time.strftime('%Y-%m-%d %H:%M%z') @@ -717,31 +587,24 @@ class Options: args = expanded # slurp through all the files - eater = TokenEater(options) + visitor = GettextVisitor(options) for filename in args: if filename == '-': if options.verbose: print('Reading standard input') - fp = sys.stdin.buffer - closep = 0 + source = sys.stdin.buffer.read() else: if options.verbose: print(f'Working on {filename}') - fp = open(filename, 'rb') - closep = 1 + with open(filename, 'rb') as fp: + source = fp.read() + try: - eater.set_filename(filename) - try: - tokens = tokenize.tokenize(fp.readline) - for _token in tokens: - eater(*_token) - except tokenize.TokenError as e: - print('%s: %s, line %d, column %d' % ( - e.args[0], filename, e.args[1][0], e.args[1][1]), - file=sys.stderr) - finally: - if closep: - fp.close() + module_tree = ast.parse(source) + except SyntaxError: + continue + + visitor.visit_file(module_tree, filename) # write the output if options.outfile == '-': @@ -753,7 +616,7 @@ class Options: fp = open(options.outfile, 'w') closep = 1 try: - write_pot_file(eater.messages, options, fp) + write_pot_file(visitor.messages, options, fp) finally: if closep: fp.close()