Skip to content

Commit ca043c3

Browse files
authored
gettext: Ignore trailing backslashes when extracting messages (#13686)
Trailing backslashes are now ignored by the `gettext` builder, and whitespaces preceding and following them are ignored.
1 parent b9641ae commit ca043c3

File tree

5 files changed

+99
-1
lines changed

5 files changed

+99
-1
lines changed

CHANGES.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ Bugs fixed
7777
* #13635: LaTeX: if a cell contains a table, row coloring is turned off for
7878
the next table cells.
7979
Patch by Jean-François B.
80+
* #13685: gettext: Correctly ignore trailing backslashes.
81+
Patch by Bénédikt Tran.
8082

8183
Testing
8284
-------

sphinx/util/nodes.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import contextlib
66
import re
77
import unicodedata
8+
from io import StringIO
89
from typing import TYPE_CHECKING, Any, Generic, TypeVar, cast
910

1011
from docutils import nodes
@@ -289,6 +290,35 @@ def is_translatable(node: Node) -> bool:
289290
) # fmt: skip
290291

291292

293+
def _clean_extracted_message(text: str) -> str:
294+
"""Remove trailing backslashes from each line of *text*."""
295+
if '\\' in text:
296+
# TODO(picnixz): if possible, find a regex alternative
297+
# that is not vulnerable to a ReDOS (the code below is
298+
# equivalent to re.sub(r'[ \t]*\\[ \t]*$', text, re.MULTILINE)).
299+
buffer = StringIO()
300+
for line in text.splitlines(keepends=True):
301+
split = line.rsplit('\\', maxsplit=1)
302+
if len(split) == 2:
303+
prefix, suffix = split
304+
if re.match(r'^[ \t]*\s$', suffix):
305+
# The line ends with some NL character, preceded by
306+
# one or more whitespaces (to be dropped), the backslash,
307+
# and possibly other whitespaces on its left.
308+
buffer.write(prefix.rstrip(' \t'))
309+
buffer.write(suffix.lstrip(' \t'))
310+
elif not suffix:
311+
# backslash is at the end of the LAST line
312+
buffer.write(prefix.rstrip(' \t'))
313+
else:
314+
# backslash is is in the middle of the line
315+
buffer.write(line)
316+
else:
317+
buffer.write(line)
318+
text = buffer.getvalue()
319+
return text.replace('\n', ' ').strip()
320+
321+
292322
def extract_messages(doctree: Element) -> Iterable[tuple[Element, str]]:
293323
"""Extract translatable messages from a document tree."""
294324
for node in doctree.findall(is_translatable):
@@ -311,7 +341,8 @@ def extract_messages(doctree: Element) -> Iterable[tuple[Element, str]]:
311341
elif isinstance(node, nodes.meta):
312342
msg = node['content']
313343
else:
314-
msg = node.rawsource.replace('\n', ' ').strip() # type: ignore[attr-defined]
344+
text = node.rawsource # type: ignore[attr-defined]
345+
msg = _clean_extracted_message(text)
315346

316347
# XXX nodes rendering empty are likely a bug in sphinx.addnodes
317348
if msg:

tests/roots/test-intl/backslashes.txt

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
:tocdepth: 2
2+
3+
i18n with backslashes
4+
=====================
5+
6+
line 1\
7+
line 2 \
8+
line 3 \
9+
line 4a \ and 4b \
10+
line with spaces after backslash \
11+
last line with spaces \
12+
and done 1
13+
14+
.. gettext parses the following lines as "a<space>b<space>c",
15+
while a C pre-processor would have produced "a<space>bc".
16+
17+
a \
18+
b\
19+
c \
20+
21+
last trailing \ \ \
22+
is ignored
23+
24+
25+
See [#]_
26+
27+
.. [#] footnote with backslashes \
28+
and done 2
29+
30+
31+
.. note:: directive with \
32+
backslashes
33+
34+
35+
.. function:: foo(a, \
36+
b, \
37+
c, d, e, f)
38+
the foo

tests/roots/test-intl/index.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ CONTENTS
3232
translation_progress
3333
topic
3434
markup
35+
backslashes
3536

3637
.. toctree::
3738
:maxdepth: 2

tests/test_builders/test_build_gettext.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,3 +323,29 @@ def test_gettext_literalblock_additional(app: SphinxTestApp) -> None:
323323
"stdout object\\n>>>\\n>>> if __name__ == '__main__': # if run this py "
324324
'file as python script\\n... main() # call main',
325325
]
326+
327+
328+
@pytest.mark.sphinx('gettext', testroot='intl', srcdir='gettext')
329+
def test_gettext_trailing_backslashes(app: SphinxTestApp) -> None:
330+
app.build(force_all=True)
331+
332+
assert (app.outdir / 'backslashes.pot').is_file()
333+
pot = (app.outdir / 'backslashes.pot').read_text(encoding='utf8')
334+
msg_ids = get_msgids(pot)
335+
assert msg_ids == [
336+
'i18n with backslashes',
337+
(
338+
'line 1 line 2 line 3 '
339+
# middle backslashes are escaped normally
340+
'line 4a \\\\ and 4b '
341+
# whitespaces after backslashes are dropped
342+
'line with spaces after backslash '
343+
'last line with spaces '
344+
'and done 1'
345+
),
346+
'a b c',
347+
'last trailing \\\\ \\\\ is ignored',
348+
'See [#]_',
349+
'footnote with backslashes and done 2',
350+
'directive with backslashes',
351+
]

0 commit comments

Comments
 (0)