Skip to content

Commit e31cb7c

Browse files
jensmaurertkoeppe
authored andcommitted
[lex] Introduce \unicode macro for Unicode character names
and format them using \textsc
1 parent 1567c48 commit e31cb7c

File tree

5 files changed

+95
-88
lines changed

5 files changed

+95
-88
lines changed

source/lex.tex

Lines changed: 74 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -243,45 +243,45 @@
243243
\begin{floattable}{Basic character set}{lex.charset.basic}{lll}
244244
\topline
245245
\lhdrx{2}{character} & \rhdr{glyph} \\ \capsep
246-
U+0009 & CHARACTER TABULATION & \\
247-
U+000B & LINE TABULATION & \\
248-
U+000C & FORM FEED (FF) & \\
249-
U+0020 & SPACE & \\
250-
U+000A & LINE FEED (LF) & new-line \\
251-
U+0021 & EXCLAMATION MARK & \tcode{!} \\
252-
U+0022 & QUOTATION MARK & \tcode{"} \\
253-
U+0023 & NUMBER SIGN & \tcode{\#} \\
254-
U+0025 & PERCENT SIGN & \tcode{\%} \\
255-
U+0026 & AMPERSAND & \tcode{\&} \\
256-
U+0027 & APOSTROPHE & \tcode{'} \\
257-
U+0028 & LEFT PARENTHESIS & \tcode{(} \\
258-
U+0029 & RIGHT PARENTHESIS & \tcode{)} \\
259-
U+002A & ASTERISK & \tcode{*} \\
260-
U+002B & PLUS SIGN & \tcode{+} \\
261-
U+002C & COMMA & \tcode{,} \\
262-
U+002D & HYPHEN-MINUS & \tcode{-} \\
263-
U+002E & FULL STOP & \tcode{.} \\
264-
U+002F & SOLIDUS & \tcode{/} \\
265-
U+0030 .. U+0039 & DIGIT ZERO .. NINE & \tcode{0 1 2 3 4 5 6 7 8 9} \\
266-
U+003A & COLON & \tcode{:} \\
267-
U+003B & SEMICOLON & \tcode{;} \\
268-
U+003C & LESS-THAN SIGN & \tcode{<} \\
269-
U+003D & EQUALS SIGN & \tcode{=} \\
270-
U+003E & GREATER-THAN SIGN & \tcode{>} \\
271-
U+003F & QUESTION MARK & \tcode{?} \\
272-
U+0041 .. U+005A & LATIN CAPITAL LETTER A .. Z & \tcode{A B C D E F G H I J K L M} \\
273-
& & \tcode{N O P Q R S T U V W X Y Z} \\
274-
U+005B & LEFT SQUARE BRACKET & \tcode{[} \\
275-
U+005C & REVERSE SOLIDUS & \tcode{\textbackslash} \\
276-
U+005D & RIGHT SQUARE BRACKET & \tcode{]} \\
277-
U+005E & CIRCUMFLEX ACCENT & \tcode{\caret} \\
278-
U+005F & LOW LINE & \tcode{_} \\
279-
U+0061 .. U+007A & LATIN SMALL LETTER A .. Z & \tcode{a b c d e f g h i j k l m} \\
246+
\ucode{0009} & \uname{character tabulation} & \\
247+
\ucode{000b} & \uname{line tabulation} & \\
248+
\ucode{000c} & \uname{form feed (ff)} & \\
249+
\ucode{0020} & \uname{space} & \\
250+
\ucode{000a} & \uname{line feed (lf)} & new-line \\
251+
\ucode{0021} & \uname{exclamation mark} & \tcode{!} \\
252+
\ucode{0022} & \uname{quotation mark} & \tcode{"} \\
253+
\ucode{0023} & \uname{number sign} & \tcode{\#} \\
254+
\ucode{0025} & \uname{percent sign} & \tcode{\%} \\
255+
\ucode{0026} & \uname{ampersand} & \tcode{\&} \\
256+
\ucode{0027} & \uname{apostrophe} & \tcode{'} \\
257+
\ucode{0028} & \uname{left parenthesis} & \tcode{(} \\
258+
\ucode{0029} & \uname{right parenthesis} & \tcode{)} \\
259+
\ucode{002a} & \uname{asterisk} & \tcode{*} \\
260+
\ucode{002b} & \uname{plus sign} & \tcode{+} \\
261+
\ucode{002c} & \uname{comma} & \tcode{,} \\
262+
\ucode{002d} & \uname{hyphen-minus} & \tcode{-} \\
263+
\ucode{002e} & \uname{full stop} & \tcode{.} \\
264+
\ucode{002f} & \uname{solidus} & \tcode{/} \\
265+
\ucode{0030} .. \ucode{0039} & \uname{digit zero .. nine} & \tcode{0 1 2 3 4 5 6 7 8 9} \\
266+
\ucode{003a} & \uname{colon} & \tcode{:} \\
267+
\ucode{003b} & \uname{semicolon} & \tcode{;} \\
268+
\ucode{003c} & \uname{less-than sign} & \tcode{<} \\
269+
\ucode{003d} & \uname{equals sign} & \tcode{=} \\
270+
\ucode{003e} & \uname{greater-than sign} & \tcode{>} \\
271+
\ucode{003f} & \uname{question mark} & \tcode{?} \\
272+
\ucode{0041} .. \ucode{005a} & \uname{latin capital letter a .. z} & \tcode{A B C D E F G H I J K L M} \\
273+
& & \tcode{N O P Q R S T U V W X Y Z} \\
274+
\ucode{005b} & \uname{left square bracket} & \tcode{[} \\
275+
\ucode{005c} & \uname{reverse solidus} & \tcode{\textbackslash} \\
276+
\ucode{005d} & \uname{right square bracket} & \tcode{]} \\
277+
\ucode{005e} & \uname{circumflex accent} & \tcode{\caret} \\
278+
\ucode{005f} & \uname{low line} & \tcode{_} \\
279+
\ucode{0061} .. \ucode{007a} & \uname{latin small letter a .. z} & \tcode{a b c d e f g h i j k l m} \\
280280
& & \tcode{n o p q r s t u v w x y z} \\
281-
U+007B & LEFT CURLY BRACKET & \tcode{\{} \\
282-
U+007C & VERTICAL LINE & \tcode{|} \\
283-
U+007D & RIGHT CURLY BRACKET & \tcode{\}} \\
284-
U+007E & TILDE & \tcode{\textasciitilde} \\
281+
\ucode{007b} & \uname{left curly bracket} & \tcode{\{} \\
282+
\ucode{007c} & \uname{vertical line} & \tcode{|} \\
283+
\ucode{007d} & \uname{right curly bracket} & \tcode{\}} \\
284+
\ucode{007e} & \uname{tilde} & \tcode{\textasciitilde} \\
285285
\end{floattable}
286286

287287
\pnum
@@ -325,10 +325,10 @@
325325
\begin{floattable}{Additional control characters in the basic literal character set}{lex.charset.literal}{ll}
326326
\topline
327327
\ohdrx{2}{character} \\ \capsep
328-
U+0000 & NULL \\
329-
U+0007 & BELL \\
330-
U+0008 & BACKSPACE \\
331-
U+000D & CARRIAGE RETURN (CR) \\
328+
\ucode{0000} & \uname{null} \\
329+
\ucode{0007} & \uname{bell} \\
330+
\ucode{0008} & \uname{backspace} \\
331+
\ucode{000d} & \uname{carriage return (cr)} \\
332332
\end{floattable}
333333

334334
\pnum
@@ -359,10 +359,10 @@
359359
\end{note}
360360
\indextext{character!null}%
361361
\indextext{wide-character!null}%
362-
The U+0000 NULL character is encoded as the value \tcode{0}.
362+
The \unicode{0000}{null} character is encoded as the value \tcode{0}.
363363
No other element of the translation character set
364364
is encoded with a code unit of value \tcode{0}.
365-
The code unit value of each decimal digit character after the digit \tcode{0} (U+0030)
365+
The code unit value of each decimal digit character after the digit \tcode{0} (\ucode{0030})
366366
shall be one greater than the value of the previous.
367367
The ordinary and wide literal encodings are otherwise
368368
\impldef{ordinary and wide literal encodings}.
@@ -412,16 +412,21 @@
412412
literals), string literals (including user-defined string literals), preprocessing
413413
operators and punctuators, and single non-whitespace characters that do not lexically
414414
match the other preprocessing token categories.
415-
If a U+0027 APOSTROPHE or a U+0022 QUOTATION MARK character
415+
If a \unicode{0027}{apostrophe} or a \unicode{0022}{quotation mark} character
416416
matches the last category, the behavior is undefined.
417417
If any character not in the basic character set matches the last category,
418418
the program is ill-formed.
419419
Preprocessing tokens can be separated by
420420
\indextext{whitespace}%
421421
whitespace;
422422
\indextext{comment}%
423-
this consists of comments\iref{lex.comment}, or whitespace
424-
characters (U+0020 SPACE, U+0009 CHARACTER TABULATION, new-line, U+000B LINE TABULATION, and U+000C FORM FEED), or both. As described in \ref{cpp}, in certain
423+
this consists of comments\iref{lex.comment}, or whitespace characters
424+
(\unicode{0020}{space},
425+
\unicode{0009}{character tabulation},
426+
new-line,
427+
\unicode{000b}{line tabulation}, and
428+
\unicode{000c}{form feed}), or both.
429+
As described in \ref{cpp}, in certain
425430
circumstances during translation phase 4, whitespace (or the absence
426431
thereof) serves as more than preprocessing token separation. Whitespace
427432
can appear within a preprocessing token only as part of a header name or
@@ -625,7 +630,7 @@
625630

626631
\begin{bnf}
627632
\nontermdef{h-char}\br
628-
\textnormal{any member of the translation character set except new-line and \terminal{U+003E GREATER-THAN SIGN}}
633+
\textnormal{any member of the translation character set except new-line and \unicode{003e}{greater-than sign}}
629634
\end{bnf}
630635

631636
\begin{bnf}
@@ -636,7 +641,7 @@
636641

637642
\begin{bnf}
638643
\nontermdef{q-char}\br
639-
\textnormal{any member of the translation character set except new-line and \terminal{U+0022 QUOTATION MARK}}
644+
\textnormal{any member of the translation character set except new-line and \unicode{0022}{quotation mark}}
640645
\end{bnf}
641646

642647
\pnum
@@ -1273,8 +1278,8 @@
12731278

12741279
\begin{bnf}
12751280
\nontermdef{basic-c-char}\br
1276-
\textnormal{any member of the translation character set except the U+0027 APOSTROPHE,}\br
1277-
\bnfindent\textnormal{U+005C REVERSE SOLIDUS, or new-line character}
1281+
\textnormal{any member of the translation character set except the \unicode{0027}{apostrophe},}\br
1282+
\bnfindent\textnormal{\unicode{005c}{reverse solidus}, or new-line character}
12781283
\end{bnf}
12791284

12801285
\begin{bnf}
@@ -1492,17 +1497,17 @@
14921497
{lll}
14931498
\topline
14941499
\lhdrx{2}{character} & \rhdr{\grammarterm{simple-escape-sequence}} \\ \capsep
1495-
U+000A & LINE FEED (LF) & \tcode{\textbackslash n} \\
1496-
U+0009 & CHARACTER TABULATION & \tcode{\textbackslash t} \\
1497-
U+000B & LINE TABULATION & \tcode{\textbackslash v} \\
1498-
U+0008 & BACKSPACE & \tcode{\textbackslash b} \\
1499-
U+000D & CARRIAGE RETURN (CR) & \tcode{\textbackslash r} \\
1500-
U+000C & FORM FEED (FF) & \tcode{\textbackslash f} \\
1501-
U+0007 & BELL & \tcode{\textbackslash a} \\
1502-
U+005C & REVERSE SOLIDUS & \tcode{\textbackslash\textbackslash} \\
1503-
U+003F & QUESTION MARK & \tcode{\textbackslash ?} \\
1504-
U+0027 & APOSTROPHE & \tcode{\textbackslash '} \\
1505-
U+0022 & QUOTATION MARK & \tcode{\textbackslash "} \\
1500+
\ucode{000a} & \uname{line feed} & \tcode{\textbackslash n} \\
1501+
\ucode{0009} & \uname{character tabulation} & \tcode{\textbackslash t} \\
1502+
\ucode{000b} & \uname{line tabulation} & \tcode{\textbackslash v} \\
1503+
\ucode{0008} & \uname{backspace} & \tcode{\textbackslash b} \\
1504+
\ucode{000d} & \uname{carriage return} & \tcode{\textbackslash r} \\
1505+
\ucode{000c} & \uname{form feed} & \tcode{\textbackslash f} \\
1506+
\ucode{0007} & \uname{bell} & \tcode{\textbackslash a} \\
1507+
\ucode{005c} & \uname{reverse solidus} & \tcode{\textbackslash\textbackslash} \\
1508+
\ucode{003f} & \uname{question mark} & \tcode{\textbackslash ?} \\
1509+
\ucode{0027} & \uname{apostrohpe} & \tcode{\textbackslash '} \\
1510+
\ucode{0022} & \uname{quotation mark} & \tcode{\textbackslash "} \\
15061511
\end{floattable}
15071512

15081513
\rSec2[lex.fcon]{Floating-point literals}
@@ -1654,8 +1659,8 @@
16541659

16551660
\begin{bnf}
16561661
\nontermdef{basic-s-char}\br
1657-
\textnormal{any member of the translation character set except the U+0022 QUOTATION MARK,}\br
1658-
\bnfindent\textnormal{U+005C REVERSE SOLIDUS, or new-line character}
1662+
\textnormal{any member of the translation character set except the \unicode{0022}{quotation mark},}\br
1663+
\bnfindent\textnormal{\unicode{005c}{reverse solidus}, or new-line character}
16591664
\end{bnf}
16601665

16611666
\begin{bnf}
@@ -1671,8 +1676,8 @@
16711676

16721677
\begin{bnf}
16731678
\nontermdef{r-char}\br
1674-
\textnormal{any member of the translation character set, except a U+0029 RIGHT PARENTHESIS followed by}\br
1675-
\bnfindent\textnormal{the initial \grammarterm{d-char-sequence} (which may be empty) followed by a U+0022 QUOTATION MARK}
1679+
\textnormal{any member of the translation character set, except a \unicode{0029}{right parenthesis} followed by}\br
1680+
\bnfindent\textnormal{the initial \grammarterm{d-char-sequence} (which may be empty) followed by a \unicode{0022}{quotation mark}}
16761681
\end{bnf}
16771682

16781683
\begin{bnf}
@@ -1684,9 +1689,8 @@
16841689
\begin{bnf}
16851690
\nontermdef{d-char}\br
16861691
\textnormal{any member of the basic character set except:}\br
1687-
\bnfindent\textnormal{U+0020 SPACE, U+0028 LEFT PARENTHESIS, U+0029 RIGHT PARENTHESIS,}\br
1688-
\bnfindent\textnormal{U+005C REVERSE SOLIDUS, U+0009 CHARACTER TABULATION,}\br
1689-
\bnfindent\textnormal{U+000B LINE TABULATION, U+000C FORM FEED (FF), and new-line}
1692+
\bnfindent\textnormal{\unicode{0020}{space}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis}, \unicode{005c}{reverse solidus},}\br
1693+
\bnfindent\textnormal{\unicode{0009}{character tabulation}, \unicode{000b}{line tabulation}, \unicode{000c}{form feed}, and new-line}
16901694
\end{bnf}
16911695

16921696
\pnum
@@ -1882,7 +1886,7 @@
18821886
corresponding to the \grammarterm{string-literal}'s sequence of
18831887
\grammarterm{s-char}s (originally from non-raw string literals) and
18841888
\grammarterm{r-char}s (originally from raw string literals),
1885-
plus a terminating U+0000 NULL character,
1889+
plus a terminating \unicode{0000}{null} character,
18861890
in order as follows:
18871891
\begin{itemize}
18881892
\item

source/macros.tex

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,9 @@
358358
\renewcommand{\fref}[1]{\hyperref[fig:#1]{\figurerefname \nolinebreak[3] \ref*{fig:#1}}}
359359

360360
%% NTBS, etc.
361+
\newcommand{\ucode}[1]{\textsc{u}\textsmaller[1]{\kern-0.05em\protect\raisebox{.25ex}{\textsmaller[1]{+}}\uppercase{#1}}}
362+
\newcommand{\uname}[1]{\textsc{#1}}
363+
\newcommand{\unicode}[2]{\ucode{#1} \uname{#2}}
361364
\newcommand{\NTS}[1]{\textsc{#1}}
362365
\newcommand{\ntbs}{\NTS{ntbs}}
363366
\newcommand{\ntmbs}{\NTS{ntmbs}}

source/strings.tex

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5772,7 +5772,7 @@
57725772
Subsequent calls will store successive UTF-8 code units
57735773
without consuming any additional input
57745774
until all the code units have been stored.
5775-
If the corresponding Unicode character is U+0000,
5775+
If the corresponding Unicode character is \unicode{0000}{null},
57765776
the resulting state described is the initial conversion state.
57775777

57785778
\pnum
@@ -5781,7 +5781,7 @@
57815781
\begin{itemize}
57825782
\item \tcode{0}, if the next \tcode{n} or fewer bytes complete
57835783
the multibyte character
5784-
that corresponds to the U+0000 Unicode character
5784+
that corresponds to the \unicode{0000}{null} Unicode character
57855785
(which is the value stored).
57865786
\item between \tcode{1} and \tcode{n} (inclusive),
57875787
if the next n or fewer bytes complete a valid multibyte character

source/uax31.tex

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,14 @@
2929
where \tcode{<Start>} has the XID_Start property,
3030
\tcode{<Continue>} has the XID_Continue property, and
3131
\tcode{<Medial>} is a list of characters permitted between continue characters.
32-
For \Cpp{} we add the character U+005F, LOW LINE, or \tcode{_},
32+
For \Cpp{} we add the character \unicode{005f}{low line}, or \tcode{_},
3333
to the set of permitted \tcode{<Start>} characters,
3434
the \tcode{<Medial>} set is empty, and
3535
the \tcode{<Continue>} characters are unmodified.
3636
In the grammar used in UAX \#31, this is
3737
\begin{codeblock}
3838
<Identifier> := <Start> <Continue>*
39-
<Start> := XID_Start + U+005F
39+
<Start> := XID_Start + @\textrm{\ucode{005f}}@
4040
<Continue> := <Start> + XID_Continue
4141
\end{codeblock}
4242

source/utilities.tex

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21046,20 +21046,20 @@
2104621046
The extended grapheme clusters of a string are defined by UAX \#29.
2104721047
The estimated width of the following code points is 2:
2104821048
\begin{itemize}
21049-
\item \tcode{U+1100-U+115F}
21050-
\item \tcode{U+2329-U+232A}
21051-
\item \tcode{U+2E80-U+303E}
21052-
\item \tcode{U+3040-U+A4CF}
21053-
\item \tcode{U+AC00-U+D7A3}
21054-
\item \tcode{U+F900-U+FAFF}
21055-
\item \tcode{U+FE10-U+FE19}
21056-
\item \tcode{U+FE30-U+FE6F}
21057-
\item \tcode{U+FF00-U+FF60}
21058-
\item \tcode{U+FFE0-U+FFE6}
21059-
\item \tcode{U+1F300-U+1F64F}
21060-
\item \tcode{U+1F900-U+1F9FF}
21061-
\item \tcode{U+20000-U+2FFFD}
21062-
\item \tcode{U+30000-U+3FFFD}
21049+
\item \ucode{1100} -- \ucode{115f}
21050+
\item \ucode{2329} -- \ucode{232a}
21051+
\item \ucode{2e80} -- \ucode{303e}
21052+
\item \ucode{3040} -- \ucode{a4cf}
21053+
\item \ucode{ac00} -- \ucode{d7a3}
21054+
\item \ucode{f900} -- \ucode{faff}
21055+
\item \ucode{fe10} -- \ucode{fe19}
21056+
\item \ucode{fe30} -- \ucode{fe6f}
21057+
\item \ucode{ff00} -- \ucode{ff60}
21058+
\item \ucode{ffe0} -- \ucode{ffe6}
21059+
\item \ucode{1f300} -- \ucode{1f64f}
21060+
\item \ucode{1f900} -- \ucode{1f9ff}
21061+
\item \ucode{20000} -- \ucode{2fffd}
21062+
\item \ucode{30000} -- \ucode{3fffd}
2106321063
\end{itemize}
2106421064
The estimated width of other code points is 1.
2106521065

0 commit comments

Comments
 (0)