[lex] Introduce \unicode macro for Unicode character names

jensmaurer · tkoeppe · commit e31cb7ca1a0a · 2021-11-19T21:42:12.000Z
and format them using \textsc
diff --git a/source/lex.tex b/source/lex.tex
@@ -243,45 +243,45 @@
 \begin{floattable}{Basic character set}{lex.charset.basic}{lll}
 \topline
 \lhdrx{2}{character} & \rhdr{glyph} \\ \capsep
-U+0009 & CHARACTER TABULATION & \\
-U+000B & LINE TABULATION & \\
-U+000C & FORM FEED (FF) & \\
-U+0020 & SPACE & \\
-U+000A & LINE FEED (LF) & new-line \\
-U+0021 & EXCLAMATION MARK & \tcode{!} \\
-U+0022 & QUOTATION MARK & \tcode{"} \\
-U+0023 & NUMBER SIGN & \tcode{\#} \\
-U+0025 & PERCENT SIGN & \tcode{\%} \\
-U+0026 & AMPERSAND  & \tcode{\&} \\
-U+0027 & APOSTROPHE & \tcode{'} \\
-U+0028 & LEFT PARENTHESIS & \tcode{(} \\
-U+0029 & RIGHT PARENTHESIS & \tcode{)} \\
-U+002A & ASTERISK & \tcode{*} \\
-U+002B & PLUS SIGN & \tcode{+} \\
-U+002C & COMMA & \tcode{,} \\
-U+002D & HYPHEN-MINUS & \tcode{-} \\
-U+002E & FULL STOP & \tcode{.} \\
-U+002F & SOLIDUS & \tcode{/} \\
-U+0030 .. U+0039 & DIGIT ZERO .. NINE & \tcode{0 1 2 3 4 5 6 7 8 9} \\
-U+003A & COLON & \tcode{:} \\
-U+003B & SEMICOLON & \tcode{;} \\
-U+003C & LESS-THAN SIGN & \tcode{<} \\
-U+003D & EQUALS SIGN & \tcode{=} \\
-U+003E & GREATER-THAN SIGN & \tcode{>} \\
-U+003F & QUESTION MARK & \tcode{?} \\
-U+0041 .. U+005A & LATIN CAPITAL LETTER A .. Z & \tcode{A B C D E F G H I J K L M} \\
-& & \tcode{N O P Q R S T U V W X Y Z} \\
-U+005B & LEFT SQUARE BRACKET & \tcode{[} \\
-U+005C & REVERSE SOLIDUS & \tcode{\textbackslash} \\
-U+005D & RIGHT SQUARE BRACKET & \tcode{]} \\
-U+005E & CIRCUMFLEX ACCENT & \tcode{\caret} \\
-U+005F & LOW LINE & \tcode{_} \\
-U+0061 .. U+007A & LATIN SMALL LETTER A .. Z & \tcode{a b c d e f g h i j k l m} \\
+\ucode{0009} & \uname{character tabulation} & \\
+\ucode{000b} & \uname{line tabulation} & \\
+\ucode{000c} & \uname{form feed (ff)} & \\
+\ucode{0020} & \uname{space} & \\
+\ucode{000a} & \uname{line feed (lf)} & new-line \\
+\ucode{0021} & \uname{exclamation mark} & \tcode{!} \\
+\ucode{0022} & \uname{quotation mark} & \tcode{"} \\
+\ucode{0023} & \uname{number sign} & \tcode{\#} \\
+\ucode{0025} & \uname{percent sign} & \tcode{\%} \\
+\ucode{0026} & \uname{ampersand}  & \tcode{\&} \\
+\ucode{0027} & \uname{apostrophe} & \tcode{'} \\
+\ucode{0028} & \uname{left parenthesis} & \tcode{(} \\
+\ucode{0029} & \uname{right parenthesis} & \tcode{)} \\
+\ucode{002a} & \uname{asterisk} & \tcode{*} \\
+\ucode{002b} & \uname{plus sign} & \tcode{+} \\
+\ucode{002c} & \uname{comma} & \tcode{,} \\
+\ucode{002d} & \uname{hyphen-minus} & \tcode{-} \\
+\ucode{002e} & \uname{full stop} & \tcode{.} \\
+\ucode{002f} & \uname{solidus} & \tcode{/} \\
+\ucode{0030} .. \ucode{0039} & \uname{digit zero .. nine} & \tcode{0 1 2 3 4 5 6 7 8 9} \\
+\ucode{003a} & \uname{colon} & \tcode{:} \\
+\ucode{003b} & \uname{semicolon} & \tcode{;} \\
+\ucode{003c} & \uname{less-than sign} & \tcode{<} \\
+\ucode{003d} & \uname{equals sign} & \tcode{=} \\
+\ucode{003e} & \uname{greater-than sign} & \tcode{>} \\
+\ucode{003f} & \uname{question mark} & \tcode{?} \\
+\ucode{0041} .. \ucode{005a} & \uname{latin capital letter a .. z} & \tcode{A B C D E F G H I J K L M} \\
+ & & \tcode{N O P Q R S T U V W X Y Z} \\
+\ucode{005b} & \uname{left square bracket} & \tcode{[} \\
+\ucode{005c} & \uname{reverse solidus} & \tcode{\textbackslash} \\
+\ucode{005d} & \uname{right square bracket} & \tcode{]} \\
+\ucode{005e} & \uname{circumflex accent} & \tcode{\caret} \\
+\ucode{005f} & \uname{low line} & \tcode{_} \\
+\ucode{0061} .. \ucode{007a} & \uname{latin small letter a .. z} & \tcode{a b c d e f g h i j k l m} \\
  & & \tcode{n o p q r s t u v w x y z} \\
-U+007B & LEFT CURLY BRACKET & \tcode{\{} \\
-U+007C & VERTICAL LINE & \tcode{|} \\
-U+007D & RIGHT CURLY BRACKET & \tcode{\}} \\
-U+007E & TILDE & \tcode{\textasciitilde} \\
+\ucode{007b} & \uname{left curly bracket} & \tcode{\{} \\
+\ucode{007c} & \uname{vertical line} & \tcode{|} \\
+\ucode{007d} & \uname{right curly bracket} & \tcode{\}} \\
+\ucode{007e} & \uname{tilde} & \tcode{\textasciitilde} \\
 \end{floattable}
 
 \pnum
@@ -325,10 +325,10 @@
 \begin{floattable}{Additional control characters in the basic literal character set}{lex.charset.literal}{ll}
 \topline
 \ohdrx{2}{character} \\ \capsep
-U+0000 & NULL \\
-U+0007 & BELL \\
-U+0008 & BACKSPACE \\
-U+000D & CARRIAGE RETURN (CR) \\
+\ucode{0000} & \uname{null} \\
+\ucode{0007} & \uname{bell} \\
+\ucode{0008} & \uname{backspace} \\
+\ucode{000d} & \uname{carriage return (cr)} \\
 \end{floattable}
 
 \pnum
@@ -359,10 +359,10 @@
 \end{note}
 \indextext{character!null}%
 \indextext{wide-character!null}%
-The U+0000 NULL character is encoded as the value \tcode{0}.
+The \unicode{0000}{null} character is encoded as the value \tcode{0}.
 No other element of the translation character set
 is encoded with a code unit of value \tcode{0}.
-The code unit value of each decimal digit character after the digit \tcode{0} (U+0030)
+The code unit value of each decimal digit character after the digit \tcode{0} (\ucode{0030})
 shall be one greater than the value of the previous.
 The ordinary and wide literal encodings are otherwise
 \impldef{ordinary and wide literal encodings}.
@@ -412,16 +412,21 @@
 literals), string literals (including user-defined string literals), preprocessing
 operators and punctuators, and single non-whitespace characters that do not lexically
 match the other preprocessing token categories.
-If a U+0027 APOSTROPHE or a U+0022 QUOTATION MARK character
+If a \unicode{0027}{apostrophe} or a \unicode{0022}{quotation mark} character
 matches the last category, the behavior is undefined.
 If any character not in the basic character set matches the last category,
 the program is ill-formed.
 Preprocessing tokens can be separated by
 \indextext{whitespace}%
 whitespace;
 \indextext{comment}%
-this consists of comments\iref{lex.comment}, or whitespace
-characters (U+0020 SPACE, U+0009 CHARACTER TABULATION, new-line, U+000B LINE TABULATION, and U+000C FORM FEED), or both. As described in \ref{cpp}, in certain
+this consists of comments\iref{lex.comment}, or whitespace characters
+(\unicode{0020}{space},
+\unicode{0009}{character tabulation},
+new-line,
+\unicode{000b}{line tabulation}, and
+\unicode{000c}{form feed}), or both.
+As described in \ref{cpp}, in certain
 circumstances during translation phase 4, whitespace (or the absence
 thereof) serves as more than preprocessing token separation. Whitespace
 can appear within a preprocessing token only as part of a header name or
@@ -625,7 +630,7 @@
 
 \begin{bnf}
 \nontermdef{h-char}\br
-    \textnormal{any member of the translation character set except new-line and \terminal{U+003E GREATER-THAN SIGN}}
+    \textnormal{any member of the translation character set except new-line and \unicode{003e}{greater-than sign}}
 \end{bnf}
 
 \begin{bnf}
@@ -636,7 +641,7 @@
 
 \begin{bnf}
 \nontermdef{q-char}\br
-    \textnormal{any member of the translation character set except new-line and \terminal{U+0022 QUOTATION MARK}}
+    \textnormal{any member of the translation character set except new-line and \unicode{0022}{quotation mark}}
 \end{bnf}
 
 \pnum
@@ -1273,8 +1278,8 @@
 
 \begin{bnf}
 \nontermdef{basic-c-char}\br
-    \textnormal{any member of the translation character set except the U+0027 APOSTROPHE,}\br
-    \bnfindent\textnormal{U+005C REVERSE SOLIDUS, or new-line character}
+    \textnormal{any member of the translation character set except the \unicode{0027}{apostrophe},}\br
+    \bnfindent\textnormal{\unicode{005c}{reverse solidus}, or new-line character}
 \end{bnf}
 
 \begin{bnf}
@@ -1492,17 +1497,17 @@
 {lll}
 \topline
 \lhdrx{2}{character} &  \rhdr{\grammarterm{simple-escape-sequence}} \\ \capsep
-U+000A & LINE FEED (LF)       & \tcode{\textbackslash n} \\
-U+0009 & CHARACTER TABULATION & \tcode{\textbackslash t} \\
-U+000B & LINE TABULATION      & \tcode{\textbackslash v} \\
-U+0008 & BACKSPACE            & \tcode{\textbackslash b} \\
-U+000D & CARRIAGE RETURN (CR) & \tcode{\textbackslash r} \\
-U+000C & FORM FEED (FF)       & \tcode{\textbackslash f} \\
-U+0007 & BELL                 & \tcode{\textbackslash a} \\
-U+005C & REVERSE SOLIDUS      & \tcode{\textbackslash\textbackslash} \\
-U+003F & QUESTION MARK        & \tcode{\textbackslash ?} \\
-U+0027 & APOSTROPHE           & \tcode{\textbackslash '} \\
-U+0022 & QUOTATION MARK       & \tcode{\textbackslash "} \\
+\ucode{000a} & \uname{line feed}            & \tcode{\textbackslash n} \\
+\ucode{0009} & \uname{character tabulation} & \tcode{\textbackslash t} \\
+\ucode{000b} & \uname{line tabulation}      & \tcode{\textbackslash v} \\
+\ucode{0008} & \uname{backspace}            & \tcode{\textbackslash b} \\
+\ucode{000d} & \uname{carriage return}      & \tcode{\textbackslash r} \\
+\ucode{000c} & \uname{form feed}            & \tcode{\textbackslash f} \\
+\ucode{0007} & \uname{bell}                 & \tcode{\textbackslash a} \\
+\ucode{005c} & \uname{reverse solidus}      & \tcode{\textbackslash\textbackslash} \\
+\ucode{003f} & \uname{question mark}        & \tcode{\textbackslash ?} \\
+\ucode{0027} & \uname{apostrohpe}           & \tcode{\textbackslash '} \\
+\ucode{0022} & \uname{quotation mark}       & \tcode{\textbackslash "} \\
 \end{floattable}
 
 \rSec2[lex.fcon]{Floating-point literals}
@@ -1654,8 +1659,8 @@
 
 \begin{bnf}
 \nontermdef{basic-s-char}\br
-    \textnormal{any member of the translation character set except the U+0022 QUOTATION MARK,}\br
-    \bnfindent\textnormal{U+005C REVERSE SOLIDUS, or new-line character}
+    \textnormal{any member of the translation character set except the \unicode{0022}{quotation mark},}\br
+    \bnfindent\textnormal{\unicode{005c}{reverse solidus}, or new-line character}
 \end{bnf}
 
 \begin{bnf}
@@ -1671,8 +1676,8 @@
 
 \begin{bnf}
 \nontermdef{r-char}\br
-    \textnormal{any member of the translation character set, except a U+0029 RIGHT PARENTHESIS followed by}\br
-    \bnfindent\textnormal{the initial \grammarterm{d-char-sequence} (which may be empty) followed by a U+0022 QUOTATION MARK}
+    \textnormal{any member of the translation character set, except a \unicode{0029}{right parenthesis} followed by}\br
+    \bnfindent\textnormal{the initial \grammarterm{d-char-sequence} (which may be empty) followed by a \unicode{0022}{quotation mark}}
 \end{bnf}
 
 \begin{bnf}
@@ -1684,9 +1689,8 @@
 \begin{bnf}
 \nontermdef{d-char}\br
     \textnormal{any member of the basic character set except:}\br
-    \bnfindent\textnormal{U+0020 SPACE, U+0028 LEFT PARENTHESIS, U+0029 RIGHT PARENTHESIS,}\br
-    \bnfindent\textnormal{U+005C REVERSE SOLIDUS, U+0009 CHARACTER TABULATION,}\br
-    \bnfindent\textnormal{U+000B LINE TABULATION, U+000C FORM FEED (FF), and new-line}
+    \bnfindent\textnormal{\unicode{0020}{space}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis}, \unicode{005c}{reverse solidus},}\br
+    \bnfindent\textnormal{\unicode{0009}{character tabulation}, \unicode{000b}{line tabulation}, \unicode{000c}{form feed}, and new-line}
 \end{bnf}
 
 \pnum
@@ -1882,7 +1886,7 @@
 corresponding to the \grammarterm{string-literal}'s sequence of
 \grammarterm{s-char}s (originally from non-raw string literals) and
 \grammarterm{r-char}s (originally from raw string literals),
-plus a terminating U+0000 NULL character,
+plus a terminating \unicode{0000}{null} character,
 in order as follows:
 \begin{itemize}
 \item
diff --git a/source/macros.tex b/source/macros.tex
@@ -358,6 +358,9 @@
 \renewcommand{\fref}[1]{\hyperref[fig:#1]{\figurerefname \nolinebreak[3] \ref*{fig:#1}}}
 
 %% NTBS, etc.
+\newcommand{\ucode}[1]{\textsc{u}\textsmaller[1]{\kern-0.05em\protect\raisebox{.25ex}{\textsmaller[1]{+}}\uppercase{#1}}}
+\newcommand{\uname}[1]{\textsc{#1}}
+\newcommand{\unicode}[2]{\ucode{#1} \uname{#2}}
 \newcommand{\NTS}[1]{\textsc{#1}}
 \newcommand{\ntbs}{\NTS{ntbs}}
 \newcommand{\ntmbs}{\NTS{ntmbs}}
diff --git a/source/strings.tex b/source/strings.tex
@@ -5772,7 +5772,7 @@
 Subsequent calls will store successive UTF-8 code units
 without consuming any additional input
 until all the code units have been stored.
-If the corresponding Unicode character is U+0000,
+If the corresponding Unicode character is \unicode{0000}{null},
 the resulting state described is the initial conversion state.
 
 \pnum
@@ -5781,7 +5781,7 @@
 \begin{itemize}
 \item \tcode{0}, if the next \tcode{n} or fewer bytes complete
 the multibyte character
-that corresponds to the U+0000 Unicode character
+that corresponds to the \unicode{0000}{null} Unicode character
 (which is the value stored).
 \item between \tcode{1} and \tcode{n} (inclusive),
 if the next n or fewer bytes complete a valid multibyte character
diff --git a/source/uax31.tex b/source/uax31.tex
@@ -29,14 +29,14 @@
 where \tcode{<Start>} has the XID_Start property,
 \tcode{<Continue>} has the XID_Continue property, and
 \tcode{<Medial>} is a list of characters permitted between continue characters.
-For \Cpp{} we add the character U+005F, LOW LINE, or \tcode{_},
+For \Cpp{} we add the character \unicode{005f}{low line}, or \tcode{_},
 to the set of permitted \tcode{<Start>} characters,
 the \tcode{<Medial>} set is empty, and
 the \tcode{<Continue>} characters are unmodified.
 In the grammar used in UAX \#31, this is
 \begin{codeblock}
 <Identifier> := <Start> <Continue>*
-<Start> := XID_Start + U+005F
+<Start> := XID_Start + @\textrm{\ucode{005f}}@
 <Continue> := <Start> + XID_Continue
 \end{codeblock}
 
diff --git a/source/utilities.tex b/source/utilities.tex
@@ -21046,20 +21046,20 @@
 The extended grapheme clusters of a string are defined by UAX \#29.
 The estimated width of the following code points is 2:
 \begin{itemize}
-\item \tcode{U+1100-U+115F}
-\item \tcode{U+2329-U+232A}
-\item \tcode{U+2E80-U+303E}
-\item \tcode{U+3040-U+A4CF}
-\item \tcode{U+AC00-U+D7A3}
-\item \tcode{U+F900-U+FAFF}
-\item \tcode{U+FE10-U+FE19}
-\item \tcode{U+FE30-U+FE6F}
-\item \tcode{U+FF00-U+FF60}
-\item \tcode{U+FFE0-U+FFE6}
-\item \tcode{U+1F300-U+1F64F}
-\item \tcode{U+1F900-U+1F9FF}
-\item \tcode{U+20000-U+2FFFD}
-\item \tcode{U+30000-U+3FFFD}
+\item \ucode{1100} -- \ucode{115f}
+\item \ucode{2329} -- \ucode{232a}
+\item \ucode{2e80} -- \ucode{303e}
+\item \ucode{3040} -- \ucode{a4cf}
+\item \ucode{ac00} -- \ucode{d7a3}
+\item \ucode{f900} -- \ucode{faff}
+\item \ucode{fe10} -- \ucode{fe19}
+\item \ucode{fe30} -- \ucode{fe6f}
+\item \ucode{ff00} -- \ucode{ff60}
+\item \ucode{ffe0} -- \ucode{ffe6}
+\item \ucode{1f300} -- \ucode{1f64f}
+\item \ucode{1f900} -- \ucode{1f9ff}
+\item \ucode{20000} -- \ucode{2fffd}
+\item \ucode{30000} -- \ucode{3fffd}
 \end{itemize}
 The estimated width of other code points is 1.