Skip to content

Commit 4d7e81e

Browse files
committed
Add Stream-Safe Text Process (for interop with golang's unicode/norm package)
1 parent 6a7e38e commit 4d7e81e

File tree

9 files changed

+5290
-3916
lines changed

9 files changed

+5290
-3916
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22

33
name = "unicode-normalization"
4-
version = "0.1.5"
4+
version = "0.1.6"
55
authors = ["kwantam <kwantam@gmail.com>"]
66

77
homepage = "https://github.com/unicode-rs/unicode-normalization"

benches/bench.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,15 @@ fn bench_nfd_ascii(b: &mut Bencher) {
4747
let s = "decompose me entirely";
4848
b.iter(|| s.nfd().count());
4949
}
50+
51+
#[bench]
52+
fn bench_streamsafe_ascii(b: &mut Bencher) {
53+
let s = "quite nonthreatening";
54+
b.iter(|| s.stream_safe().count());
55+
}
56+
57+
#[bench]
58+
fn bench_streamsafe_adversarial(b: &mut Bencher) {
59+
let s = "bo\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}oom";
60+
b.iter(|| s.stream_safe().count());
61+
}

scripts/unicode.py

Lines changed: 104 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ def stats(name, table):
7676
stats("Canonical fully decomp", self.canon_fully_decomp)
7777
stats("Compatible fully decomp", self.compat_fully_decomp)
7878

79+
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
80+
7981
def _fetch(self, filename):
8082
resp = requests.get(UCD_URL + filename)
8183
return resp.text
@@ -91,17 +93,18 @@ def _load_unicode_data(self):
9193
pieces = line.split(';')
9294
assert len(pieces) == 15
9395
char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
96+
char_int = int(char, 16)
9497

9598
if cc != '0':
96-
self.combining_classes[char] = cc
99+
self.combining_classes[char_int] = cc
97100

98101
if decomp.startswith('<'):
99-
self.compat_decomp[char] = decomp.split()[1:]
102+
self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
100103
elif decomp != '':
101-
self.canon_decomp[char] = decomp.split()
104+
self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
102105

103106
if category == 'M' or 'M' in expanded_categories.get(category, []):
104-
self.general_category_mark.append(char)
107+
self.general_category_mark.append(char_int)
105108

106109
def _load_norm_props(self):
107110
props = collections.defaultdict(list)
@@ -146,14 +149,13 @@ def _compute_canonical_comp(self):
146149
(int(low, 16), int(high or low, 16))
147150
for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
148151
]
149-
for char, decomp in self.canon_decomp.items():
150-
char_int = int(char, 16)
152+
for char_int, decomp in self.canon_decomp.items():
151153
if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
152154
continue
153155

154156
assert len(decomp) == 2
155157
assert (decomp[0], decomp[1]) not in canon_comp
156-
canon_comp[(decomp[0], decomp[1])] = char
158+
canon_comp[(decomp[0], decomp[1])] = char_int
157159

158160
return canon_comp
159161

@@ -181,15 +183,6 @@ def _compute_fully_decomposed(self):
181183
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
182184
S_COUNT = L_COUNT * V_COUNT * T_COUNT
183185

184-
canon_decomp = {
185-
int(k, 16): [int(c, 16) for c in v]
186-
for k, v in self.canon_decomp.items()
187-
}
188-
compat_decomp = {
189-
int(k, 16): [int(c, 16) for c in v]
190-
for k, v in self.compat_decomp.items()
191-
}
192-
193186
def _decompose(char_int, compatible):
194187
# 7-bit ASCII never decomposes
195188
if char_int <= 0x7f:
@@ -199,15 +192,15 @@ def _decompose(char_int, compatible):
199192
# Assert that we're handling Hangul separately.
200193
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
201194

202-
decomp = canon_decomp.get(char_int)
195+
decomp = self.canon_decomp.get(char_int)
203196
if decomp is not None:
204197
for decomposed_ch in decomp:
205198
for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
206199
yield fully_decomposed_ch
207200
return
208201

209-
if compatible and char_int in compat_decomp:
210-
for decomposed_ch in compat_decomp[char_int]:
202+
if compatible and char_int in self.compat_decomp:
203+
for decomposed_ch in self.compat_decomp[char_int]:
211204
for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
212205
yield fully_decomposed_ch
213206
return
@@ -216,12 +209,13 @@ def _decompose(char_int, compatible):
216209
return
217210

218211
end_codepoint = max(
219-
max(canon_decomp.keys()),
220-
max(compat_decomp.keys()),
212+
max(self.canon_decomp.keys()),
213+
max(self.compat_decomp.keys()),
221214
)
222215

223-
canon_fully_decomposed = {}
224-
compat_fully_decomposed = {}
216+
canon_fully_decomp = {}
217+
compat_fully_decomp = {}
218+
225219
for char_int in range(0, end_codepoint + 1):
226220
# Always skip Hangul, since it's more efficient to represent its
227221
# decomposition programmatically.
@@ -230,31 +224,75 @@ def _decompose(char_int, compatible):
230224

231225
canon = list(_decompose(char_int, False))
232226
if not (len(canon) == 1 and canon[0] == char_int):
233-
canon_fully_decomposed[char_int] = canon
227+
canon_fully_decomp[char_int] = canon
234228

235229
compat = list(_decompose(char_int, True))
236230
if not (len(compat) == 1 and compat[0] == char_int):
237-
compat_fully_decomposed[char_int] = compat
231+
compat_fully_decomp[char_int] = compat
238232

239-
# Since canon_decomp is a subset of compat_decomp, we don't need to
240-
# store their overlap when they agree. When they don't agree, store the
241-
# decomposition in the compatibility table since we'll check that first
242-
# when normalizing to NFKD.
243-
assert canon_fully_decomposed <= compat_fully_decomposed
233+
# Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
234+
# need to store their overlap when they agree. When they don't agree,
235+
# store the decomposition in the compatibility table since we'll check
236+
# that first when normalizing to NFKD.
237+
assert canon_fully_decomp <= compat_fully_decomp
244238

245-
for ch in set(canon_fully_decomposed) & set(compat_fully_decomposed):
246-
if canon_fully_decomposed[ch] == compat_fully_decomposed[ch]:
247-
del compat_fully_decomposed[ch]
239+
for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
240+
if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
241+
del compat_fully_decomp[ch]
242+
243+
return canon_fully_decomp, compat_fully_decomp
244+
245+
def _compute_stream_safe_tables(self):
246+
"""
247+
To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
248+
we need to be able to know the number of contiguous non-starters *after*
249+
applying compatibility decomposition to each character.
250+
251+
We can do this incrementally by computing the number of leading and
252+
trailing non-starters for each character's compatibility decomposition
253+
with the following rules:
254+
255+
1) If a character is not affected by compatibility decomposition, look
256+
up its canonical combining class to find out if it's a non-starter.
257+
2) All Hangul characters are starters, even under decomposition.
258+
3) Otherwise, very few decomposing characters have a nonzero count
259+
of leading or trailing non-starters, so store these characters
260+
with their associated counts in a separate table.
261+
"""
262+
leading_nonstarters = {}
263+
trailing_nonstarters = {}
248264

249-
return canon_fully_decomposed, compat_fully_decomposed
265+
for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
266+
decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
267+
268+
num_leading = 0
269+
for d in decomposed:
270+
if d not in self.combining_classes:
271+
break
272+
num_leading += 1
273+
274+
num_trailing = 0
275+
for d in reversed(decomposed):
276+
if d not in self.combining_classes:
277+
break
278+
num_trailing += 1
279+
280+
if num_leading > 0:
281+
leading_nonstarters[c] = num_leading
282+
if num_trailing > 0:
283+
trailing_nonstarters[c] = num_trailing
284+
285+
return leading_nonstarters, trailing_nonstarters
286+
287+
hexify = lambda c: hex(c)[2:].upper().rjust(4, '0')
250288

251289
def gen_combining_class(combining_classes, out):
252290
out.write("#[inline]\n")
253291
out.write("pub fn canonical_combining_class(c: char) -> u8 {\n")
254292
out.write(" match c {\n")
255293

256-
for char, combining_class in sorted(combining_classes.items(), key=lambda (k, _): int(k, 16)):
257-
out.write(" '\u{%s}' => %s,\n" % (char, combining_class))
294+
for char, combining_class in sorted(combining_classes.items()):
295+
out.write(" '\u{%s}' => %s,\n" % (hexify(char), combining_class))
258296

259297
out.write(" _ => 0,\n")
260298
out.write(" }\n")
@@ -265,8 +303,8 @@ def gen_composition_table(canon_comp, out):
265303
out.write("pub fn composition_table(c1: char, c2: char) -> Option<char> {\n")
266304
out.write(" match (c1, c2) {\n")
267305

268-
for (c1, c2), c3 in sorted(canon_comp.items(), key=lambda ((c1, c2), _): (int(c1, 16), int(c2, 16))):
269-
out.write(" ('\u{%s}', '\u{%s}') => Some('\u{%s}'),\n" % (c1, c2, c3))
306+
for (c1, c2), c3 in sorted(canon_comp.items()):
307+
out.write(" ('\u{%s}', '\u{%s}') => Some('\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
270308

271309
out.write(" _ => None,\n")
272310
out.write(" }\n")
@@ -279,8 +317,6 @@ def gen_decomposition_tables(canon_decomp, compat_decomp, out):
279317
out.write("pub fn %s_fully_decomposed(c: char) -> Option<&'static [char]> {\n" % name)
280318
out.write(" match c {\n")
281319

282-
hexify = lambda c: hex(c)[2:].upper()
283-
284320
for char, chars in sorted(table.items()):
285321
d = ", ".join("'\u{%s}'" % hexify(c) for c in chars)
286322
out.write(" '\u{%s}' => Some(&[%s]),\n" % (hexify(char), d))
@@ -323,12 +359,36 @@ def gen_combining_mark(general_category_mark, out):
323359
out.write(" match c {\n")
324360

325361
for char in general_category_mark:
326-
out.write(" '\u{%s}' => true,\n" % char)
362+
out.write(" '\u{%s}' => true,\n" % hexify(char))
327363

328364
out.write(" _ => false,\n")
329365
out.write(" }\n")
330366
out.write("}\n")
331367

368+
def gen_stream_safe(leading, trailing, out):
369+
out.write("#[inline]\n")
370+
out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
371+
out.write(" match c {\n")
372+
373+
for char, num_leading in leading.items():
374+
out.write(" '\u{%s}' => %d,\n" % (hexify(char), num_leading))
375+
376+
out.write(" _ => 0,\n")
377+
out.write(" }\n")
378+
out.write("}\n")
379+
out.write("\n")
380+
381+
out.write("#[inline]\n")
382+
out.write("pub fn stream_safe_trailing_nonstarters(c: char) -> usize {\n")
383+
out.write(" match c {\n")
384+
385+
for char, num_trailing in trailing.items():
386+
out.write(" '\u{%s}' => %d,\n" % (hexify(char), num_trailing))
387+
388+
out.write(" _ => 0,\n")
389+
out.write(" }\n")
390+
out.write("}\n")
391+
332392
def gen_tests(tests, out):
333393
out.write("""#[derive(Debug)]
334394
pub struct NormalizationTest {
@@ -384,6 +444,9 @@ def gen_tests(tests, out):
384444
gen_nfd_qc(data.norm_props, out)
385445
out.write("\n")
386446

447+
gen_stream_safe(data.ss_leading, data.ss_trailing, out)
448+
out.write("\n")
449+
387450
with open("normalization_tests.rs", "w") as out:
388451
out.write(PREAMBLE)
389452
gen_tests(data.norm_tests, out)

src/lib.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
//! unicode-normalization = "0.1.3"
3838
//! ```
3939
40-
// #![deny(missing_docs, unsafe_code)]
40+
#![deny(missing_docs, unsafe_code)]
4141
#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
4242
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]
4343

@@ -51,12 +51,14 @@ pub use quick_check::{
5151
is_nfd_quick,
5252
};
5353
pub use recompose::Recompositions;
54+
pub use stream_safe::StreamSafe;
5455
use std::str::Chars;
5556

5657
mod decompose;
5758
mod normalize;
5859
mod recompose;
5960
mod quick_check;
61+
mod stream_safe;
6062
mod tables;
6163

6264
#[cfg(test)]
@@ -99,6 +101,11 @@ pub trait UnicodeNormalization<I: Iterator<Item=char>> {
99101
/// (compatibility decomposition followed by canonical composition).
100102
#[inline]
101103
fn nfkc(self) -> Recompositions<I>;
104+
105+
/// An Iterator over the string with Conjoining Grapheme Joiner characters
106+
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
107+
#[inline]
108+
fn stream_safe(self) -> StreamSafe<I>;
102109
}
103110

104111
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
@@ -121,6 +128,11 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
121128
fn nfkc(self) -> Recompositions<Chars<'a>> {
122129
recompose::new_compatible(self.chars())
123130
}
131+
132+
#[inline]
133+
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
134+
StreamSafe::new(self.chars())
135+
}
124136
}
125137

126138
impl<I: Iterator<Item=char>> UnicodeNormalization<I> for I {
@@ -143,4 +155,9 @@ impl<I: Iterator<Item=char>> UnicodeNormalization<I> for I {
143155
fn nfkc(self) -> Recompositions<I> {
144156
recompose::new_compatible(self)
145157
}
158+
159+
#[inline]
160+
fn stream_safe(self) -> StreamSafe<I> {
161+
StreamSafe::new(self)
162+
}
146163
}

0 commit comments

Comments
 (0)