@@ -76,6 +76,8 @@ def stats(name, table):
76
76
stats ("Canonical fully decomp" , self .canon_fully_decomp )
77
77
stats ("Compatible fully decomp" , self .compat_fully_decomp )
78
78
79
+ self .ss_leading , self .ss_trailing = self ._compute_stream_safe_tables ()
80
+
79
81
def _fetch (self , filename ):
80
82
resp = requests .get (UCD_URL + filename )
81
83
return resp .text
@@ -91,17 +93,18 @@ def _load_unicode_data(self):
91
93
pieces = line .split (';' )
92
94
assert len (pieces ) == 15
93
95
char , category , cc , decomp = pieces [0 ], pieces [2 ], pieces [3 ], pieces [5 ]
96
+ char_int = int (char , 16 )
94
97
95
98
if cc != '0' :
96
- self .combining_classes [char ] = cc
99
+ self .combining_classes [char_int ] = cc
97
100
98
101
if decomp .startswith ('<' ):
99
- self .compat_decomp [char ] = decomp .split ()[1 :]
102
+ self .compat_decomp [char_int ] = [ int ( c , 16 ) for c in decomp .split ()[1 :] ]
100
103
elif decomp != '' :
101
- self .canon_decomp [char ] = decomp .split ()
104
+ self .canon_decomp [char_int ] = [ int ( c , 16 ) for c in decomp .split ()]
102
105
103
106
if category == 'M' or 'M' in expanded_categories .get (category , []):
104
- self .general_category_mark .append (char )
107
+ self .general_category_mark .append (char_int )
105
108
106
109
def _load_norm_props (self ):
107
110
props = collections .defaultdict (list )
@@ -146,14 +149,13 @@ def _compute_canonical_comp(self):
146
149
(int (low , 16 ), int (high or low , 16 ))
147
150
for low , high , _ in self .norm_props ["Full_Composition_Exclusion" ]
148
151
]
149
- for char , decomp in self .canon_decomp .items ():
150
- char_int = int (char , 16 )
152
+ for char_int , decomp in self .canon_decomp .items ():
151
153
if any (lo <= char_int <= hi for lo , hi in comp_exclusions ):
152
154
continue
153
155
154
156
assert len (decomp ) == 2
155
157
assert (decomp [0 ], decomp [1 ]) not in canon_comp
156
- canon_comp [(decomp [0 ], decomp [1 ])] = char
158
+ canon_comp [(decomp [0 ], decomp [1 ])] = char_int
157
159
158
160
return canon_comp
159
161
@@ -181,15 +183,6 @@ def _compute_fully_decomposed(self):
181
183
S_BASE , L_COUNT , V_COUNT , T_COUNT = 0xAC00 , 19 , 21 , 28
182
184
S_COUNT = L_COUNT * V_COUNT * T_COUNT
183
185
184
- canon_decomp = {
185
- int (k , 16 ): [int (c , 16 ) for c in v ]
186
- for k , v in self .canon_decomp .items ()
187
- }
188
- compat_decomp = {
189
- int (k , 16 ): [int (c , 16 ) for c in v ]
190
- for k , v in self .compat_decomp .items ()
191
- }
192
-
193
186
def _decompose (char_int , compatible ):
194
187
# 7-bit ASCII never decomposes
195
188
if char_int <= 0x7f :
@@ -199,15 +192,15 @@ def _decompose(char_int, compatible):
199
192
# Assert that we're handling Hangul separately.
200
193
assert not (S_BASE <= char_int < S_BASE + S_COUNT )
201
194
202
- decomp = canon_decomp .get (char_int )
195
+ decomp = self . canon_decomp .get (char_int )
203
196
if decomp is not None :
204
197
for decomposed_ch in decomp :
205
198
for fully_decomposed_ch in _decompose (decomposed_ch , compatible ):
206
199
yield fully_decomposed_ch
207
200
return
208
201
209
- if compatible and char_int in compat_decomp :
210
- for decomposed_ch in compat_decomp [char_int ]:
202
+ if compatible and char_int in self . compat_decomp :
203
+ for decomposed_ch in self . compat_decomp [char_int ]:
211
204
for fully_decomposed_ch in _decompose (decomposed_ch , compatible ):
212
205
yield fully_decomposed_ch
213
206
return
@@ -216,12 +209,13 @@ def _decompose(char_int, compatible):
216
209
return
217
210
218
211
end_codepoint = max (
219
- max (canon_decomp .keys ()),
220
- max (compat_decomp .keys ()),
212
+ max (self . canon_decomp .keys ()),
213
+ max (self . compat_decomp .keys ()),
221
214
)
222
215
223
- canon_fully_decomposed = {}
224
- compat_fully_decomposed = {}
216
+ canon_fully_decomp = {}
217
+ compat_fully_decomp = {}
218
+
225
219
for char_int in range (0 , end_codepoint + 1 ):
226
220
# Always skip Hangul, since it's more efficient to represent its
227
221
# decomposition programmatically.
@@ -230,31 +224,75 @@ def _decompose(char_int, compatible):
230
224
231
225
canon = list (_decompose (char_int , False ))
232
226
if not (len (canon ) == 1 and canon [0 ] == char_int ):
233
- canon_fully_decomposed [char_int ] = canon
227
+ canon_fully_decomp [char_int ] = canon
234
228
235
229
compat = list (_decompose (char_int , True ))
236
230
if not (len (compat ) == 1 and compat [0 ] == char_int ):
237
- compat_fully_decomposed [char_int ] = compat
231
+ compat_fully_decomp [char_int ] = compat
238
232
239
- # Since canon_decomp is a subset of compat_decomp , we don't need to
240
- # store their overlap when they agree. When they don't agree, store the
241
- # decomposition in the compatibility table since we'll check that first
242
- # when normalizing to NFKD.
243
- assert canon_fully_decomposed <= compat_fully_decomposed
233
+ # Since canon_fully_decomp is a subset of compat_fully_decomp , we don't
234
+ # need to store their overlap when they agree. When they don't agree,
235
+ # store the decomposition in the compatibility table since we'll check
236
+ # that first when normalizing to NFKD.
237
+ assert canon_fully_decomp <= compat_fully_decomp
244
238
245
- for ch in set (canon_fully_decomposed ) & set (compat_fully_decomposed ):
246
- if canon_fully_decomposed [ch ] == compat_fully_decomposed [ch ]:
247
- del compat_fully_decomposed [ch ]
239
+ for ch in set (canon_fully_decomp ) & set (compat_fully_decomp ):
240
+ if canon_fully_decomp [ch ] == compat_fully_decomp [ch ]:
241
+ del compat_fully_decomp [ch ]
242
+
243
+ return canon_fully_decomp , compat_fully_decomp
244
+
245
+ def _compute_stream_safe_tables (self ):
246
+ """
247
+ To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
248
+ we need to be able to know the number of contiguous non-starters *after*
249
+ applying compatibility decomposition to each character.
250
+
251
+ We can do this incrementally by computing the number of leading and
252
+ trailing non-starters for each character's compatibility decomposition
253
+ with the following rules:
254
+
255
+ 1) If a character is not affected by compatibility decomposition, look
256
+ up its canonical combining class to find out if it's a non-starter.
257
+ 2) All Hangul characters are starters, even under decomposition.
258
+ 3) Otherwise, very few decomposing characters have a nonzero count
259
+ of leading or trailing non-starters, so store these characters
260
+ with their associated counts in a separate table.
261
+ """
262
+ leading_nonstarters = {}
263
+ trailing_nonstarters = {}
248
264
249
- return canon_fully_decomposed , compat_fully_decomposed
265
+ for c in set (self .canon_fully_decomp ) | set (self .compat_fully_decomp ):
266
+ decomposed = self .compat_fully_decomp .get (c ) or self .canon_fully_decomp [c ]
267
+
268
+ num_leading = 0
269
+ for d in decomposed :
270
+ if d not in self .combining_classes :
271
+ break
272
+ num_leading += 1
273
+
274
+ num_trailing = 0
275
+ for d in reversed (decomposed ):
276
+ if d not in self .combining_classes :
277
+ break
278
+ num_trailing += 1
279
+
280
+ if num_leading > 0 :
281
+ leading_nonstarters [c ] = num_leading
282
+ if num_trailing > 0 :
283
+ trailing_nonstarters [c ] = num_trailing
284
+
285
+ return leading_nonstarters , trailing_nonstarters
286
+
287
+ hexify = lambda c : hex (c )[2 :].upper ().rjust (4 , '0' )
250
288
251
289
def gen_combining_class (combining_classes , out ):
252
290
out .write ("#[inline]\n " )
253
291
out .write ("pub fn canonical_combining_class(c: char) -> u8 {\n " )
254
292
out .write (" match c {\n " )
255
293
256
- for char , combining_class in sorted (combining_classes .items (), key = lambda ( k , _ ): int ( k , 16 ) ):
257
- out .write (" '\u{%s}' => %s,\n " % (char , combining_class ))
294
+ for char , combining_class in sorted (combining_classes .items ()):
295
+ out .write (" '\u{%s}' => %s,\n " % (hexify ( char ) , combining_class ))
258
296
259
297
out .write (" _ => 0,\n " )
260
298
out .write (" }\n " )
@@ -265,8 +303,8 @@ def gen_composition_table(canon_comp, out):
265
303
out .write ("pub fn composition_table(c1: char, c2: char) -> Option<char> {\n " )
266
304
out .write (" match (c1, c2) {\n " )
267
305
268
- for (c1 , c2 ), c3 in sorted (canon_comp .items (), key = lambda (( c1 , c2 ), _ ): ( int ( c1 , 16 ), int ( c2 , 16 )) ):
269
- out .write (" ('\u{%s}', '\u{%s}') => Some('\u{%s}'),\n " % (c1 , c2 , c3 ))
306
+ for (c1 , c2 ), c3 in sorted (canon_comp .items ()):
307
+ out .write (" ('\u{%s}', '\u{%s}') => Some('\u{%s}'),\n " % (hexify ( c1 ), hexify ( c2 ), hexify ( c3 ) ))
270
308
271
309
out .write (" _ => None,\n " )
272
310
out .write (" }\n " )
@@ -279,8 +317,6 @@ def gen_decomposition_tables(canon_decomp, compat_decomp, out):
279
317
out .write ("pub fn %s_fully_decomposed(c: char) -> Option<&'static [char]> {\n " % name )
280
318
out .write (" match c {\n " )
281
319
282
- hexify = lambda c : hex (c )[2 :].upper ()
283
-
284
320
for char , chars in sorted (table .items ()):
285
321
d = ", " .join ("'\u{%s}'" % hexify (c ) for c in chars )
286
322
out .write (" '\u{%s}' => Some(&[%s]),\n " % (hexify (char ), d ))
@@ -323,12 +359,36 @@ def gen_combining_mark(general_category_mark, out):
323
359
out .write (" match c {\n " )
324
360
325
361
for char in general_category_mark :
326
- out .write (" '\u{%s}' => true,\n " % char )
362
+ out .write (" '\u{%s}' => true,\n " % hexify ( char ) )
327
363
328
364
out .write (" _ => false,\n " )
329
365
out .write (" }\n " )
330
366
out .write ("}\n " )
331
367
368
+ def gen_stream_safe (leading , trailing , out ):
369
+ out .write ("#[inline]\n " )
370
+ out .write ("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n " )
371
+ out .write (" match c {\n " )
372
+
373
+ for char , num_leading in leading .items ():
374
+ out .write (" '\u{%s}' => %d,\n " % (hexify (char ), num_leading ))
375
+
376
+ out .write (" _ => 0,\n " )
377
+ out .write (" }\n " )
378
+ out .write ("}\n " )
379
+ out .write ("\n " )
380
+
381
+ out .write ("#[inline]\n " )
382
+ out .write ("pub fn stream_safe_trailing_nonstarters(c: char) -> usize {\n " )
383
+ out .write (" match c {\n " )
384
+
385
+ for char , num_trailing in trailing .items ():
386
+ out .write (" '\u{%s}' => %d,\n " % (hexify (char ), num_trailing ))
387
+
388
+ out .write (" _ => 0,\n " )
389
+ out .write (" }\n " )
390
+ out .write ("}\n " )
391
+
332
392
def gen_tests (tests , out ):
333
393
out .write ("""#[derive(Debug)]
334
394
pub struct NormalizationTest {
@@ -384,6 +444,9 @@ def gen_tests(tests, out):
384
444
gen_nfd_qc (data .norm_props , out )
385
445
out .write ("\n " )
386
446
447
+ gen_stream_safe (data .ss_leading , data .ss_trailing , out )
448
+ out .write ("\n " )
449
+
387
450
with open ("normalization_tests.rs" , "w" ) as out :
388
451
out .write (PREAMBLE )
389
452
gen_tests (data .norm_tests , out )
0 commit comments