@@ -148,9 +148,11 @@ def load_confusables(f):
148
148
return confusables
149
149
150
150
def aliases ():
151
- """
152
- Fetch the shorthand aliases for each longhand Script name
153
- """
151
+ # This function is taken from the `unicode-script` crate. If significant
152
+ # changes are introduced, update accordingly.
153
+
154
+ # Note that this file is in UCD directly, not security directory.
155
+ # we use `fetch_unidata` function to download it.
154
156
fetch_unidata ("PropertyValueAliases.txt" )
155
157
longforms = {}
156
158
shortforms = {}
@@ -170,6 +172,9 @@ def aliases():
170
172
return (longforms , shortforms )
171
173
172
174
def load_scripts (f ):
175
+ # This function is taken from the `unicode-script` crate. If significant
176
+ # changes are introduced, update accordingly.
177
+
173
178
(longforms , shortforms ) = aliases ()
174
179
scripts = load_script_properties (f , [])
175
180
@@ -235,31 +240,52 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
235
240
return False
236
241
237
242
def load_rustc_mixedscript_confusables (f , identifier_allowed , scripts ):
243
+ # First, load all confusables data from confusables.txt
238
244
confusables = load_confusables (f )
245
+
246
+ # The confusables.txt is reductive, means that it is intended to be used in
247
+ # on the fly substitutions. The code points that didn't occur in the file can be
248
+ # seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C,
249
+ # and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable.
250
+
251
+ # here we first make a dict that contains all As and Bs whose corresponding C is single code point.
239
252
seekup_map = {}
240
253
for item in confusables :
241
254
d_proto_list = item [1 ]
242
255
d_source = item [0 ]
243
256
assert (len (d_proto_list ) > 0 )
244
257
if len (d_proto_list ) == 1 :
245
258
seekup_map [escape_char (d_source )] = d_proto_list
246
- # collect prototypes
259
+
260
+ # Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes.
261
+ # Principally we'll be using the rhs operands as the representive element of its equivalence classes.
262
+ # However some rhs operands are single code point, while some others are not.
263
+ # Here we collect them separately into `codepoint_map` and `multicodepoint_map`.
247
264
codepoint_map = {}
248
265
multicodepoint_map = {}
249
266
for item in confusables :
250
267
d_source = item [0 ]
268
+ # According to the RFC, we'll skip those code points that are restricted from identifier usage.
251
269
if not is_codepoint_identifier_allowed (d_source , identifier_allowed ):
252
270
continue
253
271
d_proto_list = item [1 ]
254
272
if len (d_proto_list ) == 1 :
255
273
d_proto = escape_char (d_proto_list [0 ])
274
+ # we use the escaped representation of rhs as key to the dict when creating new equivalence class.
256
275
if d_proto not in codepoint_map :
257
276
codepoint_map [d_proto ] = []
277
+ # when we create new equivalence class, we'll check whether the representative element should be collected.
278
+ # i.e. if it is not subject to substituion, and not restricted from identifier usage,
279
+ # we collect it into the equivalence class.
258
280
if d_proto not in seekup_map and is_codepoint_identifier_allowed (d_proto_list [0 ], identifier_allowed ):
259
281
codepoint_map [d_proto ].append (d_proto_list [0 ])
282
+ # we collect the original code point to be substituted into this list.
260
283
codepoint_map [d_proto ].append (d_source )
261
284
else :
262
285
d_protos = escape_char_list (d_proto_list )
286
+ # difference in multi code point case: the rhs part is not directly usable, however we store it in
287
+ # dict for further special examination between each lhs and this multi code point rhs.
288
+ # and there's an extra level of tuple here.
263
289
if d_protos not in multicodepoint_map :
264
290
multicodepoint_map [d_protos ] = (d_proto_list , [])
265
291
multicodepoint_map [d_protos ][1 ].append (d_source )
@@ -274,24 +300,33 @@ def confusable_entry_item(confusable, script, item_text, item):
274
300
script_entry [item_text ] = (item , [])
275
301
return script_entry [item_text ][1 ]
276
302
277
- # between single charpoint that has single charpoint prototype
303
+ # First let's examine the each code point having single code point prototype case.
278
304
for _ , source in codepoint_map .items ():
279
305
source_len = len (source )
306
+ # Examine each pair in the equivalence class
280
307
for i in range (0 , source_len - 1 ):
281
308
for j in range (i + 1 , source_len ):
282
309
item_i , item_j = source [i ], source [j ]
283
310
script_i , script_j = codepoint_script (item_i , scripts ), codepoint_script (item_j , scripts )
311
+ # If they're in the same script, just skip this pair.
284
312
if script_i == script_j :
285
313
continue
314
+ # If `item_i` (the first) is not in a non-ignored script, and `item_j` (the second) is in a differnt one (maybe ignored),
315
+ # this means that this usage of the `item_i` can be suspicious, when it occurs in a document that is written in `script_j`.
316
+ # We'll consider it a mixed_script_confusable code point.
286
317
if not is_script_ignored_in_mixedscript (script_i ):
318
+ # store it within the map, saving as much information as possible, for further investigation on the final results.
287
319
confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append (item_j )
320
+ # Do the same in reverse from `item_j` to `item_i`
288
321
if not is_script_ignored_in_mixedscript (script_j ):
289
322
confusable_entry_item (mixedscript_confusable , script_j , escape_char (item_j ), item_j ).append (item_i )
290
323
291
- # between single charpoint that has multi charpoint prototype
324
+ # Then let's examine the each code point having multiple code point prototype case.
325
+ # We'll check between the code points that shares the same prototype
292
326
for _ , proto_lst_and_source in multicodepoint_map .items ():
293
327
source = proto_lst_and_source [1 ]
294
328
source_len = len (source )
329
+ # This is basically the same as the single code point case.
295
330
for i in range (0 , source_len - 1 ):
296
331
for j in range (i + 1 , source_len ):
297
332
item_i , item_j = source [i ], source [j ]
@@ -304,10 +339,11 @@ def confusable_entry_item(confusable, script, item_text, item):
304
339
confusable_entry_item (mixedscript_confusable , script_j , escape_char (item_j ), item_j ).append (item_i )
305
340
306
341
mixedscript_confusable_unresolved = {}
307
- # single charpoint that has multi charpoint prototype and its prototype
342
+ # We'll also check between each code points and its multiple codepoint prototype
308
343
for _ , proto_lst_and_source in multicodepoint_map .items ():
309
344
proto_lst = proto_lst_and_source [0 ]
310
345
proto_lst_can_be_part_of_identifier = True
346
+ # If the prototype contains one or more restricted code point, then we skip it.
311
347
for c in proto_lst :
312
348
if not is_codepoint_identifier_allowed (c , identifier_allowed ):
313
349
proto_lst_can_be_part_of_identifier = False
@@ -318,15 +354,25 @@ def confusable_entry_item(confusable, script, item_text, item):
318
354
source_len = len (source )
319
355
for i in range (0 , source_len ):
320
356
item_i = source [i ]
357
+ # So here we're just checking whether the single code point should be considered confusable.
321
358
script_i = codepoint_script (item_i , scripts )
359
+ # If it's in ignored script, we don't need to do anything here.
322
360
if is_script_ignored_in_mixedscript (script_i ):
323
361
continue
362
+ # Here're some rules on examining whether the single code point should be considered confusable.
363
+ # The principle is that, when subsitution happens, no new non-ignored script are introduced, and its
364
+ # own script is not lost.
324
365
processed , should_add = process_mixedscript_single_to_multi (item_i , script_i , proto_lst , scripts )
325
366
if should_add :
326
367
assert (processed )
368
+ # Mark the single code point as confusable.
327
369
confusable_entry_item (mixedscript_confusable , script_i , escape_char (item_i ), item_i ).append ('multi' )
328
370
if processed :
371
+ # Finished dealing with this code point.
329
372
continue
373
+ # If it's not processed we must be dealing with a newer version Unicode data, which introduced some significant
374
+ # changes. We don't throw an exception here, instead we collect it into a table for debugging purpose, and throw
375
+ # an exception after we returned and printed the table out.
330
376
proto_lst_text = escape_char_list (proto_lst )
331
377
if not proto_lst_text in mixedscript_confusable_unresolved :
332
378
mixedscript_confusable_unresolved [proto_lst_text ] = (proto_lst , [])
0 commit comments