Skip to content

Commit 741303d

Browse files
committed
Add more comments.
1 parent 72cefff commit 741303d

File tree

1 file changed

+53
-7
lines changed

1 file changed

+53
-7
lines changed

scripts/unicode.py

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,11 @@ def load_confusables(f):
148148
return confusables
149149

150150
def aliases():
151-
"""
152-
Fetch the shorthand aliases for each longhand Script name
153-
"""
151+
# This function is taken from the `unicode-script` crate. If significant
152+
# changes are introduced, update accordingly.
153+
154+
# Note that this file is in UCD directly, not security directory.
155+
# we use `fetch_unidata` function to download it.
154156
fetch_unidata("PropertyValueAliases.txt")
155157
longforms = {}
156158
shortforms = {}
@@ -170,6 +172,9 @@ def aliases():
170172
return (longforms, shortforms)
171173

172174
def load_scripts(f):
175+
# This function is taken from the `unicode-script` crate. If significant
176+
# changes are introduced, update accordingly.
177+
173178
(longforms, shortforms) = aliases()
174179
scripts = load_script_properties(f, [])
175180

@@ -235,31 +240,52 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
235240
return False
236241

237242
def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts):
243+
# First, load all confusables data from confusables.txt
238244
confusables = load_confusables(f)
245+
246+
# The confusables.txt is reductive, means that it is intended to be used in
247+
# on the fly substitutions. The code points that didn't occur in the file can be
248+
# seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C,
249+
# and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable.
250+
251+
# here we first make a dict that contains all As and Bs whose corresponding C is single code point.
239252
seekup_map = {}
240253
for item in confusables:
241254
d_proto_list = item[1]
242255
d_source = item[0]
243256
assert(len(d_proto_list) > 0)
244257
if len(d_proto_list) == 1:
245258
seekup_map[escape_char(d_source)] = d_proto_list
246-
# collect prototypes
259+
260+
# Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes.
261+
# Principally we'll be using the rhs operands as the representive element of its equivalence classes.
262+
# However some rhs operands are single code point, while some others are not.
263+
# Here we collect them separately into `codepoint_map` and `multicodepoint_map`.
247264
codepoint_map = {}
248265
multicodepoint_map = {}
249266
for item in confusables:
250267
d_source = item[0]
268+
# According to the RFC, we'll skip those code points that are restricted from identifier usage.
251269
if not is_codepoint_identifier_allowed(d_source, identifier_allowed):
252270
continue
253271
d_proto_list = item[1]
254272
if len(d_proto_list) == 1:
255273
d_proto = escape_char(d_proto_list[0])
274+
# we use the escaped representation of rhs as key to the dict when creating new equivalence class.
256275
if d_proto not in codepoint_map:
257276
codepoint_map[d_proto] = []
277+
# when we create new equivalence class, we'll check whether the representative element should be collected.
278+
# i.e. if it is not subject to substituion, and not restricted from identifier usage,
279+
# we collect it into the equivalence class.
258280
if d_proto not in seekup_map and is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed):
259281
codepoint_map[d_proto].append(d_proto_list[0])
282+
# we collect the original code point to be substituted into this list.
260283
codepoint_map[d_proto].append(d_source)
261284
else:
262285
d_protos = escape_char_list(d_proto_list)
286+
# difference in multi code point case: the rhs part is not directly usable, however we store it in
287+
# dict for further special examination between each lhs and this multi code point rhs.
288+
# and there's an extra level of tuple here.
263289
if d_protos not in multicodepoint_map:
264290
multicodepoint_map[d_protos] = (d_proto_list, [])
265291
multicodepoint_map[d_protos][1].append(d_source)
@@ -274,24 +300,33 @@ def confusable_entry_item(confusable, script, item_text, item):
274300
script_entry[item_text] = (item, [])
275301
return script_entry[item_text][1]
276302

277-
# between single charpoint that has single charpoint prototype
303+
# First let's examine the each code point having single code point prototype case.
278304
for _, source in codepoint_map.items():
279305
source_len = len(source)
306+
# Examine each pair in the equivalence class
280307
for i in range(0, source_len - 1):
281308
for j in range(i + 1, source_len):
282309
item_i, item_j = source[i], source[j]
283310
script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts)
311+
# If they're in the same script, just skip this pair.
284312
if script_i == script_j:
285313
continue
314+
# If `item_i` (the first) is not in a non-ignored script, and `item_j` (the second) is in a differnt one (maybe ignored),
315+
# this means that this usage of the `item_i` can be suspicious, when it occurs in a document that is written in `script_j`.
316+
# We'll consider it a mixed_script_confusable code point.
286317
if not is_script_ignored_in_mixedscript(script_i):
318+
# store it within the map, saving as much information as possible, for further investigation on the final results.
287319
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j)
320+
# Do the same in reverse from `item_j` to `item_i`
288321
if not is_script_ignored_in_mixedscript(script_j):
289322
confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i)
290323

291-
# between single charpoint that has multi charpoint prototype
324+
# Then let's examine the each code point having multiple code point prototype case.
325+
# We'll check between the code points that shares the same prototype
292326
for _, proto_lst_and_source in multicodepoint_map.items():
293327
source = proto_lst_and_source[1]
294328
source_len = len(source)
329+
# This is basically the same as the single code point case.
295330
for i in range(0, source_len - 1):
296331
for j in range(i + 1, source_len):
297332
item_i, item_j = source[i], source[j]
@@ -304,10 +339,11 @@ def confusable_entry_item(confusable, script, item_text, item):
304339
confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i)
305340

306341
mixedscript_confusable_unresolved = {}
307-
# single charpoint that has multi charpoint prototype and its prototype
342+
# We'll also check between each code points and its multiple codepoint prototype
308343
for _, proto_lst_and_source in multicodepoint_map.items():
309344
proto_lst = proto_lst_and_source[0]
310345
proto_lst_can_be_part_of_identifier = True
346+
# If the prototype contains one or more restricted code point, then we skip it.
311347
for c in proto_lst:
312348
if not is_codepoint_identifier_allowed(c, identifier_allowed):
313349
proto_lst_can_be_part_of_identifier = False
@@ -318,15 +354,25 @@ def confusable_entry_item(confusable, script, item_text, item):
318354
source_len = len(source)
319355
for i in range(0, source_len):
320356
item_i = source[i]
357+
# So here we're just checking whether the single code point should be considered confusable.
321358
script_i = codepoint_script(item_i, scripts)
359+
# If it's in ignored script, we don't need to do anything here.
322360
if is_script_ignored_in_mixedscript(script_i):
323361
continue
362+
# Here're some rules on examining whether the single code point should be considered confusable.
363+
# The principle is that, when subsitution happens, no new non-ignored script are introduced, and its
364+
# own script is not lost.
324365
processed, should_add = process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts)
325366
if should_add:
326367
assert(processed)
368+
# Mark the single code point as confusable.
327369
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append('multi')
328370
if processed:
371+
# Finished dealing with this code point.
329372
continue
373+
# If it's not processed we must be dealing with a newer version Unicode data, which introduced some significant
374+
# changes. We don't throw an exception here, instead we collect it into a table for debugging purpose, and throw
375+
# an exception after we returned and printed the table out.
330376
proto_lst_text = escape_char_list(proto_lst)
331377
if not proto_lst_text in mixedscript_confusable_unresolved:
332378
mixedscript_confusable_unresolved[proto_lst_text] = (proto_lst, [])

0 commit comments

Comments
 (0)