12
12
13
13
# This script uses the following Unicode security tables:
14
14
# - IdentifierStatus.txt
15
+ # - IdentifierType.txt
16
+ # - PropertyValueAliases.txt
17
+ # - confusables.txt
15
18
# - ReadMe.txt
19
+ # This script also uses the following Unicode UCD data:
20
+ # - Scripts.txt
16
21
#
17
22
# Since this should not require frequent updates, we just store this
18
- # out-of-line and check the unicode .rs file into git.
23
+ # out-of-line and check the tables .rs file into git.
19
24
20
25
import fileinput , re , os , sys , operator
21
26
38
43
39
44
UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
40
45
46
+ # Download a Unicode security table file
41
47
def fetch (f ):
42
48
if not os .path .exists (os .path .basename (f )):
43
49
os .system ("curl -O http://www.unicode.org/Public/security/%s/%s"
@@ -47,6 +53,7 @@ def fetch(f):
47
53
sys .stderr .write ("cannot load %s\n " % f )
48
54
exit (1 )
49
55
56
+ # Download a UCD table file
50
57
def fetch_unidata (f ):
51
58
if not os .path .exists (os .path .basename (f )):
52
59
os .system ("curl -O http://www.unicode.org/Public/%s/ucd/%s"
@@ -56,6 +63,8 @@ def fetch_unidata(f):
56
63
sys .stderr .write ("cannot load %s" % f )
57
64
exit (1 )
58
65
66
+ # Loads code point data from IdentifierStatus.txt and
67
+ # IdentifierType.txt
59
68
# Implementation from unicode-segmentation
60
69
def load_properties (f , interestingprops = None ):
61
70
fetch (f )
@@ -90,6 +99,7 @@ def load_properties(f, interestingprops = None):
90
99
91
100
return props
92
101
102
+ # Loads script data from Scripts.txt
93
103
def load_script_properties (f , interestingprops ):
94
104
fetch_unidata (f )
95
105
props = {}
@@ -125,6 +135,7 @@ def load_script_properties(f, interestingprops):
125
135
126
136
return props
127
137
138
+ # Loads confusables data from confusables.txt
128
139
def load_confusables (f ):
129
140
fetch (f )
130
141
confusables = []
@@ -147,6 +158,7 @@ def load_confusables(f):
147
158
148
159
return confusables
149
160
161
+ # Loads Unicode script name correspondence from PropertyValueAliases.txt
150
162
def aliases ():
151
163
# This function is taken from the `unicode-script` crate. If significant
152
164
# changes are introduced, update accordingly.
@@ -171,6 +183,7 @@ def aliases():
171
183
172
184
return (longforms , shortforms )
173
185
186
+ # Loads Unicode script name list and correspondence mapping
174
187
def load_scripts (f ):
175
188
# This function is taken from the `unicode-script` crate. If significant
176
189
# changes are introduced, update accordingly.
@@ -192,6 +205,16 @@ def load_scripts(f):
192
205
def is_script_ignored_in_mixedscript (source ):
193
206
return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz'
194
207
208
+ # When a codepoint's prototype consists of multiple codepoints.
209
+ # The situation is more complex. Here we make up a few rules
210
+ # to cover all the cases in confusables.txt .
211
+ # The principle is that when replacing the original codepoint with its prototype.
212
+ # Neither a "non-ignored script" appears nor it disappears.
213
+ #
214
+ # We make up several rules to cover the cases occurred within confusables.txt
215
+ # Return True, True when we want to consider it confusable,
216
+ # and return True, False when we want to consider it non-confusable.
217
+ # and return False, _ when new not-yet-processed cases are added in future Unicode versions.
195
218
def process_mixedscript_single_to_multi (item_i , script_i , proto_lst , scripts ):
196
219
script_lst = script_list (proto_lst , scripts )
197
220
script_lst .sort ()
@@ -239,6 +262,21 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
239
262
return True
240
263
return False
241
264
265
+ # This function load and generates a table of all the confusable characters.
266
+ # It returns a pair consists of a `mixedscript_confusable` table and a
267
+ # `mixedscript_confusable_unresolved` table.
268
+ # The `mixedscript_confusable` is a dict, its keys are Unicode script names, and each
269
+ # entry has a value of a inner dict. The inner dict's keys are confusable code points
270
+ # converted to string with the `escape_char` function, and its values are pairs.
271
+ # pair[0] keeps a copy of the confusable code point itself but as integer.
272
+ # pair[1] keeps a list of all the code points that are mixed script confusable with it.
273
+ # which is only used for debugging purposes.
274
+ # note that the string 'multi' will occur in the list when pair[0] is considered
275
+ # confusable with its multiple code point prototype.
276
+ # Usually the `mixedscript_confusable_unresolved` table is empty, but it's possible
277
+ # that future Unicode version update may cause that table become nonempty, in which
278
+ # case more rules needs to be added to the `process_mixedscript_single_to_multi` function
279
+ # above to cover those new cases.
242
280
def load_potential_mixedscript_confusables (f , identifier_allowed , scripts ):
243
281
# First, load all confusables data from confusables.txt
244
282
confusables = load_confusables (f )
@@ -375,6 +413,7 @@ def codepoint_script(c, scripts):
375
413
return script
376
414
raise Exception ("Not in scripts: " + escape_char (c ))
377
415
416
+ # Emit some useful information for debugging when further update happens.
378
417
def debug_emit_mixedscript_confusable (f , mixedscript_confusable , text , scripts ):
379
418
f .write ("/* " + text + "\n " )
380
419
for script , lst in mixedscript_confusable .items ():
0 commit comments