Skip to content

Commit 31bd869

Browse files
committed
Add more comments to document the details.
1 parent a18eb9c commit 31bd869

File tree

1 file changed

+40
-1
lines changed

1 file changed

+40
-1
lines changed

scripts/unicode.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,15 @@
1212

1313
# This script uses the following Unicode security tables:
1414
# - IdentifierStatus.txt
15+
# - IdentifierType.txt
16+
# - PropertyValueAliases.txt
17+
# - confusables.txt
1518
# - ReadMe.txt
19+
# This script also uses the following Unicode UCD data:
20+
# - Scripts.txt
1621
#
1722
# Since this should not require frequent updates, we just store this
18-
# out-of-line and check the unicode.rs file into git.
23+
# out-of-line and check the tables.rs file into git.
1924

2025
import fileinput, re, os, sys, operator
2126

@@ -38,6 +43,7 @@
3843

3944
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
4045

46+
# Download a Unicode security table file
4147
def fetch(f):
4248
if not os.path.exists(os.path.basename(f)):
4349
os.system("curl -O http://www.unicode.org/Public/security/%s/%s"
@@ -47,6 +53,7 @@ def fetch(f):
4753
sys.stderr.write("cannot load %s\n" % f)
4854
exit(1)
4955

56+
# Download a UCD table file
5057
def fetch_unidata(f):
5158
if not os.path.exists(os.path.basename(f)):
5259
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
@@ -56,6 +63,8 @@ def fetch_unidata(f):
5663
sys.stderr.write("cannot load %s" % f)
5764
exit(1)
5865

66+
# Loads code point data from IdentifierStatus.txt and
67+
# IdentifierType.txt
5968
# Implementation from unicode-segmentation
6069
def load_properties(f, interestingprops = None):
6170
fetch(f)
@@ -90,6 +99,7 @@ def load_properties(f, interestingprops = None):
9099

91100
return props
92101

102+
# Loads script data from Scripts.txt
93103
def load_script_properties(f, interestingprops):
94104
fetch_unidata(f)
95105
props = {}
@@ -125,6 +135,7 @@ def load_script_properties(f, interestingprops):
125135

126136
return props
127137

138+
# Loads confusables data from confusables.txt
128139
def load_confusables(f):
129140
fetch(f)
130141
confusables = []
@@ -147,6 +158,7 @@ def load_confusables(f):
147158

148159
return confusables
149160

161+
# Loads Unicode script name correspondence from PropertyValueAliases.txt
150162
def aliases():
151163
# This function is taken from the `unicode-script` crate. If significant
152164
# changes are introduced, update accordingly.
@@ -171,6 +183,7 @@ def aliases():
171183

172184
return (longforms, shortforms)
173185

186+
# Loads Unicode script name list and correspondence mapping
174187
def load_scripts(f):
175188
# This function is taken from the `unicode-script` crate. If significant
176189
# changes are introduced, update accordingly.
@@ -192,6 +205,16 @@ def load_scripts(f):
192205
def is_script_ignored_in_mixedscript(source):
193206
return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz'
194207

208+
# When a codepoint's prototype consists of multiple codepoints.
209+
# The situation is more complex. Here we make up a few rules
210+
# to cover all the cases in confusables.txt .
211+
# The principle is that when replacing the original codepoint with its prototype.
212+
# Neither a "non-ignored script" appears nor it disappears.
213+
#
214+
# We make up several rules to cover the cases occurred within confusables.txt
215+
# Return True, True when we want to consider it confusable,
216+
# and return True, False when we want to consider it non-confusable.
217+
# and return False, _ when new not-yet-processed cases are added in future Unicode versions.
195218
def process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts):
196219
script_lst = script_list(proto_lst, scripts)
197220
script_lst.sort()
@@ -239,6 +262,21 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
239262
return True
240263
return False
241264

265+
# This function load and generates a table of all the confusable characters.
266+
# It returns a pair consists of a `mixedscript_confusable` table and a
267+
# `mixedscript_confusable_unresolved` table.
268+
# The `mixedscript_confusable` is a dict, its keys are Unicode script names, and each
269+
# entry has a value of a inner dict. The inner dict's keys are confusable code points
270+
# converted to string with the `escape_char` function, and its values are pairs.
271+
# pair[0] keeps a copy of the confusable code point itself but as integer.
272+
# pair[1] keeps a list of all the code points that are mixed script confusable with it.
273+
# which is only used for debugging purposes.
274+
# note that the string 'multi' will occur in the list when pair[0] is considered
275+
# confusable with its multiple code point prototype.
276+
# Usually the `mixedscript_confusable_unresolved` table is empty, but it's possible
277+
# that future Unicode version update may cause that table become nonempty, in which
278+
# case more rules needs to be added to the `process_mixedscript_single_to_multi` function
279+
# above to cover those new cases.
242280
def load_potential_mixedscript_confusables(f, identifier_allowed, scripts):
243281
# First, load all confusables data from confusables.txt
244282
confusables = load_confusables(f)
@@ -375,6 +413,7 @@ def codepoint_script(c, scripts):
375413
return script
376414
raise Exception("Not in scripts: " + escape_char(c))
377415

416+
# Emit some useful information for debugging when further update happens.
378417
def debug_emit_mixedscript_confusable(f, mixedscript_confusable, text, scripts):
379418
f.write("/* " + text + "\n")
380419
for script, lst in mixedscript_confusable.items():

0 commit comments

Comments
 (0)