Skip to content

Commit a44257d

Browse files
committed
merge identically-mapped adjacent ranges
There's no reason to store adjacent ranges that map to identical results separately; we can store them as a single entry and save ~10% space.
1 parent 8f0c9e7 commit a44257d

File tree

2 files changed

+265
-968
lines changed

2 files changed

+265
-968
lines changed

idna/src/make_uts46_mapping_table.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,48 @@ def rust_slice(s):
7676
unicode_str = u''
7777
ranges.append((first, last, mapping, unicode_str))
7878

79-
for (first, last, mapping, unicode_str) in ranges:
79+
def mergeable_key(r):
80+
mapping = r[2]
81+
# These types have associated data, so we should not merge them.
82+
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
83+
return r
84+
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
85+
return mapping
86+
87+
grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
88+
89+
optimized_ranges = []
90+
91+
for (k, g) in grouped_ranges:
92+
group = list(g)
93+
if len(group) == 1:
94+
optimized_ranges.append(group[0])
95+
continue
96+
# Assert that nothing in the group has an associated unicode string.
97+
for g in group:
98+
if len(g[3]) > 2:
99+
assert not g[3][2].strip()
100+
# Assert that consecutive members of the group don't leave gaps in
101+
# the codepoint space.
102+
a, b = itertools.tee(group)
103+
next(b, None)
104+
for (g1, g2) in itertools.izip(a, b):
105+
last_char = int(g1[1], 16)
106+
next_char = int(g2[0], 16)
107+
if last_char + 1 == next_char:
108+
continue
109+
# There's a gap where surrogates would appear, but we don't have to
110+
# worry about that gap, as surrogates never appear in Rust strings.
111+
# Assert we're seeing the surrogate case here.
112+
assert last_char == 0xd7ff
113+
assert next_char == 0xe000
114+
first = group[0][0]
115+
last = group[-1][1]
116+
mapping = group[0][2]
117+
unicode_str = group[0][3]
118+
optimized_ranges.append((first, last, mapping, unicode_str))
119+
120+
for (first, last, mapping, unicode_str) in optimized_ranges:
80121
if unicode_str is not None:
81122
mapping += rust_slice(strtab_slice(unicode_str))
82123
print(" Range { from: '%s', to: '%s', mapping: %s }," % (escape_char(char(first)),

0 commit comments

Comments
 (0)