@@ -76,7 +76,48 @@ def rust_slice(s):
76
76
unicode_str = u''
77
77
ranges .append ((first , last , mapping , unicode_str ))
78
78
79
- for (first , last , mapping , unicode_str ) in ranges :
79
+ def mergeable_key (r ):
80
+ mapping = r [2 ]
81
+ # These types have associated data, so we should not merge them.
82
+ if mapping in ('Mapped' , 'Deviation' , 'DisallowedStd3Mapped' ):
83
+ return r
84
+ assert mapping in ('Valid' , 'Ignored' , 'Disallowed' , 'DisallowedStd3Valid' )
85
+ return mapping
86
+
87
+ grouped_ranges = itertools .groupby (ranges , key = mergeable_key )
88
+
89
+ optimized_ranges = []
90
+
91
+ for (k , g ) in grouped_ranges :
92
+ group = list (g )
93
+ if len (group ) == 1 :
94
+ optimized_ranges .append (group [0 ])
95
+ continue
96
+ # Assert that nothing in the group has an associated unicode string.
97
+ for g in group :
98
+ if len (g [3 ]) > 2 :
99
+ assert not g [3 ][2 ].strip ()
100
+ # Assert that consecutive members of the group don't leave gaps in
101
+ # the codepoint space.
102
+ a , b = itertools .tee (group )
103
+ next (b , None )
104
+ for (g1 , g2 ) in itertools .izip (a , b ):
105
+ last_char = int (g1 [1 ], 16 )
106
+ next_char = int (g2 [0 ], 16 )
107
+ if last_char + 1 == next_char :
108
+ continue
109
+ # There's a gap where surrogates would appear, but we don't have to
110
+ # worry about that gap, as surrogates never appear in Rust strings.
111
+ # Assert we're seeing the surrogate case here.
112
+ assert last_char == 0xd7ff
113
+ assert next_char == 0xe000
114
+ first = group [0 ][0 ]
115
+ last = group [- 1 ][1 ]
116
+ mapping = group [0 ][2 ]
117
+ unicode_str = group [0 ][3 ]
118
+ optimized_ranges .append ((first , last , mapping , unicode_str ))
119
+
120
+ for (first , last , mapping , unicode_str ) in optimized_ranges :
80
121
if unicode_str is not None :
81
122
mapping += rust_slice (strtab_slice (unicode_str ))
82
123
print (" Range { from: '%s', to: '%s', mapping: %s }," % (escape_char (char (first )),
0 commit comments