Skip to content

Commit 3898e77

Browse files
authored
Merge pull request #14 from froydnj/trim-unicode-tables
reduce space required by decomposition and composition tables
2 parents 6dbfd72 + 8d01bc5 commit 3898e77

File tree

3 files changed

+11011
-2828
lines changed

3 files changed

+11011
-2828
lines changed

scripts/unicode.py

Lines changed: 64 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# Since this should not require frequent updates, we just store this
1919
# out-of-line and check the unicode.rs file into git.
2020

21-
import fileinput, re, os, sys
21+
import fileinput, re, os, sys, collections
2222

2323
preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2424
// file at the top-level directory of this distribution and at
@@ -160,19 +160,9 @@ def to_combines(combs):
160160
return combs_out
161161

162162
def format_table_content(f, content, indent):
163-
line = " "*indent
164-
first = True
165-
for chunk in content.split(","):
166-
if len(line) + len(chunk) < 98:
167-
if first:
168-
line += chunk
169-
else:
170-
line += ", " + chunk
171-
first = False
172-
else:
173-
f.write(line + ",\n")
174-
line = " "*indent + chunk
175-
f.write(line)
163+
indent = " "*indent
164+
for c in content:
165+
f.write("%s%s,\n" % (indent, c))
176166

177167
def load_properties(f, interestingprops):
178168
fetch(f)
@@ -220,14 +210,44 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
220210
if is_pub:
221211
pub_string = "pub "
222212
f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type))
223-
data = ""
224-
first = True
225-
for dat in t_data:
226-
if not first:
227-
data += ","
228-
first = False
229-
data += pfun(dat)
230-
format_table_content(f, data, 8)
213+
format_table_content(f, [pfun(d) for d in t_data], 8)
214+
f.write("\n ];\n\n")
215+
216+
def emit_strtab_table(f, name, keys, vfun, is_pub=True,
217+
tab_entry_type='char', slice_element_sfun=escape_char):
218+
pub_string = ""
219+
if is_pub:
220+
pub_string = "pub "
221+
f.write(" %s const %s: &'static [(char, Slice)] = &[\n"
222+
% (pub_string, name))
223+
224+
strtab = collections.OrderedDict()
225+
strtab_offset = 0
226+
227+
# TODO: a more sophisticated algorithm here would not only check for the
228+
# existence of v in the strtab, but also v in contiguous substrings of
229+
# strtab, if that's possible.
230+
for k in keys:
231+
v = tuple(vfun(k))
232+
if v in strtab:
233+
item_slice = strtab[v]
234+
else:
235+
value_len = len(v)
236+
item_slice = (strtab_offset, value_len)
237+
strtab[v] = item_slice
238+
strtab_offset += value_len
239+
240+
f.write("%s(%s, Slice { offset: %d, length: %d }),\n"
241+
% (" "*8, escape_char(k), item_slice[0], item_slice[1]))
242+
243+
f.write("\n ];\n\n")
244+
245+
f.write(" %s const %s_STRTAB: &'static [%s] = &[\n"
246+
% (pub_string, name, tab_entry_type))
247+
248+
for (v, _) in strtab.iteritems():
249+
f.write("%s%s,\n" % (" "*8, ', '.join(slice_element_sfun(c) for c in v)))
250+
231251
f.write("\n ];\n\n")
232252

233253
def emit_norm_module(f, canon, compat, combine, norm_props, general_category_mark):
@@ -251,43 +271,38 @@ def emit_norm_module(f, canon, compat, combine, norm_props, general_category_mar
251271
canon_comp_keys.sort()
252272

253273
f.write("pub mod normalization {\n")
274+
f.write("""
275+
pub struct Slice {
276+
pub offset: u16,
277+
pub length: u16,
278+
}
279+
""")
254280

255281
def mkdata_fun(table):
256282
def f(char):
257-
data = "(%s,&[" % escape_char(char)
258-
first = True
259-
for d in table[char]:
260-
if not first:
261-
data += ","
262-
first = False
263-
data += escape_char(d)
264-
data += "])"
265-
return data
283+
return table[char]
266284
return f
267285

286+
# TODO: should the strtab of these two tables be of type &'static str, for
287+
# smaller data?
268288
f.write(" // Canonical decompositions\n")
269-
emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]",
270-
pfun=mkdata_fun(canon))
289+
emit_strtab_table(f, "canonical_table", canon_keys,
290+
vfun=mkdata_fun(canon))
271291

272292
f.write(" // Compatibility decompositions\n")
273-
emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
274-
pfun=mkdata_fun(compat))
275-
276-
def comp_pfun(char):
277-
data = "(%s,&[" % escape_char(char)
278-
canon_comp[char].sort(lambda x, y: x[0] - y[0])
279-
first = True
280-
for pair in canon_comp[char]:
281-
if not first:
282-
data += ","
283-
first = False
284-
data += "(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
285-
data += "])"
286-
return data
293+
emit_strtab_table(f, "compatibility_table", compat_keys,
294+
vfun=mkdata_fun(compat))
295+
296+
def comp_vfun(char):
297+
return sorted(canon_comp[char], lambda x, y: x[0] - y[0])
287298

288299
f.write(" // Canonical compositions\n")
289-
emit_table(f, "composition_table", canon_comp_keys,
290-
"&'static [(char, &'static [(char, char)])]", pfun=comp_pfun)
300+
# "&'static [(char, &'static [(char, char)])]", pfun=comp_pfun)
301+
emit_strtab_table(f, "composition_table", canon_comp_keys,
302+
vfun=comp_vfun,
303+
tab_entry_type="(char, char)",
304+
slice_element_sfun=lambda pair: "(%s,%s)" % (escape_char(pair[0]),
305+
escape_char(pair[1])))
291306

292307
f.write("""
293308
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {

src/normalize.rs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,22 @@
1212
1313
use std::cmp::Ordering::{Equal, Less, Greater};
1414
use std::ops::FnMut;
15-
use tables::normalization::{canonical_table, compatibility_table, composition_table};
15+
use tables::normalization::{canonical_table, canonical_table_STRTAB};
16+
use tables::normalization::{compatibility_table, compatibility_table_STRTAB};
17+
use tables::normalization::{composition_table, composition_table_STRTAB};
18+
use tables::normalization::Slice;
1619

17-
fn bsearch_table<T>(c: char, r: &'static [(char, &'static [T])]) -> Option<&'static [T]> {
20+
fn bsearch_table<T>(c: char, r: &'static [(char, Slice)], strtab: &'static [T]) -> Option<&'static [T]> {
1821
match r.binary_search_by(|&(val, _)| {
1922
if c == val { Equal }
2023
else if val < c { Less }
2124
else { Greater }
2225
}) {
2326
Ok(idx) => {
24-
let (_, result) = r[idx];
25-
Some(result)
27+
let ref slice = r[idx].1;
28+
let offset = slice.offset as usize;
29+
let length = slice.length as usize;
30+
Some(&strtab[offset..(offset + length)])
2631
}
2732
Err(_) => None
2833
}
@@ -50,7 +55,7 @@ fn d<F>(c: char, i: &mut F, k: bool) where F: FnMut(char) {
5055
}
5156

5257
// First check the canonical decompositions
53-
match bsearch_table(c, canonical_table) {
58+
match bsearch_table(c, canonical_table, canonical_table_STRTAB) {
5459
Some(canon) => {
5560
for x in canon {
5661
d(*x, i, k);
@@ -64,7 +69,7 @@ fn d<F>(c: char, i: &mut F, k: bool) where F: FnMut(char) {
6469
if !k { (*i)(c); return; }
6570

6671
// Then check the compatibility decompositions
67-
match bsearch_table(c, compatibility_table) {
72+
match bsearch_table(c, compatibility_table, compatibility_table_STRTAB) {
6873
Some(compat) => {
6974
for x in compat {
7075
d(*x, i, k);
@@ -83,7 +88,7 @@ fn d<F>(c: char, i: &mut F, k: bool) where F: FnMut(char) {
8388
/// for more information.
8489
pub fn compose(a: char, b: char) -> Option<char> {
8590
compose_hangul(a, b).or_else(|| {
86-
match bsearch_table(a, composition_table) {
91+
match bsearch_table(a, composition_table, composition_table_STRTAB) {
8792
None => None,
8893
Some(candidates) => {
8994
match candidates.binary_search_by(|&(val, _)| {

0 commit comments

Comments
 (0)