Skip to content

Commit 8d01bc5

Browse files
committed
store smaller slices in unicode data tables
Rust's default slices are convenient, but for tables like: const f: &'static [(char, &'static [char])] they take up far too much space. An element of the above array consumes 24 bytes on 64-bit platforms, and unicode-normalization contains about 6000 such array elements. A better approach is to manually store a smaller slice type: struct Slice { offset: u16, length: u16, } const f: &'static [(char, Slice)] and store the actual character data in a separate array on the side. The `Slice` structures then point in to this separate array, but at a much smaller space cost: elements of the modified `f` take up only 8 bytes on 64-bit platforms, which implies a space savings of ~96K on 64-bit platforms. On some systems, this strategy also eliminates the necessity of run-time relocations, which can be a further, significant savings in binary size and runtime cost. This change is strictly local to the library; it does not affect the public API.
1 parent 7552e6c commit 8d01bc5

File tree

3 files changed

+10335
-6127
lines changed

3 files changed

+10335
-6127
lines changed

scripts/unicode.py

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# Since this should not require frequent updates, we just store this
1919
# out-of-line and check the unicode.rs file into git.
2020

21-
import fileinput, re, os, sys
21+
import fileinput, re, os, sys, collections
2222

2323
preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2424
// file at the top-level directory of this distribution and at
@@ -213,6 +213,43 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
213213
format_table_content(f, [pfun(d) for d in t_data], 8)
214214
f.write("\n ];\n\n")
215215

216+
def emit_strtab_table(f, name, keys, vfun, is_pub=True,
217+
tab_entry_type='char', slice_element_sfun=escape_char):
218+
pub_string = ""
219+
if is_pub:
220+
pub_string = "pub "
221+
f.write(" %s const %s: &'static [(char, Slice)] = &[\n"
222+
% (pub_string, name))
223+
224+
strtab = collections.OrderedDict()
225+
strtab_offset = 0
226+
227+
# TODO: a more sophisticated algorithm here would not only check for the
228+
# existence of v in the strtab, but also v in contiguous substrings of
229+
# strtab, if that's possible.
230+
for k in keys:
231+
v = tuple(vfun(k))
232+
if v in strtab:
233+
item_slice = strtab[v]
234+
else:
235+
value_len = len(v)
236+
item_slice = (strtab_offset, value_len)
237+
strtab[v] = item_slice
238+
strtab_offset += value_len
239+
240+
f.write("%s(%s, Slice { offset: %d, length: %d }),\n"
241+
% (" "*8, escape_char(k), item_slice[0], item_slice[1]))
242+
243+
f.write("\n ];\n\n")
244+
245+
f.write(" %s const %s_STRTAB: &'static [%s] = &[\n"
246+
% (pub_string, name, tab_entry_type))
247+
248+
for (v, _) in strtab.iteritems():
249+
f.write("%s%s,\n" % (" "*8, ', '.join(slice_element_sfun(c) for c in v)))
250+
251+
f.write("\n ];\n\n")
252+
216253
def emit_norm_module(f, canon, compat, combine, norm_props, general_category_mark):
217254
canon_keys = canon.keys()
218255
canon_keys.sort()
@@ -234,35 +271,38 @@ def emit_norm_module(f, canon, compat, combine, norm_props, general_category_mar
234271
canon_comp_keys.sort()
235272

236273
f.write("pub mod normalization {\n")
274+
f.write("""
275+
pub struct Slice {
276+
pub offset: u16,
277+
pub length: u16,
278+
}
279+
""")
237280

238281
def mkdata_fun(table):
239282
def f(char):
240-
data = "(%s,&[" % escape_char(char)
241-
chars = [escape_char(d) for d in table[char]]
242-
data += ','.join(chars)
243-
data += "])"
244-
return data
283+
return table[char]
245284
return f
246285

286+
# TODO: should the strtab of these two tables be of type &'static str, for
287+
# smaller data?
247288
f.write(" // Canonical decompositions\n")
248-
emit_table(f, "canonical_table", canon_keys, "&'static [(char, &'static [char])]",
249-
pfun=mkdata_fun(canon))
289+
emit_strtab_table(f, "canonical_table", canon_keys,
290+
vfun=mkdata_fun(canon))
250291

251292
f.write(" // Compatibility decompositions\n")
252-
emit_table(f, "compatibility_table", compat_keys, "&'static [(char, &'static [char])]",
253-
pfun=mkdata_fun(compat))
293+
emit_strtab_table(f, "compatibility_table", compat_keys,
294+
vfun=mkdata_fun(compat))
254295

255-
def comp_pfun(char):
256-
data = "(%s,&[" % escape_char(char)
257-
canon_comp[char].sort(lambda x, y: x[0] - y[0])
258-
data += ','.join("(%s,%s)" % (escape_char(pair[0]), escape_char(pair[1]))
259-
for pair in canon_comp[char])
260-
data += "])"
261-
return data
296+
def comp_vfun(char):
297+
return sorted(canon_comp[char], lambda x, y: x[0] - y[0])
262298

263299
f.write(" // Canonical compositions\n")
264-
emit_table(f, "composition_table", canon_comp_keys,
265-
"&'static [(char, &'static [(char, char)])]", pfun=comp_pfun)
300+
# "&'static [(char, &'static [(char, char)])]", pfun=comp_pfun)
301+
emit_strtab_table(f, "composition_table", canon_comp_keys,
302+
vfun=comp_vfun,
303+
tab_entry_type="(char, char)",
304+
slice_element_sfun=lambda pair: "(%s,%s)" % (escape_char(pair[0]),
305+
escape_char(pair[1])))
266306

267307
f.write("""
268308
fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {

src/normalize.rs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,22 @@
1212
1313
use std::cmp::Ordering::{Equal, Less, Greater};
1414
use std::ops::FnMut;
15-
use tables::normalization::{canonical_table, compatibility_table, composition_table};
15+
use tables::normalization::{canonical_table, canonical_table_STRTAB};
16+
use tables::normalization::{compatibility_table, compatibility_table_STRTAB};
17+
use tables::normalization::{composition_table, composition_table_STRTAB};
18+
use tables::normalization::Slice;
1619

17-
fn bsearch_table<T>(c: char, r: &'static [(char, &'static [T])]) -> Option<&'static [T]> {
20+
fn bsearch_table<T>(c: char, r: &'static [(char, Slice)], strtab: &'static [T]) -> Option<&'static [T]> {
1821
match r.binary_search_by(|&(val, _)| {
1922
if c == val { Equal }
2023
else if val < c { Less }
2124
else { Greater }
2225
}) {
2326
Ok(idx) => {
24-
let (_, result) = r[idx];
25-
Some(result)
27+
let ref slice = r[idx].1;
28+
let offset = slice.offset as usize;
29+
let length = slice.length as usize;
30+
Some(&strtab[offset..(offset + length)])
2631
}
2732
Err(_) => None
2833
}
@@ -50,7 +55,7 @@ fn d<F>(c: char, i: &mut F, k: bool) where F: FnMut(char) {
5055
}
5156

5257
// First check the canonical decompositions
53-
match bsearch_table(c, canonical_table) {
58+
match bsearch_table(c, canonical_table, canonical_table_STRTAB) {
5459
Some(canon) => {
5560
for x in canon {
5661
d(*x, i, k);
@@ -64,7 +69,7 @@ fn d<F>(c: char, i: &mut F, k: bool) where F: FnMut(char) {
6469
if !k { (*i)(c); return; }
6570

6671
// Then check the compatibility decompositions
67-
match bsearch_table(c, compatibility_table) {
72+
match bsearch_table(c, compatibility_table, compatibility_table_STRTAB) {
6873
Some(compat) => {
6974
for x in compat {
7075
d(*x, i, k);
@@ -83,7 +88,7 @@ fn d<F>(c: char, i: &mut F, k: bool) where F: FnMut(char) {
8388
/// for more information.
8489
pub fn compose(a: char, b: char) -> Option<char> {
8590
compose_hangul(a, b).or_else(|| {
86-
match bsearch_table(a, composition_table) {
91+
match bsearch_table(a, composition_table, composition_table_STRTAB) {
8792
None => None,
8893
Some(candidates) => {
8994
match candidates.binary_search_by(|&(val, _)| {

0 commit comments

Comments
 (0)