Skip to content

Commit 2e432d2

Browse files
committed
Move code out of tables
The code has been moved out of the tables module into perfect_hash, and there is a bit more explanation in comments.
1 parent f64d47c commit 2e432d2

File tree

7 files changed

+15749
-15760
lines changed

7 files changed

+15749
-15760
lines changed

scripts/unicode.py

Lines changed: 42 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -286,67 +286,46 @@ def _compute_stream_safe_tables(self):
286286

287287
hexify = lambda c: '{:04X}'.format(c)
288288

289-
def gen_combining_class(combining_classes, out):
290-
(salt, keys) = minimal_perfect_hash(combining_classes)
291-
out.write("pub fn canonical_combining_class(c: char) -> u8 {\n")
292-
out.write(" mph_lookup(c.into(), &[\n")
289+
def gen_mph_data(name, d, kv_type, kv_callback):
290+
(salt, keys) = minimal_perfect_hash(d)
291+
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
293292
for s in salt:
294-
out.write(" 0x{:x},\n".format(s))
295-
out.write(" ],\n")
296-
out.write(" &[\n")
293+
out.write(" 0x{:x},\n".format(s))
294+
out.write("];\n")
295+
out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
297296
for k in keys:
298-
kv = int(combining_classes[k]) | (k << 8)
299-
out.write(" 0x{:x},\n".format(kv))
300-
out.write(" ],\n")
301-
out.write(" u8_lookup_fk, u8_lookup_fv, 0)\n")
302-
out.write("}\n")
297+
out.write(" {},\n".format(kv_callback(k)))
298+
out.write("];\n\n")
299+
300+
def gen_combining_class(combining_classes, out):
301+
gen_mph_data('canonical_combining_class', combining_classes, 'u32',
302+
lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
303303

304304
def gen_composition_table(canon_comp, out):
305305
table = {}
306306
for (c1, c2), c3 in canon_comp.items():
307307
if c1 < 0x10000 and c2 < 0x10000:
308308
table[(c1 << 16) | c2] = c3
309309
(salt, keys) = minimal_perfect_hash(table)
310-
out.write("pub fn composition_table(c1: char, c2: char) -> Option<char> {\n")
311-
out.write(" if c1 < '\\u{10000}' && c2 < '\\u{10000}' {\n")
312-
out.write(" mph_lookup((c1 as u32) << 16 | (c2 as u32), &[\n")
313-
for s in salt:
314-
out.write(" 0x{:x},\n".format(s))
315-
out.write(" ],\n")
316-
out.write(" &[\n")
317-
for k in keys:
318-
out.write(" (0x%s, '\\u{%s}'),\n" % (hexify(k), hexify(table[k])))
319-
out.write(" ],\n")
320-
out.write(" pair_lookup_fk, pair_lookup_fv_opt, None)\n")
321-
out.write(" } else {\n")
322-
out.write(" match (c1, c2) {\n")
310+
gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
311+
lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
323312

313+
out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
314+
out.write(" match (c1, c2) {\n")
324315
for (c1, c2), c3 in sorted(canon_comp.items()):
325316
if c1 >= 0x10000 and c2 >= 0x10000:
326-
out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
317+
out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
327318

328-
out.write(" _ => None,\n")
329-
out.write(" }\n")
319+
out.write(" _ => None,\n")
330320
out.write(" }\n")
331321
out.write("}\n")
332322

333323
def gen_decomposition_tables(canon_decomp, compat_decomp, out):
334324
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
335325
for table, name in tables:
336-
(salt, keys) = minimal_perfect_hash(table)
337-
out.write("const {}_DECOMPOSED_KV: &[(u32, &'static [char])] = &[\n".format(name.upper()))
338-
for char in keys:
339-
d = ", ".join("'\\u{%s}'" % hexify(c) for c in table[char])
340-
out.write(" (0x{:x}, &[{}]),\n".format(char, d))
341-
out.write("];\n")
342-
out.write("pub fn %s_fully_decomposed(c: char) -> Option<&'static [char]> {\n" % name)
343-
out.write(" mph_lookup(c.into(), &[\n")
344-
for s in salt:
345-
out.write(" 0x{:x},\n".format(s))
346-
out.write(" ],\n")
347-
out.write(" {}_DECOMPOSED_KV,\n".format(name.upper()))
348-
out.write(" pair_lookup_fk, pair_lookup_fv_opt, None)\n")
349-
out.write("}\n")
326+
gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
327+
lambda k: "(0x{:x}, &[{}])".format(k,
328+
", ".join("'\\u{%s}'" % hexify(c) for c in table[k])))
350329

351330
def gen_qc_match(prop_table, out):
352331
out.write(" match c {\n")
@@ -388,18 +367,8 @@ def gen_nfkd_qc(prop_tables, out):
388367
out.write("}\n")
389368

390369
def gen_combining_mark(general_category_mark, out):
391-
(salt, keys) = minimal_perfect_hash(general_category_mark)
392-
out.write("pub fn is_combining_mark(c: char) -> bool {\n")
393-
out.write(" mph_lookup(c.into(), &[\n")
394-
for s in salt:
395-
out.write(" 0x{:x},\n".format(s))
396-
out.write(" ],\n")
397-
out.write(" &[\n")
398-
for k in keys:
399-
out.write(" 0x{:x},\n".format(k))
400-
out.write(" ],\n")
401-
out.write(" bool_lookup_fk, bool_lookup_fv, false)\n")
402-
out.write("}\n")
370+
gen_mph_data('combining_mark', general_category_mark, 'u32',
371+
lambda k: '0x{:04x}'.format(k))
403372

404373
def gen_stream_safe(leading, trailing, out):
405374
# This could be done as a hash but the table is very small.
@@ -415,19 +384,8 @@ def gen_stream_safe(leading, trailing, out):
415384
out.write("}\n")
416385
out.write("\n")
417386

418-
(salt, keys) = minimal_perfect_hash(trailing)
419-
out.write("pub fn stream_safe_trailing_nonstarters(c: char) -> usize {\n")
420-
out.write(" mph_lookup(c.into(), &[\n")
421-
for s in salt:
422-
out.write(" 0x{:x},\n".format(s))
423-
out.write(" ],\n")
424-
out.write(" &[\n")
425-
for k in keys:
426-
kv = int(trailing[k]) | (k << 8)
427-
out.write(" 0x{:x},\n".format(kv))
428-
out.write(" ],\n")
429-
out.write(" u8_lookup_fk, u8_lookup_fv, 0) as usize\n")
430-
out.write("}\n")
387+
gen_mph_data('trailing_nonstarters', trailing, 'u32',
388+
lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
431389

432390
def gen_tests(tests, out):
433391
out.write("""#[derive(Debug)]
@@ -463,7 +421,7 @@ def my_hash(x, salt, n):
463421
return (y * n) >> 32
464422

465423
# Compute minimal perfect hash function, d can be either a dict or list of keys.
466-
def minimal_perfect_hash(d, singleton_buckets = False):
424+
def minimal_perfect_hash(d):
467425
n = len(d)
468426
buckets = dict((h, []) for h in range(n))
469427
for key in d:
@@ -475,18 +433,16 @@ def minimal_perfect_hash(d, singleton_buckets = False):
475433
salts = [0] * n
476434
keys = [0] * n
477435
for (bucket_size, h) in bsorted:
436+
# Note: the traditional perfect hashing approach would also special-case
437+
# bucket_size == 1 here and assign any empty slot, rather than iterating
438+
# until rehash finds an empty slot. But we're not doing that so we can
439+
# avoid the branch.
478440
if bucket_size == 0:
479441
break
480-
elif singleton_buckets and bucket_size == 1:
481-
for i in range(n):
482-
if not claimed[i]:
483-
salts[h] = -(i + 1)
484-
claimed[i] = True
485-
keys[i] = buckets[h][0]
486-
break
487442
else:
488443
for salt in range(1, 32768):
489444
rehashes = [my_hash(key, salt, n) for key in buckets[h]]
445+
# Make sure there are no rehash collisions within this bucket.
490446
if all(not claimed[hash] for hash in rehashes):
491447
if len(set(rehashes)) < bucket_size:
492448
continue
@@ -498,6 +454,17 @@ def minimal_perfect_hash(d, singleton_buckets = False):
498454
break
499455
if salts[h] == 0:
500456
print("minimal perfect hashing failed")
457+
# Note: if this happens (because of unfortunate data), then there are
458+
# a few things that could be done. First, the hash function could be
459+
# tweaked. Second, the bucket order could be scrambled (especially the
460+
# singletons). Right now, the buckets are sorted, which has the advantage
461+
# of being deterministic.
462+
#
463+
# As a more extreme approach, the singleton bucket optimization could be
464+
# applied (give the direct address for singleton buckets, rather than
465+
# relying on a rehash). That is definitely the more standard approach in
466+
# the minimal perfect hashing literature, but in testing the branch was a
467+
# significant slowdown.
501468
exit(1)
502469
return (salts, keys)
503470

@@ -508,8 +475,6 @@ def minimal_perfect_hash(d, singleton_buckets = False):
508475
out.write("use quick_check::IsNormalized;\n")
509476
out.write("use quick_check::IsNormalized::*;\n")
510477
out.write("\n")
511-
out.write("use perfect_hash::*;\n")
512-
out.write("\n")
513478

514479
version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
515480
out.write("#[allow(unused)]\n")

src/lib.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,7 @@ mod normalization_tests;
8181
pub mod char {
8282
pub use normalize::{decompose_canonical, decompose_compatible, compose};
8383

84-
/// Look up the canonical combining class of a character.
85-
pub use tables::canonical_combining_class;
86-
87-
/// Return whether the given character is a combining mark (`General_Category=Mark`)
88-
pub use tables::is_combining_mark;
84+
pub use perfect_hash::{canonical_combining_class, is_combining_mark};
8985
}
9086

9187

src/normalize.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,23 @@
1111
//! Functions for computing canonical and compatible decompositions for Unicode characters.
1212
use std::char;
1313
use std::ops::FnMut;
14-
use tables;
14+
use perfect_hash::{canonical_fully_decomposed, composition_table, compatibility_fully_decomposed};
1515

1616
/// Compute canonical Unicode decomposition for character.
1717
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
1818
/// for more information.
1919
#[inline]
2020
pub fn decompose_canonical<F>(c: char, emit_char: F) where F: FnMut(char) {
21-
decompose(c, tables::canonical_fully_decomposed, emit_char)
21+
decompose(c, canonical_fully_decomposed, emit_char)
2222
}
2323

2424
/// Compute canonical or compatible Unicode decomposition for character.
2525
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
2626
/// for more information.
2727
#[inline]
2828
pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
29-
let decompose_char = |c| tables::compatibility_fully_decomposed(c)
30-
.or_else(|| tables::canonical_fully_decomposed(c));
29+
let decompose_char = |c| compatibility_fully_decomposed(c)
30+
.or_else(|| canonical_fully_decomposed(c));
3131
decompose(c, decompose_char, emit_char)
3232
}
3333

@@ -62,7 +62,7 @@ fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
6262
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
6363
/// for more information.
6464
pub fn compose(a: char, b: char) -> Option<char> {
65-
compose_hangul(a, b).or_else(|| tables::composition_table(a, b))
65+
compose_hangul(a, b).or_else(|| composition_table(a, b))
6666
}
6767

6868
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior

src/perfect_hash.rs

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
//! Support for lookups based on minimal perfect hashing.
1212
13+
use tables::*;
14+
1315
// This function is based on multiplication being fast and is "good enough". Also
1416
// it can share some work between the unsalted and salted versions.
1517
#[inline]
@@ -27,7 +29,7 @@ fn my_hash(key: u32, salt: u32, n: usize) -> usize {
2729
/// The hash function doesn't have to be very good, just good enough that the
2830
/// resulting map is unique.
2931
#[inline]
30-
pub(crate) fn mph_lookup<KV, V, FK, FV>(x: u32, salt: &[u16], kv: &[KV], fk: FK, fv: FV,
32+
fn mph_lookup<KV, V, FK, FV>(x: u32, salt: &[u16], kv: &[KV], fk: FK, fv: FV,
3133
default: V) -> V
3234
where KV: Copy, FK: Fn(KV) -> u32, FV: Fn(KV) -> V
3335
{
@@ -42,37 +44,75 @@ pub(crate) fn mph_lookup<KV, V, FK, FV>(x: u32, salt: &[u16], kv: &[KV], fk: FK,
4244

4345
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
4446
#[inline]
45-
pub(crate) fn u8_lookup_fk(kv: u32) -> u32 {
47+
fn u8_lookup_fk(kv: u32) -> u32 {
4648
kv >> 8
4749
}
4850

4951
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
5052
#[inline]
51-
pub(crate) fn u8_lookup_fv(kv: u32) -> u8 {
53+
fn u8_lookup_fv(kv: u32) -> u8 {
5254
(kv & 0xff) as u8
5355
}
5456

5557
/// Extract the key for a boolean lookup.
5658
#[inline]
57-
pub(crate) fn bool_lookup_fk(kv: u32) -> u32 {
59+
fn bool_lookup_fk(kv: u32) -> u32 {
5860
kv
5961
}
6062

6163
/// Extract the value for a boolean lookup.
6264
#[inline]
63-
pub(crate) fn bool_lookup_fv(_kv: u32) -> bool {
65+
fn bool_lookup_fv(_kv: u32) -> bool {
6466
true
6567
}
6668

6769
/// Extract the key in a pair.
6870
#[inline]
69-
pub(crate) fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
71+
fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
7072
kv.0
7173
}
7274

7375
/// Extract the value in a pair, returning an option.
7476
#[inline]
75-
pub(crate) fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
77+
fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
7678
Some(kv.1)
7779
}
7880

81+
/// Look up the canonical combining class for a codepoint.
82+
///
83+
/// The value returned is as defined in the Unicode Character Database.
84+
pub fn canonical_combining_class(c: char) -> u8 {
85+
mph_lookup(c.into(), CANONICAL_COMBINING_CLASS_SALT, CANONICAL_COMBINING_CLASS_KV,
86+
u8_lookup_fk, u8_lookup_fv, 0)
87+
}
88+
89+
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
90+
if c1 < '\u{10000}' && c2 < '\u{10000}' {
91+
mph_lookup((c1 as u32) << 16 | (c2 as u32),
92+
COMPOSITION_TABLE_SALT, COMPOSITION_TABLE_KV,
93+
pair_lookup_fk, pair_lookup_fv_opt, None)
94+
} else {
95+
composition_table_astral(c1, c2)
96+
}
97+
}
98+
99+
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
100+
mph_lookup(c.into(), CANONICAL_DECOMPOSED_SALT, CANONICAL_DECOMPOSED_KV,
101+
pair_lookup_fk, pair_lookup_fv_opt, None)
102+
}
103+
104+
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
105+
mph_lookup(c.into(), COMPATIBILITY_DECOMPOSED_SALT, COMPATIBILITY_DECOMPOSED_KV,
106+
pair_lookup_fk, pair_lookup_fv_opt, None)
107+
}
108+
109+
/// Return whether the given character is a combining mark (`General_Category=Mark`)
110+
pub fn is_combining_mark(c: char) -> bool {
111+
mph_lookup(c.into(), COMBINING_MARK_SALT, COMBINING_MARK_KV,
112+
bool_lookup_fk, bool_lookup_fv, false)
113+
}
114+
115+
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
116+
mph_lookup(c.into(), TRAILING_NONSTARTERS_SALT, TRAILING_NONSTARTERS_KV,
117+
u8_lookup_fk, u8_lookup_fv, 0) as usize
118+
}

src/quick_check.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use UnicodeNormalization;
2+
use perfect_hash::canonical_combining_class;
23
use stream_safe;
34
use tables;
45

@@ -32,7 +33,7 @@ fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
3233
}
3334

3435
// Otherwise, lookup the combining class and QC property
35-
let cc = tables::canonical_combining_class(ch);
36+
let cc = canonical_combining_class(ch);
3637
if last_cc > cc && cc != 0 {
3738
return IsNormalized::No;
3839
}

0 commit comments

Comments
 (0)