Skip to content

Commit 3a4a8f6

Browse files
committed
Use minimal perfect hashing for combining class lookup
1 parent aa84f63 commit 3a4a8f6

File tree

4 files changed

+2027
-1113
lines changed

4 files changed

+2027
-1113
lines changed

scripts/unicode.py

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -288,14 +288,18 @@ def _compute_stream_safe_tables(self):
288288

289289
def gen_combining_class(combining_classes, out):
290290
out.write("#[inline]\n")
291+
(salt, keys) = minimal_perfect_hash(combining_classes)
291292
out.write("pub fn canonical_combining_class(c: char) -> u8 {\n")
292-
out.write(" match c {\n")
293-
294-
for char, combining_class in sorted(combining_classes.items()):
295-
out.write(" '\\u{%s}' => %s,\n" % (hexify(char), combining_class))
296-
297-
out.write(" _ => 0,\n")
298-
out.write(" }\n")
293+
out.write(" mph_lookup(c.into(), &[\n")
294+
for s in salt:
295+
out.write(" 0x{:x},\n".format(s))
296+
out.write(" ],\n")
297+
out.write(" &[\n")
298+
for k in keys:
299+
kv = int(combining_classes[k]) | (k << 8)
300+
out.write(" 0x{:x},\n".format(kv))
301+
out.write(" ],\n")
302+
out.write(" u8_lookup_fk, u8_lookup_fv, 0) as u8\n")
299303
out.write("}\n")
300304

301305
def gen_composition_table(canon_comp, out):
@@ -432,13 +436,61 @@ def gen_tests(tests, out):
432436

433437
out.write("];\n")
434438

439+
def my_hash(x, salt, n):
440+
# This is hash based on the theory that multiplication is efficient
441+
mask_32 = 0xffffffff
442+
y = ((x + salt) * 2654435769) & mask_32
443+
y ^= (x * 0x31415926) & mask_32
444+
return (y * n) >> 32
445+
446+
# Compute minimal perfect hash function, d can be either a dict or list of keys.
447+
def minimal_perfect_hash(d, singleton_buckets = False):
448+
n = len(d)
449+
buckets = dict((h, []) for h in range(n))
450+
for key in d:
451+
h = my_hash(key, 0, n)
452+
buckets[h].append(key)
453+
bsorted = [(len(buckets[h]), h) for h in range(n)]
454+
bsorted.sort(reverse = True)
455+
claimed = [False] * n
456+
salts = [0] * n
457+
keys = [0] * n
458+
for (bucket_size, h) in bsorted:
459+
if bucket_size == 0:
460+
break
461+
elif singleton_buckets and bucket_size == 1:
462+
for i in range(n):
463+
if not claimed[i]:
464+
salts[h] = -(i + 1)
465+
claimed[i] = True
466+
keys[i] = buckets[h][0]
467+
break
468+
else:
469+
for salt in range(1, 32768):
470+
rehashes = [my_hash(key, salt, n) for key in buckets[h]]
471+
if all(not claimed[hash] for hash in rehashes):
472+
if len(set(rehashes)) < bucket_size:
473+
continue
474+
salts[h] = salt
475+
for key in buckets[h]:
476+
rehash = my_hash(key, salt, n)
477+
claimed[rehash] = True
478+
keys[rehash] = key
479+
break
480+
if salts[h] == 0:
481+
print("minimal perfect hashing failed")
482+
exit(1)
483+
return (salts, keys)
484+
435485
if __name__ == '__main__':
436486
data = UnicodeData()
437487
with open("tables.rs", "w", newline = "\n") as out:
438488
out.write(PREAMBLE)
439489
out.write("use quick_check::IsNormalized;\n")
440490
out.write("use quick_check::IsNormalized::*;\n")
441491
out.write("\n")
492+
out.write("use perfect_hash::*;\n")
493+
out.write("\n")
442494

443495
version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
444496
out.write("#[allow(unused)]\n")

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ use std::str::Chars;
6666

6767
mod decompose;
6868
mod normalize;
69+
mod perfect_hash;
6970
mod recompose;
7071
mod quick_check;
7172
mod stream_safe;

src/perfect_hash.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
//! Support for lookups based on minimal perfect hashing.
12+
13+
// This function is based on multiplication being fast and is "good enough". Also
14+
// it can share some work between the unsalted and salted versions.
15+
fn my_hash(key: u32, salt: u32, n: usize) -> usize {
16+
let y = key.wrapping_add(salt).wrapping_mul(2654435769);
17+
let y = y ^ key.wrapping_mul(0x31415926);
18+
(((y as u64) * (n as u64)) >> 32) as usize
19+
}
20+
21+
#[inline]
22+
pub(crate) fn mph_lookup<KV, V, FK, FV>(x: u32, salt: &[u16], kv: &[KV], fk: FK, fv: FV,
23+
default: V) -> V
24+
where KV: Copy, FK: Fn(KV) -> u32, FV: Fn(KV) -> V
25+
{
26+
let s = salt[my_hash(x, 0, salt.len())] as u32;
27+
let key_val = kv[my_hash(x, s, salt.len())];
28+
if x == fk(key_val) {
29+
fv(key_val)
30+
} else {
31+
default
32+
}
33+
}
34+
35+
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
36+
pub(crate) fn u8_lookup_fk(kv: u32) -> u32 {
37+
kv >> 8
38+
}
39+
40+
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
41+
pub(crate) fn u8_lookup_fv(kv: u32) -> u32 {
42+
kv & 0xff
43+
}

0 commit comments

Comments
 (0)