@@ -288,14 +288,18 @@ def _compute_stream_safe_tables(self):
288
288
289
289
def gen_combining_class (combining_classes , out ):
290
290
out .write ("#[inline]\n " )
291
+ (salt , keys ) = minimal_perfect_hash (combining_classes )
291
292
out .write ("pub fn canonical_combining_class(c: char) -> u8 {\n " )
292
- out .write (" match c {\n " )
293
-
294
- for char , combining_class in sorted (combining_classes .items ()):
295
- out .write (" '\\ u{%s}' => %s,\n " % (hexify (char ), combining_class ))
296
-
297
- out .write (" _ => 0,\n " )
298
- out .write (" }\n " )
293
+ out .write (" mph_lookup(c.into(), &[\n " )
294
+ for s in salt :
295
+ out .write (" 0x{:x},\n " .format (s ))
296
+ out .write (" ],\n " )
297
+ out .write (" &[\n " )
298
+ for k in keys :
299
+ kv = int (combining_classes [k ]) | (k << 8 )
300
+ out .write (" 0x{:x},\n " .format (kv ))
301
+ out .write (" ],\n " )
302
+ out .write (" u8_lookup_fk, u8_lookup_fv, 0) as u8\n " )
299
303
out .write ("}\n " )
300
304
301
305
def gen_composition_table (canon_comp , out ):
@@ -432,13 +436,61 @@ def gen_tests(tests, out):
432
436
433
437
out .write ("];\n " )
434
438
439
+ def my_hash (x , salt , n ):
440
+ # This is hash based on the theory that multiplication is efficient
441
+ mask_32 = 0xffffffff
442
+ y = ((x + salt ) * 2654435769 ) & mask_32
443
+ y ^= (x * 0x31415926 ) & mask_32
444
+ return (y * n ) >> 32
445
+
446
+ # Compute minimal perfect hash function, d can be either a dict or list of keys.
447
+ def minimal_perfect_hash (d , singleton_buckets = False ):
448
+ n = len (d )
449
+ buckets = dict ((h , []) for h in range (n ))
450
+ for key in d :
451
+ h = my_hash (key , 0 , n )
452
+ buckets [h ].append (key )
453
+ bsorted = [(len (buckets [h ]), h ) for h in range (n )]
454
+ bsorted .sort (reverse = True )
455
+ claimed = [False ] * n
456
+ salts = [0 ] * n
457
+ keys = [0 ] * n
458
+ for (bucket_size , h ) in bsorted :
459
+ if bucket_size == 0 :
460
+ break
461
+ elif singleton_buckets and bucket_size == 1 :
462
+ for i in range (n ):
463
+ if not claimed [i ]:
464
+ salts [h ] = - (i + 1 )
465
+ claimed [i ] = True
466
+ keys [i ] = buckets [h ][0 ]
467
+ break
468
+ else :
469
+ for salt in range (1 , 32768 ):
470
+ rehashes = [my_hash (key , salt , n ) for key in buckets [h ]]
471
+ if all (not claimed [hash ] for hash in rehashes ):
472
+ if len (set (rehashes )) < bucket_size :
473
+ continue
474
+ salts [h ] = salt
475
+ for key in buckets [h ]:
476
+ rehash = my_hash (key , salt , n )
477
+ claimed [rehash ] = True
478
+ keys [rehash ] = key
479
+ break
480
+ if salts [h ] == 0 :
481
+ print ("minimal perfect hashing failed" )
482
+ exit (1 )
483
+ return (salts , keys )
484
+
435
485
if __name__ == '__main__' :
436
486
data = UnicodeData ()
437
487
with open ("tables.rs" , "w" , newline = "\n " ) as out :
438
488
out .write (PREAMBLE )
439
489
out .write ("use quick_check::IsNormalized;\n " )
440
490
out .write ("use quick_check::IsNormalized::*;\n " )
441
491
out .write ("\n " )
492
+ out .write ("use perfect_hash::*;\n " )
493
+ out .write ("\n " )
442
494
443
495
version = "(%s, %s, %s)" % tuple (UNICODE_VERSION .split ("." ))
444
496
out .write ("#[allow(unused)]\n " )
0 commit comments