@@ -286,67 +286,46 @@ def _compute_stream_safe_tables(self):
286
286
287
287
hexify = lambda c : '{:04X}' .format (c )
288
288
289
- def gen_combining_class (combining_classes , out ):
290
- (salt , keys ) = minimal_perfect_hash (combining_classes )
291
- out .write ("pub fn canonical_combining_class(c: char) -> u8 {\n " )
292
- out .write (" mph_lookup(c.into(), &[\n " )
289
+ def gen_mph_data (name , d , kv_type , kv_callback ):
290
+ (salt , keys ) = minimal_perfect_hash (d )
291
+ out .write ("pub(crate) const %s_SALT: &[u16] = &[\n " % name .upper ())
293
292
for s in salt :
294
- out .write (" 0x{:x},\n " .format (s ))
295
- out .write (" ], \n " )
296
- out .write (" &[\n " )
293
+ out .write (" 0x{:x},\n " .format (s ))
294
+ out .write ("]; \n " )
295
+ out .write ("pub(crate) const {}_KV: &[{}] = &[\n " . format ( name . upper (), kv_type ) )
297
296
for k in keys :
298
- kv = int (combining_classes [k ]) | (k << 8 )
299
- out .write (" 0x{:x},\n " .format (kv ))
300
- out .write (" ],\n " )
301
- out .write (" u8_lookup_fk, u8_lookup_fv, 0)\n " )
302
- out .write ("}\n " )
297
+ out .write (" {},\n " .format (kv_callback (k )))
298
+ out .write ("];\n \n " )
299
+
300
+ def gen_combining_class (combining_classes , out ):
301
+ gen_mph_data ('canonical_combining_class' , combining_classes , 'u32' ,
302
+ lambda k : "0x{:X}" .format (int (combining_classes [k ]) | (k << 8 )))
303
303
304
304
def gen_composition_table (canon_comp , out ):
305
305
table = {}
306
306
for (c1 , c2 ), c3 in canon_comp .items ():
307
307
if c1 < 0x10000 and c2 < 0x10000 :
308
308
table [(c1 << 16 ) | c2 ] = c3
309
309
(salt , keys ) = minimal_perfect_hash (table )
310
- out .write ("pub fn composition_table(c1: char, c2: char) -> Option<char> {\n " )
311
- out .write (" if c1 < '\\ u{10000}' && c2 < '\\ u{10000}' {\n " )
312
- out .write (" mph_lookup((c1 as u32) << 16 | (c2 as u32), &[\n " )
313
- for s in salt :
314
- out .write (" 0x{:x},\n " .format (s ))
315
- out .write (" ],\n " )
316
- out .write (" &[\n " )
317
- for k in keys :
318
- out .write (" (0x%s, '\\ u{%s}'),\n " % (hexify (k ), hexify (table [k ])))
319
- out .write (" ],\n " )
320
- out .write (" pair_lookup_fk, pair_lookup_fv_opt, None)\n " )
321
- out .write (" } else {\n " )
322
- out .write (" match (c1, c2) {\n " )
310
+ gen_mph_data ('COMPOSITION_TABLE' , table , '(u32, char)' ,
311
+ lambda k : "(0x%s, '\\ u{%s}')" % (hexify (k ), hexify (table [k ])))
323
312
313
+ out .write ("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n " )
314
+ out .write (" match (c1, c2) {\n " )
324
315
for (c1 , c2 ), c3 in sorted (canon_comp .items ()):
325
316
if c1 >= 0x10000 and c2 >= 0x10000 :
326
- out .write (" ('\\ u{%s}', '\\ u{%s}') => Some('\\ u{%s}'),\n " % (hexify (c1 ), hexify (c2 ), hexify (c3 )))
317
+ out .write (" ('\\ u{%s}', '\\ u{%s}') => Some('\\ u{%s}'),\n " % (hexify (c1 ), hexify (c2 ), hexify (c3 )))
327
318
328
- out .write (" _ => None,\n " )
329
- out .write (" }\n " )
319
+ out .write (" _ => None,\n " )
330
320
out .write (" }\n " )
331
321
out .write ("}\n " )
332
322
333
323
def gen_decomposition_tables (canon_decomp , compat_decomp , out ):
334
324
tables = [(canon_decomp , 'canonical' ), (compat_decomp , 'compatibility' )]
335
325
for table , name in tables :
336
- (salt , keys ) = minimal_perfect_hash (table )
337
- out .write ("const {}_DECOMPOSED_KV: &[(u32, &'static [char])] = &[\n " .format (name .upper ()))
338
- for char in keys :
339
- d = ", " .join ("'\\ u{%s}'" % hexify (c ) for c in table [char ])
340
- out .write (" (0x{:x}, &[{}]),\n " .format (char , d ))
341
- out .write ("];\n " )
342
- out .write ("pub fn %s_fully_decomposed(c: char) -> Option<&'static [char]> {\n " % name )
343
- out .write (" mph_lookup(c.into(), &[\n " )
344
- for s in salt :
345
- out .write (" 0x{:x},\n " .format (s ))
346
- out .write (" ],\n " )
347
- out .write (" {}_DECOMPOSED_KV,\n " .format (name .upper ()))
348
- out .write (" pair_lookup_fk, pair_lookup_fv_opt, None)\n " )
349
- out .write ("}\n " )
326
+ gen_mph_data (name + '_decomposed' , table , "(u32, &'static [char])" ,
327
+ lambda k : "(0x{:x}, &[{}])" .format (k ,
328
+ ", " .join ("'\\ u{%s}'" % hexify (c ) for c in table [k ])))
350
329
351
330
def gen_qc_match (prop_table , out ):
352
331
out .write (" match c {\n " )
@@ -388,18 +367,8 @@ def gen_nfkd_qc(prop_tables, out):
388
367
out .write ("}\n " )
389
368
390
369
def gen_combining_mark (general_category_mark , out ):
391
- (salt , keys ) = minimal_perfect_hash (general_category_mark )
392
- out .write ("pub fn is_combining_mark(c: char) -> bool {\n " )
393
- out .write (" mph_lookup(c.into(), &[\n " )
394
- for s in salt :
395
- out .write (" 0x{:x},\n " .format (s ))
396
- out .write (" ],\n " )
397
- out .write (" &[\n " )
398
- for k in keys :
399
- out .write (" 0x{:x},\n " .format (k ))
400
- out .write (" ],\n " )
401
- out .write (" bool_lookup_fk, bool_lookup_fv, false)\n " )
402
- out .write ("}\n " )
370
+ gen_mph_data ('combining_mark' , general_category_mark , 'u32' ,
371
+ lambda k : '0x{:04x}' .format (k ))
403
372
404
373
def gen_stream_safe (leading , trailing , out ):
405
374
# This could be done as a hash but the table is very small.
@@ -415,19 +384,8 @@ def gen_stream_safe(leading, trailing, out):
415
384
out .write ("}\n " )
416
385
out .write ("\n " )
417
386
418
- (salt , keys ) = minimal_perfect_hash (trailing )
419
- out .write ("pub fn stream_safe_trailing_nonstarters(c: char) -> usize {\n " )
420
- out .write (" mph_lookup(c.into(), &[\n " )
421
- for s in salt :
422
- out .write (" 0x{:x},\n " .format (s ))
423
- out .write (" ],\n " )
424
- out .write (" &[\n " )
425
- for k in keys :
426
- kv = int (trailing [k ]) | (k << 8 )
427
- out .write (" 0x{:x},\n " .format (kv ))
428
- out .write (" ],\n " )
429
- out .write (" u8_lookup_fk, u8_lookup_fv, 0) as usize\n " )
430
- out .write ("}\n " )
387
+ gen_mph_data ('trailing_nonstarters' , trailing , 'u32' ,
388
+ lambda k : "0x{:X}" .format (int (trailing [k ]) | (k << 8 )))
431
389
432
390
def gen_tests (tests , out ):
433
391
out .write ("""#[derive(Debug)]
@@ -463,7 +421,7 @@ def my_hash(x, salt, n):
463
421
return (y * n ) >> 32
464
422
465
423
# Compute minimal perfect hash function, d can be either a dict or list of keys.
466
- def minimal_perfect_hash (d , singleton_buckets = False ):
424
+ def minimal_perfect_hash (d ):
467
425
n = len (d )
468
426
buckets = dict ((h , []) for h in range (n ))
469
427
for key in d :
@@ -475,18 +433,16 @@ def minimal_perfect_hash(d, singleton_buckets = False):
475
433
salts = [0 ] * n
476
434
keys = [0 ] * n
477
435
for (bucket_size , h ) in bsorted :
436
+ # Note: the traditional perfect hashing approach would also special-case
437
+ # bucket_size == 1 here and assign any empty slot, rather than iterating
438
+ # until rehash finds an empty slot. But we're not doing that so we can
439
+ # avoid the branch.
478
440
if bucket_size == 0 :
479
441
break
480
- elif singleton_buckets and bucket_size == 1 :
481
- for i in range (n ):
482
- if not claimed [i ]:
483
- salts [h ] = - (i + 1 )
484
- claimed [i ] = True
485
- keys [i ] = buckets [h ][0 ]
486
- break
487
442
else :
488
443
for salt in range (1 , 32768 ):
489
444
rehashes = [my_hash (key , salt , n ) for key in buckets [h ]]
445
+ # Make sure there are no rehash collisions within this bucket.
490
446
if all (not claimed [hash ] for hash in rehashes ):
491
447
if len (set (rehashes )) < bucket_size :
492
448
continue
@@ -498,6 +454,17 @@ def minimal_perfect_hash(d, singleton_buckets = False):
498
454
break
499
455
if salts [h ] == 0 :
500
456
print ("minimal perfect hashing failed" )
457
+ # Note: if this happens (because of unfortunate data), then there are
458
+ # a few things that could be done. First, the hash function could be
459
+ # tweaked. Second, the bucket order could be scrambled (especially the
460
+ # singletons). Right now, the buckets are sorted, which has the advantage
461
+ # of being deterministic.
462
+ #
463
+ # As a more extreme approach, the singleton bucket optimization could be
464
+ # applied (give the direct address for singleton buckets, rather than
465
+ # relying on a rehash). That is definitely the more standard approach in
466
+ # the minimal perfect hashing literature, but in testing the branch was a
467
+ # significant slowdown.
501
468
exit (1 )
502
469
return (salts , keys )
503
470
@@ -508,8 +475,6 @@ def minimal_perfect_hash(d, singleton_buckets = False):
508
475
out .write ("use quick_check::IsNormalized;\n " )
509
476
out .write ("use quick_check::IsNormalized::*;\n " )
510
477
out .write ("\n " )
511
- out .write ("use perfect_hash::*;\n " )
512
- out .write ("\n " )
513
478
514
479
version = "(%s, %s, %s)" % tuple (UNICODE_VERSION .split ("." ))
515
480
out .write ("#[allow(unused)]\n " )
0 commit comments