@@ -409,3 +409,92 @@ fn chapter_settings_priority() {
409
409
) ;
410
410
}
411
411
}
412
+
413
+ #[ cfg( test) ]
414
+ mod tests {
415
+ use super :: * ;
416
+
417
+ #[ test]
418
+ fn test_tokenize_basic ( ) {
419
+ assert_eq ! ( tokenize( "hello world" ) , vec![ "hello" , "world" ] ) ;
420
+ }
421
+
422
+ #[ test]
423
+ fn test_tokenize_with_hyphens ( ) {
424
+ assert_eq ! (
425
+ tokenize( "hello-world test-case" ) ,
426
+ vec![ "hello" , "world" , "test" , "case" ]
427
+ ) ;
428
+ }
429
+
430
+ #[ test]
431
+ fn test_tokenize_mixed_whitespace ( ) {
432
+ assert_eq ! (
433
+ tokenize( "hello\t world\n test\r \n case" ) ,
434
+ vec![ "hello" , "world" , "test" , "case" ]
435
+ ) ;
436
+ }
437
+
438
+ #[ test]
439
+ fn test_tokenize_empty_string ( ) {
440
+ assert_eq ! ( tokenize( "" ) , Vec :: <String >:: new( ) ) ;
441
+ }
442
+
443
+ #[ test]
444
+ fn test_tokenize_only_whitespace ( ) {
445
+ assert_eq ! ( tokenize( " \t \n " ) , Vec :: <String >:: new( ) ) ;
446
+ }
447
+
448
+ #[ test]
449
+ fn test_tokenize_case_normalization ( ) {
450
+ assert_eq ! ( tokenize( "Hello WORLD Test" ) , vec![ "hello" , "world" , "test" ] ) ;
451
+ }
452
+
453
+ #[ test]
454
+ fn test_tokenize_trim_whitespace ( ) {
455
+ assert_eq ! ( tokenize( " hello world " ) , vec![ "hello" , "world" ] ) ;
456
+ }
457
+
458
+ #[ test]
459
+ fn test_tokenize_long_words_filtered ( ) {
460
+ let long_word = "a" . repeat ( MAX_WORD_LENGTH_TO_INDEX + 1 ) ;
461
+ let short_word = "a" . repeat ( MAX_WORD_LENGTH_TO_INDEX ) ;
462
+ let input = format ! ( "{} hello {}" , long_word, short_word) ;
463
+ assert_eq ! ( tokenize( & input) , vec![ "hello" , & short_word] ) ;
464
+ }
465
+
466
+ #[ test]
467
+ fn test_tokenize_max_length_word ( ) {
468
+ let max_word = "a" . repeat ( MAX_WORD_LENGTH_TO_INDEX ) ;
469
+ assert_eq ! ( tokenize( & max_word) , vec![ max_word] ) ;
470
+ }
471
+
472
+ #[ test]
473
+ fn test_tokenize_special_characters ( ) {
474
+ assert_eq ! (
475
+ tokenize( "hello,world.test!case?" ) ,
476
+ vec![ "hello,world.test!case?" ]
477
+ ) ;
478
+ }
479
+
480
+ #[ test]
481
+ fn test_tokenize_unicode ( ) {
482
+ assert_eq ! (
483
+ tokenize( "café naïve résumé" ) ,
484
+ vec![ "café" , "naïve" , "résumé" ]
485
+ ) ;
486
+ }
487
+
488
+ #[ test]
489
+ fn test_tokenize_unicode_rtl_hebre ( ) {
490
+ assert_eq ! ( tokenize( "שלום עולם" ) , vec![ "שלום" , "עולם" ] ) ;
491
+ }
492
+
493
+ #[ test]
494
+ fn test_tokenize_numbers ( ) {
495
+ assert_eq ! (
496
+ tokenize( "test123 456-789 hello" ) ,
497
+ vec![ "test123" , "456" , "789" , "hello" ]
498
+ ) ;
499
+ }
500
+ }
0 commit comments