1
1
use UnicodeNormalization ;
2
+ use stream_safe;
2
3
use tables;
3
4
4
5
/// The QuickCheck algorithm can quickly determine if a text is or isn't
5
6
/// normalized without any allocations in many cases, but it has to be able to
6
7
/// return `Maybe` when a full decomposition and recomposition is necessary.
8
+ #[ derive( Debug , Eq , PartialEq ) ]
7
9
pub enum IsNormalized {
8
10
/// The text is definitely normalized.
9
11
Yes ,
@@ -15,17 +17,20 @@ pub enum IsNormalized {
15
17
16
18
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
17
19
#[ inline]
18
- fn quick_check < F , I > ( s : I , is_allowed : F ) -> IsNormalized
20
+ fn quick_check < F , I > ( s : I , is_allowed : F , stream_safe : bool ) -> IsNormalized
19
21
where I : Iterator < Item =char > , F : Fn ( char ) -> IsNormalized
20
22
{
21
23
let mut last_cc = 0u8 ;
24
+ let mut nonstarter_count = 0 ;
22
25
let mut result = IsNormalized :: Yes ;
23
26
for ch in s {
24
27
// For ASCII we know it's always allowed and a starter
25
28
if ch <= '\x7f' {
26
29
last_cc = 0 ;
30
+ nonstarter_count = 0 ;
27
31
continue ;
28
32
}
33
+
29
34
// Otherwise, lookup the combining class and QC property
30
35
let cc = tables:: canonical_combining_class ( ch) ;
31
36
if last_cc > cc && cc != 0 {
@@ -38,6 +43,20 @@ fn quick_check<F, I>(s: I, is_allowed: F) -> IsNormalized
38
43
result = IsNormalized :: Maybe ;
39
44
} ,
40
45
}
46
+ if stream_safe {
47
+ let decomp = stream_safe:: classify_nonstarters ( ch) ;
48
+
49
+ // If we're above `MAX_NONSTARTERS`, we're definitely *not*
50
+ // stream-safe normalized.
51
+ if nonstarter_count + decomp. leading_nonstarters > stream_safe:: MAX_NONSTARTERS {
52
+ return IsNormalized :: No ;
53
+ }
54
+ if decomp. leading_nonstarters == decomp. decomposition_len {
55
+ nonstarter_count += decomp. decomposition_len ;
56
+ } else {
57
+ nonstarter_count = decomp. trailing_nonstarters ;
58
+ }
59
+ }
41
60
last_cc = cc;
42
61
}
43
62
result
@@ -48,16 +67,29 @@ fn quick_check<F, I>(s: I, is_allowed: F) -> IsNormalized
48
67
/// like `s.chars().nfc().eq(s.chars())` should suffice.
49
68
#[ inline]
50
69
pub fn is_nfc_quick < I : Iterator < Item =char > > ( s : I ) -> IsNormalized {
51
- quick_check ( s, tables:: qc_nfc)
70
+ quick_check ( s, tables:: qc_nfc, false )
52
71
}
53
72
54
73
/// Quickly check if a string is in NFD.
55
74
#[ inline]
56
75
pub fn is_nfd_quick < I : Iterator < Item =char > > ( s : I ) -> IsNormalized {
57
- quick_check ( s, tables:: qc_nfd)
76
+ quick_check ( s, tables:: qc_nfd, false )
77
+ }
78
+
79
+ /// Quickly check if a string is Stream-Safe NFC.
80
+ #[ inline]
81
+ pub fn is_nfc_stream_safe_quick < I : Iterator < Item =char > > ( s : I ) -> IsNormalized {
82
+ quick_check ( s, tables:: qc_nfc, true )
83
+ }
84
+
85
+ /// Quickly check if a string is Stream-Safe NFD.
86
+ #[ inline]
87
+ pub fn is_nfd_stream_safe_quick < I : Iterator < Item =char > > ( s : I ) -> IsNormalized {
88
+ quick_check ( s, tables:: qc_nfd, true )
58
89
}
59
90
60
91
/// Authoritatively check if a string is in NFC.
92
+ #[ inline]
61
93
pub fn is_nfc ( s : & str ) -> bool {
62
94
match is_nfc_quick ( s. chars ( ) ) {
63
95
IsNormalized :: Yes => true ,
@@ -67,10 +99,58 @@ pub fn is_nfc(s: &str) -> bool {
67
99
}
68
100
69
101
/// Authoritatively check if a string is in NFD.
102
+ #[ inline]
70
103
pub fn is_nfd ( s : & str ) -> bool {
71
104
match is_nfd_quick ( s. chars ( ) ) {
72
105
IsNormalized :: Yes => true ,
73
106
IsNormalized :: No => false ,
74
107
IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . nfd ( ) ) ,
75
108
}
76
109
}
110
+
111
+ /// Authoritatively check if a string is Stream-Safe NFC.
112
+ #[ inline]
113
+ pub fn is_nfc_stream_safe ( s : & str ) -> bool {
114
+ match is_nfc_stream_safe_quick ( s. chars ( ) ) {
115
+ IsNormalized :: Yes => true ,
116
+ IsNormalized :: No => false ,
117
+ IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . stream_safe ( ) . nfc ( ) ) ,
118
+ }
119
+ }
120
+
121
+ /// Authoritatively check if a string is Stream-Safe NFD.
122
+ #[ inline]
123
+ pub fn is_nfd_stream_safe ( s : & str ) -> bool {
124
+ match is_nfd_stream_safe_quick ( s. chars ( ) ) {
125
+ IsNormalized :: Yes => true ,
126
+ IsNormalized :: No => false ,
127
+ IsNormalized :: Maybe => s. chars ( ) . eq ( s. chars ( ) . stream_safe ( ) . nfd ( ) ) ,
128
+ }
129
+ }
130
+
131
+ #[ cfg( test) ]
132
+ mod tests {
133
+ use super :: {
134
+ IsNormalized ,
135
+ is_nfc_stream_safe_quick,
136
+ is_nfd_stream_safe_quick,
137
+ } ;
138
+
139
+ #[ test]
140
+ fn test_stream_safe_nfd ( ) {
141
+ let okay = "Da\u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{0300} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} ngerzone" ;
142
+ assert_eq ! ( is_nfd_stream_safe_quick( okay. chars( ) ) , IsNormalized :: Yes ) ;
143
+
144
+ let too_much = "Da\u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{031e} \u{0300} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} ngerzone" ;
145
+ assert_eq ! ( is_nfd_stream_safe_quick( too_much. chars( ) ) , IsNormalized :: No ) ;
146
+ }
147
+
148
+ #[ test]
149
+ fn test_stream_safe_nfc ( ) {
150
+ let okay = "ok\u{e0} \u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} y" ;
151
+ assert_eq ! ( is_nfc_stream_safe_quick( okay. chars( ) ) , IsNormalized :: Maybe ) ;
152
+
153
+ let too_much = "not ok\u{e0} \u{031b} \u{0316} \u{0317} \u{0318} \u{0319} \u{031c} \u{031d} \u{031e} \u{0301} \u{0302} \u{0303} \u{0304} \u{0305} \u{0306} \u{0307} \u{0308} \u{0309} \u{030a} \u{030b} \u{030c} \u{030d} \u{030e} \u{030f} \u{0310} \u{0311} \u{0312} \u{0313} \u{0314} \u{0315} \u{031a} y" ;
154
+ assert_eq ! ( is_nfc_stream_safe_quick( too_much. chars( ) ) , IsNormalized :: No ) ;
155
+ }
156
+ }
0 commit comments