Skip to content

Commit caff953

Browse files
authored
Merge pull request #24 from sujayakar/quick-ss
Implement Stream-Safe QuickCheck variant
2 parents b6a1fc8 + dab0c5e commit caff953

File tree

4 files changed

+136
-21
lines changed

4 files changed

+136
-21
lines changed

benches/bench.rs

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,52 +6,83 @@ extern crate test;
66
use test::Bencher;
77
use unicode_normalization::UnicodeNormalization;
88

9+
const ASCII: &'static str = "all types of normalized";
10+
const NFC: &'static str = "Introducci\u{00f3}n a Unicode.pdf";
11+
const NFD: &'static str = "Introduccio\u{0301}n a Unicode.pdf";
12+
913
#[bench]
1014
fn bench_is_nfc_ascii(b: &mut Bencher) {
11-
b.iter(|| unicode_normalization::is_nfc("all types of normalized"));
15+
b.iter(|| unicode_normalization::is_nfc(ASCII));
1216
}
1317

1418
#[bench]
1519
fn bench_is_nfc_normalized(b: &mut Bencher) {
16-
b.iter(|| unicode_normalization::is_nfc("Introducci\u{00f3}n a Unicode.pdf"));
20+
b.iter(|| unicode_normalization::is_nfc(NFC));
1721
}
1822

1923
#[bench]
2024
fn bench_is_nfc_not_normalized(b: &mut Bencher) {
21-
b.iter(|| unicode_normalization::is_nfc("Introduccio\u{0301}n a Unicode.pdf"));
25+
b.iter(|| unicode_normalization::is_nfc(NFD));
2226
}
2327

2428
#[bench]
2529
fn bench_is_nfd_ascii(b: &mut Bencher) {
26-
b.iter(|| unicode_normalization::is_nfd("an easy string to check"));
30+
b.iter(|| unicode_normalization::is_nfd(ASCII));
2731
}
2832

2933
#[bench]
3034
fn bench_is_nfd_normalized(b: &mut Bencher) {
31-
b.iter(|| unicode_normalization::is_nfd("Introduccio\u{0301}n a Unicode.pdf"));
35+
b.iter(|| unicode_normalization::is_nfd(NFD));
3236
}
3337

3438
#[bench]
3539
fn bench_is_nfd_not_normalized(b: &mut Bencher) {
36-
b.iter(|| unicode_normalization::is_nfd("Introducci\u{00f3}n a Unicode.pdf"));
40+
b.iter(|| unicode_normalization::is_nfd(NFC));
41+
}
42+
43+
#[bench]
44+
fn bench_is_nfc_stream_safe_ascii(b: &mut Bencher) {
45+
b.iter(|| unicode_normalization::is_nfc_stream_safe(ASCII));
46+
}
47+
48+
#[bench]
49+
fn bench_is_nfc_stream_safe_normalized(b: &mut Bencher) {
50+
b.iter(|| unicode_normalization::is_nfc_stream_safe(NFC));
51+
}
52+
53+
#[bench]
54+
fn bench_is_nfc_stream_safe_not_normalized(b: &mut Bencher) {
55+
b.iter(|| unicode_normalization::is_nfc_stream_safe(NFD));
56+
}
57+
58+
#[bench]
59+
fn bench_is_nfd_stream_safe_ascii(b: &mut Bencher) {
60+
b.iter(|| unicode_normalization::is_nfd_stream_safe(ASCII));
61+
}
62+
63+
#[bench]
64+
fn bench_is_nfd_stream_safe_normalized(b: &mut Bencher) {
65+
b.iter(|| unicode_normalization::is_nfd_stream_safe(NFD));
66+
}
67+
68+
#[bench]
69+
fn bench_is_nfd_stream_safe_not_normalized(b: &mut Bencher) {
70+
b.iter(|| unicode_normalization::is_nfd_stream_safe(NFC));
3771
}
3872

3973
#[bench]
4074
fn bench_nfc_ascii(b: &mut Bencher) {
41-
let s = "normalize me please";
42-
b.iter(|| s.nfc().count());
75+
b.iter(|| ASCII.nfc().count());
4376
}
4477

4578
#[bench]
4679
fn bench_nfd_ascii(b: &mut Bencher) {
47-
let s = "decompose me entirely";
48-
b.iter(|| s.nfd().count());
80+
b.iter(|| ASCII.nfd().count());
4981
}
5082

5183
#[bench]
5284
fn bench_streamsafe_ascii(b: &mut Bencher) {
53-
let s = "quite nonthreatening";
54-
b.iter(|| s.stream_safe().count());
85+
b.iter(|| ASCII.stream_safe().count());
5586
}
5687

5788
#[bench]

src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,12 @@ pub use quick_check::{
4747
IsNormalized,
4848
is_nfc,
4949
is_nfc_quick,
50+
is_nfc_stream_safe,
51+
is_nfc_stream_safe_quick,
5052
is_nfd,
5153
is_nfd_quick,
54+
is_nfd_stream_safe,
55+
is_nfd_stream_safe_quick,
5256
};
5357
pub use recompose::Recompositions;
5458
pub use stream_safe::StreamSafe;

src/quick_check.rs

Lines changed: 83 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
use UnicodeNormalization;
2+
use stream_safe;
23
use tables;
34

45
/// The QuickCheck algorithm can quickly determine if a text is or isn't
56
/// normalized without any allocations in many cases, but it has to be able to
67
/// return `Maybe` when a full decomposition and recomposition is necessary.
8+
#[derive(Debug, Eq, PartialEq)]
79
pub enum IsNormalized {
810
/// The text is definitely normalized.
911
Yes,
@@ -15,17 +17,20 @@ pub enum IsNormalized {
1517

1618
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
1719
#[inline]
18-
fn quick_check<F, I>(s: I, is_allowed: F) -> IsNormalized
20+
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
1921
where I: Iterator<Item=char>, F: Fn(char) -> IsNormalized
2022
{
2123
let mut last_cc = 0u8;
24+
let mut nonstarter_count = 0;
2225
let mut result = IsNormalized::Yes;
2326
for ch in s {
2427
// For ASCII we know it's always allowed and a starter
2528
if ch <= '\x7f' {
2629
last_cc = 0;
30+
nonstarter_count = 0;
2731
continue;
2832
}
33+
2934
// Otherwise, lookup the combining class and QC property
3035
let cc = tables::canonical_combining_class(ch);
3136
if last_cc > cc && cc != 0 {
@@ -38,6 +43,20 @@ fn quick_check<F, I>(s: I, is_allowed: F) -> IsNormalized
3843
result = IsNormalized::Maybe;
3944
},
4045
}
46+
if stream_safe {
47+
let decomp = stream_safe::classify_nonstarters(ch);
48+
49+
// If we're above `MAX_NONSTARTERS`, we're definitely *not*
50+
// stream-safe normalized.
51+
if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS {
52+
return IsNormalized::No;
53+
}
54+
if decomp.leading_nonstarters == decomp.decomposition_len {
55+
nonstarter_count += decomp.decomposition_len;
56+
} else {
57+
nonstarter_count = decomp.trailing_nonstarters;
58+
}
59+
}
4160
last_cc = cc;
4261
}
4362
result
@@ -48,16 +67,29 @@ fn quick_check<F, I>(s: I, is_allowed: F) -> IsNormalized
4867
/// like `s.chars().nfc().eq(s.chars())` should suffice.
4968
#[inline]
5069
pub fn is_nfc_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
51-
quick_check(s, tables::qc_nfc)
70+
quick_check(s, tables::qc_nfc, false)
5271
}
5372

5473
/// Quickly check if a string is in NFD.
5574
#[inline]
5675
pub fn is_nfd_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
57-
quick_check(s, tables::qc_nfd)
76+
quick_check(s, tables::qc_nfd, false)
77+
}
78+
79+
/// Quickly check if a string is Stream-Safe NFC.
80+
#[inline]
81+
pub fn is_nfc_stream_safe_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
82+
quick_check(s, tables::qc_nfc, true)
83+
}
84+
85+
/// Quickly check if a string is Stream-Safe NFD.
86+
#[inline]
87+
pub fn is_nfd_stream_safe_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
88+
quick_check(s, tables::qc_nfd, true)
5889
}
5990

6091
/// Authoritatively check if a string is in NFC.
92+
#[inline]
6193
pub fn is_nfc(s: &str) -> bool {
6294
match is_nfc_quick(s.chars()) {
6395
IsNormalized::Yes => true,
@@ -67,10 +99,58 @@ pub fn is_nfc(s: &str) -> bool {
6799
}
68100

69101
/// Authoritatively check if a string is in NFD.
102+
#[inline]
70103
pub fn is_nfd(s: &str) -> bool {
71104
match is_nfd_quick(s.chars()) {
72105
IsNormalized::Yes => true,
73106
IsNormalized::No => false,
74107
IsNormalized::Maybe => s.chars().eq(s.chars().nfd()),
75108
}
76109
}
110+
111+
/// Authoritatively check if a string is Stream-Safe NFC.
112+
#[inline]
113+
pub fn is_nfc_stream_safe(s: &str) -> bool {
114+
match is_nfc_stream_safe_quick(s.chars()) {
115+
IsNormalized::Yes => true,
116+
IsNormalized::No => false,
117+
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()),
118+
}
119+
}
120+
121+
/// Authoritatively check if a string is Stream-Safe NFD.
122+
#[inline]
123+
pub fn is_nfd_stream_safe(s: &str) -> bool {
124+
match is_nfd_stream_safe_quick(s.chars()) {
125+
IsNormalized::Yes => true,
126+
IsNormalized::No => false,
127+
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()),
128+
}
129+
}
130+
131+
#[cfg(test)]
132+
mod tests {
133+
use super::{
134+
IsNormalized,
135+
is_nfc_stream_safe_quick,
136+
is_nfd_stream_safe_quick,
137+
};
138+
139+
#[test]
140+
fn test_stream_safe_nfd() {
141+
let okay = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
142+
assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes);
143+
144+
let too_much = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
145+
assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No);
146+
}
147+
148+
#[test]
149+
fn test_stream_safe_nfc() {
150+
let okay = "ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
151+
assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe);
152+
153+
let too_much = "not ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
154+
assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No);
155+
}
156+
}

src/stream_safe.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use normalize::{
44
};
55
use tables;
66

7-
const MAX_NONSTARTERS: usize = 30;
7+
pub(crate) const MAX_NONSTARTERS: usize = 30;
88
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
99

1010
/// UAX15-D4: This iterator keeps track of how many non-starters there have been
@@ -54,14 +54,14 @@ impl<I: Iterator<Item=char>> Iterator for StreamSafe<I> {
5454
}
5555

5656
#[derive(Debug)]
57-
struct Decomposition {
58-
leading_nonstarters: usize,
59-
trailing_nonstarters: usize,
60-
decomposition_len: usize,
57+
pub(crate) struct Decomposition {
58+
pub(crate) leading_nonstarters: usize,
59+
pub(crate) trailing_nonstarters: usize,
60+
pub(crate) decomposition_len: usize,
6161
}
6262

6363
#[inline]
64-
fn classify_nonstarters(c: char) -> Decomposition {
64+
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
6565
// As usual, fast path for ASCII (which is always a starter)
6666
if c <= '\x7f' {
6767
return Decomposition {

0 commit comments

Comments
 (0)