Skip to content

Commit fc7e5b2

Browse files
fix: bug in mapping whitespace-ish control characters
1 parent 17108ef commit fc7e5b2

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

src/lib.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,24 @@ pub fn resourceprep(s: &str) -> Result<Cow<'_, str>, Error> {
295295
Ok(Cow::Owned(normalized))
296296
}
297297

298+
/// Determines if `c` is to be removed according to section 7.2 of
299+
/// [ITU-T Recommendation X.520 (2019)](https://www.itu.int/rec/T-REC-X.520-201910-I/en).
298300
fn x520_mapped_to_nothing(c: char) -> bool {
299-
if c.is_control() {
300-
return true;
301-
}
302301
match c {
303302
'\u{00AD}' | '\u{1806}' | '\u{034F}' | '\u{180B}'..='\u{180D}' |
304303
'\u{FE00}'..='\u{FE0F}' | '\u{FFFC}' | '\u{200B}' => true,
305-
_ => false,
304+
// Technically control characters, but mapped to whitespace in X.520.
305+
'\u{09}' | '\u{0A}'..='\u{0D}' | '\u{85}' => false,
306+
_ => c.is_control(),
307+
}
308+
}
309+
310+
/// Determines if `c` is to be replaced by SPACE (0x20) according to section 7.2 of
311+
/// [ITU-T Recommendation X.520 (2019)](https://www.itu.int/rec/T-REC-X.520-201910-I/en).
312+
fn x520_mapped_to_space(c: char) -> bool {
313+
match c {
314+
'\u{09}' | '\u{0A}'..='\u{0D}' | '\u{85}' => true,
315+
_ => c.is_separator(),
306316
}
307317
}
308318

@@ -323,7 +333,7 @@ pub fn x520prep(s: &str, case_fold: bool) -> Result<Cow<'_, str>, Error> {
323333
// 2. Map
324334
let mapped = s.chars()
325335
.filter(|&c| !x520_mapped_to_nothing(c))
326-
.map(|c| if c.is_separator() { ' ' } else { c });
336+
.map(|c| if x520_mapped_to_space(c) { ' ' } else { c });
327337

328338
// 3. Normalize
329339
let normalized = if case_fold {
@@ -399,8 +409,8 @@ mod test {
399409
fn x520prep_examples() {
400410
assert_eq!(x520prep("", true).unwrap(), "");
401411
assert_eq!(x520prep("foo@bar", true).unwrap(), "foo@bar");
402-
assert_eq!(x520prep("J.\u{FE00} \u{9}W.\u{9} \u{B}wuz h\u{0115}re", false).unwrap(), "J. W. wuz h\u{0115}re");
403-
assert_eq!(x520prep("J.\u{FE00} \u{9}W.\u{9} \u{B}wuz h\u{0115}re", true).unwrap(), "j. w. wuz h\u{0115}re");
412+
assert_eq!(x520prep("J.\u{FE00} \u{9}W. \u{B}wuz h\u{0115}re", false).unwrap(), "J. W. wuz h\u{0115}re");
413+
assert_eq!(x520prep("J.\u{FE00} \u{9}W. \u{B}wuz h\u{0115}re", true).unwrap(), "j. w. wuz h\u{0115}re");
404414
assert_eq!(x520prep("UPPERCASED", true).unwrap(), "uppercased");
405415
assert_prohibited_character(x520prep("\u{0306}hello", true));
406416
}

0 commit comments

Comments
 (0)