Skip to content

Commit 7b8c64e

Browse files
feat: x520_prep
1 parent 0696f19 commit 7b8c64e

File tree

2 files changed

+396
-0
lines changed

2 files changed

+396
-0
lines changed

src/lib.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,75 @@ pub fn resourceprep(s: &str) -> Result<Cow<'_, str>, Error> {
293293
Ok(Cow::Owned(normalized))
294294
}
295295

296+
fn x520_mapped_to_nothing(c: char) -> bool {
297+
if c.is_control() {
298+
return true;
299+
}
300+
match c {
301+
'\u{00AD}' | '\u{1806}' | '\u{034F}' | '\u{180B}'..='\u{180D}' |
302+
'\u{FE00}'..='\u{FE0F}' | '\u{FFFC}' | '\u{200B}' => true,
303+
_ => false,
304+
}
305+
}
306+
307+
/// Prepares a string according to the procedures described in Section 7 of
308+
/// [ITU-T Recommendation X.520 (2019)](https://www.itu.int/rec/T-REC-X.520-201910-I/en).
309+
///
310+
/// Note that this function does _not_ remove leading, trailing, or inner
311+
/// spaces as described in Section 7.6, because the characters needing removal
312+
/// will vary across the matching rules and ASN.1 syntaxes used.
313+
pub fn x520prep(s: &str, case_fold: bool) -> Result<Cow<'_, str>, Error> {
314+
if s.chars().all(|c| matches!(c, ' '..='~')) {
315+
return Ok(Cow::Borrowed(s));
316+
}
317+
318+
// 1. Transcode
319+
// Already done because &str is enforced to be Unicode.
320+
321+
// 2. Map
322+
let mapped = s.chars()
323+
.filter(|&c| !x520_mapped_to_nothing(c))
324+
.map(|c| if c.is_whitespace() { ' ' } else { c });
325+
326+
// 3. Normalize
327+
let normalized = if case_fold {
328+
mapped
329+
.flat_map(tables::case_fold_for_nfkc)
330+
.collect::<String>()
331+
} else {
332+
mapped.collect::<String>()
333+
};
334+
335+
// 4. Prohibit
336+
let prohibited = normalized.chars().find(|&c| tables::unassigned_code_point(c)
337+
|| tables::private_use(c)
338+
|| tables::non_character_code_point(c)
339+
|| tables::surrogate_code(c)
340+
|| c == '\u{FFFD}' // REPLACEMENT CHARACTER
341+
);
342+
if let Some(c) = prohibited {
343+
return Err(Error(ErrorCause::ProhibitedCharacter(c)));
344+
}
345+
// From ITU-T Recommendation X.520, Section 7.4:
346+
// "The first code point of a string is prohibited from being a combining character."
347+
let first_char = s.chars().next();
348+
if first_char.is_some_and(|c| tables::unicode_mark_category(c)) {
349+
// I do think this ought to be considered a different error, but adding
350+
// another enum variant would be a breaking change, so this is "good"
351+
return Err(Error(ErrorCause::ProhibitedCharacter(first_char.unwrap())));
352+
}
353+
354+
// 5. Check bidi
355+
// From ITU-T Recommendation X.520, Section 7.4:
356+
// "There are no bidirectional restrictions. The output string is the input string."
357+
// So there is nothing to do for this step.
358+
359+
// 6. Insignificant Character Removal
360+
// Done in calling functions.
361+
362+
Ok(normalized.into())
363+
}
364+
296365
#[cfg(test)]
297366
mod test {
298367
use super::*;
@@ -322,6 +391,15 @@ mod test {
322391
assert_eq!("foo@bar", resourceprep("foo@bar").unwrap());
323392
}
324393

394+
#[test]
395+
fn x520prep_examples() {
396+
assert_eq!(x520prep("", true).unwrap(), "");
397+
assert_eq!(x520prep("foo@bar", true).unwrap(), "foo@bar");
398+
assert_eq!(x520prep("J.\u{FE00} \u{9}W.\u{9} \u{B}wuz h\u{0115}re", false).unwrap(), "J. W. wuz h\u{0115}re");
399+
assert_eq!(x520prep("J.\u{FE00} \u{9}W.\u{9} \u{B}wuz h\u{0115}re", true).unwrap(), "j. w. wuz h\u{0115}re");
400+
assert_prohibited_character(x520prep("\u{0306}hello", true));
401+
}
402+
325403
#[test]
326404
fn ascii_optimisations() {
327405
if let Cow::Owned(_) = nodeprep("nodepart").unwrap() {

0 commit comments

Comments
 (0)