sfackler
diff --git a/‎src/lib.rs
Lines changed: 78 additions & 0 deletions b/‎src/lib.rs
Lines changed: 78 additions & 0 deletions
@@ -293,6 +293,75 @@ pub fn resourceprep(s: &str) -> Result<Cow<'_, str>, Error> {
     Ok(Cow::Owned(normalized))
 }
 
+fn x520_mapped_to_nothing(c: char) -> bool {
+    if c.is_control() {
+        return true;
+    }
+    match c {
+        '\u{00AD}' | '\u{1806}' | '\u{034F}' | '\u{180B}'..='\u{180D}' |
+        '\u{FE00}'..='\u{FE0F}' | '\u{FFFC}' | '\u{200B}' => true,
+        _ => false,
+    }
+}
+
+/// Prepares a string according to the procedures described in Section 7 of
+/// [ITU-T Recommendation X.520 (2019)](https://www.itu.int/rec/T-REC-X.520-201910-I/en).
+///
+/// Note that this function does _not_ remove leading, trailing, or inner
+/// spaces as described in Section 7.6, because the characters needing removal
+/// will vary across the matching rules and ASN.1 syntaxes used.
+pub fn x520prep(s: &str, case_fold: bool) -> Result<Cow<'_, str>, Error> {
+    if s.chars().all(|c| matches!(c, ' '..='~')) {
+        return Ok(Cow::Borrowed(s));
+    }
+
+    // 1. Transcode
+    // Already done because &str is enforced to be Unicode.
+
+    // 2. Map
+    let mapped = s.chars()
+        .filter(|&c| !x520_mapped_to_nothing(c))
+        .map(|c| if c.is_whitespace() { ' ' } else { c });
+
+    // 3. Normalize
+    let normalized = if case_fold {
+        mapped
+            .flat_map(tables::case_fold_for_nfkc)
+            .collect::<String>()
+    } else {
+        mapped.collect::<String>()
+    };
+
+    // 4. Prohibit
+    let prohibited = normalized.chars().find(|&c| tables::unassigned_code_point(c)
+        || tables::private_use(c)
+        || tables::non_character_code_point(c)
+        || tables::surrogate_code(c)
+        || c == '\u{FFFD}' // REPLACEMENT CHARACTER
+    );
+    if let Some(c) = prohibited {
+        return Err(Error(ErrorCause::ProhibitedCharacter(c)));
+    }
+    // From ITU-T Recommendation X.520, Section 7.4:
+    // "The first code point of a string is prohibited from being a combining character."
+    let first_char = s.chars().next();
+    if first_char.is_some_and(|c| tables::unicode_mark_category(c)) {
+        // I do think this ought to be considered a different error, but adding
+        // another enum variant would be a breaking change, so this is "good"
+        return Err(Error(ErrorCause::ProhibitedCharacter(first_char.unwrap())));
+    }
+
+    // 5. Check bidi
+    // From ITU-T Recommendation X.520, Section 7.4:
+    // "There are no bidirectional restrictions. The output string is the input string."
+    // So there is nothing to do for this step.
+
+    // 6. Insignificant Character Removal
+    // Done in calling functions.
+
+    Ok(normalized.into())
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -322,6 +391,15 @@ mod test {
         assert_eq!("foo@bar", resourceprep("foo@bar").unwrap());
     }
 
+    #[test]
+    fn x520prep_examples() {
+        assert_eq!(x520prep("", true).unwrap(), "");
+        assert_eq!(x520prep("foo@bar", true).unwrap(), "foo@bar");
+        assert_eq!(x520prep("J.\u{FE00} \u{9}W.\u{9} \u{B}wuz h\u{0115}re", false).unwrap(), "J. W. wuz h\u{0115}re");
+        assert_eq!(x520prep("J.\u{FE00} \u{9}W.\u{9} \u{B}wuz h\u{0115}re", true).unwrap(), "j. w. wuz h\u{0115}re");
+        assert_prohibited_character(x520prep("\u{0306}hello", true));
+    }
+
     #[test]
     fn ascii_optimisations() {
         if let Cow::Owned(_) = nodeprep("nodepart").unwrap() {