Skip to content

Commit dc7f4f5

Browse files
hsivonensffc
authored andcommitted
Make UTS 46 normalization non-experimental (unicode-org#4712)
* Bake ignored/disallow data into the normalization data after all. * Make public operations available via a dedicated wrapper type instead of the main normalizer types. Closes unicode-org#2850
1 parent 4b7473e commit dc7f4f5

File tree

3 files changed

+402
-105
lines changed

3 files changed

+402
-105
lines changed

components/normalizer/src/lib.rs

Lines changed: 91 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ extern crate alloc;
7070
mod error;
7171
pub mod properties;
7272
pub mod provider;
73+
pub mod uts46;
7374

7475
pub use crate::error::NormalizerError;
7576

@@ -79,7 +80,6 @@ pub use NormalizerError as Error;
7980
use crate::provider::CanonicalDecompositionDataV1Marker;
8081
use crate::provider::CompatibilityDecompositionSupplementV1Marker;
8182
use crate::provider::DecompositionDataV1;
82-
#[cfg(feature = "experimental")]
8383
use crate::provider::Uts46DecompositionSupplementV1Marker;
8484
use alloc::string::String;
8585
use alloc::vec::Vec;
@@ -106,20 +106,30 @@ use zerovec::{zeroslice, ZeroSlice};
106106
#[derive(Debug)]
107107
enum SupplementPayloadHolder {
108108
Compatibility(DataPayload<CompatibilityDecompositionSupplementV1Marker>),
109-
#[cfg(feature = "experimental")]
110109
Uts46(DataPayload<Uts46DecompositionSupplementV1Marker>),
111110
}
112111

113112
impl SupplementPayloadHolder {
114113
fn get(&self) -> &DecompositionSupplementV1 {
115114
match self {
116115
SupplementPayloadHolder::Compatibility(d) => d.get(),
117-
#[cfg(feature = "experimental")]
118116
SupplementPayloadHolder::Uts46(d) => d.get(),
119117
}
120118
}
121119
}
122120

121+
/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
122+
#[derive(Debug, PartialEq, Eq)]
123+
enum IgnorableBehavior {
124+
/// 0xFFFFFFFF in data is not supported.
125+
Unsupported,
126+
/// Ignorables are ignored.
127+
Ignored,
128+
/// Ignorables are treated as singleton decompositions
129+
/// to the REPLACEMENT CHARACTER.
130+
ReplacementCharacter,
131+
}
132+
123133
/// Number of iterations allowed on the fast path before flushing.
124134
/// Since a typical UTF-16 iteration advances over a 2-byte BMP
125135
/// character, this means two memory pages.
@@ -132,6 +142,9 @@ impl SupplementPayloadHolder {
132142
/// passes an error through from `Write`.
133143
const UTF16_FAST_PATH_FLUSH_THRESHOLD: usize = 4096;
134144

145+
/// Marker for UTS 46 ignorables.
146+
const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
147+
135148
/// Marker for starters that decompose to themselves but may
136149
/// combine backwards under canonical composition.
137150
/// (Main trie only; not used in the supplementary trie.)
@@ -528,6 +541,7 @@ where
528541
/// 1. Decomposes to self.
529542
/// 2. Decomposition starts with a non-starter
530543
decomposition_passthrough_bound: u32, // never above 0xC0
544+
ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
531545
}
532546

533547
impl<'data, I> Decomposition<'data, I>
@@ -549,7 +563,15 @@ where
549563
decompositions: &'data DecompositionDataV1,
550564
tables: &'data DecompositionTablesV1,
551565
) -> Self {
552-
Self::new_with_supplements(delegate, decompositions, None, tables, None, 0xC0)
566+
Self::new_with_supplements(
567+
delegate,
568+
decompositions,
569+
None,
570+
tables,
571+
None,
572+
0xC0,
573+
IgnorableBehavior::Unsupported,
574+
)
553575
}
554576

555577
/// Constructs a decomposing iterator adapter from a delegate
@@ -565,6 +587,7 @@ where
565587
tables: &'data DecompositionTablesV1,
566588
supplementary_tables: Option<&'data DecompositionTablesV1>,
567589
decomposition_passthrough_bound: u8,
590+
ignorable_behavior: IgnorableBehavior,
568591
) -> Self {
569592
let half_width_voicing_marks_become_non_starters =
570593
if let Some(supplementary) = supplementary_decompositions {
@@ -595,6 +618,7 @@ where
595618
},
596619
half_width_voicing_marks_become_non_starters,
597620
decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
621+
ignorable_behavior,
598622
};
599623
let _ = ret.next(); // Remove the U+FFFF placeholder
600624
ret
@@ -721,16 +745,42 @@ where
721745

722746
fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
723747
debug_assert!(self.pending.is_none());
724-
let c = self.delegate.next()?;
748+
loop {
749+
let c = self.delegate.next()?;
725750

726-
// TODO(#2384): Measure if this check is actually an optimization even in the
727-
// non-supplementary case of if this should go inside the supplementary
728-
// `if` below.
729-
if u32::from(c) < self.decomposition_passthrough_bound {
730-
return Some(CharacterAndTrieValue::new(c, 0));
731-
}
751+
// TODO(#2384): Measure if this check is actually an optimization even in the
752+
// non-supplementary case of if this should go inside the supplementary
753+
// `if` below.
754+
if u32::from(c) < self.decomposition_passthrough_bound {
755+
return Some(CharacterAndTrieValue::new(c, 0));
756+
}
732757

733-
Some(self.attach_trie_value(c))
758+
if let Some(supplementary) = self.supplementary_trie {
759+
if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
760+
if value.trie_val == IGNORABLE_MARKER {
761+
match self.ignorable_behavior {
762+
IgnorableBehavior::Unsupported => {
763+
debug_assert!(false);
764+
}
765+
IgnorableBehavior::ReplacementCharacter => {
766+
return Some(CharacterAndTrieValue::new(
767+
c,
768+
u32::from(REPLACEMENT_CHARACTER),
769+
));
770+
}
771+
IgnorableBehavior::Ignored => {
772+
// Else ignore this character by reading the next one from the delegate.
773+
continue;
774+
}
775+
}
776+
}
777+
return Some(value);
778+
}
779+
}
780+
let trie_val = self.trie.get(c);
781+
debug_assert_ne!(trie_val, IGNORABLE_MARKER);
782+
return Some(CharacterAndTrieValue::new(c, trie_val));
783+
}
734784
}
735785

736786
fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
@@ -1229,6 +1279,7 @@ macro_rules! composing_normalize_to {
12291279
) -> core::fmt::Result {
12301280
$prolog
12311281
let mut $composition = self.normalize_iter($text.chars());
1282+
debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
12321283
for cc in $composition.decomposition.buffer.drain(..) {
12331284
$sink.write_char(cc.character())?;
12341285
}
@@ -1416,6 +1467,7 @@ macro_rules! decomposing_normalize_to {
14161467
$prolog
14171468

14181469
let mut $decomposition = self.normalize_iter($text.chars());
1470+
debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
14191471

14201472
// Try to get the compiler to hoist the bound to a register.
14211473
let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
@@ -1730,8 +1782,8 @@ impl DecomposingNormalizer {
17301782
}
17311783

17321784
#[doc(hidden)]
1733-
#[cfg(all(feature = "experimental", feature = "compiled_data"))]
1734-
pub const fn new_uts46_decomposed_without_ignored_and_disallowed() -> Self {
1785+
#[cfg(feature = "compiled_data")]
1786+
pub(crate) const fn new_uts46_decomposed() -> Self {
17351787
const _: () = assert!(
17361788
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
17371789
.scalars16
@@ -1807,8 +1859,7 @@ impl DecomposingNormalizer {
18071859
///
18081860
/// Public for testing only.
18091861
#[doc(hidden)]
1810-
#[cfg(feature = "experimental")]
1811-
pub fn try_new_uts46_decomposed_without_ignored_and_disallowed_unstable<D>(
1862+
pub(crate) fn try_new_uts46_decomposed_unstable<D>(
18121863
provider: &D,
18131864
) -> Result<Self, NormalizerError>
18141865
where
@@ -1872,6 +1923,7 @@ impl DecomposingNormalizer {
18721923
self.tables.get(),
18731924
self.supplementary_tables.as_ref().map(|s| s.get()),
18741925
self.decomposition_passthrough_bound,
1926+
IgnorableBehavior::Unsupported,
18751927
)
18761928
}
18771929

@@ -2241,52 +2293,27 @@ impl ComposingNormalizer {
22412293
})
22422294
}
22432295

2244-
/// See [`Self::try_new_uts46_without_ignored_and_disallowed_unstable`].
2245-
#[cfg(all(feature = "experimental", feature = "compiled_data"))]
2246-
pub const fn new_uts46_without_ignored_and_disallowed() -> Self {
2247-
ComposingNormalizer {
2248-
decomposing_normalizer:
2249-
DecomposingNormalizer::new_uts46_decomposed_without_ignored_and_disallowed(),
2250-
canonical_compositions: DataPayload::from_static_ref(
2251-
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
2252-
),
2253-
}
2254-
}
2255-
2256-
/// 🚧 \[Experimental\] UTS 46 constructor
2257-
///
22582296
/// This is a special building block normalization for IDNA that implements parts of the Map
2259-
/// step and the following Normalize step. The caller is responsible for performing the
2260-
/// "disallowed", "ignored", and "deviation" parts of the Map step before passing data to
2261-
/// this normalizer such that disallowed and ignored characters aren't passed to this
2262-
/// normalizer.
2263-
///
2264-
/// This is ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows
2265-
/// and ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as
2266-
/// in NFC in this normalization. Making the disallowed characters behave like this is beneficial
2267-
/// to data size, and this normalizer implementation cannot deal with a character normalizing
2268-
/// to the empty string, which doesn't happen in NFC or NFKC as of Unicode 14.
2297+
/// step and the following Normalize step.
22692298
///
22702299
/// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
22712300
/// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
22722301
/// U+0345 from a reordered character into a non-reordered character before reordering happens.
22732302
/// Therefore, the output of this normalization may differ for different inputs that are
22742303
/// canonically equivalents with each other if they differ by how U+0345 is ordered relative
22752304
/// to other reorderable characters.
2276-
///
2277-
/// NOTE: This method remains experimental until suitability of this feature as part of
2278-
/// IDNA processing has been demonstrated.
2279-
///
2280-
/// <div class="stab unstable">
2281-
/// 🚧 This code is experimental; it may change at any time, in breaking or non-breaking ways,
2282-
/// including in SemVer minor releases. It can be enabled with the "experimental" Cargo feature
2283-
/// of the icu meta-crate. Use with caution.
2284-
/// <a href="https://github.com/unicode-org/icu4x/issues/2614">#2614</a>
2285-
/// </div>
2286-
#[cfg(feature = "experimental")]
2287-
pub fn try_new_uts46_without_ignored_and_disallowed_unstable<D>(
2288-
provider: &D,
2289-
) -> Result<Self, NormalizerError>
2305+
#[cfg(feature = "compiled_data")]
2306+
pub(crate) const fn new_uts46() -> Self {
2307+
ComposingNormalizer {
2308+
decomposing_normalizer: DecomposingNormalizer::new_uts46_decomposed(),
2309+
canonical_compositions: DataPayload::from_static_ref(
2310+
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
2311+
),
2312+
}
2313+
}
2314+
2315+
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
2316+
pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
22902317
where
22912318
D: DataProvider<CanonicalDecompositionDataV1Marker>
22922319
+ DataProvider<Uts46DecompositionSupplementV1Marker>
@@ -2297,9 +2324,7 @@ impl ComposingNormalizer {
22972324
+ ?Sized,
22982325
{
22992326
let decomposing_normalizer =
2300-
DecomposingNormalizer::try_new_uts46_decomposed_without_ignored_and_disallowed_unstable(
2301-
provider,
2302-
)?;
2327+
DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
23032328

23042329
let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
23052330
provider.load(Default::default())?.take_payload()?;
@@ -2313,6 +2338,14 @@ impl ComposingNormalizer {
23132338
/// Wraps a delegate iterator into a composing iterator
23142339
/// adapter by using the data already held by this normalizer.
23152340
pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<I> {
2341+
self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
2342+
}
2343+
2344+
fn normalize_iter_private<I: Iterator<Item = char>>(
2345+
&self,
2346+
iter: I,
2347+
ignorable_behavior: IgnorableBehavior,
2348+
) -> Composition<I> {
23162349
Composition::new(
23172350
Decomposition::new_with_supplements(
23182351
iter,
@@ -2327,6 +2360,7 @@ impl ComposingNormalizer {
23272360
.as_ref()
23282361
.map(|s| s.get()),
23292362
self.decomposing_normalizer.decomposition_passthrough_bound,
2363+
ignorable_behavior,
23302364
),
23312365
ZeroFrom::zero_from(&self.canonical_compositions.get().canonical_compositions),
23322366
self.decomposing_normalizer.composition_passthrough_bound,

0 commit comments

Comments
 (0)