@@ -70,6 +70,7 @@ extern crate alloc;
70
70
mod error;
71
71
pub mod properties;
72
72
pub mod provider;
73
+ pub mod uts46;
73
74
74
75
pub use crate :: error:: NormalizerError ;
75
76
@@ -79,7 +80,6 @@ pub use NormalizerError as Error;
79
80
use crate :: provider:: CanonicalDecompositionDataV1Marker ;
80
81
use crate :: provider:: CompatibilityDecompositionSupplementV1Marker ;
81
82
use crate :: provider:: DecompositionDataV1 ;
82
- #[ cfg( feature = "experimental" ) ]
83
83
use crate :: provider:: Uts46DecompositionSupplementV1Marker ;
84
84
use alloc:: string:: String ;
85
85
use alloc:: vec:: Vec ;
@@ -106,20 +106,30 @@ use zerovec::{zeroslice, ZeroSlice};
106
106
#[ derive( Debug ) ]
107
107
enum SupplementPayloadHolder {
108
108
Compatibility ( DataPayload < CompatibilityDecompositionSupplementV1Marker > ) ,
109
- #[ cfg( feature = "experimental" ) ]
110
109
Uts46 ( DataPayload < Uts46DecompositionSupplementV1Marker > ) ,
111
110
}
112
111
113
112
impl SupplementPayloadHolder {
114
113
fn get ( & self ) -> & DecompositionSupplementV1 {
115
114
match self {
116
115
SupplementPayloadHolder :: Compatibility ( d) => d. get ( ) ,
117
- #[ cfg( feature = "experimental" ) ]
118
116
SupplementPayloadHolder :: Uts46 ( d) => d. get ( ) ,
119
117
}
120
118
}
121
119
}
122
120
121
+ /// Treatment of the ignorable marker (0xFFFFFFFF) in data.
122
+ #[ derive( Debug , PartialEq , Eq ) ]
123
+ enum IgnorableBehavior {
124
+ /// 0xFFFFFFFF in data is not supported.
125
+ Unsupported ,
126
+ /// Ignorables are ignored.
127
+ Ignored ,
128
+ /// Ignorables are treated as singleton decompositions
129
+ /// to the REPLACEMENT CHARACTER.
130
+ ReplacementCharacter ,
131
+ }
132
+
123
133
/// Number of iterations allowed on the fast path before flushing.
124
134
/// Since a typical UTF-16 iteration advances over a 2-byte BMP
125
135
/// character, this means two memory pages.
@@ -132,6 +142,9 @@ impl SupplementPayloadHolder {
132
142
/// passes an error through from `Write`.
133
143
const UTF16_FAST_PATH_FLUSH_THRESHOLD : usize = 4096 ;
134
144
145
+ /// Marker for UTS 46 ignorables.
146
+ const IGNORABLE_MARKER : u32 = 0xFFFFFFFF ;
147
+
135
148
/// Marker for starters that decompose to themselves but may
136
149
/// combine backwards under canonical composition.
137
150
/// (Main trie only; not used in the supplementary trie.)
@@ -528,6 +541,7 @@ where
528
541
/// 1. Decomposes to self.
529
542
/// 2. Decomposition starts with a non-starter
530
543
decomposition_passthrough_bound : u32 , // never above 0xC0
544
+ ignorable_behavior : IgnorableBehavior , // Arguably should be a type parameter
531
545
}
532
546
533
547
impl < ' data , I > Decomposition < ' data , I >
@@ -549,7 +563,15 @@ where
549
563
decompositions : & ' data DecompositionDataV1 ,
550
564
tables : & ' data DecompositionTablesV1 ,
551
565
) -> Self {
552
- Self :: new_with_supplements ( delegate, decompositions, None , tables, None , 0xC0 )
566
+ Self :: new_with_supplements (
567
+ delegate,
568
+ decompositions,
569
+ None ,
570
+ tables,
571
+ None ,
572
+ 0xC0 ,
573
+ IgnorableBehavior :: Unsupported ,
574
+ )
553
575
}
554
576
555
577
/// Constructs a decomposing iterator adapter from a delegate
@@ -565,6 +587,7 @@ where
565
587
tables : & ' data DecompositionTablesV1 ,
566
588
supplementary_tables : Option < & ' data DecompositionTablesV1 > ,
567
589
decomposition_passthrough_bound : u8 ,
590
+ ignorable_behavior : IgnorableBehavior ,
568
591
) -> Self {
569
592
let half_width_voicing_marks_become_non_starters =
570
593
if let Some ( supplementary) = supplementary_decompositions {
@@ -595,6 +618,7 @@ where
595
618
} ,
596
619
half_width_voicing_marks_become_non_starters,
597
620
decomposition_passthrough_bound : u32:: from ( decomposition_passthrough_bound) ,
621
+ ignorable_behavior,
598
622
} ;
599
623
let _ = ret. next ( ) ; // Remove the U+FFFF placeholder
600
624
ret
@@ -721,16 +745,42 @@ where
721
745
722
746
fn delegate_next_no_pending ( & mut self ) -> Option < CharacterAndTrieValue > {
723
747
debug_assert ! ( self . pending. is_none( ) ) ;
724
- let c = self . delegate . next ( ) ?;
748
+ loop {
749
+ let c = self . delegate . next ( ) ?;
725
750
726
- // TODO(#2384): Measure if this check is actually an optimization even in the
727
- // non-supplementary case of if this should go inside the supplementary
728
- // `if` below.
729
- if u32:: from ( c) < self . decomposition_passthrough_bound {
730
- return Some ( CharacterAndTrieValue :: new ( c, 0 ) ) ;
731
- }
751
+ // TODO(#2384): Measure if this check is actually an optimization even in the
752
+ // non-supplementary case of if this should go inside the supplementary
753
+ // `if` below.
754
+ if u32:: from ( c) < self . decomposition_passthrough_bound {
755
+ return Some ( CharacterAndTrieValue :: new ( c, 0 ) ) ;
756
+ }
732
757
733
- Some ( self . attach_trie_value ( c) )
758
+ if let Some ( supplementary) = self . supplementary_trie {
759
+ if let Some ( value) = self . attach_supplementary_trie_value ( c, supplementary) {
760
+ if value. trie_val == IGNORABLE_MARKER {
761
+ match self . ignorable_behavior {
762
+ IgnorableBehavior :: Unsupported => {
763
+ debug_assert ! ( false ) ;
764
+ }
765
+ IgnorableBehavior :: ReplacementCharacter => {
766
+ return Some ( CharacterAndTrieValue :: new (
767
+ c,
768
+ u32:: from ( REPLACEMENT_CHARACTER ) ,
769
+ ) ) ;
770
+ }
771
+ IgnorableBehavior :: Ignored => {
772
+ // Else ignore this character by reading the next one from the delegate.
773
+ continue ;
774
+ }
775
+ }
776
+ }
777
+ return Some ( value) ;
778
+ }
779
+ }
780
+ let trie_val = self . trie . get ( c) ;
781
+ debug_assert_ne ! ( trie_val, IGNORABLE_MARKER ) ;
782
+ return Some ( CharacterAndTrieValue :: new ( c, trie_val) ) ;
783
+ }
734
784
}
735
785
736
786
fn delegate_next ( & mut self ) -> Option < CharacterAndTrieValue > {
@@ -1229,6 +1279,7 @@ macro_rules! composing_normalize_to {
1229
1279
) -> core:: fmt:: Result {
1230
1280
$prolog
1231
1281
let mut $composition = self . normalize_iter( $text. chars( ) ) ;
1282
+ debug_assert_eq!( $composition. decomposition. ignorable_behavior, IgnorableBehavior :: Unsupported ) ;
1232
1283
for cc in $composition. decomposition. buffer. drain( ..) {
1233
1284
$sink. write_char( cc. character( ) ) ?;
1234
1285
}
@@ -1416,6 +1467,7 @@ macro_rules! decomposing_normalize_to {
1416
1467
$prolog
1417
1468
1418
1469
let mut $decomposition = self . normalize_iter( $text. chars( ) ) ;
1470
+ debug_assert_eq!( $decomposition. ignorable_behavior, IgnorableBehavior :: Unsupported ) ;
1419
1471
1420
1472
// Try to get the compiler to hoist the bound to a register.
1421
1473
let $decomposition_passthrough_bound = $decomposition. decomposition_passthrough_bound;
@@ -1730,8 +1782,8 @@ impl DecomposingNormalizer {
1730
1782
}
1731
1783
1732
1784
#[ doc( hidden) ]
1733
- #[ cfg( all ( feature = "experimental" , feature = " compiled_data") ) ]
1734
- pub const fn new_uts46_decomposed_without_ignored_and_disallowed ( ) -> Self {
1785
+ #[ cfg( feature = "compiled_data" ) ]
1786
+ pub ( crate ) const fn new_uts46_decomposed ( ) -> Self {
1735
1787
const _: ( ) = assert ! (
1736
1788
crate :: provider:: Baked :: SINGLETON_NORMALIZER_NFDEX_V1
1737
1789
. scalars16
@@ -1807,8 +1859,7 @@ impl DecomposingNormalizer {
1807
1859
///
1808
1860
/// Public for testing only.
1809
1861
#[ doc( hidden) ]
1810
- #[ cfg( feature = "experimental" ) ]
1811
- pub fn try_new_uts46_decomposed_without_ignored_and_disallowed_unstable < D > (
1862
+ pub ( crate ) fn try_new_uts46_decomposed_unstable < D > (
1812
1863
provider : & D ,
1813
1864
) -> Result < Self , NormalizerError >
1814
1865
where
@@ -1872,6 +1923,7 @@ impl DecomposingNormalizer {
1872
1923
self . tables . get ( ) ,
1873
1924
self . supplementary_tables . as_ref ( ) . map ( |s| s. get ( ) ) ,
1874
1925
self . decomposition_passthrough_bound ,
1926
+ IgnorableBehavior :: Unsupported ,
1875
1927
)
1876
1928
}
1877
1929
@@ -2241,52 +2293,27 @@ impl ComposingNormalizer {
2241
2293
} )
2242
2294
}
2243
2295
2244
- /// See [`Self::try_new_uts46_without_ignored_and_disallowed_unstable`].
2245
- #[ cfg( all( feature = "experimental" , feature = "compiled_data" ) ) ]
2246
- pub const fn new_uts46_without_ignored_and_disallowed ( ) -> Self {
2247
- ComposingNormalizer {
2248
- decomposing_normalizer :
2249
- DecomposingNormalizer :: new_uts46_decomposed_without_ignored_and_disallowed ( ) ,
2250
- canonical_compositions : DataPayload :: from_static_ref (
2251
- crate :: provider:: Baked :: SINGLETON_NORMALIZER_COMP_V1 ,
2252
- ) ,
2253
- }
2254
- }
2255
-
2256
- /// 🚧 \[Experimental\] UTS 46 constructor
2257
- ///
2258
2296
/// This is a special building block normalization for IDNA that implements parts of the Map
2259
- /// step and the following Normalize step. The caller is responsible for performing the
2260
- /// "disallowed", "ignored", and "deviation" parts of the Map step before passing data to
2261
- /// this normalizer such that disallowed and ignored characters aren't passed to this
2262
- /// normalizer.
2263
- ///
2264
- /// This is ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows
2265
- /// and ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as
2266
- /// in NFC in this normalization. Making the disallowed characters behave like this is beneficial
2267
- /// to data size, and this normalizer implementation cannot deal with a character normalizing
2268
- /// to the empty string, which doesn't happen in NFC or NFKC as of Unicode 14.
2297
+ /// step and the following Normalize step.
2269
2298
///
2270
2299
/// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
2271
2300
/// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
2272
2301
/// U+0345 from a reordered character into a non-reordered character before reordering happens.
2273
2302
/// Therefore, the output of this normalization may differ for different inputs that are
2274
2303
/// canonically equivalents with each other if they differ by how U+0345 is ordered relative
2275
2304
/// to other reorderable characters.
2276
- ///
2277
- /// NOTE: This method remains experimental until suitability of this feature as part of
2278
- /// IDNA processing has been demonstrated.
2279
- ///
2280
- /// <div class="stab unstable">
2281
- /// 🚧 This code is experimental; it may change at any time, in breaking or non-breaking ways,
2282
- /// including in SemVer minor releases. It can be enabled with the "experimental" Cargo feature
2283
- /// of the icu meta-crate. Use with caution.
2284
- /// <a href="https://github.com/unicode-org/icu4x/issues/2614">#2614</a>
2285
- /// </div>
2286
- #[ cfg( feature = "experimental" ) ]
2287
- pub fn try_new_uts46_without_ignored_and_disallowed_unstable < D > (
2288
- provider : & D ,
2289
- ) -> Result < Self , NormalizerError >
2305
+ #[ cfg( feature = "compiled_data" ) ]
2306
+ pub ( crate ) const fn new_uts46 ( ) -> Self {
2307
+ ComposingNormalizer {
2308
+ decomposing_normalizer : DecomposingNormalizer :: new_uts46_decomposed ( ) ,
2309
+ canonical_compositions : DataPayload :: from_static_ref (
2310
+ crate :: provider:: Baked :: SINGLETON_NORMALIZER_COMP_V1 ,
2311
+ ) ,
2312
+ }
2313
+ }
2314
+
2315
+ #[ doc = icu_provider:: gen_any_buffer_unstable_docs!( UNSTABLE , Self :: new_uts46) ]
2316
+ pub ( crate ) fn try_new_uts46_unstable < D > ( provider : & D ) -> Result < Self , NormalizerError >
2290
2317
where
2291
2318
D : DataProvider < CanonicalDecompositionDataV1Marker >
2292
2319
+ DataProvider < Uts46DecompositionSupplementV1Marker >
@@ -2297,9 +2324,7 @@ impl ComposingNormalizer {
2297
2324
+ ?Sized ,
2298
2325
{
2299
2326
let decomposing_normalizer =
2300
- DecomposingNormalizer :: try_new_uts46_decomposed_without_ignored_and_disallowed_unstable (
2301
- provider,
2302
- ) ?;
2327
+ DecomposingNormalizer :: try_new_uts46_decomposed_unstable ( provider) ?;
2303
2328
2304
2329
let canonical_compositions: DataPayload < CanonicalCompositionsV1Marker > =
2305
2330
provider. load ( Default :: default ( ) ) ?. take_payload ( ) ?;
@@ -2313,6 +2338,14 @@ impl ComposingNormalizer {
2313
2338
/// Wraps a delegate iterator into a composing iterator
2314
2339
/// adapter by using the data already held by this normalizer.
2315
2340
pub fn normalize_iter < I : Iterator < Item = char > > ( & self , iter : I ) -> Composition < I > {
2341
+ self . normalize_iter_private ( iter, IgnorableBehavior :: Unsupported )
2342
+ }
2343
+
2344
+ fn normalize_iter_private < I : Iterator < Item = char > > (
2345
+ & self ,
2346
+ iter : I ,
2347
+ ignorable_behavior : IgnorableBehavior ,
2348
+ ) -> Composition < I > {
2316
2349
Composition :: new (
2317
2350
Decomposition :: new_with_supplements (
2318
2351
iter,
@@ -2327,6 +2360,7 @@ impl ComposingNormalizer {
2327
2360
. as_ref ( )
2328
2361
. map ( |s| s. get ( ) ) ,
2329
2362
self . decomposing_normalizer . decomposition_passthrough_bound ,
2363
+ ignorable_behavior,
2330
2364
) ,
2331
2365
ZeroFrom :: zero_from ( & self . canonical_compositions . get ( ) . canonical_compositions ) ,
2332
2366
self . decomposing_normalizer . composition_passthrough_bound ,
0 commit comments