@@ -10,6 +10,7 @@ use alloc::vec;
10
10
use alloc:: vec:: Vec ;
11
11
use core:: char;
12
12
use core:: str:: CharIndices ;
13
+ use icu_locale_core:: subtags:: language;
13
14
use icu_provider:: prelude:: * ;
14
15
use utf8_iter:: Utf8CharIndices ;
15
16
@@ -183,29 +184,29 @@ pub enum LineBreakWordOption {
183
184
184
185
/// Options to tailor line-breaking behavior.
185
186
#[ non_exhaustive]
186
- #[ derive( Copy , Clone , PartialEq , Eq , Debug ) ]
187
+ #[ derive( Clone , PartialEq , Eq , Debug ) ]
187
188
pub struct LineBreakOptions {
188
189
/// Strictness of line-breaking rules. See [`LineBreakStrictness`].
189
190
pub strictness : LineBreakStrictness ,
190
191
191
192
/// Line break opportunities between letters. See [`LineBreakWordOption`].
192
193
pub word_option : LineBreakWordOption ,
193
194
194
- /// Use `true` as a hint to the line segmenter that the writing
195
- /// system is Chinese or Japanese. This allows more break opportunities when
196
- /// `LineBreakStrictness` is `Normal` or `Loose`. See
197
- /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
195
+ /// Content locale for line segmenter
198
196
///
197
+ /// This allows more break opportunities when `LineBreakStrictness` is
198
+ /// `Normal` or `Loose`. See
199
+ /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
199
200
/// This option has no effect in Latin-1 mode.
200
- pub ja_zh : bool ,
201
+ pub content_locale : Option < DataLocale > ,
201
202
}
202
203
203
204
impl Default for LineBreakOptions {
204
205
fn default ( ) -> Self {
205
206
Self {
206
207
strictness : LineBreakStrictness :: Strict ,
207
208
word_option : LineBreakWordOption :: Normal ,
208
- ja_zh : false ,
209
+ content_locale : None ,
209
210
}
210
211
}
211
212
}
@@ -303,7 +304,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp
303
304
/// let mut options = LineBreakOptions::default();
304
305
/// options.strictness = LineBreakStrictness::Strict;
305
306
/// options.word_option = LineBreakWordOption::BreakAll;
306
- /// options.ja_zh = false ;
307
+ /// options.content_locale = None ;
307
308
/// let segmenter = LineSegmenter::new_auto_with_options(options);
308
309
///
309
310
/// let breakpoints: Vec<usize> =
@@ -641,6 +642,11 @@ impl LineSegmenter {
641
642
///
642
643
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
643
644
pub fn segment_str < ' l , ' s > ( & ' l self , input : & ' s str ) -> LineBreakIteratorUtf8 < ' l , ' s > {
645
+ let ja_zh = if let Some ( content_locale) = & self . options . content_locale {
646
+ content_locale. language == language ! ( "ja" ) || content_locale. language == language ! ( "zh" )
647
+ } else {
648
+ false
649
+ } ;
644
650
LineBreakIterator {
645
651
iter : input. char_indices ( ) ,
646
652
len : input. len ( ) ,
@@ -649,6 +655,7 @@ impl LineSegmenter {
649
655
data : self . payload . get ( ) ,
650
656
options : & self . options ,
651
657
complex : & self . complex ,
658
+ ja_zh,
652
659
}
653
660
}
654
661
/// Creates a line break iterator for a potentially ill-formed UTF8 string
@@ -660,6 +667,11 @@ impl LineSegmenter {
660
667
& ' l self ,
661
668
input : & ' s [ u8 ] ,
662
669
) -> LineBreakIteratorPotentiallyIllFormedUtf8 < ' l , ' s > {
670
+ let ja_zh = if let Some ( content_locale) = & self . options . content_locale {
671
+ content_locale. language == language ! ( "ja" ) || content_locale. language == language ! ( "zh" )
672
+ } else {
673
+ false
674
+ } ;
663
675
LineBreakIterator {
664
676
iter : Utf8CharIndices :: new ( input) ,
665
677
len : input. len ( ) ,
@@ -668,6 +680,7 @@ impl LineSegmenter {
668
680
data : self . payload . get ( ) ,
669
681
options : & self . options ,
670
682
complex : & self . complex ,
683
+ ja_zh,
671
684
}
672
685
}
673
686
/// Creates a line break iterator for a Latin-1 (8-bit) string.
@@ -682,13 +695,19 @@ impl LineSegmenter {
682
695
data : self . payload . get ( ) ,
683
696
options : & self . options ,
684
697
complex : & self . complex ,
698
+ ja_zh : false ,
685
699
}
686
700
}
687
701
688
702
/// Creates a line break iterator for a UTF-16 string.
689
703
///
690
704
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
691
705
pub fn segment_utf16 < ' l , ' s > ( & ' l self , input : & ' s [ u16 ] ) -> LineBreakIteratorUtf16 < ' l , ' s > {
706
+ let ja_zh = if let Some ( content_locale) = & self . options . content_locale {
707
+ content_locale. language == language ! ( "ja" ) || content_locale. language == language ! ( "zh" )
708
+ } else {
709
+ false
710
+ } ;
692
711
LineBreakIterator {
693
712
iter : Utf16Indices :: new ( input) ,
694
713
len : input. len ( ) ,
@@ -697,6 +716,7 @@ impl LineSegmenter {
697
716
data : self . payload . get ( ) ,
698
717
options : & self . options ,
699
718
complex : & self . complex ,
719
+ ja_zh,
700
720
}
701
721
}
702
722
}
@@ -853,6 +873,7 @@ pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
853
873
data : & ' l RuleBreakDataV2 < ' l > ,
854
874
options : & ' l LineBreakOptions ,
855
875
complex : & ' l ComplexPayloads ,
876
+ ja_zh : bool ,
856
877
}
857
878
858
879
impl < ' l , ' s , Y : LineBreakType < ' l , ' s > > Iterator for LineBreakIterator < ' l , ' s , Y > {
@@ -948,7 +969,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
948
969
right_codepoint. into ( ) ,
949
970
left_prop,
950
971
right_prop,
951
- self . options . ja_zh ,
972
+ self . ja_zh ,
952
973
) {
953
974
if breakable && !after_zwj {
954
975
return self . get_current_position ( ) ;
@@ -1151,7 +1172,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {
1151
1172
1152
1173
fn is_break_by_normal ( & self , codepoint : Y :: CharType ) -> bool {
1153
1174
match codepoint. into ( ) {
1154
- 0x301C | 0x30A0 => self . options . ja_zh ,
1175
+ 0x301C | 0x30A0 => self . ja_zh ,
1155
1176
_ => false ,
1156
1177
}
1157
1178
}
0 commit comments