Skip to content

Commit d704ef7

Browse files
authored
Add content_locale member to LineBreakOptions (#5565)
Fixes #3284.
1 parent 3524f8e commit d704ef7

18 files changed

+190
-159
lines changed

components/segmenter/src/line.rs

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use alloc::vec;
1010
use alloc::vec::Vec;
1111
use core::char;
1212
use core::str::CharIndices;
13+
use icu_locale_core::subtags::language;
1314
use icu_provider::prelude::*;
1415
use utf8_iter::Utf8CharIndices;
1516

@@ -183,29 +184,29 @@ pub enum LineBreakWordOption {
183184

184185
/// Options to tailor line-breaking behavior.
185186
#[non_exhaustive]
186-
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
187+
#[derive(Clone, PartialEq, Eq, Debug)]
187188
pub struct LineBreakOptions {
188189
/// Strictness of line-breaking rules. See [`LineBreakStrictness`].
189190
pub strictness: LineBreakStrictness,
190191

191192
/// Line break opportunities between letters. See [`LineBreakWordOption`].
192193
pub word_option: LineBreakWordOption,
193194

194-
/// Use `true` as a hint to the line segmenter that the writing
195-
/// system is Chinese or Japanese. This allows more break opportunities when
196-
/// `LineBreakStrictness` is `Normal` or `Loose`. See
197-
/// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
195+
/// Content locale for line segmenter
198196
///
197+
/// This allows more break opportunities when `LineBreakStrictness` is
198+
/// `Normal` or `Loose`. See
199+
/// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
199200
/// This option has no effect in Latin-1 mode.
200-
pub ja_zh: bool,
201+
pub content_locale: Option<DataLocale>,
201202
}
202203

203204
impl Default for LineBreakOptions {
204205
fn default() -> Self {
205206
Self {
206207
strictness: LineBreakStrictness::Strict,
207208
word_option: LineBreakWordOption::Normal,
208-
ja_zh: false,
209+
content_locale: None,
209210
}
210211
}
211212
}
@@ -303,7 +304,7 @@ pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTyp
303304
/// let mut options = LineBreakOptions::default();
304305
/// options.strictness = LineBreakStrictness::Strict;
305306
/// options.word_option = LineBreakWordOption::BreakAll;
306-
/// options.ja_zh = false;
307+
/// options.content_locale = None;
307308
/// let segmenter = LineSegmenter::new_auto_with_options(options);
308309
///
309310
/// let breakpoints: Vec<usize> =
@@ -641,6 +642,11 @@ impl LineSegmenter {
641642
///
642643
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
643644
pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
645+
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
646+
content_locale.language == language!("ja") || content_locale.language == language!("zh")
647+
} else {
648+
false
649+
};
644650
LineBreakIterator {
645651
iter: input.char_indices(),
646652
len: input.len(),
@@ -649,6 +655,7 @@ impl LineSegmenter {
649655
data: self.payload.get(),
650656
options: &self.options,
651657
complex: &self.complex,
658+
ja_zh,
652659
}
653660
}
654661
/// Creates a line break iterator for a potentially ill-formed UTF8 string
@@ -660,6 +667,11 @@ impl LineSegmenter {
660667
&'l self,
661668
input: &'s [u8],
662669
) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
670+
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
671+
content_locale.language == language!("ja") || content_locale.language == language!("zh")
672+
} else {
673+
false
674+
};
663675
LineBreakIterator {
664676
iter: Utf8CharIndices::new(input),
665677
len: input.len(),
@@ -668,6 +680,7 @@ impl LineSegmenter {
668680
data: self.payload.get(),
669681
options: &self.options,
670682
complex: &self.complex,
683+
ja_zh,
671684
}
672685
}
673686
/// Creates a line break iterator for a Latin-1 (8-bit) string.
@@ -682,13 +695,19 @@ impl LineSegmenter {
682695
data: self.payload.get(),
683696
options: &self.options,
684697
complex: &self.complex,
698+
ja_zh: false,
685699
}
686700
}
687701

688702
/// Creates a line break iterator for a UTF-16 string.
689703
///
690704
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
691705
pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
706+
let ja_zh = if let Some(content_locale) = &self.options.content_locale {
707+
content_locale.language == language!("ja") || content_locale.language == language!("zh")
708+
} else {
709+
false
710+
};
692711
LineBreakIterator {
693712
iter: Utf16Indices::new(input),
694713
len: input.len(),
@@ -697,6 +716,7 @@ impl LineSegmenter {
697716
data: self.payload.get(),
698717
options: &self.options,
699718
complex: &self.complex,
719+
ja_zh,
700720
}
701721
}
702722
}
@@ -853,6 +873,7 @@ pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
853873
data: &'l RuleBreakDataV2<'l>,
854874
options: &'l LineBreakOptions,
855875
complex: &'l ComplexPayloads,
876+
ja_zh: bool,
856877
}
857878

858879
impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
@@ -948,7 +969,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
948969
right_codepoint.into(),
949970
left_prop,
950971
right_prop,
951-
self.options.ja_zh,
972+
self.ja_zh,
952973
) {
953974
if breakable && !after_zwj {
954975
return self.get_current_position();
@@ -1151,7 +1172,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {
11511172

11521173
fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
11531174
match codepoint.into() {
1154-
0x301C | 0x30A0 => self.options.ja_zh,
1175+
0x301C | 0x30A0 => self.ja_zh,
11551176
_ => false,
11561177
}
11571178
}

components/segmenter/tests/css_line_break.rs

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// called LICENSE at the top level of the ICU4X source tree
33
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
44

5+
use icu::locale::locale;
56
use icu_segmenter::LineBreakOptions;
67
use icu_segmenter::LineBreakStrictness;
78
use icu_segmenter::LineBreakWordOption;
@@ -31,31 +32,47 @@ fn strict(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize
3132
let mut options = LineBreakOptions::default();
3233
options.strictness = LineBreakStrictness::Strict;
3334
options.word_option = LineBreakWordOption::Normal;
34-
options.ja_zh = ja_zh;
35+
options.content_locale = if ja_zh {
36+
Some(locale!("ja").into())
37+
} else {
38+
None
39+
};
3540
check_with_options(s, expect_utf8, expect_utf16, options);
3641
}
3742

3843
fn normal(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
3944
let mut options = LineBreakOptions::default();
4045
options.strictness = LineBreakStrictness::Normal;
4146
options.word_option = LineBreakWordOption::Normal;
42-
options.ja_zh = ja_zh;
47+
options.content_locale = if ja_zh {
48+
Some(locale!("ja").into())
49+
} else {
50+
None
51+
};
4352
check_with_options(s, expect_utf8, expect_utf16, options);
4453
}
4554

4655
fn loose(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
4756
let mut options = LineBreakOptions::default();
4857
options.strictness = LineBreakStrictness::Loose;
4958
options.word_option = LineBreakWordOption::Normal;
50-
options.ja_zh = ja_zh;
59+
options.content_locale = if ja_zh {
60+
Some(locale!("ja").into())
61+
} else {
62+
None
63+
};
5164
check_with_options(s, expect_utf8, expect_utf16, options);
5265
}
5366

5467
fn anywhere(s: &str, ja_zh: bool, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
5568
let mut options = LineBreakOptions::default();
5669
options.strictness = LineBreakStrictness::Anywhere;
5770
options.word_option = LineBreakWordOption::Normal;
58-
options.ja_zh = ja_zh;
71+
options.content_locale = if ja_zh {
72+
Some(locale!("ja").into())
73+
} else {
74+
None
75+
};
5976
check_with_options(s, expect_utf8, expect_utf16, options);
6077
}
6178

components/segmenter/tests/css_word_break.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,23 +31,23 @@ fn break_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
3131
let mut options = LineBreakOptions::default();
3232
options.strictness = LineBreakStrictness::Strict;
3333
options.word_option = LineBreakWordOption::BreakAll;
34-
options.ja_zh = false;
34+
options.content_locale = None;
3535
check_with_options(s, expect_utf8, expect_utf16, options);
3636
}
3737

3838
fn keep_all(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
3939
let mut options = LineBreakOptions::default();
4040
options.strictness = LineBreakStrictness::Strict;
4141
options.word_option = LineBreakWordOption::KeepAll;
42-
options.ja_zh = false;
42+
options.content_locale = None;
4343
check_with_options(s, expect_utf8, expect_utf16, options);
4444
}
4545

4646
fn normal(s: &str, expect_utf8: Vec<usize>, expect_utf16: Vec<usize>) {
4747
let mut options = LineBreakOptions::default();
4848
options.strictness = LineBreakStrictness::Strict;
4949
options.word_option = LineBreakWordOption::Normal;
50-
options.ja_zh = false;
50+
options.content_locale = None;
5151
check_with_options(s, expect_utf8, expect_utf16, options);
5252
}
5353

ffi/capi/bindings/c/LineBreakOptionsV1.d.h

Lines changed: 0 additions & 26 deletions
This file was deleted.

ffi/capi/bindings/c/LineBreakOptionsV2.d.h

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/c/LineBreakOptionsV1.h renamed to ffi/capi/bindings/c/LineBreakOptionsV2.h

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/c/LineSegmenter.h

Lines changed: 8 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ffi/capi/bindings/cpp/icu4x/LineBreakOptionsV1.d.hpp renamed to ffi/capi/bindings/cpp/icu4x/LineBreakOptionsV2.d.hpp

Lines changed: 8 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)