Skip to content

Commit 7cb4f39

Browse files
authored
Merge pull request #48 from Jules-Bertholet/lisu-tone
Lisu tone letters
2 parents 6edfc60 + b3ab633 commit 7cb4f39

File tree

4 files changed

+63
-1312
lines changed

4 files changed

+63
-1312
lines changed

scripts/unicode.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -754,9 +754,8 @@ def main(module_path: str):
754754
{EffectiveWidth.NARROW, EffectiveWidth.AMBIGUOUS},
755755
)
756756

757-
# Download files for use by tests
757+
# Download normalization test file for use by tests
758758
fetch_open("NormalizationTest.txt", "../tests/")
759-
fetch_open("auxiliary/GraphemeBreakTest.txt", "../tests/")
760759

761760
print("------------------------")
762761
total_size = 0

src/lib.rs

Lines changed: 38 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,18 @@
3030
//! # Rules for determining width
3131
//!
3232
//! This crate currently uses the following rules to determine the width of a
33-
//! character or string, in order of decreasing precedence. These may be tweaked in the future;
34-
//! however see [guarantees](#guarantees) below.
33+
//! character or string, in order of decreasing precedence. These may be tweaked in the future.
3534
//!
3635
//! 1. [Emoji presentation sequences] have width 2.
3736
//! 2. Outside of an East Asian context, [text presentation sequences] have width 1
3837
//! if their base character:
3938
//! - Has the [`Emoji_Presentation`] property, and
4039
//! - Is not in the [Enclosed Ideographic Supplement] block.
4140
//! 3. The sequence `"\r\n"` has width 1.
42-
//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
43-
//! 5. The following have width 0:
41+
//! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
42+
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
43+
//! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
44+
//! 6. The following have width 0:
4445
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
4546
//! with the [`Default_Ignorable_Code_Point`] property.
4647
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -56,11 +57,11 @@
5657
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
5758
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
5859
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
59-
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
60+
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
6061
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
61-
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
62+
//! 8. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
6263
//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
63-
//! 8. All other characters have width 1.
64+
//! 9. All other characters have width 1.
6465
//!
6566
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
6667
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
@@ -77,16 +78,12 @@
7778
//!
7879
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
7980
//!
80-
//! ## Guarantees
81+
//! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
8182
//!
82-
//! - Any two canonically equivalent strings have the same non-CJK width.
83-
//! This will not change in any future semver-compatible version.
84-
//! (This guarantee does not currently hold for the CJK width variants.)
85-
//! - The width of any string equals the sum of the widths of its [extended grapheme clusters].
86-
//! This is unlikely to change in any future semver-compatible version.
87-
//! (This guarantee holds for both CJK and non-CJK width.)
83+
//! ## Canonical equivalence
8884
//!
89-
//! [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
85+
//! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
86+
//! However, this guarantee does not currently hold for the CJK width variants.
9087
9188
#![forbid(unsafe_code)]
9289
#![deny(missing_docs)]
@@ -102,14 +99,6 @@ pub use tables::UNICODE_VERSION;
10299
mod tables;
103100

104101
/// Methods for determining displayed width of Unicode characters.
105-
///
106-
/// **NB:** the width of a string may differ from the sum of the widths of its characters;
107-
/// see the [crate-level documentation](crate#rules-for-determining-width) for more.
108-
/// Instead of working with individual characters, consider using [extended grapheme clusters],
109-
/// perhaps with the [`unicode-segmentation`] crate.
110-
///
111-
/// [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
112-
/// [`unicode-segmentation`]: https://docs.rs/unicode-segmentation/latest/unicode_segmentation/trait.UnicodeSegmentation.html#tymethod.graphemes
113102
pub trait UnicodeWidthChar {
114103
/// Returns the character's displayed width in columns, or `None` if the
115104
/// character is a control character.
@@ -200,8 +189,14 @@ impl UnicodeWidthStr for str {
200189
enum NextCharInfo {
201190
#[default]
202191
Default,
192+
/// `'\n'`
203193
LineFeed = 0x0A,
194+
/// `'\u{A4FC}'..='\u{A4FD}'`
195+
/// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
196+
TrailingLisuToneLetter,
197+
/// `'\u{FE0E}'`
204198
Vs15 = 0x0E,
199+
/// `'\u{FE0F}'`
205200
Vs16 = 0x0F,
206201
}
207202

@@ -219,25 +214,28 @@ fn str_width(s: &str, is_cjk: bool) -> usize {
219214
/// they're treated as single width.
220215
#[inline]
221216
fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
222-
match next_info {
223-
NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => {
224-
(1, NextCharInfo::Default)
217+
if next_info == NextCharInfo::Vs16 && cw::starts_emoji_presentation_seq(c) {
218+
(2, NextCharInfo::Default)
219+
} else if c <= '\u{A0}' {
220+
match c {
221+
'\n' => (1, NextCharInfo::LineFeed),
222+
'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
223+
_ => (1, NextCharInfo::Default),
225224
}
226-
NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default),
227-
_ => {
228-
if c <= '\u{A0}' {
229-
match c {
230-
'\n' => (1, NextCharInfo::LineFeed),
231-
'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
232-
_ => (1, NextCharInfo::Default),
233-
}
234-
} else {
235-
match c {
236-
'\u{FE0E}' => (0, NextCharInfo::Vs15),
237-
'\u{FE0F}' => (0, NextCharInfo::Vs16),
238-
_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
239-
}
225+
} else {
226+
match (c, next_info) {
227+
('\u{A4F8}'..='\u{A4FB}', NextCharInfo::TrailingLisuToneLetter) => {
228+
(0, NextCharInfo::Default)
229+
}
230+
('\u{A4FC}'..='\u{A4FD}', _) => (1, NextCharInfo::TrailingLisuToneLetter),
231+
('\u{FE0E}', _) => (0, NextCharInfo::Vs15),
232+
('\u{FE0F}', _) => (0, NextCharInfo::Vs16),
233+
(_, NextCharInfo::Vs15)
234+
if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
235+
{
236+
(1, NextCharInfo::Default)
240237
}
238+
_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
241239
}
242240
}
243241
}

0 commit comments

Comments
 (0)