30
30
//! # Rules for determining width
31
31
//!
32
32
//! This crate currently uses the following rules to determine the width of a
33
- //! character or string, in order of decreasing precedence. These may be tweaked in the future;
34
- //! however see [guarantees](#guarantees) below.
33
+ //! character or string, in order of decreasing precedence. These may be tweaked in the future.
35
34
//!
36
35
//! 1. [Emoji presentation sequences] have width 2.
37
36
//! 2. Outside of an East Asian context, [text presentation sequences] have width 1
38
37
//! if their base character:
39
38
//! - Has the [`Emoji_Presentation`] property, and
40
39
//! - Is not in the [Enclosed Ideographic Supplement] block.
41
40
//! 3. The sequence `"\r\n"` has width 1.
42
- //! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
43
- //! 5. The following have width 0:
41
+ //! 4. [Lisu tone letter] combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
42
+ //! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1.
43
+ //! 5. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
44
+ //! 6. The following have width 0:
44
45
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
45
46
//! with the [`Default_Ignorable_Code_Point`] property.
46
47
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
56
57
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
57
58
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
58
59
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
59
- //! 6 . [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
60
+ //! 7 . [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
60
61
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
61
- //! 7 . [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
62
+ //! 8 . [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
62
63
//! with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
63
- //! 8 . All other characters have width 1.
64
+ //! 9 . All other characters have width 1.
64
65
//!
65
66
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
66
67
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
77
78
//!
78
79
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
79
80
//!
80
- //! ## Guarantees
81
+ //! [Lisu tone letter]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078
81
82
//!
82
- //! - Any two canonically equivalent strings have the same non-CJK width.
83
- //! This will not change in any future semver-compatible version.
84
- //! (This guarantee does not currently hold for the CJK width variants.)
85
- //! - The width of any string equals the sum of the widths of its [extended grapheme clusters].
86
- //! This is unlikely to change in any future semver-compatible version.
87
- //! (This guarantee holds for both CJK and non-CJK width.)
83
+ //! ## Canonical equivalence
88
84
//!
89
- //! [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
85
+ //! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
86
+ //! However, this guarantee does not currently hold for the CJK width variants.
90
87
91
88
#![ forbid( unsafe_code) ]
92
89
#![ deny( missing_docs) ]
@@ -102,14 +99,6 @@ pub use tables::UNICODE_VERSION;
102
99
mod tables;
103
100
104
101
/// Methods for determining displayed width of Unicode characters.
105
- ///
106
- /// **NB:** the width of a string may differ from the sum of the widths of its characters;
107
- /// see the [crate-level documentation](crate#rules-for-determining-width) for more.
108
- /// Instead of working with individual characters, consider using [extended grapheme clusters],
109
- /// perhaps with the [`unicode-segmentation`] crate.
110
- ///
111
- /// [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
112
- /// [`unicode-segmentation`]: https://docs.rs/unicode-segmentation/latest/unicode_segmentation/trait.UnicodeSegmentation.html#tymethod.graphemes
113
102
pub trait UnicodeWidthChar {
114
103
/// Returns the character's displayed width in columns, or `None` if the
115
104
/// character is a control character.
@@ -200,8 +189,14 @@ impl UnicodeWidthStr for str {
200
189
enum NextCharInfo {
201
190
#[ default]
202
191
Default ,
192
+ /// `'\n'`
203
193
LineFeed = 0x0A ,
194
+ /// `'\u{A4FC}'..='\u{A4FD}'`
195
+ /// <https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078>
196
+ TrailingLisuToneLetter ,
197
+ /// `'\u{FE0E}'`
204
198
Vs15 = 0x0E ,
199
+ /// `'\u{FE0F}'`
205
200
Vs16 = 0x0F ,
206
201
}
207
202
@@ -219,25 +214,28 @@ fn str_width(s: &str, is_cjk: bool) -> usize {
219
214
/// they're treated as single width.
220
215
#[ inline]
221
216
fn width_in_str ( c : char , is_cjk : bool , next_info : NextCharInfo ) -> ( usize , NextCharInfo ) {
222
- match next_info {
223
- NextCharInfo :: Vs15 if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) => {
224
- ( 1 , NextCharInfo :: Default )
217
+ if next_info == NextCharInfo :: Vs16 && cw:: starts_emoji_presentation_seq ( c) {
218
+ ( 2 , NextCharInfo :: Default )
219
+ } else if c <= '\u{A0}' {
220
+ match c {
221
+ '\n' => ( 1 , NextCharInfo :: LineFeed ) ,
222
+ '\r' if next_info == NextCharInfo :: LineFeed => ( 0 , NextCharInfo :: Default ) ,
223
+ _ => ( 1 , NextCharInfo :: Default ) ,
225
224
}
226
- NextCharInfo :: Vs16 if cw:: starts_emoji_presentation_seq ( c) => ( 2 , NextCharInfo :: Default ) ,
227
- _ => {
228
- if c <= '\u{A0}' {
229
- match c {
230
- '\n' => ( 1 , NextCharInfo :: LineFeed ) ,
231
- '\r' if next_info == NextCharInfo :: LineFeed => ( 0 , NextCharInfo :: Default ) ,
232
- _ => ( 1 , NextCharInfo :: Default ) ,
233
- }
234
- } else {
235
- match c {
236
- '\u{FE0E}' => ( 0 , NextCharInfo :: Vs15 ) ,
237
- '\u{FE0F}' => ( 0 , NextCharInfo :: Vs16 ) ,
238
- _ => ( cw:: lookup_width ( c, is_cjk) , NextCharInfo :: Default ) ,
239
- }
225
+ } else {
226
+ match ( c, next_info) {
227
+ ( '\u{A4F8}' ..='\u{A4FB}' , NextCharInfo :: TrailingLisuToneLetter ) => {
228
+ ( 0 , NextCharInfo :: Default )
229
+ }
230
+ ( '\u{A4FC}' ..='\u{A4FD}' , _) => ( 1 , NextCharInfo :: TrailingLisuToneLetter ) ,
231
+ ( '\u{FE0E}' , _) => ( 0 , NextCharInfo :: Vs15 ) ,
232
+ ( '\u{FE0F}' , _) => ( 0 , NextCharInfo :: Vs16 ) ,
233
+ ( _, NextCharInfo :: Vs15 )
234
+ if !is_cjk && cw:: starts_non_ideographic_text_presentation_seq ( c) =>
235
+ {
236
+ ( 1 , NextCharInfo :: Default )
240
237
}
238
+ _ => ( cw:: lookup_width ( c, is_cjk) , NextCharInfo :: Default ) ,
241
239
}
242
240
}
243
241
}
0 commit comments