@@ -42,7 +42,7 @@ fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
42
42
}
43
43
44
44
// Perform decomposition for Hangul
45
- if is_hangul ( c) {
45
+ if is_hangul_syllable ( c) {
46
46
decompose_hangul ( c, emit_char) ;
47
47
return ;
48
48
}
@@ -77,26 +77,34 @@ const T_COUNT: u32 = 28;
77
77
const N_COUNT : u32 = ( V_COUNT * T_COUNT ) ;
78
78
const S_COUNT : u32 = ( L_COUNT * N_COUNT ) ;
79
79
80
- pub ( crate ) fn is_hangul ( c : char ) -> bool {
80
+ const S_LAST : u32 = S_BASE + S_COUNT - 1 ;
81
+ const L_LAST : u32 = L_BASE + L_COUNT - 1 ;
82
+ const V_LAST : u32 = V_BASE + V_COUNT - 1 ;
83
+ const T_LAST : u32 = T_BASE + T_COUNT - 1 ;
84
+
85
+ // Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`,
86
+ // i.e. `T_BASE + 1 ... T_LAST`.
87
+ const T_FIRST : u32 = T_BASE + 1 ;
88
+
89
+ pub ( crate ) fn is_hangul_syllable ( c : char ) -> bool {
81
90
( c as u32 ) >= S_BASE && ( c as u32 ) < ( S_BASE + S_COUNT )
82
91
}
83
92
84
93
// Decompose a precomposed Hangul syllable
85
94
#[ allow( unsafe_code) ]
86
95
#[ inline( always) ]
87
96
fn decompose_hangul < F > ( s : char , mut emit_char : F ) where F : FnMut ( char ) {
88
- let si = s as u32 - S_BASE ;
89
-
90
- let li = si / N_COUNT ;
97
+ let s_index = s as u32 - S_BASE ;
98
+ let l_index = s_index / N_COUNT ;
91
99
unsafe {
92
- emit_char ( char:: from_u32_unchecked ( L_BASE + li ) ) ;
100
+ emit_char ( char:: from_u32_unchecked ( L_BASE + l_index ) ) ;
93
101
94
- let vi = ( si % N_COUNT ) / T_COUNT ;
95
- emit_char ( char:: from_u32_unchecked ( V_BASE + vi ) ) ;
102
+ let v_index = ( s_index % N_COUNT ) / T_COUNT ;
103
+ emit_char ( char:: from_u32_unchecked ( V_BASE + v_index ) ) ;
96
104
97
- let ti = si % T_COUNT ;
98
- if ti > 0 {
99
- emit_char ( char:: from_u32_unchecked ( T_BASE + ti ) ) ;
105
+ let t_index = s_index % T_COUNT ;
106
+ if t_index > 0 {
107
+ emit_char ( char:: from_u32_unchecked ( T_BASE + t_index ) ) ;
100
108
}
101
109
}
102
110
}
@@ -112,20 +120,33 @@ pub(crate) fn hangul_decomposition_length(s: char) -> usize {
112
120
#[ allow( unsafe_code) ]
113
121
#[ inline( always) ]
114
122
fn compose_hangul ( a : char , b : char ) -> Option < char > {
115
- let l = a as u32 ;
116
- let v = b as u32 ;
117
- // Compose an LPart and a VPart
118
- if L_BASE <= l && l < ( L_BASE + L_COUNT ) // l should be an L choseong jamo
119
- && V_BASE <= v && v < ( V_BASE + V_COUNT ) { // v should be a V jungseong jamo
120
- let r = S_BASE + ( l - L_BASE ) * N_COUNT + ( v - V_BASE ) * T_COUNT ;
121
- return unsafe { Some ( char:: from_u32_unchecked ( r) ) } ;
123
+ let ( a, b) = ( a as u32 , b as u32 ) ;
124
+ match ( a, b) {
125
+ // Compose a leading consonant and a vowel together into an LV_Syllable
126
+ ( L_BASE ... L_LAST , V_BASE ... V_LAST ) => {
127
+ let l_index = a - L_BASE ;
128
+ let v_index = b - V_BASE ;
129
+ let lv_index = l_index * N_COUNT + v_index * T_COUNT ;
130
+ let s = S_BASE + lv_index;
131
+ Some ( unsafe { char:: from_u32_unchecked ( s) } )
132
+ } ,
133
+ // Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
134
+ ( S_BASE ... S_LAST , T_FIRST ... T_LAST ) if ( a - S_BASE ) % T_COUNT == 0 => {
135
+ Some ( unsafe { char:: from_u32_unchecked ( a + ( b - T_BASE ) ) } )
136
+ } ,
137
+ _ => None ,
122
138
}
123
- // Compose an LVPart and a TPart
124
- if S_BASE <= l && l <= ( S_BASE +S_COUNT -T_COUNT ) // l should be a syllable block
125
- && T_BASE <= v && v < ( T_BASE +T_COUNT ) // v should be a T jongseong jamo
126
- && ( l - S_BASE ) % T_COUNT == 0 { // l should be an LV syllable block (not LVT)
127
- let r = l + ( v - T_BASE ) ;
128
- return unsafe { Some ( char:: from_u32_unchecked ( r) ) } ;
139
+ }
140
+
141
+ #[ cfg( test) ]
142
+ mod tests {
143
+ use super :: compose_hangul;
144
+
145
+ // Regression test from a bugfix where we were composing an LV_Syllable with
146
+ // T_BASE directly. (We should only compose an LV_Syllable with a character
147
+ // in the range `T_BASE + 1 ... T_LAST`.)
148
+ #[ test]
149
+ fn test_hangul_composition ( ) {
150
+ assert_eq ! ( compose_hangul( '\u{c8e0}' , '\u{11a7}' ) , None ) ;
129
151
}
130
- None
131
152
}
0 commit comments