@@ -135,28 +135,51 @@ pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndice
135
135
}
136
136
137
137
// maybe unify with PairResult?
138
+ // An enum describing information about a potential boundary.
138
139
#[ derive( PartialEq , Eq , Clone ) ]
139
140
enum GraphemeState {
141
+ // No information is known.
140
142
Unknown ,
143
+ // It is known to not be a boundary.
141
144
NotBreak ,
145
+ // It is known to be a boundary.
142
146
Break ,
147
+ // The codepoint after is LF, so a boundary iff the codepoint before is not CR. (GB3)
143
148
CheckCrlf ,
149
+ // The codepoint after is a Regional Indicator Symbol, so a boundary iff
150
+ // it is preceded by an even number of RIS codepoints. (GB12, GB13)
144
151
Regional ,
152
+ // The codepoint after is in the E_Modifier category, so whether it's a boundary
153
+ // depends on pre-context according to GB10.
145
154
Emoji ,
146
155
}
147
156
148
157
/// Cursor-based segmenter for grapheme clusters.
149
158
#[ derive( Clone ) ]
150
159
pub struct GraphemeCursor {
151
- offset : usize , // current cursor position
152
- len : usize , // total length of the string
160
+ // Current cursor position.
161
+ offset : usize ,
162
+ // Total length of the string.
163
+ len : usize ,
164
+ // A config flag indicating whether this cursor computes legacy or extended
165
+ // grapheme cluster boundaries (enables GB9a and GB9b if set).
153
166
is_extended : bool ,
167
+ // Information about the potential boundary at `offset`
154
168
state : GraphemeState ,
155
- cat_before : Option < GraphemeCat > , // category of codepoint immediately preceding cursor
156
- cat_after : Option < GraphemeCat > , // category of codepoint immediately after cursor
169
+ // Category of codepoint immediately preceding cursor, if known.
170
+ cat_before : Option < GraphemeCat > ,
171
+ // Category of codepoint immediately after cursor, if known.
172
+ cat_after : Option < GraphemeCat > ,
173
+ // If set, at least one more codepoint immediately preceding this offset
174
+ // is needed to resolve whether there's a boundary at `offset`.
157
175
pre_context_offset : Option < usize > ,
176
+ // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
177
+ // is set, then counts the number of RIS between that and `offset`, otherwise
178
+ // is an accurate count relative to the string.
158
179
ris_count : Option < usize > ,
159
- resuming : bool , // query was suspended
180
+ // Set if a call to `prev_boundary` or `next_boundary` was suspended due
181
+ // to needing more input.
182
+ resuming : bool ,
160
183
}
161
184
162
185
/// An error return indicating that not enough content was available in the
@@ -183,14 +206,15 @@ pub enum GraphemeIncomplete {
183
206
InvalidOffset ,
184
207
}
185
208
209
+ // An enum describing the result from lookup of a pair of categories.
186
210
#[ derive( PartialEq , Eq ) ]
187
211
enum PairResult {
188
212
NotBreak , // definitely not a break
189
213
Break , // definitely a break
190
- Extended , // a break if not in extended mode
214
+ Extended , // a break iff not in extended mode
191
215
CheckCrlf , // a break unless it's a CR LF pair
192
216
Regional , // a break if preceded by an even number of RIS
193
- Emoji , // a break if preceded by emoji base and extend
217
+ Emoji , // a break if preceded by emoji base and (Extend)*
194
218
}
195
219
196
220
fn check_pair ( before : GraphemeCat , after : GraphemeCat ) -> PairResult {
@@ -213,7 +237,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
213
237
( _, GC_Extend ) => NotBreak , // GB9
214
238
( _, GC_ZWJ ) => NotBreak , // GB9
215
239
( _, GC_SpacingMark ) => Extended , // GB9a
216
- ( GC_Prepend , _) => Extended , // GB9a
240
+ ( GC_Prepend , _) => Extended , // GB9b
217
241
( GC_E_Base , GC_E_Modifier ) => NotBreak , // GB10
218
242
( GC_E_Base_GAZ , GC_E_Modifier ) => NotBreak , // GB10
219
243
( GC_Extend , GC_E_Modifier ) => Emoji , // GB10
@@ -230,6 +254,15 @@ impl GraphemeCursor {
230
254
/// controls whether extended grapheme clusters are selected.
231
255
///
232
256
/// The `offset` parameter must be on a codepoint boundary.
257
+ ///
258
+ /// ```rust
259
+ /// # use unicode_segmentation::GraphemeCursor;
260
+ /// let s = "हिन्दी";
261
+ /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
262
+ /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
263
+ /// let mut extended = GraphemeCursor::new(0, s.len(), true);
264
+ /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
265
+ /// ```
233
266
pub fn new ( offset : usize , len : usize , is_extended : bool ) -> GraphemeCursor {
234
267
let state = if offset == 0 || offset == len {
235
268
GraphemeState :: Break
@@ -252,6 +285,15 @@ impl GraphemeCursor {
252
285
// Not sure I'm gonna keep this, the advantage over new() seems thin.
253
286
254
287
/// Set the cursor to a new location in the same string.
288
+ ///
289
+ /// ```rust
290
+ /// # use unicode_segmentation::GraphemeCursor;
291
+ /// let s = "abcd";
292
+ /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
293
+ /// assert_eq!(cursor.cur_cursor(), 0);
294
+ /// cursor.set_cursor(2);
295
+ /// assert_eq!(cursor.cur_cursor(), 2);
296
+ /// ```
255
297
pub fn set_cursor ( & mut self , offset : usize ) {
256
298
if offset != self . offset {
257
299
self . offset = offset;
@@ -270,13 +312,39 @@ impl GraphemeCursor {
270
312
/// The current offset of the cursor. Equal to the last value provided to
271
313
/// `new()` or `set_cursor()`, or returned from `next_boundary()` or
272
314
/// `prev_boundary()`.
315
+ ///
316
+ /// ```rust
317
+ /// # use unicode_segmentation::GraphemeCursor;
318
+ /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
319
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
320
+ /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
321
+ /// assert_eq!(cursor.cur_cursor(), 4);
322
+ /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
323
+ /// assert_eq!(cursor.cur_cursor(), 8);
324
+ /// ```
273
325
pub fn cur_cursor ( & self ) -> usize {
274
326
self . offset
275
327
}
276
328
277
329
/// Provide additional pre-context when it is needed to decide a boundary.
278
330
/// The end of the chunk must coincide with the value given in the
279
331
/// `GraphemeIncomplete::PreContext` request.
332
+ ///
333
+ /// ```rust
334
+ /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
335
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
336
+ /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
337
+ /// // Note enough pre-context to decide if there's a boundary between the two flags.
338
+ /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
339
+ /// // Provide one more Regional Indicator Symbol of pre-context
340
+ /// cursor.provide_context(&flags[4..8], 4);
341
+ /// // Still not enough context to decide.
342
+ /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
343
+ /// // Provide additional requested context.
344
+ /// cursor.provide_context(&flags[0..4], 0);
345
+ /// // That's enough to decide (it always is when context goes to the start of the string)
346
+ /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
347
+ /// ```
280
348
pub fn provide_context ( & mut self , chunk : & str , chunk_start : usize ) {
281
349
use tables:: grapheme as gr;
282
350
assert ! ( chunk_start + chunk. len( ) == self . pre_context_offset. unwrap( ) ) ;
@@ -379,6 +447,15 @@ impl GraphemeCursor {
379
447
/// All calls should have consistent chunk contents (ie, if a chunk provides
380
448
/// content for a given slice, all further chunks covering that slice must have
381
449
/// the same content for it).
450
+ ///
451
+ /// ```rust
452
+ /// # use unicode_segmentation::GraphemeCursor;
453
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
454
+ /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
455
+ /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
456
+ /// cursor.set_cursor(12);
457
+ /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
458
+ /// ```
382
459
pub fn is_boundary ( & mut self , chunk : & str , chunk_start : usize ) -> Result < bool , GraphemeIncomplete > {
383
460
use tables:: grapheme as gr;
384
461
if self . state == GraphemeState :: Break {
@@ -388,7 +465,9 @@ impl GraphemeCursor {
388
465
return Ok ( false )
389
466
}
390
467
if self . offset < chunk_start || self . offset >= chunk_start + chunk. len ( ) {
391
- return Err ( GraphemeIncomplete :: InvalidOffset )
468
+ if self . offset > chunk_start + chunk. len ( ) || self . cat_after . is_none ( ) {
469
+ return Err ( GraphemeIncomplete :: InvalidOffset )
470
+ }
392
471
}
393
472
if let Some ( pre_context_offset) = self . pre_context_offset {
394
473
return Err ( GraphemeIncomplete :: PreContext ( pre_context_offset) ) ;
@@ -399,6 +478,7 @@ impl GraphemeCursor {
399
478
self . cat_after = Some ( gr:: grapheme_category ( ch) ) ;
400
479
}
401
480
if self . offset == chunk_start {
481
+ let mut need_pre_context = true ;
402
482
match self . cat_after . unwrap ( ) {
403
483
gr:: GC_Control => {
404
484
if chunk. as_bytes ( ) [ offset_in_chunk] == b'\n' {
@@ -407,10 +487,12 @@ impl GraphemeCursor {
407
487
}
408
488
gr:: GC_Regional_Indicator => self . state = GraphemeState :: Regional ,
409
489
gr:: GC_E_Modifier => self . state = GraphemeState :: Emoji ,
410
- _ => ( )
490
+ _ => need_pre_context = self . cat_before . is_none ( ) ,
491
+ }
492
+ if need_pre_context {
493
+ self . pre_context_offset = Some ( chunk_start) ;
494
+ return Err ( GraphemeIncomplete :: PreContext ( chunk_start) ) ;
411
495
}
412
- self . pre_context_offset = Some ( chunk_start) ;
413
- return Err ( GraphemeIncomplete :: PreContext ( chunk_start) ) ;
414
496
}
415
497
if self . cat_before . is_none ( ) {
416
498
let ch = chunk[ ..offset_in_chunk] . chars ( ) . rev ( ) . next ( ) . unwrap ( ) ;
@@ -457,6 +539,29 @@ impl GraphemeCursor {
457
539
/// given, then retry.
458
540
///
459
541
/// See `is_boundary` for expectations on the provided chunk.
542
+ ///
543
+ /// ```rust
544
+ /// # use unicode_segmentation::GraphemeCursor;
545
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
546
+ /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
547
+ /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
548
+ /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
549
+ /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
550
+ /// ```
551
+ ///
552
+ /// And an example that uses partial strings:
553
+ ///
554
+ /// ```rust
555
+ /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
556
+ /// let s = "abcd";
557
+ /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
558
+ /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
559
+ /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
560
+ /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
561
+ /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
562
+ /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
563
+ /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
564
+ /// ```
460
565
pub fn next_boundary ( & mut self , chunk : & str , chunk_start : usize ) -> Result < Option < usize > , GraphemeIncomplete > {
461
566
use tables:: grapheme as gr;
462
567
if self . offset == self . len {
@@ -509,6 +614,30 @@ impl GraphemeCursor {
509
614
/// given, then retry.
510
615
///
511
616
/// See `is_boundary` for expectations on the provided chunk.
617
+ ///
618
+ /// ```rust
619
+ /// # use unicode_segmentation::GraphemeCursor;
620
+ /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
621
+ /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
622
+ /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
623
+ /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
624
+ /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
625
+ /// ```
626
+ ///
627
+ /// And an example that uses partial strings (note the exact return is not
628
+ /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
629
+ ///
630
+ /// ```rust
631
+ /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
632
+ /// let s = "abcd";
633
+ /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
634
+ /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
635
+ /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
636
+ /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
637
+ /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
638
+ /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
639
+ /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
640
+ /// ```
512
641
pub fn prev_boundary ( & mut self , chunk : & str , chunk_start : usize ) -> Result < Option < usize > , GraphemeIncomplete > {
513
642
use tables:: grapheme as gr;
514
643
if self . offset == 0 {
0 commit comments