unicode_segmentation/
grapheme.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12
13use tables::grapheme::GraphemeCat;
14
15/// External iterator for grapheme clusters and byte offsets.
16#[derive(Clone)]
17pub struct GraphemeIndices<'a> {
18    start_offset: usize,
19    iter: Graphemes<'a>,
20}
21
22impl<'a> GraphemeIndices<'a> {
23    #[inline]
24    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
25    ///
26    /// ```rust
27    /// # use unicode_segmentation::UnicodeSegmentation;
28    /// let mut iter = "abc".grapheme_indices(true);
29    /// assert_eq!(iter.as_str(), "abc");
30    /// iter.next();
31    /// assert_eq!(iter.as_str(), "bc");
32    /// iter.next();
33    /// iter.next();
34    /// assert_eq!(iter.as_str(), "");
35    /// ```
36    pub fn as_str(&self) -> &'a str {
37        self.iter.as_str()
38    }
39}
40
41impl<'a> Iterator for GraphemeIndices<'a> {
42    type Item = (usize, &'a str);
43
44    #[inline]
45    fn next(&mut self) -> Option<(usize, &'a str)> {
46        self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
47    }
48
49    #[inline]
50    fn size_hint(&self) -> (usize, Option<usize>) {
51        self.iter.size_hint()
52    }
53}
54
55impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
56    #[inline]
57    fn next_back(&mut self) -> Option<(usize, &'a str)> {
58        self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
59    }
60}
61
62/// External iterator for a string's
63/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
64#[derive(Clone)]
65pub struct Graphemes<'a> {
66    string: &'a str,
67    cursor: GraphemeCursor,
68    cursor_back: GraphemeCursor,
69}
70
71impl<'a> Graphemes<'a> {
72    #[inline]
73    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
74    ///
75    /// ```rust
76    /// # use unicode_segmentation::UnicodeSegmentation;
77    /// let mut iter = "abc".graphemes(true);
78    /// assert_eq!(iter.as_str(), "abc");
79    /// iter.next();
80    /// assert_eq!(iter.as_str(), "bc");
81    /// iter.next();
82    /// iter.next();
83    /// assert_eq!(iter.as_str(), "");
84    /// ```
85    pub fn as_str(&self) -> &'a str {
86        &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
87    }
88}
89
90impl<'a> Iterator for Graphemes<'a> {
91    type Item = &'a str;
92
93    #[inline]
94    fn size_hint(&self) -> (usize, Option<usize>) {
95        let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
96        (cmp::min(slen, 1), Some(slen))
97    }
98
99    #[inline]
100    fn next(&mut self) -> Option<&'a str> {
101        let start = self.cursor.cur_cursor();
102        if start == self.cursor_back.cur_cursor() {
103            return None;
104        }
105        let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
106        Some(&self.string[start..next])
107    }
108}
109
110impl<'a> DoubleEndedIterator for Graphemes<'a> {
111    #[inline]
112    fn next_back(&mut self) -> Option<&'a str> {
113        let end = self.cursor_back.cur_cursor();
114        if end == self.cursor.cur_cursor() {
115            return None;
116        }
117        let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap();
118        Some(&self.string[prev..end])
119    }
120}
121
122#[inline]
123pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
124    let len = s.len();
125    Graphemes {
126        string: s,
127        cursor: GraphemeCursor::new(0, len, is_extended),
128        cursor_back: GraphemeCursor::new(len, len, is_extended),
129    }
130}
131
132#[inline]
133pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
134    GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }
135}
136
137// maybe unify with PairResult?
138// An enum describing information about a potential boundary.
139#[derive(PartialEq, Eq, Clone)]
140enum GraphemeState {
141    // No information is known.
142    Unknown,
143    // It is known to not be a boundary.
144    NotBreak,
145    // It is known to be a boundary.
146    Break,
147    // The codepoint after is a Regional Indicator Symbol, so a boundary iff
148    // it is preceded by an even number of RIS codepoints. (GB12, GB13)
149    Regional,
150    // The codepoint after is Extended_Pictographic,
151    // so whether it's a boundary depends on pre-context according to GB11.
152    Emoji,
153}
154
155/// Cursor-based segmenter for grapheme clusters.
156#[derive(Clone)]
157pub struct GraphemeCursor {
158    // Current cursor position.
159    offset: usize,
160    // Total length of the string.
161    len: usize,
162    // A config flag indicating whether this cursor computes legacy or extended
163    // grapheme cluster boundaries (enables GB9a and GB9b if set).
164    is_extended: bool,
165    // Information about the potential boundary at `offset`
166    state: GraphemeState,
167    // Category of codepoint immediately preceding cursor, if known.
168    cat_before: Option<GraphemeCat>,
169    // Category of codepoint immediately after cursor, if known.
170    cat_after: Option<GraphemeCat>,
171    // If set, at least one more codepoint immediately preceding this offset
172    // is needed to resolve whether there's a boundary at `offset`.
173    pre_context_offset: Option<usize>,
174    // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
175    // is set, then counts the number of RIS between that and `offset`, otherwise
176    // is an accurate count relative to the string.
177    ris_count: Option<usize>,
178    // Set if a call to `prev_boundary` or `next_boundary` was suspended due
179    // to needing more input.
180    resuming: bool,
181}
182
183/// An error return indicating that not enough content was available in the
184/// provided chunk to satisfy the query, and that more content must be provided.
185#[derive(PartialEq, Eq, Debug)]
186pub enum GraphemeIncomplete {
187    /// More pre-context is needed. The caller should call `provide_context`
188    /// with a chunk ending at the offset given, then retry the query. This
189    /// will only be returned if the `chunk_start` parameter is nonzero.
190    PreContext(usize),
191
192    /// When requesting `prev_boundary`, the cursor is moving past the beginning
193    /// of the current chunk, so the chunk before that is requested. This will
194    /// only be returned if the `chunk_start` parameter is nonzero.
195    PrevChunk,
196
197    /// When requesting `next_boundary`, the cursor is moving past the end of the
198    /// current chunk, so the chunk after that is requested. This will only be
199    /// returned if the chunk ends before the `len` parameter provided on
200    /// creation of the cursor.
201    NextChunk,  // requesting chunk following the one given
202
203    /// An error returned when the chunk given does not contain the cursor position.
204    InvalidOffset,
205}
206
207// An enum describing the result from lookup of a pair of categories.
208#[derive(PartialEq, Eq)]
209enum PairResult {
210    NotBreak,  // definitely not a break
211    Break,  // definitely a break
212    Extended,  // a break iff not in extended mode
213    Regional,  // a break if preceded by an even number of RIS
214    Emoji,  // a break if preceded by emoji base and (Extend)*
215}
216
217fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
218    use tables::grapheme::GraphemeCat::*;
219    use self::PairResult::*;
220    match (before, after) {
221        (GC_CR, GC_LF) => NotBreak,  // GB3
222        (GC_Control, _) => Break,  // GB4
223        (GC_CR, _) => Break,  // GB4
224        (GC_LF, _) => Break,  // GB4
225        (_, GC_Control) => Break,  // GB5
226        (_, GC_CR) => Break,  // GB5
227        (_, GC_LF) => Break,  // GB5
228        (GC_L, GC_L) => NotBreak,  // GB6
229        (GC_L, GC_V) => NotBreak,  // GB6
230        (GC_L, GC_LV) => NotBreak,  // GB6
231        (GC_L, GC_LVT) => NotBreak,  // GB6
232        (GC_LV, GC_V) => NotBreak,  // GB7
233        (GC_LV, GC_T) => NotBreak,  // GB7
234        (GC_V, GC_V) => NotBreak,  // GB7
235        (GC_V, GC_T) => NotBreak,  // GB7
236        (GC_LVT, GC_T) => NotBreak,  // GB8
237        (GC_T, GC_T) => NotBreak,  // GB8
238        (_, GC_Extend) => NotBreak, // GB9
239        (_, GC_ZWJ) => NotBreak,  // GB9
240        (_, GC_SpacingMark) => Extended,  // GB9a
241        (GC_Prepend, _) => Extended,  // GB9b
242        (GC_ZWJ, GC_Extended_Pictographic) => Emoji,  // GB11
243        (GC_Regional_Indicator, GC_Regional_Indicator) => Regional,  // GB12, GB13
244        (_, _) => Break,  // GB999
245    }
246}
247
248impl GraphemeCursor {
249    /// Create a new cursor. The string and initial offset are given at creation
250    /// time, but the contents of the string are not. The `is_extended` parameter
251    /// controls whether extended grapheme clusters are selected.
252    ///
253    /// The `offset` parameter must be on a codepoint boundary.
254    ///
255    /// ```rust
256    /// # use unicode_segmentation::GraphemeCursor;
257    /// let s = "हिन्दी";
258    /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
259    /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
260    /// let mut extended = GraphemeCursor::new(0, s.len(), true);
261    /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
262    /// ```
263    pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
264        let state = if offset == 0 || offset == len {
265            GraphemeState::Break
266        } else {
267            GraphemeState::Unknown
268        };
269        GraphemeCursor {
270            offset: offset,
271            len: len,
272            state: state,
273            is_extended: is_extended,
274            cat_before: None,
275            cat_after: None,
276            pre_context_offset: None,
277            ris_count: None,
278            resuming: false,
279        }
280    }
281
282    // Not sure I'm gonna keep this, the advantage over new() seems thin.
283
284    /// Set the cursor to a new location in the same string.
285    ///
286    /// ```rust
287    /// # use unicode_segmentation::GraphemeCursor;
288    /// let s = "abcd";
289    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
290    /// assert_eq!(cursor.cur_cursor(), 0);
291    /// cursor.set_cursor(2);
292    /// assert_eq!(cursor.cur_cursor(), 2);
293    /// ```
294    pub fn set_cursor(&mut self, offset: usize) {
295        if offset != self.offset {
296            self.offset = offset;
297            self.state = if offset == 0 || offset == self.len {
298                GraphemeState::Break
299            } else {
300                GraphemeState::Unknown
301            };
302            // reset state derived from text around cursor
303            self.cat_before = None;
304            self.cat_after = None;
305            self.ris_count = None;
306        }
307    }
308
309    #[inline]
310    /// The current offset of the cursor. Equal to the last value provided to
311    /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
312    /// `prev_boundary()`.
313    ///
314    /// ```rust
315    /// # use unicode_segmentation::GraphemeCursor;
316    /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
317    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
318    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
319    /// assert_eq!(cursor.cur_cursor(), 4);
320    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
321    /// assert_eq!(cursor.cur_cursor(), 8);
322    /// ```
323    pub fn cur_cursor(&self) -> usize {
324        self.offset
325    }
326
327    /// Provide additional pre-context when it is needed to decide a boundary.
328    /// The end of the chunk must coincide with the value given in the
329    /// `GraphemeIncomplete::PreContext` request.
330    ///
331    /// ```rust
332    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
333    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
334    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
335    /// // Not enough pre-context to decide if there's a boundary between the two flags.
336    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
337    /// // Provide one more Regional Indicator Symbol of pre-context
338    /// cursor.provide_context(&flags[4..8], 4);
339    /// // Still not enough context to decide.
340    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
341    /// // Provide additional requested context.
342    /// cursor.provide_context(&flags[0..4], 0);
343    /// // That's enough to decide (it always is when context goes to the start of the string)
344    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
345    /// ```
346    pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
347        use tables::grapheme as gr;
348        assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
349        self.pre_context_offset = None;
350        if self.is_extended && chunk_start + chunk.len() == self.offset {
351            let ch = chunk.chars().rev().next().unwrap();
352            if gr::grapheme_category(ch) == gr::GC_Prepend {
353                self.decide(false);  // GB9b
354                return;
355            }
356        }
357        match self.state {
358            GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
359            GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
360            _ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
361                let ch = chunk.chars().rev().next().unwrap();
362                self.cat_before = Some(gr::grapheme_category(ch));
363            },
364        }
365    }
366
367    fn decide(&mut self, is_break: bool) {
368        self.state = if is_break {
369            GraphemeState::Break
370        } else {
371            GraphemeState::NotBreak
372        };
373    }
374
375    fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
376        self.decide(is_break);
377        Ok(is_break)
378    }
379
380    fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
381        if self.state == GraphemeState::Break {
382            Ok(true)
383        } else if self.state == GraphemeState::NotBreak {
384            Ok(false)
385        } else if let Some(pre_context_offset) = self.pre_context_offset {
386            Err(GraphemeIncomplete::PreContext(pre_context_offset))
387        } else {
388            unreachable!("inconsistent state");
389        }
390    }
391
392    fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
393        use tables::grapheme as gr;
394        let mut ris_count = self.ris_count.unwrap_or(0);
395        for ch in chunk.chars().rev() {
396            if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
397                self.ris_count = Some(ris_count);
398                self.decide((ris_count % 2) == 0);
399                return;
400            }
401            ris_count += 1;
402        }
403        self.ris_count = Some(ris_count);
404        if chunk_start == 0 {
405            self.decide((ris_count % 2) == 0);
406            return;
407        }
408        self.pre_context_offset = Some(chunk_start);
409        self.state = GraphemeState::Regional;
410    }
411
412    fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
413        use tables::grapheme as gr;
414        let mut iter = chunk.chars().rev();
415        if let Some(ch) = iter.next() {
416            if gr::grapheme_category(ch) != gr::GC_ZWJ {
417                self.decide(true);
418                return;
419            }
420        }
421        for ch in iter {
422            match gr::grapheme_category(ch) {
423                gr::GC_Extend => (),
424                gr::GC_Extended_Pictographic => {
425                    self.decide(false);
426                    return;
427                }
428                _ => {
429                    self.decide(true);
430                    return;
431                }
432            }
433        }
434        if chunk_start == 0 {
435            self.decide(true);
436            return;
437        }
438        self.pre_context_offset = Some(chunk_start);
439        self.state = GraphemeState::Emoji;
440    }
441
442    /// Determine whether the current cursor location is a grapheme cluster boundary.
443    /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
444    /// the length of `chunk` is not equal to `len` on creation, then this method
445    /// may return `GraphemeIncomplete::PreContext`. The caller should then
446    /// call `provide_context` with the requested chunk, then retry calling this
447    /// method.
448    ///
449    /// For partial chunks, if the cursor is not at the beginning or end of the
450    /// string, the chunk should contain at least the codepoint following the cursor.
451    /// If the string is nonempty, the chunk must be nonempty.
452    ///
453    /// All calls should have consistent chunk contents (ie, if a chunk provides
454    /// content for a given slice, all further chunks covering that slice must have
455    /// the same content for it).
456    ///
457    /// ```rust
458    /// # use unicode_segmentation::GraphemeCursor;
459    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
460    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
461    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
462    /// cursor.set_cursor(12);
463    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
464    /// ```
465    pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<bool, GraphemeIncomplete> {
466        use tables::grapheme as gr;
467        if self.state == GraphemeState::Break {
468            return Ok(true)
469        }
470        if self.state == GraphemeState::NotBreak {
471            return Ok(false)
472        }
473        if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
474            if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
475                return Err(GraphemeIncomplete::InvalidOffset)
476            }
477        }
478        if let Some(pre_context_offset) = self.pre_context_offset {
479            return Err(GraphemeIncomplete::PreContext(pre_context_offset));
480        }
481        let offset_in_chunk = self.offset - chunk_start;
482        if self.cat_after.is_none() {
483            let ch = chunk[offset_in_chunk..].chars().next().unwrap();
484            self.cat_after = Some(gr::grapheme_category(ch));
485        }
486        if self.offset == chunk_start {
487            let mut need_pre_context = true;
488            match self.cat_after.unwrap() {
489                gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
490                gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
491                _ => need_pre_context = self.cat_before.is_none(),
492            }
493            if need_pre_context {
494                self.pre_context_offset = Some(chunk_start);
495                return Err(GraphemeIncomplete::PreContext(chunk_start));
496            }
497        }
498        if self.cat_before.is_none() {
499            let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
500            self.cat_before = Some(gr::grapheme_category(ch));
501        }
502        match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
503            PairResult::NotBreak => return self.decision(false),
504            PairResult::Break => return self.decision(true),
505            PairResult::Extended => {
506                let is_extended = self.is_extended;
507                return self.decision(!is_extended);
508            }
509            PairResult::Regional => {
510                if let Some(ris_count) = self.ris_count {
511                    return self.decision((ris_count % 2) == 0);
512                }
513                self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
514                self.is_boundary_result()
515            }
516            PairResult::Emoji => {
517                self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
518                self.is_boundary_result()
519            }
520        }
521    }
522
523    /// Find the next boundary after the current cursor position. Only a part of
524    /// the string need be supplied. If the chunk is incomplete, then this
525    /// method might return `GraphemeIncomplete::PreContext` or
526    /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
527    /// call `provide_context` with the requested chunk, then retry. In the
528    /// latter case, the caller should provide the chunk following the one
529    /// given, then retry.
530    ///
531    /// See `is_boundary` for expectations on the provided chunk.
532    ///
533    /// ```rust
534    /// # use unicode_segmentation::GraphemeCursor;
535    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
536    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
537    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
538    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
539    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
540    /// ```
541    ///
542    /// And an example that uses partial strings:
543    ///
544    /// ```rust
545    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
546    /// let s = "abcd";
547    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
548    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
549    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
550    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
551    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
552    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
553    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
554    /// ```
555    pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
556        use tables::grapheme as gr;
557        if self.offset == self.len {
558            return Ok(None);
559        }
560        let mut iter = chunk[self.offset - chunk_start..].chars();
561        let mut ch = iter.next().unwrap();
562        loop {
563            if self.resuming {
564                if self.cat_after.is_none() {
565                    self.cat_after = Some(gr::grapheme_category(ch));
566                }
567            } else {
568                self.offset += ch.len_utf8();
569                self.state = GraphemeState::Unknown;
570                self.cat_before = self.cat_after.take();
571                if self.cat_before.is_none() {
572                    self.cat_before = Some(gr::grapheme_category(ch));
573                }
574                if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
575                    self.ris_count = self.ris_count.map(|c| c + 1);
576                } else {
577                    self.ris_count = Some(0);
578                }
579                if let Some(next_ch) = iter.next() {
580                    ch = next_ch;
581                    self.cat_after = Some(gr::grapheme_category(ch));
582                } else if self.offset == self.len {
583                    self.decide(true);
584                } else {
585                    self.resuming = true;
586                    return Err(GraphemeIncomplete::NextChunk);
587                }
588            }
589            self.resuming = true;
590            if self.is_boundary(chunk, chunk_start)? {
591                self.resuming = false;
592                return Ok(Some(self.offset));
593            }
594            self.resuming = false;
595        }
596    }
597
598    /// Find the previous boundary after the current cursor position. Only a part
599    /// of the string need be supplied. If the chunk is incomplete, then this
600    /// method might return `GraphemeIncomplete::PreContext` or
601    /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
602    /// call `provide_context` with the requested chunk, then retry. In the
603    /// latter case, the caller should provide the chunk preceding the one
604    /// given, then retry.
605    ///
606    /// See `is_boundary` for expectations on the provided chunk.
607    ///
608    /// ```rust
609    /// # use unicode_segmentation::GraphemeCursor;
610    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
611    /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
612    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
613    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
614    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
615    /// ```
616    ///
617    /// And an example that uses partial strings (note the exact return is not
618    /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
619    ///
620    /// ```rust
621    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
622    /// let s = "abcd";
623    /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
624    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
625    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
626    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
627    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
628    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
629    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
630    /// ```
631    pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
632        use tables::grapheme as gr;
633        if self.offset == 0 {
634            return Ok(None);
635        }
636        if self.offset == chunk_start {
637            return Err(GraphemeIncomplete::PrevChunk);
638        }
639        let mut iter = chunk[..self.offset - chunk_start].chars().rev();
640        let mut ch = iter.next().unwrap();
641        loop {
642            if self.offset == chunk_start {
643                self.resuming = true;
644                return Err(GraphemeIncomplete::PrevChunk);
645            }
646            if self.resuming {
647                self.cat_before = Some(gr::grapheme_category(ch));
648            } else {
649                self.offset -= ch.len_utf8();
650                self.cat_after = self.cat_before.take();
651                self.state = GraphemeState::Unknown;
652                if let Some(ris_count) = self.ris_count {
653                    self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None };
654                }
655                if let Some(prev_ch) = iter.next() {
656                    ch = prev_ch;
657                    self.cat_before = Some(gr::grapheme_category(ch));
658                } else if self.offset == 0 {
659                    self.decide(true);
660                } else {
661                    self.resuming = true;
662                    self.cat_after = Some(gr::grapheme_category(ch));
663                    return Err(GraphemeIncomplete::PrevChunk);
664                }
665            }
666            self.resuming = true;
667            if self.is_boundary(chunk, chunk_start)? {
668                self.resuming = false;
669                return Ok(Some(self.offset));
670            }
671            self.resuming = false;
672        }
673    }
674}
675
676#[test]
677fn test_grapheme_cursor_ris_precontext() {
678    let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
679    let mut c = GraphemeCursor::new(8, s.len(), true);
680    assert_eq!(c.is_boundary(&s[4..], 4), Err(GraphemeIncomplete::PreContext(4)));
681    c.provide_context(&s[..4], 0);
682    assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
683}
684
685#[test]
686fn test_grapheme_cursor_chunk_start_require_precontext() {
687    let s = "\r\n";
688    let mut c = GraphemeCursor::new(1, s.len(), true);
689    assert_eq!(c.is_boundary(&s[1..], 1), Err(GraphemeIncomplete::PreContext(1)));
690    c.provide_context(&s[..1], 0);
691    assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
692}
693
694#[test]
695fn test_grapheme_cursor_prev_boundary() {
696    let s = "abcd";
697    let mut c = GraphemeCursor::new(3, s.len(), true);
698    assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
699    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
700}
701
702#[test]
703fn test_grapheme_cursor_prev_boundary_chunk_start() {
704    let s = "abcd";
705    let mut c = GraphemeCursor::new(2, s.len(), true);
706    assert_eq!(c.prev_boundary(&s[2..], 2), Err(GraphemeIncomplete::PrevChunk));
707    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
708}