unicode_segmentation/
word.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14use tables::word::WordCat;
15
16/// An iterator over the substrings of a string which, after splitting the string on
17/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18/// contain any characters with the
19/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20/// property, or with
21/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22pub struct UnicodeWords<'a> {
23    inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
24}
25
26impl<'a> Iterator for UnicodeWords<'a> {
27    type Item = &'a str;
28
29    #[inline]
30    fn next(&mut self) -> Option<&'a str> { self.inner.next() }
31}
32impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
33    #[inline]
34    fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
35}
36
37/// External iterator for a string's
38/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
39#[derive(Clone)]
40pub struct UWordBounds<'a> {
41    string: &'a str,
42    cat: Option<WordCat>,
43    catb: Option<WordCat>,
44}
45
46/// External iterator for word boundaries and byte offsets.
47#[derive(Clone)]
48pub struct UWordBoundIndices<'a> {
49    start_offset: usize,
50    iter: UWordBounds<'a>,
51}
52
53impl<'a> UWordBoundIndices<'a> {
54    #[inline]
55    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
56    ///
57    /// ```rust
58    /// # use unicode_segmentation::UnicodeSegmentation;
59    /// let mut iter = "Hello world".split_word_bound_indices();
60    /// assert_eq!(iter.as_str(), "Hello world");
61    /// iter.next();
62    /// assert_eq!(iter.as_str(), " world");
63    /// iter.next();
64    /// assert_eq!(iter.as_str(), "world");
65    /// ```
66    pub fn as_str(&self) -> &'a str {
67        self.iter.as_str()
68    }
69}
70
71impl<'a> Iterator for UWordBoundIndices<'a> {
72    type Item = (usize, &'a str);
73
74    #[inline]
75    fn next(&mut self) -> Option<(usize, &'a str)> {
76        self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
77    }
78
79    #[inline]
80    fn size_hint(&self) -> (usize, Option<usize>) {
81        self.iter.size_hint()
82    }
83}
84
85impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
86    #[inline]
87    fn next_back(&mut self) -> Option<(usize, &'a str)> {
88        self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
89    }
90}
91
92// state machine for word boundary rules
93#[derive(Clone,Copy,PartialEq,Eq,Debug)]
94enum UWordBoundsState {
95    Start,
96    Letter,
97    HLetter,
98    Numeric,
99    Katakana,
100    ExtendNumLet,
101    Regional(RegionalState),
102    FormatExtend(FormatExtendType),
103    Zwj,
104    Emoji,
105    WSegSpace,
106}
107
108// subtypes for FormatExtend state in UWordBoundsState
109#[derive(Clone,Copy,PartialEq,Eq,Debug)]
110enum FormatExtendType {
111    AcceptAny,
112    AcceptNone,
113    RequireLetter,
114    RequireHLetter,
115    AcceptQLetter,
116    RequireNumeric,
117}
118
119#[derive(Clone,Copy,PartialEq,Eq,Debug)]
120enum RegionalState {
121    Half,
122    Full,
123    Unknown,
124}
125
126fn is_emoji(ch: char) -> bool {
127    use tables::emoji;
128    emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic
129}
130
131impl<'a> Iterator for UWordBounds<'a> {
132    type Item = &'a str;
133
134    #[inline]
135    fn size_hint(&self) -> (usize, Option<usize>) {
136        let slen = self.string.len();
137        (cmp::min(slen, 1), Some(slen))
138    }
139
140    #[inline]
141    fn next(&mut self) -> Option<&'a str> {
142        use self::UWordBoundsState::*;
143        use self::FormatExtendType::*;
144        use tables::word as wd;
145        if self.string.len() == 0 {
146            return None;
147        }
148
149        let mut take_curr = true;
150        let mut take_cat = true;
151        let mut idx = 0;
152        let mut saveidx = 0;
153        let mut state = Start;
154        let mut cat = wd::WC_Any;
155        let mut savecat = wd::WC_Any;
156
157        // Whether or not the previous category was ZWJ
158        // ZWJs get collapsed, so this handles precedence of WB3c over WB4
159        let mut prev_zwj;
160        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
161        let mut skipped_format_extend = false;
162        for (curr, ch) in self.string.char_indices() {
163            idx = curr;
164            prev_zwj = cat == wd::WC_ZWJ;
165            // if there's a category cached, grab it
166            cat = match self.cat {
167                None => wd::word_category(ch),
168                _ => self.cat.take().unwrap()
169            };
170            take_cat = true;
171
172            // handle rule WB4
173            // just skip all format, extend, and zwj chars
174            // note that Start is a special case: if there's a bunch of Format | Extend
175            // characters at the beginning of a block of text, dump them out as one unit.
176            //
177            // (This is not obvious from the wording of UAX#29, but if you look at the
178            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
179            // then the "correct" interpretation of WB4 becomes apparent.)
180            if state != Start {
181                match cat {
182                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
183                        skipped_format_extend = true;
184                        continue
185                    }
186                    _ => {}
187                }
188            }
189
190            // rule WB3c
191            // WB4 makes all ZWJs collapse into the previous state
192            // but you can still be in a Zwj state if you started with Zwj
193            //
194            // This means that an EP + Zwj will collapse into EP, which is wrong,
195            // since EP+EP is not a boundary but EP+ZWJ+EP is
196            //
197            // Thus, we separately keep track of whether or not the last character
198            // was a ZWJ. This is an additional bit of state tracked outside of the
199            // state enum; the state enum represents the last non-zwj state encountered.
200            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
201            // however we are in the previous state for the purposes of all other rules.
202            if prev_zwj {
203                if is_emoji(ch) {
204                    state = Emoji;
205                    continue;
206                }
207            }
208            // Don't use `continue` in this match without updating `cat`
209            state = match state {
210                Start if cat == wd::WC_CR => {
211                    idx += match self.get_next_cat(idx) {
212                        Some(ncat) if ncat == wd::WC_LF => 1,       // rule WB3
213                        _ => 0
214                    };
215                    break;                                          // rule WB3a
216                },
217                Start => match cat {
218                    wd::WC_ALetter => Letter,           // rule WB5, WB6, WB9, WB13a
219                    wd::WC_Hebrew_Letter => HLetter,    // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
220                    wd::WC_Numeric => Numeric,          // rule WB8, WB10, WB12, WB13a
221                    wd::WC_Katakana => Katakana,        // rule WB13, WB13a
222                    wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13a, WB13b
223                    wd::WC_Regional_Indicator => Regional(RegionalState::Half),  // rule WB13c
224                    wd::WC_LF | wd::WC_Newline => break,    // rule WB3a
225                    wd::WC_ZWJ => Zwj,                      // rule WB3c
226                    wd::WC_WSegSpace => WSegSpace,          // rule WB3d
227                    _ => {
228                        if let Some(ncat) = self.get_next_cat(idx) {                // rule WB4
229                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
230                                state = FormatExtend(AcceptNone);
231                                self.cat = Some(ncat);
232                                continue;
233                            }
234                        }
235                        break;                                                      // rule WB999
236                    }
237                },
238                WSegSpace => match cat {
239                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
240                    _ => {
241                        take_curr = false;
242                        break;
243                    }
244                },
245                Zwj => {
246                    // We already handle WB3c above.
247                    take_curr = false;
248                    break;
249                }
250                Letter | HLetter => match cat {
251                    wd::WC_ALetter => Letter,                   // rule WB5
252                    wd::WC_Hebrew_Letter => HLetter,            // rule WB5
253                    wd::WC_Numeric => Numeric,                  // rule WB9
254                    wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
255                    wd::WC_Double_Quote if state == HLetter => {
256                        savecat = cat;
257                        saveidx = idx;
258                        FormatExtend(RequireHLetter)                        // rule WB7b
259                    },
260                    wd::WC_Single_Quote if state == HLetter => {
261                        FormatExtend(AcceptQLetter)                         // rule WB7a
262                    },
263                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
264                        savecat = cat;
265                        saveidx = idx;
266                        FormatExtend(RequireLetter)                         // rule WB6
267                    },
268                    _ => {
269                        take_curr = false;
270                        break;
271                    }
272                },
273                Numeric => match cat {
274                    wd::WC_Numeric => Numeric,                  // rule WB8
275                    wd::WC_ALetter => Letter,                   // rule WB10
276                    wd::WC_Hebrew_Letter => HLetter,            // rule WB10
277                    wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
278                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
279                        savecat = cat;
280                        saveidx = idx;
281                        FormatExtend(RequireNumeric)            // rule WB12
282                    },
283                    _ => {
284                        take_curr = false;
285                        break;
286                    }
287                },
288                Katakana => match cat {
289                    wd::WC_Katakana => Katakana,                // rule WB13
290                    wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
291                    _ => {
292                        take_curr = false;
293                        break;
294                    }
295                },
296                ExtendNumLet => match cat {
297                    wd::WC_ExtendNumLet => ExtendNumLet,        // rule WB13a
298                    wd::WC_ALetter => Letter,                   // rule WB13b
299                    wd::WC_Hebrew_Letter => HLetter,            // rule WB13b
300                    wd::WC_Numeric => Numeric,                  // rule WB13b
301                    wd::WC_Katakana => Katakana,                // rule WB13b
302                    _ => {
303                        take_curr = false;
304                        break;
305                    }
306                },
307                Regional(RegionalState::Full) => {
308                    // if it reaches here we've gone too far,
309                    // a full flag can only compose with ZWJ/Extend/Format
310                    // proceeding it.
311                    take_curr = false;
312                    break;
313                }
314                Regional(RegionalState::Half) => match cat {
315                    wd::WC_Regional_Indicator => Regional(RegionalState::Full),      // rule WB13c
316                    _ => {
317                        take_curr = false;
318                        break;
319                    }
320                },
321                Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
322                Emoji => {
323                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
324                    take_curr = false;
325                    break;
326                },
327                FormatExtend(t) => match t {    // handle FormatExtends depending on what type
328                    RequireNumeric if cat == wd::WC_Numeric => Numeric,     // rule WB11
329                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter,   // rule WB7
330                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
331                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,   // rule WB7b
332                    AcceptNone | AcceptQLetter => {
333                        take_curr = false;  // emit all the Format|Extend characters
334                        take_cat = false;
335                        break;
336                    },
337                    _ => break      // rewind (in if statement below)
338                }
339            }
340        }
341
342        if let FormatExtend(t) = state {
343            // we were looking for something and didn't find it; we have to back up
344            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
345                idx = saveidx;
346                cat = savecat;
347                take_curr = false;
348            }
349        }
350
351        self.cat = if take_curr {
352            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
353            None
354        } else if take_cat {
355            Some(cat)
356        } else {
357            None
358        };
359
360        let retstr = &self.string[..idx];
361        self.string = &self.string[idx..];
362        Some(retstr)
363    }
364}
365
366impl<'a> DoubleEndedIterator for UWordBounds<'a> {
367    #[inline]
368    fn next_back(&mut self) -> Option<&'a str> {
369        use self::UWordBoundsState::*;
370        use self::FormatExtendType::*;
371        use tables::word as wd;
372        if self.string.len() == 0 {
373            return None;
374        }
375
376        let mut take_curr = true;
377        let mut take_cat = true;
378        let mut idx = self.string.len();
379        idx -= self.string.chars().next_back().unwrap().len_utf8();
380        let mut previdx = idx;
381        let mut saveidx = idx;
382        let mut state = Start;
383        let mut savestate = Start;
384        let mut cat = wd::WC_Any;
385
386        let mut skipped_format_extend = false;
387
388        for (curr, ch) in self.string.char_indices().rev() {
389            previdx = idx;
390            idx = curr;
391
392            // if there's a category cached, grab it
393            cat = match self.catb {
394                None => wd::word_category(ch),
395                _ => self.catb.take().unwrap()
396            };
397            take_cat = true;
398
399            // backward iterator over word boundaries. Mostly the same as the forward
400            // iterator, with two weirdnesses:
401            // (1) If we encounter a single quote in the Start state, we have to check for a
402            //     Hebrew Letter immediately before it.
403            // (2) Format and Extend char handling takes some gymnastics.
404
405            if cat == wd::WC_Extend
406                || cat == wd::WC_Format
407                || (cat == wd::WC_ZWJ && state != Zwj) { // WB3c has more priority so we should not
408                                                         // fold in that case
409                if match state {
410                    FormatExtend(_) | Start => false,
411                    _ => true
412                } {
413                    saveidx = previdx;
414                    savestate = state;
415                    state = FormatExtend(AcceptNone);
416                }
417
418                if state != Start {
419                    continue;
420                }
421            } else if state == FormatExtend(AcceptNone) {
422                // finished a scan of some Format|Extend chars, restore previous state
423                state = savestate;
424                previdx = saveidx;
425                take_cat = false;
426                skipped_format_extend = true;
427            }
428
429            // Don't use `continue` in this match without updating `catb`
430            state = match state {
431                Start | FormatExtend(AcceptAny) => match cat {
432                    _ if is_emoji(ch) => Zwj,
433                    wd::WC_ALetter => Letter,           // rule WB5, WB7, WB10, WB13b
434                    wd::WC_Hebrew_Letter => HLetter,    // rule WB5, WB7, WB7c, WB10, WB13b
435                    wd::WC_Numeric => Numeric,          // rule WB8, WB9, WB11, WB13b
436                    wd::WC_Katakana => Katakana,                    // rule WB13, WB13b
437                    wd::WC_ExtendNumLet => ExtendNumLet,                    // rule WB13a
438                    wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
439                    // rule WB4:
440                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
441                    wd::WC_Single_Quote => {
442                        saveidx = idx;
443                        FormatExtend(AcceptQLetter)                         // rule WB7a
444                    },
445                    wd::WC_WSegSpace => WSegSpace,
446                    wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
447                        if state == Start {
448                            if cat == wd::WC_LF {
449                                idx -= match self.get_prev_cat(idx) {
450                                    Some(pcat) if pcat == wd::WC_CR => 1,   // rule WB3
451                                    _ => 0
452                                };
453                            }
454                        } else {
455                            take_curr = false;
456                        }
457                        break;                                              // rule WB3a
458                    },
459                    _ => break                              // rule WB999
460                },
461                Zwj => match cat {                          // rule WB3c
462                    wd::WC_ZWJ => {
463                        FormatExtend(AcceptAny)
464                    }
465                    _ => {
466                        take_curr = false;
467                        break;
468                    }
469                },
470                WSegSpace => match cat {                          // rule WB3d
471                    wd::WC_WSegSpace if !skipped_format_extend => {
472                        WSegSpace
473                    }
474                    _ => {
475                        take_curr = false;
476                        break;
477                    }
478                },
479                Letter | HLetter => match cat {
480                    wd::WC_ALetter => Letter,               // rule WB5
481                    wd::WC_Hebrew_Letter => HLetter,        // rule WB5
482                    wd::WC_Numeric => Numeric,              // rule WB10
483                    wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
484                    wd::WC_Double_Quote if state == HLetter => {
485                        saveidx = previdx;
486                        FormatExtend(RequireHLetter)         // rule WB7c
487                    },
488                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
489                        saveidx = previdx;
490                        FormatExtend(RequireLetter)          // rule WB7
491                    },
492                    _ => {
493                        take_curr = false;
494                        break;
495                    }
496                },
497                Numeric => match cat {
498                    wd::WC_Numeric => Numeric,              // rule WB8
499                    wd::WC_ALetter => Letter,               // rule WB9
500                    wd::WC_Hebrew_Letter => HLetter,        // rule WB9
501                    wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
502                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
503                        saveidx = previdx;
504                        FormatExtend(RequireNumeric)         // rule WB11
505                    },
506                    _ => {
507                        take_curr = false;
508                        break;
509                    }
510                },
511                Katakana => match cat {
512                    wd::WC_Katakana => Katakana,            // rule WB13
513                    wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13b
514                    _ => {
515                        take_curr = false;
516                        break;
517                    }
518                },
519                ExtendNumLet => match cat {
520                    wd::WC_ExtendNumLet => ExtendNumLet,    // rule WB13a
521                    wd::WC_ALetter => Letter,               // rule WB13a
522                    wd::WC_Hebrew_Letter => HLetter,        // rule WB13a
523                    wd::WC_Numeric => Numeric,              // rule WB13a
524                    wd::WC_Katakana => Katakana,            // rule WB13a
525                    _ => {
526                        take_curr = false;
527                        break;
528                    }
529                },
530                Regional(mut regional_state) => match cat {
531                    // rule WB13c
532                    wd::WC_Regional_Indicator => {
533                        if regional_state == RegionalState::Unknown {
534                            let count = self.string[..previdx]
535                                            .chars().rev()
536                                            .map(|c| wd::word_category(c))
537                                            .filter(|&c| ! (c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format))
538                                            .take_while(|&c| c == wd::WC_Regional_Indicator)
539                                            .count();
540                            regional_state = if count % 2 == 0 {
541                                RegionalState::Full
542                            } else {
543                                RegionalState::Half
544                            };
545                        }
546                        if regional_state == RegionalState::Full {
547                            take_curr = false;
548                            break;
549                        } else {
550                            Regional(RegionalState::Full)
551                        }
552                    }
553                    _ => {
554                        take_curr = false;
555                        break;
556                    }
557                },
558                Emoji => {
559                    if is_emoji(ch) {           // rule WB3c
560                        Zwj
561                    } else {
562                        take_curr = false;
563                        break;
564                    }
565                },
566                FormatExtend(t) => match t {
567                    RequireNumeric if cat == wd::WC_Numeric => Numeric,          // rule WB12
568                    RequireLetter if cat == wd::WC_ALetter => Letter,            // rule WB6
569                    RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter,     // rule WB6
570                    AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter,     // rule WB7a
571                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter,    // rule WB7b
572                    _ => break  // backtrack will happens
573                }
574            }
575        }
576
577        if let FormatExtend(t) = state {
578            // if we required something but didn't find it, backtrack
579            if t == RequireLetter || t == RequireHLetter ||
580                t == RequireNumeric || t == AcceptNone || t == AcceptQLetter {
581                previdx = saveidx;
582                take_cat = false;
583                take_curr = false;
584            }
585        }
586
587        self.catb = if take_curr {
588            None
589        } else {
590            idx = previdx;
591            if take_cat {
592                Some(cat)
593            } else {
594                None
595            }
596        };
597
598        let retstr = &self.string[idx..];
599        self.string = &self.string[..idx];
600        Some(retstr)
601    }
602}
603
604impl<'a> UWordBounds<'a> {
605    #[inline]
606    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
607    ///
608    /// ```rust
609    /// # use unicode_segmentation::UnicodeSegmentation;
610    /// let mut iter = "Hello world".split_word_bounds();
611    /// assert_eq!(iter.as_str(), "Hello world");
612    /// iter.next();
613    /// assert_eq!(iter.as_str(), " world");
614    /// iter.next();
615    /// assert_eq!(iter.as_str(), "world");
616    /// ```
617    pub fn as_str(&self) -> &'a str {
618        self.string
619    }
620
621    #[inline]
622    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
623        use tables::word as wd;
624        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
625        if nidx < self.string.len() {
626            let nch = self.string[nidx..].chars().next().unwrap();
627            Some(wd::word_category(nch))
628        } else {
629            None
630        }
631    }
632
633    #[inline]
634    fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
635        use tables::word as wd;
636        if idx > 0 {
637            let nch = self.string[..idx].chars().next_back().unwrap();
638            Some(wd::word_category(nch))
639        } else {
640            None
641        }
642    }
643}
644
645#[inline]
646pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
647    UWordBounds { string: s, cat: None, catb: None }
648}
649
650#[inline]
651pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
652    UWordBoundIndices { start_offset: s.as_ptr() as usize, iter: new_word_bounds(s) }
653}
654
655#[inline]
656pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
657    use super::UnicodeSegmentation;
658    use tables::util::is_alphanumeric;
659
660    fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
661    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
662
663    UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
664}