unicode_segmentation/
sentence.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14// All of the logic for forward iteration over sentences
15mod fwd {
16    use tables::sentence::SentenceCat;
17    use core::cmp;
18
19    // Describe a parsed part of source string as described in this table:
20    // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21    #[derive(Clone, Copy, PartialEq, Eq)]
22    enum StatePart {
23        Sot,
24        Eot,
25        Other,
26        CR,
27        LF,
28        Sep,
29        ATerm,
30        UpperLower,
31        ClosePlus,
32        SpPlus,
33        STerm
34    }
35
36    #[derive(Clone, PartialEq, Eq)]
37    struct SentenceBreaksState(pub [StatePart; 4]);
38
39    const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40        StatePart::Sot,
41        StatePart::Sot,
42        StatePart::Sot,
43        StatePart::Sot
44    ]);
45
46    #[derive(Clone)]
47    pub struct SentenceBreaks<'a> {
48        pub string: &'a str,
49        pos: usize,
50        state: SentenceBreaksState
51    }
52
53    impl SentenceBreaksState {
54        // Attempt to advance the internal state by one part
55        // Whitespace and some punctutation will be collapsed
56        fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57            let &SentenceBreaksState(parts) = self;
58            let parts = match (parts[3], cat) {
59                (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60                (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61                _ => [
62                    parts[1],
63                    parts[2],
64                    parts[3],
65                    match cat {
66                        SentenceCat::SC_CR => StatePart::CR,
67                        SentenceCat::SC_LF => StatePart::LF,
68                        SentenceCat::SC_Sep => StatePart::Sep,
69                        SentenceCat::SC_ATerm => StatePart::ATerm,
70                        SentenceCat::SC_Upper |
71                        SentenceCat::SC_Lower => StatePart::UpperLower,
72                        SentenceCat::SC_Close => StatePart::ClosePlus,
73                        SentenceCat::SC_Sp => StatePart::SpPlus,
74                        SentenceCat::SC_STerm => StatePart::STerm,
75                        _ => StatePart::Other
76                    }
77                ]
78            };
79            SentenceBreaksState(parts)
80        }
81
82        fn end(&self) -> SentenceBreaksState {
83            let &SentenceBreaksState(parts) = self;
84            SentenceBreaksState([
85                parts[1],
86                parts[2],
87                parts[3],
88                StatePart::Eot
89            ])
90        }
91
92        // Helper function to check if state head matches a single `StatePart`
93        fn match1(&self, part: StatePart) -> bool {
94            let &SentenceBreaksState(parts) = self;
95            part == parts[3]
96        }
97
98        // Helper function to check if first two `StateParts` in state match
99        // the given two
100        fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
101            let &SentenceBreaksState(parts) = self;
102            part1 == parts[2] && part2 == parts[3]
103        }
104    }
105
106    // https://unicode.org/reports/tr29/#SB8
107    // TODO cache this, it is currently quadratic
108    fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
109        let &SentenceBreaksState(parts) = state;
110        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
111        if parts[idx] == StatePart::ClosePlus { idx -= 1 }
112
113        if parts[idx] == StatePart::ATerm {
114            use tables::sentence as se;
115
116            for next_char in ahead.chars() {
117                //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
118                match se::sentence_category(next_char) {
119                    se::SC_Lower => return true,
120                    se::SC_OLetter |
121                    se::SC_Upper |
122                    se::SC_Sep | se::SC_CR | se::SC_LF |
123                    se::SC_STerm | se::SC_ATerm => return false,
124                    _ => continue
125                }
126            }
127        }
128
129        false
130    }
131
132    // https://unicode.org/reports/tr29/#SB8a
133    fn match_sb8a(state: &SentenceBreaksState) -> bool {
134        // SATerm Close* Sp*
135        let &SentenceBreaksState(parts) = state;
136        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
137        if parts[idx] == StatePart::ClosePlus { idx -= 1 }
138        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
139    }
140
141    // https://unicode.org/reports/tr29/#SB9
142    fn match_sb9(state: &SentenceBreaksState) -> bool {
143        // SATerm Close*
144        let &SentenceBreaksState(parts) = state;
145        let idx = if parts[3] == StatePart::ClosePlus { 2 } else { 3 };
146        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
147    }
148
149    // https://unicode.org/reports/tr29/#SB11
150    fn match_sb11(state: &SentenceBreaksState) -> bool {
151        // SATerm Close* Sp* ParaSep?
152        let &SentenceBreaksState(parts) = state;
153        let mut idx = match parts[3] {
154            StatePart::Sep |
155            StatePart::CR |
156            StatePart::LF => 2,
157            _ => 3
158        };
159
160        if parts[idx] == StatePart::SpPlus { idx -= 1 }
161        if parts[idx] == StatePart::ClosePlus { idx -= 1}
162
163        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
164    }
165
166    impl<'a> Iterator for SentenceBreaks<'a> {
167        // Returns the index of the character which follows a break
168        type Item = usize;
169
170        #[inline]
171        fn size_hint(&self) -> (usize, Option<usize>) {
172            let slen = self.string.len();
173            // A sentence could be one character
174            (cmp::min(slen, 2), Some(slen + 1))
175        }
176
177        #[inline]
178        fn next(&mut self) -> Option<usize> {
179            use tables::sentence as se;
180
181            for next_char in self.string[self.pos..].chars() {
182                let position_before = self.pos;
183                let state_before = self.state.clone();
184
185                let next_cat = se::sentence_category(next_char);
186
187                self.pos += next_char.len_utf8();
188                self.state = self.state.next(next_cat);
189
190                match next_cat {
191                    // SB1 https://unicode.org/reports/tr29/#SB1
192                    _ if state_before.match1(StatePart::Sot) =>
193                        return Some(position_before),
194
195                    // SB2 is handled when inner iterator (chars) is finished
196
197                    // SB3 https://unicode.org/reports/tr29/#SB3
198                    SentenceCat::SC_LF if state_before.match1(StatePart::CR) =>
199                        continue,
200
201                    // SB4 https://unicode.org/reports/tr29/#SB4
202                    _ if state_before.match1(StatePart::Sep)
203                        || state_before.match1(StatePart::CR)
204                        || state_before.match1(StatePart::LF)
205                    => return Some(position_before),
206
207                    // SB5 https://unicode.org/reports/tr29/#SB5
208                    SentenceCat::SC_Extend |
209                    SentenceCat::SC_Format => self.state = state_before,
210
211                    // SB6 https://unicode.org/reports/tr29/#SB6
212                    SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) =>
213                        continue,
214
215                    // SB7 https://unicode.org/reports/tr29/#SB7
216                    SentenceCat::SC_Upper if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
217                        continue,
218
219                    // SB8 https://unicode.org/reports/tr29/#SB8
220                    _ if match_sb8(&state_before, &self.string[position_before..]) =>
221                        continue,
222
223                    // SB8a https://unicode.org/reports/tr29/#SB8a
224                    SentenceCat::SC_SContinue |
225                    SentenceCat::SC_STerm |
226                    SentenceCat::SC_ATerm if match_sb8a(&state_before) =>
227                        continue,
228
229                    // SB9 https://unicode.org/reports/tr29/#SB9
230                    SentenceCat::SC_Close |
231                    SentenceCat::SC_Sp |
232                    SentenceCat::SC_Sep |
233                    SentenceCat::SC_CR |
234                    SentenceCat::SC_LF if match_sb9(&state_before) =>
235                        continue,
236
237                    // SB10 https://unicode.org/reports/tr29/#SB10
238                    SentenceCat::SC_Sp |
239                    SentenceCat::SC_Sep |
240                    SentenceCat::SC_CR |
241                    SentenceCat::SC_LF if match_sb8a(&state_before) =>
242                        continue,
243
244                    // SB11 https://unicode.org/reports/tr29/#SB11
245                    _ if match_sb11(&state_before) =>
246                        return Some(position_before),
247
248                    // SB998 https://unicode.org/reports/tr29/#SB998
249                    _ => continue
250                }
251            }
252
253            // SB2 https://unicode.org/reports/tr29/#SB2
254            if self.state.match1(StatePart::Sot) {
255                None
256            } else if self.state.match1(StatePart::Eot) {
257                None
258            } else {
259                self.state = self.state.end();
260                Some(self.pos)
261            }
262        }
263    }
264
265    pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
266        SentenceBreaks { string: source, pos: 0, state: INITIAL_STATE }
267    }
268
269}
270
271/// An iterator over the substrings of a string which, after splitting the string on
272/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
273/// contain any characters with the
274/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
275/// property, or with
276/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
277#[derive(Clone)]
278pub struct UnicodeSentences<'a> {
279    inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
280}
281
282/// External iterator for a string's
283/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
284#[derive(Clone)]
285pub struct USentenceBounds<'a> {
286    iter: fwd::SentenceBreaks<'a>,
287    sentence_start: Option<usize>
288}
289
290/// External iterator for sentence boundaries and byte offsets.
291#[derive(Clone)]
292pub struct USentenceBoundIndices<'a> {
293    start_offset: usize,
294    iter: USentenceBounds<'a>,
295}
296
297#[inline]
298pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
299    USentenceBounds {
300        iter: fwd::new_sentence_breaks(source),
301        sentence_start: None
302    }
303}
304
305#[inline]
306pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
307    USentenceBoundIndices {
308        start_offset: source.as_ptr() as usize,
309        iter: new_sentence_bounds(source)
310    }
311}
312
313#[inline]
314pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
315    use super::UnicodeSegmentation;
316    use tables::util::is_alphanumeric;
317
318    fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
319    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
320
321    UnicodeSentences { inner: s.split_sentence_bounds().filter(has_alphanumeric) }
322}
323
324impl<'a> Iterator for UnicodeSentences<'a> {
325    type Item = &'a str;
326
327    #[inline]
328    fn next(&mut self) -> Option<&'a str> { self.inner.next() }
329}
330
331impl<'a> Iterator for USentenceBounds<'a> {
332    type Item = &'a str;
333
334    #[inline]
335    fn size_hint(&self) -> (usize, Option<usize>) {
336        let (lower, upper) = self.iter.size_hint();
337        (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
338    }
339
340    #[inline]
341    fn next(&mut self) -> Option<&'a str> {
342        if self.sentence_start == None {
343            if let Some(start_pos) = self.iter.next() {
344                self.sentence_start = Some(start_pos)
345            } else {
346                return None
347            }
348        }
349
350        if let Some(break_pos) = self.iter.next() {
351            let start_pos = self.sentence_start.unwrap();
352            let sentence = &self.iter.string[start_pos..break_pos];
353            self.sentence_start = Some(break_pos);
354            Some(sentence)
355        } else {
356            None
357        }
358    }
359}
360
361impl<'a> Iterator for USentenceBoundIndices<'a> {
362    type Item = (usize, &'a str);
363
364    #[inline]
365    fn next(&mut self) -> Option<(usize, &'a str)> {
366        self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
367    }
368
369    #[inline]
370    fn size_hint(&self) -> (usize, Option<usize>) {
371        self.iter.size_hint()
372    }
373}