unicode_segmentation/
lib.rs

1// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
12//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
13//!
14//! ```rust
15//! extern crate unicode_segmentation;
16//!
17//! use unicode_segmentation::UnicodeSegmentation;
18//!
19//! fn main() {
20//!     let s = "a̐éö̲\r\n";
21//!     let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
22//!     let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
23//!     assert_eq!(g, b);
24//!
25//!     let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
26//!     let w = s.unicode_words().collect::<Vec<&str>>();
27//!     let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
28//!     assert_eq!(w, b);
29//!
30//!     let s = "The quick (\"brown\")  fox";
31//!     let w = s.split_word_bounds().collect::<Vec<&str>>();
32//!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
33//!     assert_eq!(w, b);
34//! }
35//! ```
36//!
37//! # no_std
38//!
39//! unicode-segmentation does not depend on libstd, so it can be used in crates
40//! with the `#![no_std]` attribute.
41//!
42//! # crates.io
43//!
44//! You can use this package in your project by adding the following
45//! to your `Cargo.toml`:
46//!
47//! ```toml
48//! [dependencies]
49//! unicode-segmentation = "1.3.0"
50//! ```
51
52#![deny(missing_docs, unsafe_code)]
53#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
54       html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]
55
56#![no_std]
57
58#[cfg(test)]
59#[macro_use]
60extern crate std;
61
62#[cfg(test)]
63#[macro_use]
64extern crate quickcheck;
65
66pub use grapheme::{Graphemes, GraphemeIndices};
67pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
68pub use tables::UNICODE_VERSION;
69pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
70pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
71
72mod grapheme;
73mod tables;
74mod word;
75mod sentence;
76
77#[cfg(test)]
78mod test;
79#[cfg(test)]
80mod testdata;
81
82/// Methods for segmenting strings according to
83/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
84pub trait UnicodeSegmentation {
85    /// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
86    ///
87    /// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
88    ///
89    /// If `is_extended` is true, the iterator is over the
90    /// *extended grapheme clusters*;
91    /// otherwise, the iterator is over the *legacy grapheme clusters*.
92    /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
93    /// recommends extended grapheme cluster boundaries for general processing.
94    ///
95    /// # Examples
96    ///
97    /// ```
98    /// # use self::unicode_segmentation::UnicodeSegmentation;
99    /// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
100    ///           .collect::<Vec<&str>>();
101    /// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
102    ///
103    /// assert_eq!(&gr1[..], b);
104    ///
105    /// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
106    /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
107    ///
108    /// assert_eq!(&gr2[..], b);
109    /// ```
110    fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
111
112    /// Returns an iterator over the grapheme clusters of `self` and their
113    /// byte offsets. See `graphemes()` for more information.
114    ///
115    /// # Examples
116    ///
117    /// ```
118    /// # use self::unicode_segmentation::UnicodeSegmentation;
119    /// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
120    ///               .collect::<Vec<(usize, &str)>>();
121    /// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
122    ///
123    /// assert_eq!(&gr_inds[..], b);
124    /// ```
125    fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
126
127    /// Returns an iterator over the words of `self`, separated on
128    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
129    ///
130    /// Here, "words" are just those substrings which, after splitting on
131    /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
132    /// substring must contain at least one character with the
133    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
134    /// property, or with
135    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
136    ///
137    /// # Example
138    ///
139    /// ```
140    /// # use self::unicode_segmentation::UnicodeSegmentation;
141    /// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
142    /// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
143    /// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
144    ///
145    /// assert_eq!(&uw1[..], b);
146    /// ```
147    fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
148
149    /// Returns an iterator over substrings of `self` separated on
150    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
151    ///
152    /// The concatenation of the substrings returned by this function is just the original string.
153    ///
154    /// # Example
155    ///
156    /// ```
157    /// # use self::unicode_segmentation::UnicodeSegmentation;
158    /// let swu1 = "The quick (\"brown\")  fox".split_word_bounds().collect::<Vec<&str>>();
159    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
160    ///
161    /// assert_eq!(&swu1[..], b);
162    /// ```
163    fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
164
165    /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
166    /// and their offsets. See `split_word_bounds()` for more information.
167    ///
168    /// # Example
169    ///
170    /// ```
171    /// # use self::unicode_segmentation::UnicodeSegmentation;
172    /// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
173    /// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
174    ///                 (14, "°"), (16, "F"), (17, "!")];
175    ///
176    /// assert_eq!(&swi1[..], b);
177    /// ```
178    fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
179
180    /// Returns an iterator over substrings of `self` separated on
181    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
182    ///
183    /// The concatenation of the substrings returned by this function is just the original string.
184    fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
185
186    /// Returns an iterator over substrings of `self` separated on
187    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
188    ///
189    /// Here, "sentences" are just those substrings which, after splitting on
190    /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
191    /// substring must contain at least one character with the
192    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
193    /// property, or with
194    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
195    fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
196
197    /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
198    /// and their offsets. See `split_sentence_bounds()` for more information.
199    fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
200}
201
202impl UnicodeSegmentation for str {
203    #[inline]
204    fn graphemes(&self, is_extended: bool) -> Graphemes {
205        grapheme::new_graphemes(self, is_extended)
206    }
207
208    #[inline]
209    fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
210        grapheme::new_grapheme_indices(self, is_extended)
211    }
212
213    #[inline]
214    fn unicode_words(&self) -> UnicodeWords {
215        word::new_unicode_words(self)
216    }
217
218    #[inline]
219    fn split_word_bounds(&self) -> UWordBounds {
220        word::new_word_bounds(self)
221    }
222
223    #[inline]
224    fn split_word_bound_indices(&self) -> UWordBoundIndices {
225        word::new_word_bound_indices(self)
226    }
227
228    #[inline]
229    fn unicode_sentences(&self) -> UnicodeSentences {
230        sentence::new_unicode_sentences(self)
231    }
232
233    #[inline]
234    fn split_sentence_bounds(&self) -> USentenceBounds {
235        sentence::new_sentence_bounds(self)
236    }
237
238    #[inline]
239    fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
240        sentence::new_sentence_bound_indices(self)
241    }
242}