regex/
re_trait.rs

1/// Slot is a single saved capture location. Note that there are two slots for
2/// every capture in a regular expression (one slot each for the start and end
3/// of the capture).
4pub type Slot = Option<usize>;
5
6/// Locations represents the offsets of each capturing group in a regex for
7/// a single match.
8///
9/// Unlike `Captures`, a `Locations` value only stores offsets.
10#[doc(hidden)]
11#[derive(Clone, Debug)]
12pub struct Locations(Vec<Slot>);
13
14impl Locations {
15    /// Returns the start and end positions of the Nth capture group. Returns
16    /// `None` if `i` is not a valid capture group or if the capture group did
17    /// not match anything. The positions returned are *always* byte indices
18    /// with respect to the original string matched.
19    pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
20        let (s, e) = (i * 2, i * 2 + 1);
21        match (self.0.get(s), self.0.get(e)) {
22            (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
23            _ => None,
24        }
25    }
26
27    /// Creates an iterator of all the capture group positions in order of
28    /// appearance in the regular expression. Positions are byte indices
29    /// in terms of the original string matched.
30    pub fn iter(&self) -> SubCapturesPosIter {
31        SubCapturesPosIter { idx: 0, locs: self }
32    }
33
34    /// Returns the total number of capturing groups.
35    ///
36    /// This is always at least `1` since every regex has at least `1`
37    /// capturing group that corresponds to the entire match.
38    pub fn len(&self) -> usize {
39        self.0.len() / 2
40    }
41
42    /// Return the individual slots as a slice.
43    pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
44        &mut self.0
45    }
46}
47
48/// An iterator over capture group positions for a particular match of a
49/// regular expression.
50///
51/// Positions are byte indices in terms of the original string matched.
52///
53/// `'c` is the lifetime of the captures.
54pub struct SubCapturesPosIter<'c> {
55    idx: usize,
56    locs: &'c Locations,
57}
58
59impl<'c> Iterator for SubCapturesPosIter<'c> {
60    type Item = Option<(usize, usize)>;
61
62    fn next(&mut self) -> Option<Option<(usize, usize)>> {
63        if self.idx >= self.locs.len() {
64            return None;
65        }
66        let x = match self.locs.pos(self.idx) {
67            None => Some(None),
68            Some((s, e)) => Some(Some((s, e))),
69        };
70        self.idx += 1;
71        x
72    }
73}
74
75/// `RegularExpression` describes types that can implement regex searching.
76///
77/// This trait is my attempt at reducing code duplication and to standardize
78/// the internal API. Specific duplication that is avoided are the `find`
79/// and `capture` iterators, which are slightly tricky.
80///
81/// It's not clear whether this trait is worth it, and it also isn't
82/// clear whether it's useful as a public trait or not. Methods like
83/// `next_after_empty` reak of bad design, but the rest of the methods seem
84/// somewhat reasonable. One particular thing this trait would expose would be
85/// the ability to start the search of a regex anywhere in a haystack, which
86/// isn't possible in the current public API.
87pub trait RegularExpression: Sized {
88    /// The type of the haystack.
89    type Text: ?Sized;
90
91    /// The number of capture slots in the compiled regular expression. This is
92    /// always two times the number of capture groups (two slots per group).
93    fn slots_len(&self) -> usize;
94
95    /// Allocates fresh space for all capturing groups in this regex.
96    fn locations(&self) -> Locations {
97        Locations(vec![None; self.slots_len()])
98    }
99
100    /// Returns the position of the next character after `i`.
101    ///
102    /// For example, a haystack with type `&[u8]` probably returns `i+1`,
103    /// whereas a haystack with type `&str` probably returns `i` plus the
104    /// length of the next UTF-8 sequence.
105    fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;
106
107    /// Returns the location of the shortest match.
108    fn shortest_match_at(
109        &self,
110        text: &Self::Text,
111        start: usize,
112    ) -> Option<usize>;
113
114    /// Returns whether the regex matches the text given.
115    fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
116
117    /// Returns the leftmost-first match location if one exists.
118    fn find_at(
119        &self,
120        text: &Self::Text,
121        start: usize,
122    ) -> Option<(usize, usize)>;
123
124    /// Returns the leftmost-first match location if one exists, and also
125    /// fills in any matching capture slot locations.
126    fn captures_read_at(
127        &self,
128        locs: &mut Locations,
129        text: &Self::Text,
130        start: usize,
131    ) -> Option<(usize, usize)>;
132
133    /// Returns an iterator over all non-overlapping successive leftmost-first
134    /// matches.
135    fn find_iter(self, text: &Self::Text) -> Matches<Self> {
136        Matches { re: self, text: text, last_end: 0, last_match: None }
137    }
138
139    /// Returns an iterator over all non-overlapping successive leftmost-first
140    /// matches with captures.
141    fn captures_iter(self, text: &Self::Text) -> CaptureMatches<Self> {
142        CaptureMatches(self.find_iter(text))
143    }
144}
145
146/// An iterator over all non-overlapping successive leftmost-first matches.
147pub struct Matches<'t, R>
148where
149    R: RegularExpression,
150    R::Text: 't,
151{
152    re: R,
153    text: &'t R::Text,
154    last_end: usize,
155    last_match: Option<usize>,
156}
157
158impl<'t, R> Matches<'t, R>
159where
160    R: RegularExpression,
161    R::Text: 't,
162{
163    /// Return the text being searched.
164    pub fn text(&self) -> &'t R::Text {
165        self.text
166    }
167
168    /// Return the underlying regex.
169    pub fn regex(&self) -> &R {
170        &self.re
171    }
172}
173
174impl<'t, R> Iterator for Matches<'t, R>
175where
176    R: RegularExpression,
177    R::Text: 't + AsRef<[u8]>,
178{
179    type Item = (usize, usize);
180
181    fn next(&mut self) -> Option<(usize, usize)> {
182        if self.last_end > self.text.as_ref().len() {
183            return None;
184        }
185        let (s, e) = match self.re.find_at(self.text, self.last_end) {
186            None => return None,
187            Some((s, e)) => (s, e),
188        };
189        if s == e {
190            // This is an empty match. To ensure we make progress, start
191            // the next search at the smallest possible starting position
192            // of the next match following this one.
193            self.last_end = self.re.next_after_empty(self.text, e);
194            // Don't accept empty matches immediately following a match.
195            // Just move on to the next match.
196            if Some(e) == self.last_match {
197                return self.next();
198            }
199        } else {
200            self.last_end = e;
201        }
202        self.last_match = Some(e);
203        Some((s, e))
204    }
205}
206
207/// An iterator over all non-overlapping successive leftmost-first matches with
208/// captures.
209pub struct CaptureMatches<'t, R>(Matches<'t, R>)
210where
211    R: RegularExpression,
212    R::Text: 't;
213
214impl<'t, R> CaptureMatches<'t, R>
215where
216    R: RegularExpression,
217    R::Text: 't,
218{
219    /// Return the text being searched.
220    pub fn text(&self) -> &'t R::Text {
221        self.0.text()
222    }
223
224    /// Return the underlying regex.
225    pub fn regex(&self) -> &R {
226        self.0.regex()
227    }
228}
229
230impl<'t, R> Iterator for CaptureMatches<'t, R>
231where
232    R: RegularExpression,
233    R::Text: 't + AsRef<[u8]>,
234{
235    type Item = Locations;
236
237    fn next(&mut self) -> Option<Locations> {
238        if self.0.last_end > self.0.text.as_ref().len() {
239            return None;
240        }
241        let mut locs = self.0.re.locations();
242        let (s, e) = match self.0.re.captures_read_at(
243            &mut locs,
244            self.0.text,
245            self.0.last_end,
246        ) {
247            None => return None,
248            Some((s, e)) => (s, e),
249        };
250        if s == e {
251            self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
252            if Some(e) == self.0.last_match {
253                return self.next();
254            }
255        } else {
256            self.0.last_end = e;
257        }
258        self.0.last_match = Some(e);
259        Some(locs)
260    }
261}