regex/
input.rs

1use std::char;
2use std::cmp::Ordering;
3use std::fmt;
4use std::ops;
5use std::u32;
6
7use syntax;
8
9use literal::LiteralSearcher;
10use prog::InstEmptyLook;
11use utf8::{decode_last_utf8, decode_utf8};
12
13/// Represents a location in the input.
14#[derive(Clone, Copy, Debug)]
15pub struct InputAt {
16    pos: usize,
17    c: Char,
18    byte: Option<u8>,
19    len: usize,
20}
21
22impl InputAt {
23    /// Returns true iff this position is at the beginning of the input.
24    pub fn is_start(&self) -> bool {
25        self.pos == 0
26    }
27
28    /// Returns true iff this position is past the end of the input.
29    pub fn is_end(&self) -> bool {
30        self.c.is_none() && self.byte.is_none()
31    }
32
33    /// Returns the character at this position.
34    ///
35    /// If this position is just before or after the input, then an absent
36    /// character is returned.
37    pub fn char(&self) -> Char {
38        self.c
39    }
40
41    /// Returns the byte at this position.
42    pub fn byte(&self) -> Option<u8> {
43        self.byte
44    }
45
46    /// Returns the UTF-8 width of the character at this position.
47    pub fn len(&self) -> usize {
48        self.len
49    }
50
51    /// Returns whether the UTF-8 width of the character at this position
52    /// is zero.
53    pub fn is_empty(&self) -> bool {
54        self.len == 0
55    }
56
57    /// Returns the byte offset of this position.
58    pub fn pos(&self) -> usize {
59        self.pos
60    }
61
62    /// Returns the byte offset of the next position in the input.
63    pub fn next_pos(&self) -> usize {
64        self.pos + self.len
65    }
66}
67
68/// An abstraction over input used in the matching engines.
69pub trait Input: fmt::Debug {
70    /// Return an encoding of the position at byte offset `i`.
71    fn at(&self, i: usize) -> InputAt;
72
73    /// Return the Unicode character occurring next to `at`.
74    ///
75    /// If no such character could be decoded, then `Char` is absent.
76    fn next_char(&self, at: InputAt) -> Char;
77
78    /// Return the Unicode character occurring previous to `at`.
79    ///
80    /// If no such character could be decoded, then `Char` is absent.
81    fn previous_char(&self, at: InputAt) -> Char;
82
83    /// Return true if the given empty width instruction matches at the
84    /// input position given.
85    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
86
87    /// Scan the input for a matching prefix.
88    fn prefix_at(
89        &self,
90        prefixes: &LiteralSearcher,
91        at: InputAt,
92    ) -> Option<InputAt>;
93
94    /// The number of bytes in the input.
95    fn len(&self) -> usize;
96
97    /// Whether the input is empty.
98    fn is_empty(&self) -> bool {
99        self.len() == 0
100    }
101
102    /// Return the given input as a sequence of bytes.
103    fn as_bytes(&self) -> &[u8];
104}
105
106impl<'a, T: Input> Input for &'a T {
107    fn at(&self, i: usize) -> InputAt {
108        (**self).at(i)
109    }
110
111    fn next_char(&self, at: InputAt) -> Char {
112        (**self).next_char(at)
113    }
114
115    fn previous_char(&self, at: InputAt) -> Char {
116        (**self).previous_char(at)
117    }
118
119    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
120        (**self).is_empty_match(at, empty)
121    }
122
123    fn prefix_at(
124        &self,
125        prefixes: &LiteralSearcher,
126        at: InputAt,
127    ) -> Option<InputAt> {
128        (**self).prefix_at(prefixes, at)
129    }
130
131    fn len(&self) -> usize {
132        (**self).len()
133    }
134
135    fn as_bytes(&self) -> &[u8] {
136        (**self).as_bytes()
137    }
138}
139
140/// An input reader over characters.
141#[derive(Clone, Copy, Debug)]
142pub struct CharInput<'t>(&'t [u8]);
143
144impl<'t> CharInput<'t> {
145    /// Return a new character input reader for the given string.
146    pub fn new(s: &'t [u8]) -> CharInput<'t> {
147        CharInput(s)
148    }
149}
150
151impl<'t> ops::Deref for CharInput<'t> {
152    type Target = [u8];
153
154    fn deref(&self) -> &[u8] {
155        self.0
156    }
157}
158
159impl<'t> Input for CharInput<'t> {
160    fn at(&self, i: usize) -> InputAt {
161        if i >= self.len() {
162            InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
163        } else {
164            let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
165            InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
166        }
167    }
168
169    fn next_char(&self, at: InputAt) -> Char {
170        at.char()
171    }
172
173    fn previous_char(&self, at: InputAt) -> Char {
174        decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
175    }
176
177    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
178        use prog::EmptyLook::*;
179        match empty.look {
180            StartLine => {
181                let c = self.previous_char(at);
182                at.pos() == 0 || c == '\n'
183            }
184            EndLine => {
185                let c = self.next_char(at);
186                at.pos() == self.len() || c == '\n'
187            }
188            StartText => at.pos() == 0,
189            EndText => at.pos() == self.len(),
190            WordBoundary => {
191                let (c1, c2) = (self.previous_char(at), self.next_char(at));
192                c1.is_word_char() != c2.is_word_char()
193            }
194            NotWordBoundary => {
195                let (c1, c2) = (self.previous_char(at), self.next_char(at));
196                c1.is_word_char() == c2.is_word_char()
197            }
198            WordBoundaryAscii => {
199                let (c1, c2) = (self.previous_char(at), self.next_char(at));
200                c1.is_word_byte() != c2.is_word_byte()
201            }
202            NotWordBoundaryAscii => {
203                let (c1, c2) = (self.previous_char(at), self.next_char(at));
204                c1.is_word_byte() == c2.is_word_byte()
205            }
206        }
207    }
208
209    fn prefix_at(
210        &self,
211        prefixes: &LiteralSearcher,
212        at: InputAt,
213    ) -> Option<InputAt> {
214        prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
215    }
216
217    fn len(&self) -> usize {
218        self.0.len()
219    }
220
221    fn as_bytes(&self) -> &[u8] {
222        self.0
223    }
224}
225
226/// An input reader over bytes.
227#[derive(Clone, Copy, Debug)]
228pub struct ByteInput<'t> {
229    text: &'t [u8],
230    only_utf8: bool,
231}
232
233impl<'t> ByteInput<'t> {
234    /// Return a new byte-based input reader for the given string.
235    pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
236        ByteInput { text: text, only_utf8: only_utf8 }
237    }
238}
239
240impl<'t> ops::Deref for ByteInput<'t> {
241    type Target = [u8];
242
243    fn deref(&self) -> &[u8] {
244        self.text
245    }
246}
247
248impl<'t> Input for ByteInput<'t> {
249    fn at(&self, i: usize) -> InputAt {
250        if i >= self.len() {
251            InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
252        } else {
253            InputAt {
254                pos: i,
255                c: None.into(),
256                byte: self.get(i).cloned(),
257                len: 1,
258            }
259        }
260    }
261
262    fn next_char(&self, at: InputAt) -> Char {
263        decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
264    }
265
266    fn previous_char(&self, at: InputAt) -> Char {
267        decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
268    }
269
270    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
271        use prog::EmptyLook::*;
272        match empty.look {
273            StartLine => {
274                let c = self.previous_char(at);
275                at.pos() == 0 || c == '\n'
276            }
277            EndLine => {
278                let c = self.next_char(at);
279                at.pos() == self.len() || c == '\n'
280            }
281            StartText => at.pos() == 0,
282            EndText => at.pos() == self.len(),
283            WordBoundary => {
284                let (c1, c2) = (self.previous_char(at), self.next_char(at));
285                c1.is_word_char() != c2.is_word_char()
286            }
287            NotWordBoundary => {
288                let (c1, c2) = (self.previous_char(at), self.next_char(at));
289                c1.is_word_char() == c2.is_word_char()
290            }
291            WordBoundaryAscii => {
292                let (c1, c2) = (self.previous_char(at), self.next_char(at));
293                if self.only_utf8 {
294                    // If we must match UTF-8, then we can't match word
295                    // boundaries at invalid UTF-8.
296                    if c1.is_none() && !at.is_start() {
297                        return false;
298                    }
299                    if c2.is_none() && !at.is_end() {
300                        return false;
301                    }
302                }
303                c1.is_word_byte() != c2.is_word_byte()
304            }
305            NotWordBoundaryAscii => {
306                let (c1, c2) = (self.previous_char(at), self.next_char(at));
307                if self.only_utf8 {
308                    // If we must match UTF-8, then we can't match word
309                    // boundaries at invalid UTF-8.
310                    if c1.is_none() && !at.is_start() {
311                        return false;
312                    }
313                    if c2.is_none() && !at.is_end() {
314                        return false;
315                    }
316                }
317                c1.is_word_byte() == c2.is_word_byte()
318            }
319        }
320    }
321
322    fn prefix_at(
323        &self,
324        prefixes: &LiteralSearcher,
325        at: InputAt,
326    ) -> Option<InputAt> {
327        prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
328    }
329
330    fn len(&self) -> usize {
331        self.text.len()
332    }
333
334    fn as_bytes(&self) -> &[u8] {
335        self.text
336    }
337}
338
339/// An inline representation of `Option<char>`.
340///
341/// This eliminates the need to do case analysis on `Option<char>` to determine
342/// ordinality with other characters.
343///
344/// (The `Option<char>` is not related to encoding. Instead, it is used in the
345/// matching engines to represent the beginning and ending boundaries of the
346/// search text.)
347#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
348pub struct Char(u32);
349
350impl fmt::Debug for Char {
351    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
352        match char::from_u32(self.0) {
353            None => write!(f, "Empty"),
354            Some(c) => write!(f, "{:?}", c),
355        }
356    }
357}
358
359impl Char {
360    /// Returns true iff the character is absent.
361    #[inline]
362    pub fn is_none(self) -> bool {
363        self.0 == u32::MAX
364    }
365
366    /// Returns the length of the character's UTF-8 encoding.
367    ///
368    /// If the character is absent, then `1` is returned.
369    #[inline]
370    pub fn len_utf8(self) -> usize {
371        char::from_u32(self.0).map_or(1, |c| c.len_utf8())
372    }
373
374    /// Returns true iff the character is a word character.
375    ///
376    /// If the character is absent, then false is returned.
377    pub fn is_word_char(self) -> bool {
378        // is_word_character can panic if the Unicode data for \w isn't
379        // available. However, our compiler ensures that if a Unicode word
380        // boundary is used, then the data must also be available. If it isn't,
381        // then the compiler returns an error.
382        char::from_u32(self.0).map_or(false, syntax::is_word_character)
383    }
384
385    /// Returns true iff the byte is a word byte.
386    ///
387    /// If the byte is absent, then false is returned.
388    pub fn is_word_byte(self) -> bool {
389        match char::from_u32(self.0) {
390            Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
391            None | Some(_) => false,
392        }
393    }
394}
395
396impl From<char> for Char {
397    fn from(c: char) -> Char {
398        Char(c as u32)
399    }
400}
401
402impl From<Option<char>> for Char {
403    fn from(c: Option<char>) -> Char {
404        c.map_or(Char(u32::MAX), |c| c.into())
405    }
406}
407
408impl PartialEq<char> for Char {
409    #[inline]
410    fn eq(&self, other: &char) -> bool {
411        self.0 == *other as u32
412    }
413}
414
415impl PartialEq<Char> for char {
416    #[inline]
417    fn eq(&self, other: &Char) -> bool {
418        *self as u32 == other.0
419    }
420}
421
422impl PartialOrd<char> for Char {
423    #[inline]
424    fn partial_cmp(&self, other: &char) -> Option<Ordering> {
425        self.0.partial_cmp(&(*other as u32))
426    }
427}
428
429impl PartialOrd<Char> for char {
430    #[inline]
431    fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
432        (*self as u32).partial_cmp(&other.0)
433    }
434}