regex_syntax/
utf8.rs

1/*!
2Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
3
4This is sub-module is useful for constructing byte based automatons that need
5to embed UTF-8 decoding. The most common use of this module is in conjunction
6with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type.
7
8See the documentation on the `Utf8Sequences` iterator for more details and
9an example.
10
11# Wait, what is this?
12
13This is simplest to explain with an example. Let's say you wanted to test
14whether a particular byte sequence was a Cyrillic character. One possible
15scalar value range is `[0400-04FF]`. The set of allowed bytes for this
16range can be expressed as a sequence of byte ranges:
17
18```ignore
19[D0-D3][80-BF]
20```
21
22This is simple enough: simply encode the boundaries, `0400` encodes to
23`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
24corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
25
26However, what if you wanted to add the Cyrillic Supplementary characters to
27your range? Your range might then become `[0400-052F]`. The same procedure
28as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
29you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
30this isn't quite correct because this range doesn't capture many characters,
31for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
32
33Instead, you need multiple sequences of byte ranges:
34
35```ignore
36[D0-D3][80-BF]  # matches codepoints 0400-04FF
37[D4][80-AF]     # matches codepoints 0500-052F
38```
39
40This gets even more complicated if you want bigger ranges, particularly if
41they naively contain surrogate codepoints. For example, the sequence of byte
42ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
43
44```ignore
45[0-7F]
46[C2-DF][80-BF]
47[E0][A0-BF][80-BF]
48[E1-EC][80-BF][80-BF]
49[ED][80-9F][80-BF]
50[EE-EF][80-BF][80-BF]
51```
52
53Note that the byte ranges above will *not* match any erroneous encoding of
54UTF-8, including encodings of surrogate codepoints.
55
56And, of course, for all of Unicode (`[000000-10FFFF]`):
57
58```ignore
59[0-7F]
60[C2-DF][80-BF]
61[E0][A0-BF][80-BF]
62[E1-EC][80-BF][80-BF]
63[ED][80-9F][80-BF]
64[EE-EF][80-BF][80-BF]
65[F0][90-BF][80-BF][80-BF]
66[F1-F3][80-BF][80-BF][80-BF]
67[F4][80-8F][80-BF][80-BF]
68```
69
70This module automates the process of creating these byte ranges from ranges of
71Unicode scalar values.
72
73# Lineage
74
75I got the idea and general implementation strategy from Russ Cox in his
76[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
77Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
78I also got the idea from
79[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
80which uses it for executing automata on their term index.
81*/
82
83#![deny(missing_docs)]
84
85use std::char;
86use std::fmt;
87use std::slice;
88
89const MAX_UTF8_BYTES: usize = 4;
90
91/// Utf8Sequence represents a sequence of byte ranges.
92///
93/// To match a Utf8Sequence, a candidate byte sequence must match each
94/// successive range.
95///
96/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
97/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
98#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
99pub enum Utf8Sequence {
100    /// One byte range.
101    One(Utf8Range),
102    /// Two successive byte ranges.
103    Two([Utf8Range; 2]),
104    /// Three successive byte ranges.
105    Three([Utf8Range; 3]),
106    /// Four successive byte ranges.
107    Four([Utf8Range; 4]),
108}
109
110impl Utf8Sequence {
111    /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
112    /// range.
113    ///
114    /// This assumes that `start` and `end` have the same length.
115    fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
116        assert_eq!(start.len(), end.len());
117        match start.len() {
118            2 => Utf8Sequence::Two([
119                Utf8Range::new(start[0], end[0]),
120                Utf8Range::new(start[1], end[1]),
121            ]),
122            3 => Utf8Sequence::Three([
123                Utf8Range::new(start[0], end[0]),
124                Utf8Range::new(start[1], end[1]),
125                Utf8Range::new(start[2], end[2]),
126            ]),
127            4 => Utf8Sequence::Four([
128                Utf8Range::new(start[0], end[0]),
129                Utf8Range::new(start[1], end[1]),
130                Utf8Range::new(start[2], end[2]),
131                Utf8Range::new(start[3], end[3]),
132            ]),
133            n => unreachable!("invalid encoded length: {}", n),
134        }
135    }
136
137    /// Returns the underlying sequence of byte ranges as a slice.
138    pub fn as_slice(&self) -> &[Utf8Range] {
139        use self::Utf8Sequence::*;
140        match *self {
141            One(ref r) => slice::from_ref(r),
142            Two(ref r) => &r[..],
143            Three(ref r) => &r[..],
144            Four(ref r) => &r[..],
145        }
146    }
147
148    /// Returns the number of byte ranges in this sequence.
149    ///
150    /// The length is guaranteed to be in the closed interval `[1, 4]`.
151    pub fn len(&self) -> usize {
152        self.as_slice().len()
153    }
154
155    /// Returns true if and only if a prefix of `bytes` matches this sequence
156    /// of byte ranges.
157    pub fn matches(&self, bytes: &[u8]) -> bool {
158        if bytes.len() < self.len() {
159            return false;
160        }
161        for (&b, r) in bytes.iter().zip(self) {
162            if !r.matches(b) {
163                return false;
164            }
165        }
166        true
167    }
168}
169
170impl<'a> IntoIterator for &'a Utf8Sequence {
171    type IntoIter = slice::Iter<'a, Utf8Range>;
172    type Item = &'a Utf8Range;
173
174    fn into_iter(self) -> Self::IntoIter {
175        self.as_slice().into_iter()
176    }
177}
178
179impl fmt::Debug for Utf8Sequence {
180    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
181        use self::Utf8Sequence::*;
182        match *self {
183            One(ref r) => write!(f, "{:?}", r),
184            Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
185            Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
186            Four(ref r) => {
187                write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
188            }
189        }
190    }
191}
192
193/// A single inclusive range of UTF-8 bytes.
194#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
195pub struct Utf8Range {
196    /// Start of byte range (inclusive).
197    pub start: u8,
198    /// End of byte range (inclusive).
199    pub end: u8,
200}
201
202impl Utf8Range {
203    fn new(start: u8, end: u8) -> Self {
204        Utf8Range { start: start, end: end }
205    }
206
207    /// Returns true if and only if the given byte is in this range.
208    pub fn matches(&self, b: u8) -> bool {
209        self.start <= b && b <= self.end
210    }
211}
212
213impl fmt::Debug for Utf8Range {
214    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
215        if self.start == self.end {
216            write!(f, "[{:X}]", self.start)
217        } else {
218            write!(f, "[{:X}-{:X}]", self.start, self.end)
219        }
220    }
221}
222
223/// An iterator over ranges of matching UTF-8 byte sequences.
224///
225/// The iteration represents an alternation of comprehensive byte sequences
226/// that match precisely the set of UTF-8 encoded scalar values.
227///
228/// A byte sequence corresponds to one of the scalar values in the range given
229/// if and only if it completely matches exactly one of the sequences of byte
230/// ranges produced by this iterator.
231///
232/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
233/// sequences will match the same bytes.
234///
235/// # Example
236///
237/// This shows how to match an arbitrary byte sequence against a range of
238/// scalar values.
239///
240/// ```rust
241/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
242///
243/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
244///     for range in seqs {
245///         if range.matches(bytes) {
246///             return true;
247///         }
248///     }
249///     false
250/// }
251///
252/// // Test the basic multilingual plane.
253/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
254///
255/// // UTF-8 encoding of 'a'.
256/// assert!(matches(&seqs, &[0x61]));
257/// // UTF-8 encoding of '☃' (`\u{2603}`).
258/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
259/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
260/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
261/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
262/// // which is invalid UTF-8, and therefore fails, despite the fact that
263/// // the corresponding codepoint (0xD800) falls in the range given.
264/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
265/// // And fails against plain old invalid UTF-8.
266/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
267/// ```
268///
269/// If this example seems circuitous, that's because it is! It's meant to be
270/// illustrative. In practice, you could just try to decode your byte sequence
271/// and compare it with the scalar value range directly. However, this is not
272/// always possible (for example, in a byte based automaton).
273pub struct Utf8Sequences {
274    range_stack: Vec<ScalarRange>,
275}
276
277impl Utf8Sequences {
278    /// Create a new iterator over UTF-8 byte ranges for the scalar value range
279    /// given.
280    pub fn new(start: char, end: char) -> Self {
281        let mut it = Utf8Sequences { range_stack: vec![] };
282        it.push(start as u32, end as u32);
283        it
284    }
285
286    /// reset resets the scalar value range.
287    /// Any existing state is cleared, but resources may be reused.
288    ///
289    /// N.B. Benchmarks say that this method is dubious.
290    #[doc(hidden)]
291    pub fn reset(&mut self, start: char, end: char) {
292        self.range_stack.clear();
293        self.push(start as u32, end as u32);
294    }
295
296    fn push(&mut self, start: u32, end: u32) {
297        self.range_stack.push(ScalarRange { start: start, end: end });
298    }
299}
300
301struct ScalarRange {
302    start: u32,
303    end: u32,
304}
305
306impl fmt::Debug for ScalarRange {
307    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
308        write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
309    }
310}
311
312impl Iterator for Utf8Sequences {
313    type Item = Utf8Sequence;
314
315    fn next(&mut self) -> Option<Self::Item> {
316        'TOP: while let Some(mut r) = self.range_stack.pop() {
317            'INNER: loop {
318                if let Some((r1, r2)) = r.split() {
319                    self.push(r2.start, r2.end);
320                    r.start = r1.start;
321                    r.end = r1.end;
322                    continue 'INNER;
323                }
324                if !r.is_valid() {
325                    continue 'TOP;
326                }
327                for i in 1..MAX_UTF8_BYTES {
328                    let max = max_scalar_value(i);
329                    if r.start <= max && max < r.end {
330                        self.push(max + 1, r.end);
331                        r.end = max;
332                        continue 'INNER;
333                    }
334                }
335                if let Some(ascii_range) = r.as_ascii() {
336                    return Some(Utf8Sequence::One(ascii_range));
337                }
338                for i in 1..MAX_UTF8_BYTES {
339                    let m = (1 << (6 * i)) - 1;
340                    if (r.start & !m) != (r.end & !m) {
341                        if (r.start & m) != 0 {
342                            self.push((r.start | m) + 1, r.end);
343                            r.end = r.start | m;
344                            continue 'INNER;
345                        }
346                        if (r.end & m) != m {
347                            self.push(r.end & !m, r.end);
348                            r.end = (r.end & !m) - 1;
349                            continue 'INNER;
350                        }
351                    }
352                }
353                let mut start = [0; MAX_UTF8_BYTES];
354                let mut end = [0; MAX_UTF8_BYTES];
355                let n = r.encode(&mut start, &mut end);
356                return Some(Utf8Sequence::from_encoded_range(
357                    &start[0..n],
358                    &end[0..n],
359                ));
360            }
361        }
362        None
363    }
364}
365
366impl ScalarRange {
367    /// split splits this range if it overlaps with a surrogate codepoint.
368    ///
369    /// Either or both ranges may be invalid.
370    fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
371        if self.start < 0xE000 && self.end > 0xD7FF {
372            Some((
373                ScalarRange { start: self.start, end: 0xD7FF },
374                ScalarRange { start: 0xE000, end: self.end },
375            ))
376        } else {
377            None
378        }
379    }
380
381    /// is_valid returns true if and only if start <= end.
382    fn is_valid(&self) -> bool {
383        self.start <= self.end
384    }
385
386    /// as_ascii returns this range as a Utf8Range if and only if all scalar
387    /// values in this range can be encoded as a single byte.
388    fn as_ascii(&self) -> Option<Utf8Range> {
389        if self.is_ascii() {
390            Some(Utf8Range::new(self.start as u8, self.end as u8))
391        } else {
392            None
393        }
394    }
395
396    /// is_ascii returns true if the range is ASCII only (i.e., takes a single
397    /// byte to encode any scalar value).
398    fn is_ascii(&self) -> bool {
399        self.is_valid() && self.end <= 0x7f
400    }
401
402    /// encode writes the UTF-8 encoding of the start and end of this range
403    /// to the corresponding destination slices, and returns the number of
404    /// bytes written.
405    ///
406    /// The slices should have room for at least `MAX_UTF8_BYTES`.
407    fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
408        let cs = char::from_u32(self.start).unwrap();
409        let ce = char::from_u32(self.end).unwrap();
410        let ss = cs.encode_utf8(start);
411        let se = ce.encode_utf8(end);
412        assert_eq!(ss.len(), se.len());
413        ss.len()
414    }
415}
416
417fn max_scalar_value(nbytes: usize) -> u32 {
418    match nbytes {
419        1 => 0x007F,
420        2 => 0x07FF,
421        3 => 0xFFFF,
422        4 => 0x10FFFF,
423        _ => unreachable!("invalid UTF-8 byte sequence size"),
424    }
425}
426
427#[cfg(test)]
428mod tests {
429    use std::char;
430
431    use utf8::{Utf8Range, Utf8Sequences};
432
433    fn rutf8(s: u8, e: u8) -> Utf8Range {
434        Utf8Range::new(s, e)
435    }
436
437    fn never_accepts_surrogate_codepoints(start: char, end: char) {
438        for cp in 0xD800..0xE000 {
439            let buf = encode_surrogate(cp);
440            for r in Utf8Sequences::new(start, end) {
441                if r.matches(&buf) {
442                    panic!(
443                        "Sequence ({:X}, {:X}) contains range {:?}, \
444                         which matches surrogate code point {:X} \
445                         with encoded bytes {:?}",
446                        start as u32, end as u32, r, cp, buf,
447                    );
448                }
449            }
450        }
451    }
452
453    #[test]
454    fn codepoints_no_surrogates() {
455        never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
456        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
457        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
458        never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
459        never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
460    }
461
462    #[test]
463    fn single_codepoint_one_sequence() {
464        // Tests that every range of scalar values that contains a single
465        // scalar value is recognized by one sequence of byte ranges.
466        for i in 0x0..(0x10FFFF + 1) {
467            let c = match char::from_u32(i) {
468                None => continue,
469                Some(c) => c,
470            };
471            let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
472            assert_eq!(seqs.len(), 1);
473        }
474    }
475
476    #[test]
477    fn bmp() {
478        use utf8::Utf8Sequence::*;
479
480        let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
481        assert_eq!(
482            seqs,
483            vec![
484                One(rutf8(0x0, 0x7F)),
485                Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
486                Three([
487                    rutf8(0xE0, 0xE0),
488                    rutf8(0xA0, 0xBF),
489                    rutf8(0x80, 0xBF)
490                ]),
491                Three([
492                    rutf8(0xE1, 0xEC),
493                    rutf8(0x80, 0xBF),
494                    rutf8(0x80, 0xBF)
495                ]),
496                Three([
497                    rutf8(0xED, 0xED),
498                    rutf8(0x80, 0x9F),
499                    rutf8(0x80, 0xBF)
500                ]),
501                Three([
502                    rutf8(0xEE, 0xEF),
503                    rutf8(0x80, 0xBF),
504                    rutf8(0x80, 0xBF)
505                ]),
506            ]
507        );
508    }
509
510    fn encode_surrogate(cp: u32) -> [u8; 3] {
511        const TAG_CONT: u8 = 0b1000_0000;
512        const TAG_THREE_B: u8 = 0b1110_0000;
513
514        assert!(0xD800 <= cp && cp < 0xE000);
515        let mut dst = [0; 3];
516        dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B;
517        dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT;
518        dst[2] = (cp & 0x3F) as u8 | TAG_CONT;
519        dst
520    }
521}