regex_syntax/hir/
mod.rs

1/*!
2Defines a high-level intermediate representation for regular expressions.
3*/
4use std::char;
5use std::cmp;
6use std::error;
7use std::fmt;
8use std::result;
9use std::u8;
10
11use ast::Span;
12use hir::interval::{Interval, IntervalSet, IntervalSetIter};
13use unicode;
14
15pub use hir::visitor::{visit, Visitor};
16pub use unicode::CaseFoldError;
17
18mod interval;
19pub mod literal;
20pub mod print;
21pub mod translate;
22mod visitor;
23
24/// An error that can occur while translating an `Ast` to a `Hir`.
25#[derive(Clone, Debug, Eq, PartialEq)]
26pub struct Error {
27    /// The kind of error.
28    kind: ErrorKind,
29    /// The original pattern that the translator's Ast was parsed from. Every
30    /// span in an error is a valid range into this string.
31    pattern: String,
32    /// The span of this error, derived from the Ast given to the translator.
33    span: Span,
34}
35
36impl Error {
37    /// Return the type of this error.
38    pub fn kind(&self) -> &ErrorKind {
39        &self.kind
40    }
41
42    /// The original pattern string in which this error occurred.
43    ///
44    /// Every span reported by this error is reported in terms of this string.
45    pub fn pattern(&self) -> &str {
46        &self.pattern
47    }
48
49    /// Return the span at which this error occurred.
50    pub fn span(&self) -> &Span {
51        &self.span
52    }
53}
54
55/// The type of an error that occurred while building an `Hir`.
56#[derive(Clone, Debug, Eq, PartialEq)]
57pub enum ErrorKind {
58    /// This error occurs when a Unicode feature is used when Unicode
59    /// support is disabled. For example `(?-u:\pL)` would trigger this error.
60    UnicodeNotAllowed,
61    /// This error occurs when translating a pattern that could match a byte
62    /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
63    InvalidUtf8,
64    /// This occurs when an unrecognized Unicode property name could not
65    /// be found.
66    UnicodePropertyNotFound,
67    /// This occurs when an unrecognized Unicode property value could not
68    /// be found.
69    UnicodePropertyValueNotFound,
70    /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
71    /// `\d`) could not be found. This can occur when the `unicode-perl`
72    /// crate feature is not enabled.
73    UnicodePerlClassNotFound,
74    /// This occurs when the Unicode simple case mapping tables are not
75    /// available, and the regular expression required Unicode aware case
76    /// insensitivity.
77    UnicodeCaseUnavailable,
78    /// This occurs when the translator attempts to construct a character class
79    /// that is empty.
80    ///
81    /// Note that this restriction in the translator may be removed in the
82    /// future.
83    EmptyClassNotAllowed,
84    /// Hints that destructuring should not be exhaustive.
85    ///
86    /// This enum may grow additional variants, so this makes sure clients
87    /// don't count on exhaustive matching. (Otherwise, adding a new variant
88    /// could break existing code.)
89    #[doc(hidden)]
90    __Nonexhaustive,
91}
92
93impl ErrorKind {
94    fn description(&self) -> &str {
95        use self::ErrorKind::*;
96        match *self {
97            UnicodeNotAllowed => "Unicode not allowed here",
98            InvalidUtf8 => "pattern can match invalid UTF-8",
99            UnicodePropertyNotFound => "Unicode property not found",
100            UnicodePropertyValueNotFound => "Unicode property value not found",
101            UnicodePerlClassNotFound => {
102                "Unicode-aware Perl class not found \
103                 (make sure the unicode-perl feature is enabled)"
104            }
105            UnicodeCaseUnavailable => {
106                "Unicode-aware case insensitivity matching is not available \
107                 (make sure the unicode-case feature is enabled)"
108            }
109            EmptyClassNotAllowed => "empty character classes are not allowed",
110            __Nonexhaustive => unreachable!(),
111        }
112    }
113}
114
115impl error::Error for Error {
116    fn description(&self) -> &str {
117        self.kind.description()
118    }
119}
120
121impl fmt::Display for Error {
122    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
123        ::error::Formatter::from(self).fmt(f)
124    }
125}
126
127impl fmt::Display for ErrorKind {
128    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
129        f.write_str(self.description())
130    }
131}
132
133/// A high-level intermediate representation (HIR) for a regular expression.
134///
135/// The HIR of a regular expression represents an intermediate step between its
136/// abstract syntax (a structured description of the concrete syntax) and
137/// compiled byte codes. The purpose of HIR is to make regular expressions
138/// easier to analyze. In particular, the AST is much more complex than the
139/// HIR. For example, while an AST supports arbitrarily nested character
140/// classes, the HIR will flatten all nested classes into a single set. The HIR
141/// will also "compile away" every flag present in the concrete syntax. For
142/// example, users of HIR expressions never need to worry about case folding;
143/// it is handled automatically by the translator (e.g., by translating `(?i)A`
144/// to `[aA]`).
145///
146/// If the HIR was produced by a translator that disallows invalid UTF-8, then
147/// the HIR is guaranteed to match UTF-8 exclusively.
148///
149/// This type defines its own destructor that uses constant stack space and
150/// heap space proportional to the size of the HIR.
151///
152/// The specific type of an HIR expression can be accessed via its `kind`
153/// or `into_kind` methods. This extra level of indirection exists for two
154/// reasons:
155///
156/// 1. Construction of an HIR expression *must* use the constructor methods
157///    on this `Hir` type instead of building the `HirKind` values directly.
158///    This permits construction to enforce invariants like "concatenations
159///    always consist of two or more sub-expressions."
160/// 2. Every HIR expression contains attributes that are defined inductively,
161///    and can be computed cheaply during the construction process. For
162///    example, one such attribute is whether the expression must match at the
163///    beginning of the text.
164///
165/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
166/// expression pattern string, and uses constant stack space and heap space
167/// proportional to the size of the `Hir`.
168#[derive(Clone, Debug, Eq, PartialEq)]
169pub struct Hir {
170    /// The underlying HIR kind.
171    kind: HirKind,
172    /// Analysis info about this HIR, computed during construction.
173    info: HirInfo,
174}
175
176/// The kind of an arbitrary `Hir` expression.
177#[derive(Clone, Debug, Eq, PartialEq)]
178pub enum HirKind {
179    /// The empty regular expression, which matches everything, including the
180    /// empty string.
181    Empty,
182    /// A single literal character that matches exactly this character.
183    Literal(Literal),
184    /// A single character class that matches any of the characters in the
185    /// class. A class can either consist of Unicode scalar values as
186    /// characters, or it can use bytes.
187    Class(Class),
188    /// An anchor assertion. An anchor assertion match always has zero length.
189    Anchor(Anchor),
190    /// A word boundary assertion, which may or may not be Unicode aware. A
191    /// word boundary assertion match always has zero length.
192    WordBoundary(WordBoundary),
193    /// A repetition operation applied to a child expression.
194    Repetition(Repetition),
195    /// A possibly capturing group, which contains a child expression.
196    Group(Group),
197    /// A concatenation of expressions. A concatenation always has at least two
198    /// child expressions.
199    ///
200    /// A concatenation matches only if each of its child expression matches
201    /// one after the other.
202    Concat(Vec<Hir>),
203    /// An alternation of expressions. An alternation always has at least two
204    /// child expressions.
205    ///
206    /// An alternation matches only if at least one of its child expression
207    /// matches. If multiple expressions match, then the leftmost is preferred.
208    Alternation(Vec<Hir>),
209}
210
211impl Hir {
212    /// Returns a reference to the underlying HIR kind.
213    pub fn kind(&self) -> &HirKind {
214        &self.kind
215    }
216
217    /// Consumes ownership of this HIR expression and returns its underlying
218    /// `HirKind`.
219    pub fn into_kind(mut self) -> HirKind {
220        use std::mem;
221        mem::replace(&mut self.kind, HirKind::Empty)
222    }
223
224    /// Returns an empty HIR expression.
225    ///
226    /// An empty HIR expression always matches, including the empty string.
227    pub fn empty() -> Hir {
228        let mut info = HirInfo::new();
229        info.set_always_utf8(true);
230        info.set_all_assertions(true);
231        info.set_anchored_start(false);
232        info.set_anchored_end(false);
233        info.set_line_anchored_start(false);
234        info.set_line_anchored_end(false);
235        info.set_any_anchored_start(false);
236        info.set_any_anchored_end(false);
237        info.set_match_empty(true);
238        info.set_literal(true);
239        info.set_alternation_literal(true);
240        Hir { kind: HirKind::Empty, info: info }
241    }
242
243    /// Creates a literal HIR expression.
244    ///
245    /// If the given literal has a `Byte` variant with an ASCII byte, then this
246    /// method panics. This enforces the invariant that `Byte` variants are
247    /// only used to express matching of invalid UTF-8.
248    pub fn literal(lit: Literal) -> Hir {
249        if let Literal::Byte(b) = lit {
250            assert!(b > 0x7F);
251        }
252
253        let mut info = HirInfo::new();
254        info.set_always_utf8(lit.is_unicode());
255        info.set_all_assertions(false);
256        info.set_anchored_start(false);
257        info.set_anchored_end(false);
258        info.set_line_anchored_start(false);
259        info.set_line_anchored_end(false);
260        info.set_any_anchored_start(false);
261        info.set_any_anchored_end(false);
262        info.set_match_empty(false);
263        info.set_literal(true);
264        info.set_alternation_literal(true);
265        Hir { kind: HirKind::Literal(lit), info: info }
266    }
267
268    /// Creates a class HIR expression.
269    pub fn class(class: Class) -> Hir {
270        let mut info = HirInfo::new();
271        info.set_always_utf8(class.is_always_utf8());
272        info.set_all_assertions(false);
273        info.set_anchored_start(false);
274        info.set_anchored_end(false);
275        info.set_line_anchored_start(false);
276        info.set_line_anchored_end(false);
277        info.set_any_anchored_start(false);
278        info.set_any_anchored_end(false);
279        info.set_match_empty(false);
280        info.set_literal(false);
281        info.set_alternation_literal(false);
282        Hir { kind: HirKind::Class(class), info: info }
283    }
284
285    /// Creates an anchor assertion HIR expression.
286    pub fn anchor(anchor: Anchor) -> Hir {
287        let mut info = HirInfo::new();
288        info.set_always_utf8(true);
289        info.set_all_assertions(true);
290        info.set_anchored_start(false);
291        info.set_anchored_end(false);
292        info.set_line_anchored_start(false);
293        info.set_line_anchored_end(false);
294        info.set_any_anchored_start(false);
295        info.set_any_anchored_end(false);
296        info.set_match_empty(true);
297        info.set_literal(false);
298        info.set_alternation_literal(false);
299        if let Anchor::StartText = anchor {
300            info.set_anchored_start(true);
301            info.set_line_anchored_start(true);
302            info.set_any_anchored_start(true);
303        }
304        if let Anchor::EndText = anchor {
305            info.set_anchored_end(true);
306            info.set_line_anchored_end(true);
307            info.set_any_anchored_end(true);
308        }
309        if let Anchor::StartLine = anchor {
310            info.set_line_anchored_start(true);
311        }
312        if let Anchor::EndLine = anchor {
313            info.set_line_anchored_end(true);
314        }
315        Hir { kind: HirKind::Anchor(anchor), info: info }
316    }
317
318    /// Creates a word boundary assertion HIR expression.
319    pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
320        let mut info = HirInfo::new();
321        info.set_always_utf8(true);
322        info.set_all_assertions(true);
323        info.set_anchored_start(false);
324        info.set_anchored_end(false);
325        info.set_line_anchored_start(false);
326        info.set_line_anchored_end(false);
327        info.set_any_anchored_start(false);
328        info.set_any_anchored_end(false);
329        info.set_literal(false);
330        info.set_alternation_literal(false);
331        // A negated word boundary matches the empty string, but a normal
332        // word boundary does not!
333        info.set_match_empty(word_boundary.is_negated());
334        // Negated ASCII word boundaries can match invalid UTF-8.
335        if let WordBoundary::AsciiNegate = word_boundary {
336            info.set_always_utf8(false);
337        }
338        Hir { kind: HirKind::WordBoundary(word_boundary), info: info }
339    }
340
341    /// Creates a repetition HIR expression.
342    pub fn repetition(rep: Repetition) -> Hir {
343        let mut info = HirInfo::new();
344        info.set_always_utf8(rep.hir.is_always_utf8());
345        info.set_all_assertions(rep.hir.is_all_assertions());
346        // If this operator can match the empty string, then it can never
347        // be anchored.
348        info.set_anchored_start(
349            !rep.is_match_empty() && rep.hir.is_anchored_start(),
350        );
351        info.set_anchored_end(
352            !rep.is_match_empty() && rep.hir.is_anchored_end(),
353        );
354        info.set_line_anchored_start(
355            !rep.is_match_empty() && rep.hir.is_anchored_start(),
356        );
357        info.set_line_anchored_end(
358            !rep.is_match_empty() && rep.hir.is_anchored_end(),
359        );
360        info.set_any_anchored_start(rep.hir.is_any_anchored_start());
361        info.set_any_anchored_end(rep.hir.is_any_anchored_end());
362        info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
363        info.set_literal(false);
364        info.set_alternation_literal(false);
365        Hir { kind: HirKind::Repetition(rep), info: info }
366    }
367
368    /// Creates a group HIR expression.
369    pub fn group(group: Group) -> Hir {
370        let mut info = HirInfo::new();
371        info.set_always_utf8(group.hir.is_always_utf8());
372        info.set_all_assertions(group.hir.is_all_assertions());
373        info.set_anchored_start(group.hir.is_anchored_start());
374        info.set_anchored_end(group.hir.is_anchored_end());
375        info.set_line_anchored_start(group.hir.is_line_anchored_start());
376        info.set_line_anchored_end(group.hir.is_line_anchored_end());
377        info.set_any_anchored_start(group.hir.is_any_anchored_start());
378        info.set_any_anchored_end(group.hir.is_any_anchored_end());
379        info.set_match_empty(group.hir.is_match_empty());
380        info.set_literal(false);
381        info.set_alternation_literal(false);
382        Hir { kind: HirKind::Group(group), info: info }
383    }
384
385    /// Returns the concatenation of the given expressions.
386    ///
387    /// This flattens the concatenation as appropriate.
388    pub fn concat(mut exprs: Vec<Hir>) -> Hir {
389        match exprs.len() {
390            0 => Hir::empty(),
391            1 => exprs.pop().unwrap(),
392            _ => {
393                let mut info = HirInfo::new();
394                info.set_always_utf8(true);
395                info.set_all_assertions(true);
396                info.set_any_anchored_start(false);
397                info.set_any_anchored_end(false);
398                info.set_match_empty(true);
399                info.set_literal(true);
400                info.set_alternation_literal(true);
401
402                // Some attributes require analyzing all sub-expressions.
403                for e in &exprs {
404                    let x = info.is_always_utf8() && e.is_always_utf8();
405                    info.set_always_utf8(x);
406
407                    let x = info.is_all_assertions() && e.is_all_assertions();
408                    info.set_all_assertions(x);
409
410                    let x = info.is_any_anchored_start()
411                        || e.is_any_anchored_start();
412                    info.set_any_anchored_start(x);
413
414                    let x =
415                        info.is_any_anchored_end() || e.is_any_anchored_end();
416                    info.set_any_anchored_end(x);
417
418                    let x = info.is_match_empty() && e.is_match_empty();
419                    info.set_match_empty(x);
420
421                    let x = info.is_literal() && e.is_literal();
422                    info.set_literal(x);
423
424                    let x = info.is_alternation_literal()
425                        && e.is_alternation_literal();
426                    info.set_alternation_literal(x);
427                }
428                // Anchored attributes require something slightly more
429                // sophisticated. Normally, WLOG, to determine whether an
430                // expression is anchored to the start, we'd only need to check
431                // the first expression of a concatenation. However,
432                // expressions like `$\b^` are still anchored to the start,
433                // but the first expression in the concatenation *isn't*
434                // anchored to the start. So the "first" expression to look at
435                // is actually one that is either not an assertion or is
436                // specifically the StartText assertion.
437                info.set_anchored_start(
438                    exprs
439                        .iter()
440                        .take_while(|e| {
441                            e.is_anchored_start() || e.is_all_assertions()
442                        })
443                        .any(|e| e.is_anchored_start()),
444                );
445                // Similarly for the end anchor, but in reverse.
446                info.set_anchored_end(
447                    exprs
448                        .iter()
449                        .rev()
450                        .take_while(|e| {
451                            e.is_anchored_end() || e.is_all_assertions()
452                        })
453                        .any(|e| e.is_anchored_end()),
454                );
455                // Repeat the process for line anchors.
456                info.set_line_anchored_start(
457                    exprs
458                        .iter()
459                        .take_while(|e| {
460                            e.is_line_anchored_start() || e.is_all_assertions()
461                        })
462                        .any(|e| e.is_line_anchored_start()),
463                );
464                info.set_line_anchored_end(
465                    exprs
466                        .iter()
467                        .rev()
468                        .take_while(|e| {
469                            e.is_line_anchored_end() || e.is_all_assertions()
470                        })
471                        .any(|e| e.is_line_anchored_end()),
472                );
473                Hir { kind: HirKind::Concat(exprs), info: info }
474            }
475        }
476    }
477
478    /// Returns the alternation of the given expressions.
479    ///
480    /// This flattens the alternation as appropriate.
481    pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
482        match exprs.len() {
483            0 => Hir::empty(),
484            1 => exprs.pop().unwrap(),
485            _ => {
486                let mut info = HirInfo::new();
487                info.set_always_utf8(true);
488                info.set_all_assertions(true);
489                info.set_anchored_start(true);
490                info.set_anchored_end(true);
491                info.set_line_anchored_start(true);
492                info.set_line_anchored_end(true);
493                info.set_any_anchored_start(false);
494                info.set_any_anchored_end(false);
495                info.set_match_empty(false);
496                info.set_literal(false);
497                info.set_alternation_literal(true);
498
499                // Some attributes require analyzing all sub-expressions.
500                for e in &exprs {
501                    let x = info.is_always_utf8() && e.is_always_utf8();
502                    info.set_always_utf8(x);
503
504                    let x = info.is_all_assertions() && e.is_all_assertions();
505                    info.set_all_assertions(x);
506
507                    let x = info.is_anchored_start() && e.is_anchored_start();
508                    info.set_anchored_start(x);
509
510                    let x = info.is_anchored_end() && e.is_anchored_end();
511                    info.set_anchored_end(x);
512
513                    let x = info.is_line_anchored_start()
514                        && e.is_line_anchored_start();
515                    info.set_line_anchored_start(x);
516
517                    let x = info.is_line_anchored_end()
518                        && e.is_line_anchored_end();
519                    info.set_line_anchored_end(x);
520
521                    let x = info.is_any_anchored_start()
522                        || e.is_any_anchored_start();
523                    info.set_any_anchored_start(x);
524
525                    let x =
526                        info.is_any_anchored_end() || e.is_any_anchored_end();
527                    info.set_any_anchored_end(x);
528
529                    let x = info.is_match_empty() || e.is_match_empty();
530                    info.set_match_empty(x);
531
532                    let x = info.is_alternation_literal() && e.is_literal();
533                    info.set_alternation_literal(x);
534                }
535                Hir { kind: HirKind::Alternation(exprs), info: info }
536            }
537        }
538    }
539
540    /// Build an HIR expression for `.`.
541    ///
542    /// A `.` expression matches any character except for `\n`. To build an
543    /// expression that matches any character, including `\n`, use the `any`
544    /// method.
545    ///
546    /// If `bytes` is `true`, then this assumes characters are limited to a
547    /// single byte.
548    pub fn dot(bytes: bool) -> Hir {
549        if bytes {
550            let mut cls = ClassBytes::empty();
551            cls.push(ClassBytesRange::new(b'\0', b'\x09'));
552            cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
553            Hir::class(Class::Bytes(cls))
554        } else {
555            let mut cls = ClassUnicode::empty();
556            cls.push(ClassUnicodeRange::new('\0', '\x09'));
557            cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
558            Hir::class(Class::Unicode(cls))
559        }
560    }
561
562    /// Build an HIR expression for `(?s).`.
563    ///
564    /// A `(?s).` expression matches any character, including `\n`. To build an
565    /// expression that matches any character except for `\n`, then use the
566    /// `dot` method.
567    ///
568    /// If `bytes` is `true`, then this assumes characters are limited to a
569    /// single byte.
570    pub fn any(bytes: bool) -> Hir {
571        if bytes {
572            let mut cls = ClassBytes::empty();
573            cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
574            Hir::class(Class::Bytes(cls))
575        } else {
576            let mut cls = ClassUnicode::empty();
577            cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
578            Hir::class(Class::Unicode(cls))
579        }
580    }
581
582    /// Return true if and only if this HIR will always match valid UTF-8.
583    ///
584    /// When this returns false, then it is possible for this HIR expression
585    /// to match invalid UTF-8.
586    pub fn is_always_utf8(&self) -> bool {
587        self.info.is_always_utf8()
588    }
589
590    /// Returns true if and only if this entire HIR expression is made up of
591    /// zero-width assertions.
592    ///
593    /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
594    /// not `^a`.
595    pub fn is_all_assertions(&self) -> bool {
596        self.info.is_all_assertions()
597    }
598
599    /// Return true if and only if this HIR is required to match from the
600    /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
601    /// `^foo|^bar` but not `^foo|bar`.
602    pub fn is_anchored_start(&self) -> bool {
603        self.info.is_anchored_start()
604    }
605
606    /// Return true if and only if this HIR is required to match at the end
607    /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
608    /// `foo$|bar$` but not `foo$|bar`.
609    pub fn is_anchored_end(&self) -> bool {
610        self.info.is_anchored_end()
611    }
612
613    /// Return true if and only if this HIR is required to match from the
614    /// beginning of text or the beginning of a line. This includes expressions
615    /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
616    /// but not `^foo|bar` or `(?m)^foo|bar`.
617    ///
618    /// Note that if `is_anchored_start` is `true`, then
619    /// `is_line_anchored_start` will also be `true`. The reverse implication
620    /// is not true. For example, `(?m)^foo` is line anchored, but not
621    /// `is_anchored_start`.
622    pub fn is_line_anchored_start(&self) -> bool {
623        self.info.is_line_anchored_start()
624    }
625
626    /// Return true if and only if this HIR is required to match at the
627    /// end of text or the end of a line. This includes expressions like
628    /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
629    /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
630    ///
631    /// Note that if `is_anchored_end` is `true`, then
632    /// `is_line_anchored_end` will also be `true`. The reverse implication
633    /// is not true. For example, `(?m)foo$` is line anchored, but not
634    /// `is_anchored_end`.
635    pub fn is_line_anchored_end(&self) -> bool {
636        self.info.is_line_anchored_end()
637    }
638
639    /// Return true if and only if this HIR contains any sub-expression that
640    /// is required to match at the beginning of text. Specifically, this
641    /// returns true if the `^` symbol (when multiline mode is disabled) or the
642    /// `\A` escape appear anywhere in the regex.
643    pub fn is_any_anchored_start(&self) -> bool {
644        self.info.is_any_anchored_start()
645    }
646
647    /// Return true if and only if this HIR contains any sub-expression that is
648    /// required to match at the end of text. Specifically, this returns true
649    /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
650    /// appear anywhere in the regex.
651    pub fn is_any_anchored_end(&self) -> bool {
652        self.info.is_any_anchored_end()
653    }
654
655    /// Return true if and only if the empty string is part of the language
656    /// matched by this regular expression.
657    ///
658    /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`,
659    /// but not `a`, `a+` or `\b`.
660    pub fn is_match_empty(&self) -> bool {
661        self.info.is_match_empty()
662    }
663
664    /// Return true if and only if this HIR is a simple literal. This is only
665    /// true when this HIR expression is either itself a `Literal` or a
666    /// concatenation of only `Literal`s.
667    ///
668    /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`
669    /// are not (even though that contain sub-expressions that are literals).
670    pub fn is_literal(&self) -> bool {
671        self.info.is_literal()
672    }
673
674    /// Return true if and only if this HIR is either a simple literal or an
675    /// alternation of simple literals. This is only
676    /// true when this HIR expression is either itself a `Literal` or a
677    /// concatenation of only `Literal`s or an alternation of only `Literal`s.
678    ///
679    /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternaiton
680    /// literals, but `f+`, `(foo)`, `foo()`
681    /// are not (even though that contain sub-expressions that are literals).
682    pub fn is_alternation_literal(&self) -> bool {
683        self.info.is_alternation_literal()
684    }
685}
686
687impl HirKind {
688    /// Return true if and only if this HIR is the empty regular expression.
689    ///
690    /// Note that this is not defined inductively. That is, it only tests if
691    /// this kind is the `Empty` variant. To get the inductive definition,
692    /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
693    pub fn is_empty(&self) -> bool {
694        match *self {
695            HirKind::Empty => true,
696            _ => false,
697        }
698    }
699
700    /// Returns true if and only if this kind has any (including possibly
701    /// empty) subexpressions.
702    pub fn has_subexprs(&self) -> bool {
703        match *self {
704            HirKind::Empty
705            | HirKind::Literal(_)
706            | HirKind::Class(_)
707            | HirKind::Anchor(_)
708            | HirKind::WordBoundary(_) => false,
709            HirKind::Group(_)
710            | HirKind::Repetition(_)
711            | HirKind::Concat(_)
712            | HirKind::Alternation(_) => true,
713        }
714    }
715}
716
717/// Print a display representation of this Hir.
718///
719/// The result of this is a valid regular expression pattern string.
720///
721/// This implementation uses constant stack space and heap space proportional
722/// to the size of the `Hir`.
723impl fmt::Display for Hir {
724    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
725        use hir::print::Printer;
726        Printer::new().print(self, f)
727    }
728}
729
730/// The high-level intermediate representation of a literal.
731///
732/// A literal corresponds to a single character, where a character is either
733/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
734/// are preferred whenever possible. In particular, a `Byte` variant is only
735/// ever produced when it could match invalid UTF-8.
736#[derive(Clone, Debug, Eq, PartialEq)]
737pub enum Literal {
738    /// A single character represented by a Unicode scalar value.
739    Unicode(char),
740    /// A single character represented by an arbitrary byte.
741    Byte(u8),
742}
743
744impl Literal {
745    /// Returns true if and only if this literal corresponds to a Unicode
746    /// scalar value.
747    pub fn is_unicode(&self) -> bool {
748        match *self {
749            Literal::Unicode(_) => true,
750            Literal::Byte(b) if b <= 0x7F => true,
751            Literal::Byte(_) => false,
752        }
753    }
754}
755
756/// The high-level intermediate representation of a character class.
757///
758/// A character class corresponds to a set of characters. A character is either
759/// defined by a Unicode scalar value or a byte. Unicode characters are used
760/// by default, while bytes are used when Unicode mode (via the `u` flag) is
761/// disabled.
762///
763/// A character class, regardless of its character type, is represented by a
764/// sequence of non-overlapping non-adjacent ranges of characters.
765///
766/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
767/// be produced even when it exclusively matches valid UTF-8. This is because
768/// a `Bytes` variant represents an intention by the author of the regular
769/// expression to disable Unicode mode, which in turn impacts the semantics of
770/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
771/// match the same set of strings.
772#[derive(Clone, Debug, Eq, PartialEq)]
773pub enum Class {
774    /// A set of characters represented by Unicode scalar values.
775    Unicode(ClassUnicode),
776    /// A set of characters represented by arbitrary bytes (one byte per
777    /// character).
778    Bytes(ClassBytes),
779}
780
781impl Class {
782    /// Apply Unicode simple case folding to this character class, in place.
783    /// The character class will be expanded to include all simple case folded
784    /// character variants.
785    ///
786    /// If this is a byte oriented character class, then this will be limited
787    /// to the ASCII ranges `A-Z` and `a-z`.
788    pub fn case_fold_simple(&mut self) {
789        match *self {
790            Class::Unicode(ref mut x) => x.case_fold_simple(),
791            Class::Bytes(ref mut x) => x.case_fold_simple(),
792        }
793    }
794
795    /// Negate this character class in place.
796    ///
797    /// After completion, this character class will contain precisely the
798    /// characters that weren't previously in the class.
799    pub fn negate(&mut self) {
800        match *self {
801            Class::Unicode(ref mut x) => x.negate(),
802            Class::Bytes(ref mut x) => x.negate(),
803        }
804    }
805
806    /// Returns true if and only if this character class will only ever match
807    /// valid UTF-8.
808    ///
809    /// A character class can match invalid UTF-8 only when the following
810    /// conditions are met:
811    ///
812    /// 1. The translator was configured to permit generating an expression
813    ///    that can match invalid UTF-8. (By default, this is disabled.)
814    /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
815    ///    syntax or in the parser builder. By default, Unicode mode is
816    ///    enabled.
817    pub fn is_always_utf8(&self) -> bool {
818        match *self {
819            Class::Unicode(_) => true,
820            Class::Bytes(ref x) => x.is_all_ascii(),
821        }
822    }
823}
824
825/// A set of characters represented by Unicode scalar values.
826#[derive(Clone, Debug, Eq, PartialEq)]
827pub struct ClassUnicode {
828    set: IntervalSet<ClassUnicodeRange>,
829}
830
831impl ClassUnicode {
832    /// Create a new class from a sequence of ranges.
833    ///
834    /// The given ranges do not need to be in any specific order, and ranges
835    /// may overlap.
836    pub fn new<I>(ranges: I) -> ClassUnicode
837    where
838        I: IntoIterator<Item = ClassUnicodeRange>,
839    {
840        ClassUnicode { set: IntervalSet::new(ranges) }
841    }
842
843    /// Create a new class with no ranges.
844    pub fn empty() -> ClassUnicode {
845        ClassUnicode::new(vec![])
846    }
847
848    /// Add a new range to this set.
849    pub fn push(&mut self, range: ClassUnicodeRange) {
850        self.set.push(range);
851    }
852
853    /// Return an iterator over all ranges in this class.
854    ///
855    /// The iterator yields ranges in ascending order.
856    pub fn iter(&self) -> ClassUnicodeIter {
857        ClassUnicodeIter(self.set.iter())
858    }
859
860    /// Return the underlying ranges as a slice.
861    pub fn ranges(&self) -> &[ClassUnicodeRange] {
862        self.set.intervals()
863    }
864
865    /// Expand this character class such that it contains all case folded
866    /// characters, according to Unicode's "simple" mapping. For example, if
867    /// this class consists of the range `a-z`, then applying case folding will
868    /// result in the class containing both the ranges `a-z` and `A-Z`.
869    ///
870    /// # Panics
871    ///
872    /// This routine panics when the case mapping data necessary for this
873    /// routine to complete is unavailable. This occurs when the `unicode-case`
874    /// feature is not enabled.
875    ///
876    /// Callers should prefer using `try_case_fold_simple` instead, which will
877    /// return an error instead of panicking.
878    pub fn case_fold_simple(&mut self) {
879        self.set
880            .case_fold_simple()
881            .expect("unicode-case feature must be enabled");
882    }
883
884    /// Expand this character class such that it contains all case folded
885    /// characters, according to Unicode's "simple" mapping. For example, if
886    /// this class consists of the range `a-z`, then applying case folding will
887    /// result in the class containing both the ranges `a-z` and `A-Z`.
888    ///
889    /// # Panics
890    ///
891    /// This routine panics when the case mapping data necessary for this
892    /// routine to complete is unavailable. This occurs when the `unicode-case`
893    /// feature is not enabled.
894    ///
895    /// Callers should prefer using `try_case_fold_simple` instead, which will
896    /// return an error instead of panicking.
897    pub fn try_case_fold_simple(
898        &mut self,
899    ) -> result::Result<(), CaseFoldError> {
900        self.set.case_fold_simple()
901    }
902
903    /// Negate this character class.
904    ///
905    /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
906    /// set, then it will not be in this set after negation.
907    pub fn negate(&mut self) {
908        self.set.negate();
909    }
910
911    /// Union this character class with the given character class, in place.
912    pub fn union(&mut self, other: &ClassUnicode) {
913        self.set.union(&other.set);
914    }
915
916    /// Intersect this character class with the given character class, in
917    /// place.
918    pub fn intersect(&mut self, other: &ClassUnicode) {
919        self.set.intersect(&other.set);
920    }
921
922    /// Subtract the given character class from this character class, in place.
923    pub fn difference(&mut self, other: &ClassUnicode) {
924        self.set.difference(&other.set);
925    }
926
927    /// Compute the symmetric difference of the given character classes, in
928    /// place.
929    ///
930    /// This computes the symmetric difference of two character classes. This
931    /// removes all elements in this class that are also in the given class,
932    /// but all adds all elements from the given class that aren't in this
933    /// class. That is, the class will contain all elements in either class,
934    /// but will not contain any elements that are in both classes.
935    pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
936        self.set.symmetric_difference(&other.set);
937    }
938}
939
940/// An iterator over all ranges in a Unicode character class.
941///
942/// The lifetime `'a` refers to the lifetime of the underlying class.
943#[derive(Debug)]
944pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
945
946impl<'a> Iterator for ClassUnicodeIter<'a> {
947    type Item = &'a ClassUnicodeRange;
948
949    fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
950        self.0.next()
951    }
952}
953
954/// A single range of characters represented by Unicode scalar values.
955///
956/// The range is closed. That is, the start and end of the range are included
957/// in the range.
958#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
959pub struct ClassUnicodeRange {
960    start: char,
961    end: char,
962}
963
964impl fmt::Debug for ClassUnicodeRange {
965    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
966        let start = if !self.start.is_whitespace() && !self.start.is_control()
967        {
968            self.start.to_string()
969        } else {
970            format!("0x{:X}", self.start as u32)
971        };
972        let end = if !self.end.is_whitespace() && !self.end.is_control() {
973            self.end.to_string()
974        } else {
975            format!("0x{:X}", self.end as u32)
976        };
977        f.debug_struct("ClassUnicodeRange")
978            .field("start", &start)
979            .field("end", &end)
980            .finish()
981    }
982}
983
984impl Interval for ClassUnicodeRange {
985    type Bound = char;
986
987    #[inline]
988    fn lower(&self) -> char {
989        self.start
990    }
991    #[inline]
992    fn upper(&self) -> char {
993        self.end
994    }
995    #[inline]
996    fn set_lower(&mut self, bound: char) {
997        self.start = bound;
998    }
999    #[inline]
1000    fn set_upper(&mut self, bound: char) {
1001        self.end = bound;
1002    }
1003
1004    /// Apply simple case folding to this Unicode scalar value range.
1005    ///
1006    /// Additional ranges are appended to the given vector. Canonical ordering
1007    /// is *not* maintained in the given vector.
1008    fn case_fold_simple(
1009        &self,
1010        ranges: &mut Vec<ClassUnicodeRange>,
1011    ) -> Result<(), unicode::CaseFoldError> {
1012        if !unicode::contains_simple_case_mapping(self.start, self.end)? {
1013            return Ok(());
1014        }
1015        let start = self.start as u32;
1016        let end = (self.end as u32).saturating_add(1);
1017        let mut next_simple_cp = None;
1018        for cp in (start..end).filter_map(char::from_u32) {
1019            if next_simple_cp.map_or(false, |next| cp < next) {
1020                continue;
1021            }
1022            let it = match unicode::simple_fold(cp)? {
1023                Ok(it) => it,
1024                Err(next) => {
1025                    next_simple_cp = next;
1026                    continue;
1027                }
1028            };
1029            for cp_folded in it {
1030                ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1031            }
1032        }
1033        Ok(())
1034    }
1035}
1036
1037impl ClassUnicodeRange {
1038    /// Create a new Unicode scalar value range for a character class.
1039    ///
1040    /// The returned range is always in a canonical form. That is, the range
1041    /// returned always satisfies the invariant that `start <= end`.
1042    pub fn new(start: char, end: char) -> ClassUnicodeRange {
1043        ClassUnicodeRange::create(start, end)
1044    }
1045
1046    /// Return the start of this range.
1047    ///
1048    /// The start of a range is always less than or equal to the end of the
1049    /// range.
1050    pub fn start(&self) -> char {
1051        self.start
1052    }
1053
1054    /// Return the end of this range.
1055    ///
1056    /// The end of a range is always greater than or equal to the start of the
1057    /// range.
1058    pub fn end(&self) -> char {
1059        self.end
1060    }
1061}
1062
1063/// A set of characters represented by arbitrary bytes (where one byte
1064/// corresponds to one character).
1065#[derive(Clone, Debug, Eq, PartialEq)]
1066pub struct ClassBytes {
1067    set: IntervalSet<ClassBytesRange>,
1068}
1069
1070impl ClassBytes {
1071    /// Create a new class from a sequence of ranges.
1072    ///
1073    /// The given ranges do not need to be in any specific order, and ranges
1074    /// may overlap.
1075    pub fn new<I>(ranges: I) -> ClassBytes
1076    where
1077        I: IntoIterator<Item = ClassBytesRange>,
1078    {
1079        ClassBytes { set: IntervalSet::new(ranges) }
1080    }
1081
1082    /// Create a new class with no ranges.
1083    pub fn empty() -> ClassBytes {
1084        ClassBytes::new(vec![])
1085    }
1086
1087    /// Add a new range to this set.
1088    pub fn push(&mut self, range: ClassBytesRange) {
1089        self.set.push(range);
1090    }
1091
1092    /// Return an iterator over all ranges in this class.
1093    ///
1094    /// The iterator yields ranges in ascending order.
1095    pub fn iter(&self) -> ClassBytesIter {
1096        ClassBytesIter(self.set.iter())
1097    }
1098
1099    /// Return the underlying ranges as a slice.
1100    pub fn ranges(&self) -> &[ClassBytesRange] {
1101        self.set.intervals()
1102    }
1103
1104    /// Expand this character class such that it contains all case folded
1105    /// characters. For example, if this class consists of the range `a-z`,
1106    /// then applying case folding will result in the class containing both the
1107    /// ranges `a-z` and `A-Z`.
1108    ///
1109    /// Note that this only applies ASCII case folding, which is limited to the
1110    /// characters `a-z` and `A-Z`.
1111    pub fn case_fold_simple(&mut self) {
1112        self.set.case_fold_simple().expect("ASCII case folding never fails");
1113    }
1114
1115    /// Negate this byte class.
1116    ///
1117    /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1118    /// will not be in this set after negation.
1119    pub fn negate(&mut self) {
1120        self.set.negate();
1121    }
1122
1123    /// Union this byte class with the given byte class, in place.
1124    pub fn union(&mut self, other: &ClassBytes) {
1125        self.set.union(&other.set);
1126    }
1127
1128    /// Intersect this byte class with the given byte class, in place.
1129    pub fn intersect(&mut self, other: &ClassBytes) {
1130        self.set.intersect(&other.set);
1131    }
1132
1133    /// Subtract the given byte class from this byte class, in place.
1134    pub fn difference(&mut self, other: &ClassBytes) {
1135        self.set.difference(&other.set);
1136    }
1137
1138    /// Compute the symmetric difference of the given byte classes, in place.
1139    ///
1140    /// This computes the symmetric difference of two byte classes. This
1141    /// removes all elements in this class that are also in the given class,
1142    /// but all adds all elements from the given class that aren't in this
1143    /// class. That is, the class will contain all elements in either class,
1144    /// but will not contain any elements that are in both classes.
1145    pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1146        self.set.symmetric_difference(&other.set);
1147    }
1148
1149    /// Returns true if and only if this character class will either match
1150    /// nothing or only ASCII bytes. Stated differently, this returns false
1151    /// if and only if this class contains a non-ASCII byte.
1152    pub fn is_all_ascii(&self) -> bool {
1153        self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1154    }
1155}
1156
1157/// An iterator over all ranges in a byte character class.
1158///
1159/// The lifetime `'a` refers to the lifetime of the underlying class.
1160#[derive(Debug)]
1161pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1162
1163impl<'a> Iterator for ClassBytesIter<'a> {
1164    type Item = &'a ClassBytesRange;
1165
1166    fn next(&mut self) -> Option<&'a ClassBytesRange> {
1167        self.0.next()
1168    }
1169}
1170
1171/// A single range of characters represented by arbitrary bytes.
1172///
1173/// The range is closed. That is, the start and end of the range are included
1174/// in the range.
1175#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1176pub struct ClassBytesRange {
1177    start: u8,
1178    end: u8,
1179}
1180
1181impl Interval for ClassBytesRange {
1182    type Bound = u8;
1183
1184    #[inline]
1185    fn lower(&self) -> u8 {
1186        self.start
1187    }
1188    #[inline]
1189    fn upper(&self) -> u8 {
1190        self.end
1191    }
1192    #[inline]
1193    fn set_lower(&mut self, bound: u8) {
1194        self.start = bound;
1195    }
1196    #[inline]
1197    fn set_upper(&mut self, bound: u8) {
1198        self.end = bound;
1199    }
1200
1201    /// Apply simple case folding to this byte range. Only ASCII case mappings
1202    /// (for a-z) are applied.
1203    ///
1204    /// Additional ranges are appended to the given vector. Canonical ordering
1205    /// is *not* maintained in the given vector.
1206    fn case_fold_simple(
1207        &self,
1208        ranges: &mut Vec<ClassBytesRange>,
1209    ) -> Result<(), unicode::CaseFoldError> {
1210        if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1211            let lower = cmp::max(self.start, b'a');
1212            let upper = cmp::min(self.end, b'z');
1213            ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1214        }
1215        if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1216            let lower = cmp::max(self.start, b'A');
1217            let upper = cmp::min(self.end, b'Z');
1218            ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1219        }
1220        Ok(())
1221    }
1222}
1223
1224impl ClassBytesRange {
1225    /// Create a new byte range for a character class.
1226    ///
1227    /// The returned range is always in a canonical form. That is, the range
1228    /// returned always satisfies the invariant that `start <= end`.
1229    pub fn new(start: u8, end: u8) -> ClassBytesRange {
1230        ClassBytesRange::create(start, end)
1231    }
1232
1233    /// Return the start of this range.
1234    ///
1235    /// The start of a range is always less than or equal to the end of the
1236    /// range.
1237    pub fn start(&self) -> u8 {
1238        self.start
1239    }
1240
1241    /// Return the end of this range.
1242    ///
1243    /// The end of a range is always greater than or equal to the start of the
1244    /// range.
1245    pub fn end(&self) -> u8 {
1246        self.end
1247    }
1248}
1249
1250impl fmt::Debug for ClassBytesRange {
1251    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1252        let mut debug = f.debug_struct("ClassBytesRange");
1253        if self.start <= 0x7F {
1254            debug.field("start", &(self.start as char));
1255        } else {
1256            debug.field("start", &self.start);
1257        }
1258        if self.end <= 0x7F {
1259            debug.field("end", &(self.end as char));
1260        } else {
1261            debug.field("end", &self.end);
1262        }
1263        debug.finish()
1264    }
1265}
1266
1267/// The high-level intermediate representation for an anchor assertion.
1268///
1269/// A matching anchor assertion is always zero-length.
1270#[derive(Clone, Debug, Eq, PartialEq)]
1271pub enum Anchor {
1272    /// Match the beginning of a line or the beginning of text. Specifically,
1273    /// this matches at the starting position of the input, or at the position
1274    /// immediately following a `\n` character.
1275    StartLine,
1276    /// Match the end of a line or the end of text. Specifically,
1277    /// this matches at the end position of the input, or at the position
1278    /// immediately preceding a `\n` character.
1279    EndLine,
1280    /// Match the beginning of text. Specifically, this matches at the starting
1281    /// position of the input.
1282    StartText,
1283    /// Match the end of text. Specifically, this matches at the ending
1284    /// position of the input.
1285    EndText,
1286}
1287
1288/// The high-level intermediate representation for a word-boundary assertion.
1289///
1290/// A matching word boundary assertion is always zero-length.
1291#[derive(Clone, Debug, Eq, PartialEq)]
1292pub enum WordBoundary {
1293    /// Match a Unicode-aware word boundary. That is, this matches a position
1294    /// where the left adjacent character and right adjacent character
1295    /// correspond to a word and non-word or a non-word and word character.
1296    Unicode,
1297    /// Match a Unicode-aware negation of a word boundary.
1298    UnicodeNegate,
1299    /// Match an ASCII-only word boundary. That is, this matches a position
1300    /// where the left adjacent character and right adjacent character
1301    /// correspond to a word and non-word or a non-word and word character.
1302    Ascii,
1303    /// Match an ASCII-only negation of a word boundary.
1304    AsciiNegate,
1305}
1306
1307impl WordBoundary {
1308    /// Returns true if and only if this word boundary assertion is negated.
1309    pub fn is_negated(&self) -> bool {
1310        match *self {
1311            WordBoundary::Unicode | WordBoundary::Ascii => false,
1312            WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
1313        }
1314    }
1315}
1316
1317/// The high-level intermediate representation for a group.
1318///
1319/// This represents one of three possible group types:
1320///
1321/// 1. A non-capturing group (e.g., `(?:expr)`).
1322/// 2. A capturing group (e.g., `(expr)`).
1323/// 3. A named capturing group (e.g., `(?P<name>expr)`).
1324#[derive(Clone, Debug, Eq, PartialEq)]
1325pub struct Group {
1326    /// The kind of this group. If it is a capturing group, then the kind
1327    /// contains the capture group index (and the name, if it is a named
1328    /// group).
1329    pub kind: GroupKind,
1330    /// The expression inside the capturing group, which may be empty.
1331    pub hir: Box<Hir>,
1332}
1333
1334/// The kind of group.
1335#[derive(Clone, Debug, Eq, PartialEq)]
1336pub enum GroupKind {
1337    /// A normal unnamed capturing group.
1338    ///
1339    /// The value is the capture index of the group.
1340    CaptureIndex(u32),
1341    /// A named capturing group.
1342    CaptureName {
1343        /// The name of the group.
1344        name: String,
1345        /// The capture index of the group.
1346        index: u32,
1347    },
1348    /// A non-capturing group.
1349    NonCapturing,
1350}
1351
1352/// The high-level intermediate representation of a repetition operator.
1353///
1354/// A repetition operator permits the repetition of an arbitrary
1355/// sub-expression.
1356#[derive(Clone, Debug, Eq, PartialEq)]
1357pub struct Repetition {
1358    /// The kind of this repetition operator.
1359    pub kind: RepetitionKind,
1360    /// Whether this repetition operator is greedy or not. A greedy operator
1361    /// will match as much as it can. A non-greedy operator will match as
1362    /// little as it can.
1363    ///
1364    /// Typically, operators are greedy by default and are only non-greedy when
1365    /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1366    /// not. However, this can be inverted via the `U` "ungreedy" flag.
1367    pub greedy: bool,
1368    /// The expression being repeated.
1369    pub hir: Box<Hir>,
1370}
1371
1372impl Repetition {
1373    /// Returns true if and only if this repetition operator makes it possible
1374    /// to match the empty string.
1375    ///
1376    /// Note that this is not defined inductively. For example, while `a*`
1377    /// will report `true`, `()+` will not, even though `()` matches the empty
1378    /// string and one or more occurrences of something that matches the empty
1379    /// string will always match the empty string. In order to get the
1380    /// inductive definition, see the corresponding method on
1381    /// [`Hir`](struct.Hir.html).
1382    pub fn is_match_empty(&self) -> bool {
1383        match self.kind {
1384            RepetitionKind::ZeroOrOne => true,
1385            RepetitionKind::ZeroOrMore => true,
1386            RepetitionKind::OneOrMore => false,
1387            RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1388            RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1389            RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1390        }
1391    }
1392}
1393
1394/// The kind of a repetition operator.
1395#[derive(Clone, Debug, Eq, PartialEq)]
1396pub enum RepetitionKind {
1397    /// Matches a sub-expression zero or one times.
1398    ZeroOrOne,
1399    /// Matches a sub-expression zero or more times.
1400    ZeroOrMore,
1401    /// Matches a sub-expression one or more times.
1402    OneOrMore,
1403    /// Matches a sub-expression within a bounded range of times.
1404    Range(RepetitionRange),
1405}
1406
1407/// The kind of a counted repetition operator.
1408#[derive(Clone, Debug, Eq, PartialEq)]
1409pub enum RepetitionRange {
1410    /// Matches a sub-expression exactly this many times.
1411    Exactly(u32),
1412    /// Matches a sub-expression at least this many times.
1413    AtLeast(u32),
1414    /// Matches a sub-expression at least `m` times and at most `n` times.
1415    Bounded(u32, u32),
1416}
1417
1418/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1419/// space but heap space proportional to the depth of the total `Hir`.
1420impl Drop for Hir {
1421    fn drop(&mut self) {
1422        use std::mem;
1423
1424        match *self.kind() {
1425            HirKind::Empty
1426            | HirKind::Literal(_)
1427            | HirKind::Class(_)
1428            | HirKind::Anchor(_)
1429            | HirKind::WordBoundary(_) => return,
1430            HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1431            HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1432            HirKind::Concat(ref x) if x.is_empty() => return,
1433            HirKind::Alternation(ref x) if x.is_empty() => return,
1434            _ => {}
1435        }
1436
1437        let mut stack = vec![mem::replace(self, Hir::empty())];
1438        while let Some(mut expr) = stack.pop() {
1439            match expr.kind {
1440                HirKind::Empty
1441                | HirKind::Literal(_)
1442                | HirKind::Class(_)
1443                | HirKind::Anchor(_)
1444                | HirKind::WordBoundary(_) => {}
1445                HirKind::Group(ref mut x) => {
1446                    stack.push(mem::replace(&mut x.hir, Hir::empty()));
1447                }
1448                HirKind::Repetition(ref mut x) => {
1449                    stack.push(mem::replace(&mut x.hir, Hir::empty()));
1450                }
1451                HirKind::Concat(ref mut x) => {
1452                    stack.extend(x.drain(..));
1453                }
1454                HirKind::Alternation(ref mut x) => {
1455                    stack.extend(x.drain(..));
1456                }
1457            }
1458        }
1459    }
1460}
1461
1462/// A type that documents various attributes of an HIR expression.
1463///
1464/// These attributes are typically defined inductively on the HIR.
1465#[derive(Clone, Debug, Eq, PartialEq)]
1466struct HirInfo {
1467    /// Represent yes/no questions by a bitfield to conserve space, since
1468    /// this is included in every HIR expression.
1469    ///
1470    /// If more attributes need to be added, it is OK to increase the size of
1471    /// this as appropriate.
1472    bools: u16,
1473}
1474
1475// A simple macro for defining bitfield accessors/mutators.
1476macro_rules! define_bool {
1477    ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1478        fn $is_fn_name(&self) -> bool {
1479            self.bools & (0b1 << $bit) > 0
1480        }
1481
1482        fn $set_fn_name(&mut self, yes: bool) {
1483            if yes {
1484                self.bools |= 1 << $bit;
1485            } else {
1486                self.bools &= !(1 << $bit);
1487            }
1488        }
1489    }
1490}
1491
1492impl HirInfo {
1493    fn new() -> HirInfo {
1494        HirInfo { bools: 0 }
1495    }
1496
1497    define_bool!(0, is_always_utf8, set_always_utf8);
1498    define_bool!(1, is_all_assertions, set_all_assertions);
1499    define_bool!(2, is_anchored_start, set_anchored_start);
1500    define_bool!(3, is_anchored_end, set_anchored_end);
1501    define_bool!(4, is_line_anchored_start, set_line_anchored_start);
1502    define_bool!(5, is_line_anchored_end, set_line_anchored_end);
1503    define_bool!(6, is_any_anchored_start, set_any_anchored_start);
1504    define_bool!(7, is_any_anchored_end, set_any_anchored_end);
1505    define_bool!(8, is_match_empty, set_match_empty);
1506    define_bool!(9, is_literal, set_literal);
1507    define_bool!(10, is_alternation_literal, set_alternation_literal);
1508}
1509
1510#[cfg(test)]
1511mod tests {
1512    use super::*;
1513
1514    fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1515        let ranges: Vec<ClassUnicodeRange> = ranges
1516            .iter()
1517            .map(|&(s, e)| ClassUnicodeRange::new(s, e))
1518            .collect();
1519        ClassUnicode::new(ranges)
1520    }
1521
1522    fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1523        let ranges: Vec<ClassBytesRange> =
1524            ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
1525        ClassBytes::new(ranges)
1526    }
1527
1528    fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1529        cls.iter().map(|x| (x.start(), x.end())).collect()
1530    }
1531
1532    #[cfg(feature = "unicode-case")]
1533    fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1534        let mut cls_ = cls.clone();
1535        cls_.case_fold_simple();
1536        cls_
1537    }
1538
1539    fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1540        let mut cls_ = cls1.clone();
1541        cls_.union(cls2);
1542        cls_
1543    }
1544
1545    fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1546        let mut cls_ = cls1.clone();
1547        cls_.intersect(cls2);
1548        cls_
1549    }
1550
1551    fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1552        let mut cls_ = cls1.clone();
1553        cls_.difference(cls2);
1554        cls_
1555    }
1556
1557    fn usymdifference(
1558        cls1: &ClassUnicode,
1559        cls2: &ClassUnicode,
1560    ) -> ClassUnicode {
1561        let mut cls_ = cls1.clone();
1562        cls_.symmetric_difference(cls2);
1563        cls_
1564    }
1565
1566    fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1567        let mut cls_ = cls.clone();
1568        cls_.negate();
1569        cls_
1570    }
1571
1572    fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1573        cls.iter().map(|x| (x.start(), x.end())).collect()
1574    }
1575
1576    fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1577        let mut cls_ = cls.clone();
1578        cls_.case_fold_simple();
1579        cls_
1580    }
1581
1582    fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1583        let mut cls_ = cls1.clone();
1584        cls_.union(cls2);
1585        cls_
1586    }
1587
1588    fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1589        let mut cls_ = cls1.clone();
1590        cls_.intersect(cls2);
1591        cls_
1592    }
1593
1594    fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1595        let mut cls_ = cls1.clone();
1596        cls_.difference(cls2);
1597        cls_
1598    }
1599
1600    fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1601        let mut cls_ = cls1.clone();
1602        cls_.symmetric_difference(cls2);
1603        cls_
1604    }
1605
1606    fn bnegate(cls: &ClassBytes) -> ClassBytes {
1607        let mut cls_ = cls.clone();
1608        cls_.negate();
1609        cls_
1610    }
1611
1612    #[test]
1613    fn class_range_canonical_unicode() {
1614        let range = ClassUnicodeRange::new('\u{00FF}', '\0');
1615        assert_eq!('\0', range.start());
1616        assert_eq!('\u{00FF}', range.end());
1617    }
1618
1619    #[test]
1620    fn class_range_canonical_bytes() {
1621        let range = ClassBytesRange::new(b'\xFF', b'\0');
1622        assert_eq!(b'\0', range.start());
1623        assert_eq!(b'\xFF', range.end());
1624    }
1625
1626    #[test]
1627    fn class_canonicalize_unicode() {
1628        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1629        let expected = vec![('a', 'c'), ('x', 'z')];
1630        assert_eq!(expected, uranges(&cls));
1631
1632        let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1633        let expected = vec![('a', 'c'), ('x', 'z')];
1634        assert_eq!(expected, uranges(&cls));
1635
1636        let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1637        let expected = vec![('w', 'z')];
1638        assert_eq!(expected, uranges(&cls));
1639
1640        let cls = uclass(&[
1641            ('c', 'f'),
1642            ('a', 'g'),
1643            ('d', 'j'),
1644            ('a', 'c'),
1645            ('m', 'p'),
1646            ('l', 's'),
1647        ]);
1648        let expected = vec![('a', 'j'), ('l', 's')];
1649        assert_eq!(expected, uranges(&cls));
1650
1651        let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1652        let expected = vec![('u', 'z')];
1653        assert_eq!(expected, uranges(&cls));
1654
1655        let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1656        let expected = vec![('\x00', '\u{10FFFF}')];
1657        assert_eq!(expected, uranges(&cls));
1658
1659        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1660        let expected = vec![('a', 'b')];
1661        assert_eq!(expected, uranges(&cls));
1662    }
1663
1664    #[test]
1665    fn class_canonicalize_bytes() {
1666        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1667        let expected = vec![(b'a', b'c'), (b'x', b'z')];
1668        assert_eq!(expected, branges(&cls));
1669
1670        let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1671        let expected = vec![(b'a', b'c'), (b'x', b'z')];
1672        assert_eq!(expected, branges(&cls));
1673
1674        let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1675        let expected = vec![(b'w', b'z')];
1676        assert_eq!(expected, branges(&cls));
1677
1678        let cls = bclass(&[
1679            (b'c', b'f'),
1680            (b'a', b'g'),
1681            (b'd', b'j'),
1682            (b'a', b'c'),
1683            (b'm', b'p'),
1684            (b'l', b's'),
1685        ]);
1686        let expected = vec![(b'a', b'j'), (b'l', b's')];
1687        assert_eq!(expected, branges(&cls));
1688
1689        let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1690        let expected = vec![(b'u', b'z')];
1691        assert_eq!(expected, branges(&cls));
1692
1693        let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
1694        let expected = vec![(b'\x00', b'\xFF')];
1695        assert_eq!(expected, branges(&cls));
1696
1697        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1698        let expected = vec![(b'a', b'b')];
1699        assert_eq!(expected, branges(&cls));
1700    }
1701
1702    #[test]
1703    #[cfg(feature = "unicode-case")]
1704    fn class_case_fold_unicode() {
1705        let cls = uclass(&[
1706            ('C', 'F'),
1707            ('A', 'G'),
1708            ('D', 'J'),
1709            ('A', 'C'),
1710            ('M', 'P'),
1711            ('L', 'S'),
1712            ('c', 'f'),
1713        ]);
1714        let expected = uclass(&[
1715            ('A', 'J'),
1716            ('L', 'S'),
1717            ('a', 'j'),
1718            ('l', 's'),
1719            ('\u{17F}', '\u{17F}'),
1720        ]);
1721        assert_eq!(expected, ucasefold(&cls));
1722
1723        let cls = uclass(&[('A', 'Z')]);
1724        let expected = uclass(&[
1725            ('A', 'Z'),
1726            ('a', 'z'),
1727            ('\u{17F}', '\u{17F}'),
1728            ('\u{212A}', '\u{212A}'),
1729        ]);
1730        assert_eq!(expected, ucasefold(&cls));
1731
1732        let cls = uclass(&[('a', 'z')]);
1733        let expected = uclass(&[
1734            ('A', 'Z'),
1735            ('a', 'z'),
1736            ('\u{17F}', '\u{17F}'),
1737            ('\u{212A}', '\u{212A}'),
1738        ]);
1739        assert_eq!(expected, ucasefold(&cls));
1740
1741        let cls = uclass(&[('A', 'A'), ('_', '_')]);
1742        let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1743        assert_eq!(expected, ucasefold(&cls));
1744
1745        let cls = uclass(&[('A', 'A'), ('=', '=')]);
1746        let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1747        assert_eq!(expected, ucasefold(&cls));
1748
1749        let cls = uclass(&[('\x00', '\x10')]);
1750        assert_eq!(cls, ucasefold(&cls));
1751
1752        let cls = uclass(&[('k', 'k')]);
1753        let expected =
1754            uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
1755        assert_eq!(expected, ucasefold(&cls));
1756
1757        let cls = uclass(&[('@', '@')]);
1758        assert_eq!(cls, ucasefold(&cls));
1759    }
1760
1761    #[test]
1762    #[cfg(not(feature = "unicode-case"))]
1763    fn class_case_fold_unicode_disabled() {
1764        let mut cls = uclass(&[
1765            ('C', 'F'),
1766            ('A', 'G'),
1767            ('D', 'J'),
1768            ('A', 'C'),
1769            ('M', 'P'),
1770            ('L', 'S'),
1771            ('c', 'f'),
1772        ]);
1773        assert!(cls.try_case_fold_simple().is_err());
1774    }
1775
1776    #[test]
1777    #[should_panic]
1778    #[cfg(not(feature = "unicode-case"))]
1779    fn class_case_fold_unicode_disabled_panics() {
1780        let mut cls = uclass(&[
1781            ('C', 'F'),
1782            ('A', 'G'),
1783            ('D', 'J'),
1784            ('A', 'C'),
1785            ('M', 'P'),
1786            ('L', 'S'),
1787            ('c', 'f'),
1788        ]);
1789        cls.case_fold_simple();
1790    }
1791
1792    #[test]
1793    fn class_case_fold_bytes() {
1794        let cls = bclass(&[
1795            (b'C', b'F'),
1796            (b'A', b'G'),
1797            (b'D', b'J'),
1798            (b'A', b'C'),
1799            (b'M', b'P'),
1800            (b'L', b'S'),
1801            (b'c', b'f'),
1802        ]);
1803        let expected =
1804            bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
1805        assert_eq!(expected, bcasefold(&cls));
1806
1807        let cls = bclass(&[(b'A', b'Z')]);
1808        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1809        assert_eq!(expected, bcasefold(&cls));
1810
1811        let cls = bclass(&[(b'a', b'z')]);
1812        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1813        assert_eq!(expected, bcasefold(&cls));
1814
1815        let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1816        let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1817        assert_eq!(expected, bcasefold(&cls));
1818
1819        let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1820        let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1821        assert_eq!(expected, bcasefold(&cls));
1822
1823        let cls = bclass(&[(b'\x00', b'\x10')]);
1824        assert_eq!(cls, bcasefold(&cls));
1825
1826        let cls = bclass(&[(b'k', b'k')]);
1827        let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1828        assert_eq!(expected, bcasefold(&cls));
1829
1830        let cls = bclass(&[(b'@', b'@')]);
1831        assert_eq!(cls, bcasefold(&cls));
1832    }
1833
1834    #[test]
1835    fn class_negate_unicode() {
1836        let cls = uclass(&[('a', 'a')]);
1837        let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
1838        assert_eq!(expected, unegate(&cls));
1839
1840        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1841        let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
1842        assert_eq!(expected, unegate(&cls));
1843
1844        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1845        let expected = uclass(&[
1846            ('\x00', '\x60'),
1847            ('\x64', '\x77'),
1848            ('\x7B', '\u{10FFFF}'),
1849        ]);
1850        assert_eq!(expected, unegate(&cls));
1851
1852        let cls = uclass(&[('\x00', 'a')]);
1853        let expected = uclass(&[('\x62', '\u{10FFFF}')]);
1854        assert_eq!(expected, unegate(&cls));
1855
1856        let cls = uclass(&[('a', '\u{10FFFF}')]);
1857        let expected = uclass(&[('\x00', '\x60')]);
1858        assert_eq!(expected, unegate(&cls));
1859
1860        let cls = uclass(&[('\x00', '\u{10FFFF}')]);
1861        let expected = uclass(&[]);
1862        assert_eq!(expected, unegate(&cls));
1863
1864        let cls = uclass(&[]);
1865        let expected = uclass(&[('\x00', '\u{10FFFF}')]);
1866        assert_eq!(expected, unegate(&cls));
1867
1868        let cls =
1869            uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
1870        let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
1871        assert_eq!(expected, unegate(&cls));
1872
1873        let cls = uclass(&[('\x00', '\u{D7FF}')]);
1874        let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1875        assert_eq!(expected, unegate(&cls));
1876
1877        let cls = uclass(&[('\x00', '\u{D7FE}')]);
1878        let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
1879        assert_eq!(expected, unegate(&cls));
1880
1881        let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1882        let expected = uclass(&[('\x00', '\u{D7FF}')]);
1883        assert_eq!(expected, unegate(&cls));
1884
1885        let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
1886        let expected = uclass(&[('\x00', '\u{E000}')]);
1887        assert_eq!(expected, unegate(&cls));
1888    }
1889
1890    #[test]
1891    fn class_negate_bytes() {
1892        let cls = bclass(&[(b'a', b'a')]);
1893        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
1894        assert_eq!(expected, bnegate(&cls));
1895
1896        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1897        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
1898        assert_eq!(expected, bnegate(&cls));
1899
1900        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1901        let expected = bclass(&[
1902            (b'\x00', b'\x60'),
1903            (b'\x64', b'\x77'),
1904            (b'\x7B', b'\xFF'),
1905        ]);
1906        assert_eq!(expected, bnegate(&cls));
1907
1908        let cls = bclass(&[(b'\x00', b'a')]);
1909        let expected = bclass(&[(b'\x62', b'\xFF')]);
1910        assert_eq!(expected, bnegate(&cls));
1911
1912        let cls = bclass(&[(b'a', b'\xFF')]);
1913        let expected = bclass(&[(b'\x00', b'\x60')]);
1914        assert_eq!(expected, bnegate(&cls));
1915
1916        let cls = bclass(&[(b'\x00', b'\xFF')]);
1917        let expected = bclass(&[]);
1918        assert_eq!(expected, bnegate(&cls));
1919
1920        let cls = bclass(&[]);
1921        let expected = bclass(&[(b'\x00', b'\xFF')]);
1922        assert_eq!(expected, bnegate(&cls));
1923
1924        let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
1925        let expected = bclass(&[(b'\xFE', b'\xFE')]);
1926        assert_eq!(expected, bnegate(&cls));
1927    }
1928
1929    #[test]
1930    fn class_union_unicode() {
1931        let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1932        let cls2 = uclass(&[('a', 'z')]);
1933        let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1934        assert_eq!(expected, uunion(&cls1, &cls2));
1935    }
1936
1937    #[test]
1938    fn class_union_bytes() {
1939        let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1940        let cls2 = bclass(&[(b'a', b'z')]);
1941        let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1942        assert_eq!(expected, bunion(&cls1, &cls2));
1943    }
1944
1945    #[test]
1946    fn class_intersect_unicode() {
1947        let cls1 = uclass(&[]);
1948        let cls2 = uclass(&[('a', 'a')]);
1949        let expected = uclass(&[]);
1950        assert_eq!(expected, uintersect(&cls1, &cls2));
1951
1952        let cls1 = uclass(&[('a', 'a')]);
1953        let cls2 = uclass(&[('a', 'a')]);
1954        let expected = uclass(&[('a', 'a')]);
1955        assert_eq!(expected, uintersect(&cls1, &cls2));
1956
1957        let cls1 = uclass(&[('a', 'a')]);
1958        let cls2 = uclass(&[('b', 'b')]);
1959        let expected = uclass(&[]);
1960        assert_eq!(expected, uintersect(&cls1, &cls2));
1961
1962        let cls1 = uclass(&[('a', 'a')]);
1963        let cls2 = uclass(&[('a', 'c')]);
1964        let expected = uclass(&[('a', 'a')]);
1965        assert_eq!(expected, uintersect(&cls1, &cls2));
1966
1967        let cls1 = uclass(&[('a', 'b')]);
1968        let cls2 = uclass(&[('a', 'c')]);
1969        let expected = uclass(&[('a', 'b')]);
1970        assert_eq!(expected, uintersect(&cls1, &cls2));
1971
1972        let cls1 = uclass(&[('a', 'b')]);
1973        let cls2 = uclass(&[('b', 'c')]);
1974        let expected = uclass(&[('b', 'b')]);
1975        assert_eq!(expected, uintersect(&cls1, &cls2));
1976
1977        let cls1 = uclass(&[('a', 'b')]);
1978        let cls2 = uclass(&[('c', 'd')]);
1979        let expected = uclass(&[]);
1980        assert_eq!(expected, uintersect(&cls1, &cls2));
1981
1982        let cls1 = uclass(&[('b', 'c')]);
1983        let cls2 = uclass(&[('a', 'd')]);
1984        let expected = uclass(&[('b', 'c')]);
1985        assert_eq!(expected, uintersect(&cls1, &cls2));
1986
1987        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1988        let cls2 = uclass(&[('a', 'h')]);
1989        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1990        assert_eq!(expected, uintersect(&cls1, &cls2));
1991
1992        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1993        let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1994        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1995        assert_eq!(expected, uintersect(&cls1, &cls2));
1996
1997        let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
1998        let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
1999        let expected = uclass(&[]);
2000        assert_eq!(expected, uintersect(&cls1, &cls2));
2001
2002        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2003        let cls2 = uclass(&[('h', 'h')]);
2004        let expected = uclass(&[('h', 'h')]);
2005        assert_eq!(expected, uintersect(&cls1, &cls2));
2006
2007        let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2008        let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2009        let expected = uclass(&[]);
2010        assert_eq!(expected, uintersect(&cls1, &cls2));
2011
2012        let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2013        let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2014        let expected = uclass(&[('b', 'f')]);
2015        assert_eq!(expected, uintersect(&cls1, &cls2));
2016    }
2017
2018    #[test]
2019    fn class_intersect_bytes() {
2020        let cls1 = bclass(&[]);
2021        let cls2 = bclass(&[(b'a', b'a')]);
2022        let expected = bclass(&[]);
2023        assert_eq!(expected, bintersect(&cls1, &cls2));
2024
2025        let cls1 = bclass(&[(b'a', b'a')]);
2026        let cls2 = bclass(&[(b'a', b'a')]);
2027        let expected = bclass(&[(b'a', b'a')]);
2028        assert_eq!(expected, bintersect(&cls1, &cls2));
2029
2030        let cls1 = bclass(&[(b'a', b'a')]);
2031        let cls2 = bclass(&[(b'b', b'b')]);
2032        let expected = bclass(&[]);
2033        assert_eq!(expected, bintersect(&cls1, &cls2));
2034
2035        let cls1 = bclass(&[(b'a', b'a')]);
2036        let cls2 = bclass(&[(b'a', b'c')]);
2037        let expected = bclass(&[(b'a', b'a')]);
2038        assert_eq!(expected, bintersect(&cls1, &cls2));
2039
2040        let cls1 = bclass(&[(b'a', b'b')]);
2041        let cls2 = bclass(&[(b'a', b'c')]);
2042        let expected = bclass(&[(b'a', b'b')]);
2043        assert_eq!(expected, bintersect(&cls1, &cls2));
2044
2045        let cls1 = bclass(&[(b'a', b'b')]);
2046        let cls2 = bclass(&[(b'b', b'c')]);
2047        let expected = bclass(&[(b'b', b'b')]);
2048        assert_eq!(expected, bintersect(&cls1, &cls2));
2049
2050        let cls1 = bclass(&[(b'a', b'b')]);
2051        let cls2 = bclass(&[(b'c', b'd')]);
2052        let expected = bclass(&[]);
2053        assert_eq!(expected, bintersect(&cls1, &cls2));
2054
2055        let cls1 = bclass(&[(b'b', b'c')]);
2056        let cls2 = bclass(&[(b'a', b'd')]);
2057        let expected = bclass(&[(b'b', b'c')]);
2058        assert_eq!(expected, bintersect(&cls1, &cls2));
2059
2060        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2061        let cls2 = bclass(&[(b'a', b'h')]);
2062        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2063        assert_eq!(expected, bintersect(&cls1, &cls2));
2064
2065        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2066        let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2067        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2068        assert_eq!(expected, bintersect(&cls1, &cls2));
2069
2070        let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
2071        let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
2072        let expected = bclass(&[]);
2073        assert_eq!(expected, bintersect(&cls1, &cls2));
2074
2075        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2076        let cls2 = bclass(&[(b'h', b'h')]);
2077        let expected = bclass(&[(b'h', b'h')]);
2078        assert_eq!(expected, bintersect(&cls1, &cls2));
2079
2080        let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
2081        let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
2082        let expected = bclass(&[]);
2083        assert_eq!(expected, bintersect(&cls1, &cls2));
2084
2085        let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
2086        let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
2087        let expected = bclass(&[(b'b', b'f')]);
2088        assert_eq!(expected, bintersect(&cls1, &cls2));
2089    }
2090
2091    #[test]
2092    fn class_difference_unicode() {
2093        let cls1 = uclass(&[('a', 'a')]);
2094        let cls2 = uclass(&[('a', 'a')]);
2095        let expected = uclass(&[]);
2096        assert_eq!(expected, udifference(&cls1, &cls2));
2097
2098        let cls1 = uclass(&[('a', 'a')]);
2099        let cls2 = uclass(&[]);
2100        let expected = uclass(&[('a', 'a')]);
2101        assert_eq!(expected, udifference(&cls1, &cls2));
2102
2103        let cls1 = uclass(&[]);
2104        let cls2 = uclass(&[('a', 'a')]);
2105        let expected = uclass(&[]);
2106        assert_eq!(expected, udifference(&cls1, &cls2));
2107
2108        let cls1 = uclass(&[('a', 'z')]);
2109        let cls2 = uclass(&[('a', 'a')]);
2110        let expected = uclass(&[('b', 'z')]);
2111        assert_eq!(expected, udifference(&cls1, &cls2));
2112
2113        let cls1 = uclass(&[('a', 'z')]);
2114        let cls2 = uclass(&[('z', 'z')]);
2115        let expected = uclass(&[('a', 'y')]);
2116        assert_eq!(expected, udifference(&cls1, &cls2));
2117
2118        let cls1 = uclass(&[('a', 'z')]);
2119        let cls2 = uclass(&[('m', 'm')]);
2120        let expected = uclass(&[('a', 'l'), ('n', 'z')]);
2121        assert_eq!(expected, udifference(&cls1, &cls2));
2122
2123        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2124        let cls2 = uclass(&[('a', 'z')]);
2125        let expected = uclass(&[]);
2126        assert_eq!(expected, udifference(&cls1, &cls2));
2127
2128        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2129        let cls2 = uclass(&[('d', 'v')]);
2130        let expected = uclass(&[('a', 'c')]);
2131        assert_eq!(expected, udifference(&cls1, &cls2));
2132
2133        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2134        let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
2135        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2136        assert_eq!(expected, udifference(&cls1, &cls2));
2137
2138        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2139        let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
2140        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2141        assert_eq!(expected, udifference(&cls1, &cls2));
2142
2143        let cls1 = uclass(&[('x', 'z')]);
2144        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2145        let expected = uclass(&[('x', 'z')]);
2146        assert_eq!(expected, udifference(&cls1, &cls2));
2147
2148        let cls1 = uclass(&[('a', 'z')]);
2149        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2150        let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
2151        assert_eq!(expected, udifference(&cls1, &cls2));
2152    }
2153
2154    #[test]
2155    fn class_difference_bytes() {
2156        let cls1 = bclass(&[(b'a', b'a')]);
2157        let cls2 = bclass(&[(b'a', b'a')]);
2158        let expected = bclass(&[]);
2159        assert_eq!(expected, bdifference(&cls1, &cls2));
2160
2161        let cls1 = bclass(&[(b'a', b'a')]);
2162        let cls2 = bclass(&[]);
2163        let expected = bclass(&[(b'a', b'a')]);
2164        assert_eq!(expected, bdifference(&cls1, &cls2));
2165
2166        let cls1 = bclass(&[]);
2167        let cls2 = bclass(&[(b'a', b'a')]);
2168        let expected = bclass(&[]);
2169        assert_eq!(expected, bdifference(&cls1, &cls2));
2170
2171        let cls1 = bclass(&[(b'a', b'z')]);
2172        let cls2 = bclass(&[(b'a', b'a')]);
2173        let expected = bclass(&[(b'b', b'z')]);
2174        assert_eq!(expected, bdifference(&cls1, &cls2));
2175
2176        let cls1 = bclass(&[(b'a', b'z')]);
2177        let cls2 = bclass(&[(b'z', b'z')]);
2178        let expected = bclass(&[(b'a', b'y')]);
2179        assert_eq!(expected, bdifference(&cls1, &cls2));
2180
2181        let cls1 = bclass(&[(b'a', b'z')]);
2182        let cls2 = bclass(&[(b'm', b'm')]);
2183        let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
2184        assert_eq!(expected, bdifference(&cls1, &cls2));
2185
2186        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2187        let cls2 = bclass(&[(b'a', b'z')]);
2188        let expected = bclass(&[]);
2189        assert_eq!(expected, bdifference(&cls1, &cls2));
2190
2191        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2192        let cls2 = bclass(&[(b'd', b'v')]);
2193        let expected = bclass(&[(b'a', b'c')]);
2194        assert_eq!(expected, bdifference(&cls1, &cls2));
2195
2196        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2197        let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
2198        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2199        assert_eq!(expected, bdifference(&cls1, &cls2));
2200
2201        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2202        let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
2203        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2204        assert_eq!(expected, bdifference(&cls1, &cls2));
2205
2206        let cls1 = bclass(&[(b'x', b'z')]);
2207        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2208        let expected = bclass(&[(b'x', b'z')]);
2209        assert_eq!(expected, bdifference(&cls1, &cls2));
2210
2211        let cls1 = bclass(&[(b'a', b'z')]);
2212        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2213        let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
2214        assert_eq!(expected, bdifference(&cls1, &cls2));
2215    }
2216
2217    #[test]
2218    fn class_symmetric_difference_unicode() {
2219        let cls1 = uclass(&[('a', 'm')]);
2220        let cls2 = uclass(&[('g', 't')]);
2221        let expected = uclass(&[('a', 'f'), ('n', 't')]);
2222        assert_eq!(expected, usymdifference(&cls1, &cls2));
2223    }
2224
2225    #[test]
2226    fn class_symmetric_difference_bytes() {
2227        let cls1 = bclass(&[(b'a', b'm')]);
2228        let cls2 = bclass(&[(b'g', b't')]);
2229        let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2230        assert_eq!(expected, bsymdifference(&cls1, &cls2));
2231    }
2232
2233    #[test]
2234    #[should_panic]
2235    fn hir_byte_literal_non_ascii() {
2236        Hir::literal(Literal::Byte(b'a'));
2237    }
2238
2239    // We use a thread with an explicit stack size to test that our destructor
2240    // for Hir can handle arbitrarily sized expressions in constant stack
2241    // space. In case we run on a platform without threads (WASM?), we limit
2242    // this test to Windows/Unix.
2243    #[test]
2244    #[cfg(any(unix, windows))]
2245    fn no_stack_overflow_on_drop() {
2246        use std::thread;
2247
2248        let run = || {
2249            let mut expr = Hir::empty();
2250            for _ in 0..100 {
2251                expr = Hir::group(Group {
2252                    kind: GroupKind::NonCapturing,
2253                    hir: Box::new(expr),
2254                });
2255                expr = Hir::repetition(Repetition {
2256                    kind: RepetitionKind::ZeroOrOne,
2257                    greedy: true,
2258                    hir: Box::new(expr),
2259                });
2260
2261                expr = Hir {
2262                    kind: HirKind::Concat(vec![expr]),
2263                    info: HirInfo::new(),
2264                };
2265                expr = Hir {
2266                    kind: HirKind::Alternation(vec![expr]),
2267                    info: HirInfo::new(),
2268                };
2269            }
2270            assert!(!expr.kind.is_empty());
2271        };
2272
2273        // We run our test on a thread with a small stack size so we can
2274        // force the issue more easily.
2275        thread::Builder::new()
2276            .stack_size(1 << 10)
2277            .spawn(run)
2278            .unwrap()
2279            .join()
2280            .unwrap();
2281    }
2282}