regex_syntax/ast/
mod.rs

1/*!
2Defines an abstract syntax for regular expressions.
3*/
4
5use std::cmp::Ordering;
6use std::error;
7use std::fmt;
8
9pub use ast::visitor::{visit, Visitor};
10
11pub mod parse;
12pub mod print;
13mod visitor;
14
15/// An error that occurred while parsing a regular expression into an abstract
16/// syntax tree.
17///
18/// Note that note all ASTs represents a valid regular expression. For example,
19/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20/// valid Unicode property name. That particular error is reported when
21/// translating an AST to the high-level intermediate representation (`HIR`).
22#[derive(Clone, Debug, Eq, PartialEq)]
23pub struct Error {
24    /// The kind of error.
25    kind: ErrorKind,
26    /// The original pattern that the parser generated the error from. Every
27    /// span in an error is a valid range into this string.
28    pattern: String,
29    /// The span of this error.
30    span: Span,
31}
32
33impl Error {
34    /// Return the type of this error.
35    pub fn kind(&self) -> &ErrorKind {
36        &self.kind
37    }
38
39    /// The original pattern string in which this error occurred.
40    ///
41    /// Every span reported by this error is reported in terms of this string.
42    pub fn pattern(&self) -> &str {
43        &self.pattern
44    }
45
46    /// Return the span at which this error occurred.
47    pub fn span(&self) -> &Span {
48        &self.span
49    }
50
51    /// Return an auxiliary span. This span exists only for some errors that
52    /// benefit from being able to point to two locations in the original
53    /// regular expression. For example, "duplicate" errors will have the
54    /// main error position set to the duplicate occurrence while its
55    /// auxiliary span will be set to the initial occurrence.
56    pub fn auxiliary_span(&self) -> Option<&Span> {
57        use self::ErrorKind::*;
58        match self.kind {
59            FlagDuplicate { ref original } => Some(original),
60            FlagRepeatedNegation { ref original, .. } => Some(original),
61            GroupNameDuplicate { ref original, .. } => Some(original),
62            _ => None,
63        }
64    }
65}
66
67/// The type of an error that occurred while building an AST.
68#[derive(Clone, Debug, Eq, PartialEq)]
69pub enum ErrorKind {
70    /// The capturing group limit was exceeded.
71    ///
72    /// Note that this represents a limit on the total number of capturing
73    /// groups in a regex and not necessarily the number of nested capturing
74    /// groups. That is, the nest limit can be low and it is still possible for
75    /// this error to occur.
76    CaptureLimitExceeded,
77    /// An invalid escape sequence was found in a character class set.
78    ClassEscapeInvalid,
79    /// An invalid character class range was found. An invalid range is any
80    /// range where the start is greater than the end.
81    ClassRangeInvalid,
82    /// An invalid range boundary was found in a character class. Range
83    /// boundaries must be a single literal codepoint, but this error indicates
84    /// that something else was found, such as a nested class.
85    ClassRangeLiteral,
86    /// An opening `[` was found with no corresponding closing `]`.
87    ClassUnclosed,
88    /// Note that this error variant is no longer used. Namely, a decimal
89    /// number can only appear as a repetition quantifier. When the number
90    /// in a repetition quantifier is empty, then it gets its own specialized
91    /// error, `RepetitionCountDecimalEmpty`.
92    DecimalEmpty,
93    /// An invalid decimal number was given where one was expected.
94    DecimalInvalid,
95    /// A bracketed hex literal was empty.
96    EscapeHexEmpty,
97    /// A bracketed hex literal did not correspond to a Unicode scalar value.
98    EscapeHexInvalid,
99    /// An invalid hexadecimal digit was found.
100    EscapeHexInvalidDigit,
101    /// EOF was found before an escape sequence was completed.
102    EscapeUnexpectedEof,
103    /// An unrecognized escape sequence.
104    EscapeUnrecognized,
105    /// A dangling negation was used when setting flags, e.g., `i-`.
106    FlagDanglingNegation,
107    /// A flag was used twice, e.g., `i-i`.
108    FlagDuplicate {
109        /// The position of the original flag. The error position
110        /// points to the duplicate flag.
111        original: Span,
112    },
113    /// The negation operator was used twice, e.g., `-i-s`.
114    FlagRepeatedNegation {
115        /// The position of the original negation operator. The error position
116        /// points to the duplicate negation operator.
117        original: Span,
118    },
119    /// Expected a flag but got EOF, e.g., `(?`.
120    FlagUnexpectedEof,
121    /// Unrecognized flag, e.g., `a`.
122    FlagUnrecognized,
123    /// A duplicate capture name was found.
124    GroupNameDuplicate {
125        /// The position of the initial occurrence of the capture name. The
126        /// error position itself points to the duplicate occurrence.
127        original: Span,
128    },
129    /// A capture group name is empty, e.g., `(?P<>abc)`.
130    GroupNameEmpty,
131    /// An invalid character was seen for a capture group name. This includes
132    /// errors where the first character is a digit (even though subsequent
133    /// characters are allowed to be digits).
134    GroupNameInvalid,
135    /// A closing `>` could not be found for a capture group name.
136    GroupNameUnexpectedEof,
137    /// An unclosed group, e.g., `(ab`.
138    ///
139    /// The span of this error corresponds to the unclosed parenthesis.
140    GroupUnclosed,
141    /// An unopened group, e.g., `ab)`.
142    GroupUnopened,
143    /// The nest limit was exceeded. The limit stored here is the limit
144    /// configured in the parser.
145    NestLimitExceeded(u32),
146    /// The range provided in a counted repetition operator is invalid. The
147    /// range is invalid if the start is greater than the end.
148    RepetitionCountInvalid,
149    /// An opening `{` was not followed by a valid decimal value.
150    /// For example, `x{}` or `x{]}` would fail.
151    RepetitionCountDecimalEmpty,
152    /// An opening `{` was found with no corresponding closing `}`.
153    RepetitionCountUnclosed,
154    /// A repetition operator was applied to a missing sub-expression. This
155    /// occurs, for example, in the regex consisting of just a `*` or even
156    /// `(?i)*`. It is, however, possible to create a repetition operating on
157    /// an empty sub-expression. For example, `()*` is still considered valid.
158    RepetitionMissing,
159    /// When octal support is disabled, this error is produced when an octal
160    /// escape is used. The octal escape is assumed to be an invocation of
161    /// a backreference, which is the common case.
162    UnsupportedBackreference,
163    /// When syntax similar to PCRE's look-around is used, this error is
164    /// returned. Some example syntaxes that are rejected include, but are
165    /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
166    /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
167    /// error is used to improve the user experience.
168    UnsupportedLookAround,
169    /// Hints that destructuring should not be exhaustive.
170    ///
171    /// This enum may grow additional variants, so this makes sure clients
172    /// don't count on exhaustive matching. (Otherwise, adding a new variant
173    /// could break existing code.)
174    #[doc(hidden)]
175    __Nonexhaustive,
176}
177
178impl error::Error for Error {
179    fn description(&self) -> &str {
180        use self::ErrorKind::*;
181        match self.kind {
182            CaptureLimitExceeded => "capture group limit exceeded",
183            ClassEscapeInvalid => "invalid escape sequence in character class",
184            ClassRangeInvalid => "invalid character class range",
185            ClassRangeLiteral => "invalid range boundary, must be a literal",
186            ClassUnclosed => "unclosed character class",
187            DecimalEmpty => "empty decimal literal",
188            DecimalInvalid => "invalid decimal literal",
189            EscapeHexEmpty => "empty hexadecimal literal",
190            EscapeHexInvalid => "invalid hexadecimal literal",
191            EscapeHexInvalidDigit => "invalid hexadecimal digit",
192            EscapeUnexpectedEof => "unexpected eof (escape sequence)",
193            EscapeUnrecognized => "unrecognized escape sequence",
194            FlagDanglingNegation => "dangling flag negation operator",
195            FlagDuplicate { .. } => "duplicate flag",
196            FlagRepeatedNegation { .. } => "repeated negation",
197            FlagUnexpectedEof => "unexpected eof (flag)",
198            FlagUnrecognized => "unrecognized flag",
199            GroupNameDuplicate { .. } => "duplicate capture group name",
200            GroupNameEmpty => "empty capture group name",
201            GroupNameInvalid => "invalid capture group name",
202            GroupNameUnexpectedEof => "unclosed capture group name",
203            GroupUnclosed => "unclosed group",
204            GroupUnopened => "unopened group",
205            NestLimitExceeded(_) => "nest limit exceeded",
206            RepetitionCountInvalid => "invalid repetition count range",
207            RepetitionCountUnclosed => "unclosed counted repetition",
208            RepetitionMissing => "repetition operator missing expression",
209            UnsupportedBackreference => "backreferences are not supported",
210            UnsupportedLookAround => "look-around is not supported",
211            _ => unreachable!(),
212        }
213    }
214}
215
216impl fmt::Display for Error {
217    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
218        ::error::Formatter::from(self).fmt(f)
219    }
220}
221
222impl fmt::Display for ErrorKind {
223    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
224        use self::ErrorKind::*;
225        match *self {
226            CaptureLimitExceeded => write!(
227                f,
228                "exceeded the maximum number of \
229                 capturing groups ({})",
230                ::std::u32::MAX
231            ),
232            ClassEscapeInvalid => {
233                write!(f, "invalid escape sequence found in character class")
234            }
235            ClassRangeInvalid => write!(
236                f,
237                "invalid character class range, \
238                 the start must be <= the end"
239            ),
240            ClassRangeLiteral => {
241                write!(f, "invalid range boundary, must be a literal")
242            }
243            ClassUnclosed => write!(f, "unclosed character class"),
244            DecimalEmpty => write!(f, "decimal literal empty"),
245            DecimalInvalid => write!(f, "decimal literal invalid"),
246            EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
247            EscapeHexInvalid => {
248                write!(f, "hexadecimal literal is not a Unicode scalar value")
249            }
250            EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
251            EscapeUnexpectedEof => write!(
252                f,
253                "incomplete escape sequence, \
254                 reached end of pattern prematurely"
255            ),
256            EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
257            FlagDanglingNegation => {
258                write!(f, "dangling flag negation operator")
259            }
260            FlagDuplicate { .. } => write!(f, "duplicate flag"),
261            FlagRepeatedNegation { .. } => {
262                write!(f, "flag negation operator repeated")
263            }
264            FlagUnexpectedEof => {
265                write!(f, "expected flag but got end of regex")
266            }
267            FlagUnrecognized => write!(f, "unrecognized flag"),
268            GroupNameDuplicate { .. } => {
269                write!(f, "duplicate capture group name")
270            }
271            GroupNameEmpty => write!(f, "empty capture group name"),
272            GroupNameInvalid => write!(f, "invalid capture group character"),
273            GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
274            GroupUnclosed => write!(f, "unclosed group"),
275            GroupUnopened => write!(f, "unopened group"),
276            NestLimitExceeded(limit) => write!(
277                f,
278                "exceed the maximum number of \
279                 nested parentheses/brackets ({})",
280                limit
281            ),
282            RepetitionCountInvalid => write!(
283                f,
284                "invalid repetition count range, \
285                 the start must be <= the end"
286            ),
287            RepetitionCountDecimalEmpty => {
288                write!(f, "repetition quantifier expects a valid decimal")
289            }
290            RepetitionCountUnclosed => {
291                write!(f, "unclosed counted repetition")
292            }
293            RepetitionMissing => {
294                write!(f, "repetition operator missing expression")
295            }
296            UnsupportedBackreference => {
297                write!(f, "backreferences are not supported")
298            }
299            UnsupportedLookAround => write!(
300                f,
301                "look-around, including look-ahead and look-behind, \
302                 is not supported"
303            ),
304            _ => unreachable!(),
305        }
306    }
307}
308
309/// Span represents the position information of a single AST item.
310///
311/// All span positions are absolute byte offsets that can be used on the
312/// original regular expression that was parsed.
313#[derive(Clone, Copy, Eq, PartialEq)]
314pub struct Span {
315    /// The start byte offset.
316    pub start: Position,
317    /// The end byte offset.
318    pub end: Position,
319}
320
321impl fmt::Debug for Span {
322    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
323        write!(f, "Span({:?}, {:?})", self.start, self.end)
324    }
325}
326
327impl Ord for Span {
328    fn cmp(&self, other: &Span) -> Ordering {
329        (&self.start, &self.end).cmp(&(&other.start, &other.end))
330    }
331}
332
333impl PartialOrd for Span {
334    fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
335        Some(self.cmp(other))
336    }
337}
338
339/// A single position in a regular expression.
340///
341/// A position encodes one half of a span, and include the byte offset, line
342/// number and column number.
343#[derive(Clone, Copy, Eq, PartialEq)]
344pub struct Position {
345    /// The absolute offset of this position, starting at `0` from the
346    /// beginning of the regular expression pattern string.
347    pub offset: usize,
348    /// The line number, starting at `1`.
349    pub line: usize,
350    /// The approximate column number, starting at `1`.
351    pub column: usize,
352}
353
354impl fmt::Debug for Position {
355    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
356        write!(
357            f,
358            "Position(o: {:?}, l: {:?}, c: {:?})",
359            self.offset, self.line, self.column
360        )
361    }
362}
363
364impl Ord for Position {
365    fn cmp(&self, other: &Position) -> Ordering {
366        self.offset.cmp(&other.offset)
367    }
368}
369
370impl PartialOrd for Position {
371    fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
372        Some(self.cmp(other))
373    }
374}
375
376impl Span {
377    /// Create a new span with the given positions.
378    pub fn new(start: Position, end: Position) -> Span {
379        Span { start: start, end: end }
380    }
381
382    /// Create a new span using the given position as the start and end.
383    pub fn splat(pos: Position) -> Span {
384        Span::new(pos, pos)
385    }
386
387    /// Create a new span by replacing the starting the position with the one
388    /// given.
389    pub fn with_start(self, pos: Position) -> Span {
390        Span { start: pos, ..self }
391    }
392
393    /// Create a new span by replacing the ending the position with the one
394    /// given.
395    pub fn with_end(self, pos: Position) -> Span {
396        Span { end: pos, ..self }
397    }
398
399    /// Returns true if and only if this span occurs on a single line.
400    pub fn is_one_line(&self) -> bool {
401        self.start.line == self.end.line
402    }
403
404    /// Returns true if and only if this span is empty. That is, it points to
405    /// a single position in the concrete syntax of a regular expression.
406    pub fn is_empty(&self) -> bool {
407        self.start.offset == self.end.offset
408    }
409}
410
411impl Position {
412    /// Create a new position with the given information.
413    ///
414    /// `offset` is the absolute offset of the position, starting at `0` from
415    /// the beginning of the regular expression pattern string.
416    ///
417    /// `line` is the line number, starting at `1`.
418    ///
419    /// `column` is the approximate column number, starting at `1`.
420    pub fn new(offset: usize, line: usize, column: usize) -> Position {
421        Position { offset: offset, line: line, column: column }
422    }
423}
424
425/// An abstract syntax tree for a singular expression along with comments
426/// found.
427///
428/// Comments are not stored in the tree itself to avoid complexity. Each
429/// comment contains a span of precisely where it occurred in the original
430/// regular expression.
431#[derive(Clone, Debug, Eq, PartialEq)]
432pub struct WithComments {
433    /// The actual ast.
434    pub ast: Ast,
435    /// All comments found in the original regular expression.
436    pub comments: Vec<Comment>,
437}
438
439/// A comment from a regular expression with an associated span.
440///
441/// A regular expression can only contain comments when the `x` flag is
442/// enabled.
443#[derive(Clone, Debug, Eq, PartialEq)]
444pub struct Comment {
445    /// The span of this comment, including the beginning `#` and ending `\n`.
446    pub span: Span,
447    /// The comment text, starting with the first character following the `#`
448    /// and ending with the last character preceding the `\n`.
449    pub comment: String,
450}
451
452/// An abstract syntax tree for a single regular expression.
453///
454/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
455/// space proportional to the size of the `Ast`.
456///
457/// This type defines its own destructor that uses constant stack space and
458/// heap space proportional to the size of the `Ast`.
459#[derive(Clone, Debug, Eq, PartialEq)]
460pub enum Ast {
461    /// An empty regex that matches everything.
462    Empty(Span),
463    /// A set of flags, e.g., `(?is)`.
464    Flags(SetFlags),
465    /// A single character literal, which includes escape sequences.
466    Literal(Literal),
467    /// The "any character" class.
468    Dot(Span),
469    /// A single zero-width assertion.
470    Assertion(Assertion),
471    /// A single character class. This includes all forms of character classes
472    /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
473    Class(Class),
474    /// A repetition operator applied to an arbitrary regular expression.
475    Repetition(Repetition),
476    /// A grouped regular expression.
477    Group(Group),
478    /// An alternation of regular expressions.
479    Alternation(Alternation),
480    /// A concatenation of regular expressions.
481    Concat(Concat),
482}
483
484impl Ast {
485    /// Return the span of this abstract syntax tree.
486    pub fn span(&self) -> &Span {
487        match *self {
488            Ast::Empty(ref span) => span,
489            Ast::Flags(ref x) => &x.span,
490            Ast::Literal(ref x) => &x.span,
491            Ast::Dot(ref span) => span,
492            Ast::Assertion(ref x) => &x.span,
493            Ast::Class(ref x) => x.span(),
494            Ast::Repetition(ref x) => &x.span,
495            Ast::Group(ref x) => &x.span,
496            Ast::Alternation(ref x) => &x.span,
497            Ast::Concat(ref x) => &x.span,
498        }
499    }
500
501    /// Return true if and only if this Ast is empty.
502    pub fn is_empty(&self) -> bool {
503        match *self {
504            Ast::Empty(_) => true,
505            _ => false,
506        }
507    }
508
509    /// Returns true if and only if this AST has any (including possibly empty)
510    /// subexpressions.
511    fn has_subexprs(&self) -> bool {
512        match *self {
513            Ast::Empty(_)
514            | Ast::Flags(_)
515            | Ast::Literal(_)
516            | Ast::Dot(_)
517            | Ast::Assertion(_) => false,
518            Ast::Class(_)
519            | Ast::Repetition(_)
520            | Ast::Group(_)
521            | Ast::Alternation(_)
522            | Ast::Concat(_) => true,
523        }
524    }
525}
526
527/// Print a display representation of this Ast.
528///
529/// This does not preserve any of the original whitespace formatting that may
530/// have originally been present in the concrete syntax from which this Ast
531/// was generated.
532///
533/// This implementation uses constant stack space and heap space proportional
534/// to the size of the `Ast`.
535impl fmt::Display for Ast {
536    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
537        use ast::print::Printer;
538        Printer::new().print(self, f)
539    }
540}
541
542/// An alternation of regular expressions.
543#[derive(Clone, Debug, Eq, PartialEq)]
544pub struct Alternation {
545    /// The span of this alternation.
546    pub span: Span,
547    /// The alternate regular expressions.
548    pub asts: Vec<Ast>,
549}
550
551impl Alternation {
552    /// Return this alternation as an AST.
553    ///
554    /// If this alternation contains zero ASTs, then Ast::Empty is
555    /// returned. If this alternation contains exactly 1 AST, then the
556    /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
557    pub fn into_ast(mut self) -> Ast {
558        match self.asts.len() {
559            0 => Ast::Empty(self.span),
560            1 => self.asts.pop().unwrap(),
561            _ => Ast::Alternation(self),
562        }
563    }
564}
565
566/// A concatenation of regular expressions.
567#[derive(Clone, Debug, Eq, PartialEq)]
568pub struct Concat {
569    /// The span of this concatenation.
570    pub span: Span,
571    /// The concatenation regular expressions.
572    pub asts: Vec<Ast>,
573}
574
575impl Concat {
576    /// Return this concatenation as an AST.
577    ///
578    /// If this concatenation contains zero ASTs, then Ast::Empty is
579    /// returned. If this concatenation contains exactly 1 AST, then the
580    /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
581    pub fn into_ast(mut self) -> Ast {
582        match self.asts.len() {
583            0 => Ast::Empty(self.span),
584            1 => self.asts.pop().unwrap(),
585            _ => Ast::Concat(self),
586        }
587    }
588}
589
590/// A single literal expression.
591///
592/// A literal corresponds to a single Unicode scalar value. Literals may be
593/// represented in their literal form, e.g., `a` or in their escaped form,
594/// e.g., `\x61`.
595#[derive(Clone, Debug, Eq, PartialEq)]
596pub struct Literal {
597    /// The span of this literal.
598    pub span: Span,
599    /// The kind of this literal.
600    pub kind: LiteralKind,
601    /// The Unicode scalar value corresponding to this literal.
602    pub c: char,
603}
604
605impl Literal {
606    /// If this literal was written as a `\x` hex escape, then this returns
607    /// the corresponding byte value. Otherwise, this returns `None`.
608    pub fn byte(&self) -> Option<u8> {
609        let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
610        if self.c as u32 <= 255 && self.kind == short_hex {
611            Some(self.c as u8)
612        } else {
613            None
614        }
615    }
616}
617
618/// The kind of a single literal expression.
619#[derive(Clone, Debug, Eq, PartialEq)]
620pub enum LiteralKind {
621    /// The literal is written verbatim, e.g., `a` or `☃`.
622    Verbatim,
623    /// The literal is written as an escape because it is punctuation, e.g.,
624    /// `\*` or `\[`.
625    Punctuation,
626    /// The literal is written as an octal escape, e.g., `\141`.
627    Octal,
628    /// The literal is written as a hex code with a fixed number of digits
629    /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
630    /// `\U00000061`.
631    HexFixed(HexLiteralKind),
632    /// The literal is written as a hex code with a bracketed number of
633    /// digits. The only restriction is that the bracketed hex code must refer
634    /// to a valid Unicode scalar value.
635    HexBrace(HexLiteralKind),
636    /// The literal is written as a specially recognized escape, e.g., `\f`
637    /// or `\n`.
638    Special(SpecialLiteralKind),
639}
640
641/// The type of a special literal.
642///
643/// A special literal is a special escape sequence recognized by the regex
644/// parser, e.g., `\f` or `\n`.
645#[derive(Clone, Debug, Eq, PartialEq)]
646pub enum SpecialLiteralKind {
647    /// Bell, spelled `\a` (`\x07`).
648    Bell,
649    /// Form feed, spelled `\f` (`\x0C`).
650    FormFeed,
651    /// Tab, spelled `\t` (`\x09`).
652    Tab,
653    /// Line feed, spelled `\n` (`\x0A`).
654    LineFeed,
655    /// Carriage return, spelled `\r` (`\x0D`).
656    CarriageReturn,
657    /// Vertical tab, spelled `\v` (`\x0B`).
658    VerticalTab,
659    /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
660    /// parsing in verbose mode.
661    Space,
662}
663
664/// The type of a Unicode hex literal.
665///
666/// Note that all variants behave the same when used with brackets. They only
667/// differ when used without brackets in the number of hex digits that must
668/// follow.
669#[derive(Clone, Debug, Eq, PartialEq)]
670pub enum HexLiteralKind {
671    /// A `\x` prefix. When used without brackets, this form is limited to
672    /// two digits.
673    X,
674    /// A `\u` prefix. When used without brackets, this form is limited to
675    /// four digits.
676    UnicodeShort,
677    /// A `\U` prefix. When used without brackets, this form is limited to
678    /// eight digits.
679    UnicodeLong,
680}
681
682impl HexLiteralKind {
683    /// The number of digits that must be used with this literal form when
684    /// used without brackets. When used with brackets, there is no
685    /// restriction on the number of digits.
686    pub fn digits(&self) -> u32 {
687        match *self {
688            HexLiteralKind::X => 2,
689            HexLiteralKind::UnicodeShort => 4,
690            HexLiteralKind::UnicodeLong => 8,
691        }
692    }
693}
694
695/// A single character class expression.
696#[derive(Clone, Debug, Eq, PartialEq)]
697pub enum Class {
698    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
699    Unicode(ClassUnicode),
700    /// A perl character class, e.g., `\d` or `\W`.
701    Perl(ClassPerl),
702    /// A bracketed character class set, which may contain zero or more
703    /// character ranges and/or zero or more nested classes. e.g.,
704    /// `[a-zA-Z\pL]`.
705    Bracketed(ClassBracketed),
706}
707
708impl Class {
709    /// Return the span of this character class.
710    pub fn span(&self) -> &Span {
711        match *self {
712            Class::Perl(ref x) => &x.span,
713            Class::Unicode(ref x) => &x.span,
714            Class::Bracketed(ref x) => &x.span,
715        }
716    }
717}
718
719/// A Perl character class.
720#[derive(Clone, Debug, Eq, PartialEq)]
721pub struct ClassPerl {
722    /// The span of this class.
723    pub span: Span,
724    /// The kind of Perl class.
725    pub kind: ClassPerlKind,
726    /// Whether the class is negated or not. e.g., `\d` is not negated but
727    /// `\D` is.
728    pub negated: bool,
729}
730
731/// The available Perl character classes.
732#[derive(Clone, Debug, Eq, PartialEq)]
733pub enum ClassPerlKind {
734    /// Decimal numbers.
735    Digit,
736    /// Whitespace.
737    Space,
738    /// Word characters.
739    Word,
740}
741
742/// An ASCII character class.
743#[derive(Clone, Debug, Eq, PartialEq)]
744pub struct ClassAscii {
745    /// The span of this class.
746    pub span: Span,
747    /// The kind of ASCII class.
748    pub kind: ClassAsciiKind,
749    /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
750    /// but `[[:^alpha:]]` is.
751    pub negated: bool,
752}
753
754/// The available ASCII character classes.
755#[derive(Clone, Debug, Eq, PartialEq)]
756pub enum ClassAsciiKind {
757    /// `[0-9A-Za-z]`
758    Alnum,
759    /// `[A-Za-z]`
760    Alpha,
761    /// `[\x00-\x7F]`
762    Ascii,
763    /// `[ \t]`
764    Blank,
765    /// `[\x00-\x1F\x7F]`
766    Cntrl,
767    /// `[0-9]`
768    Digit,
769    /// `[!-~]`
770    Graph,
771    /// `[a-z]`
772    Lower,
773    /// `[ -~]`
774    Print,
775    /// `[!-/:-@\[-`{-~]`
776    Punct,
777    /// `[\t\n\v\f\r ]`
778    Space,
779    /// `[A-Z]`
780    Upper,
781    /// `[0-9A-Za-z_]`
782    Word,
783    /// `[0-9A-Fa-f]`
784    Xdigit,
785}
786
787impl ClassAsciiKind {
788    /// Return the corresponding ClassAsciiKind variant for the given name.
789    ///
790    /// The name given should correspond to the lowercase version of the
791    /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
792    ///
793    /// If no variant with the corresponding name exists, then `None` is
794    /// returned.
795    pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
796        use self::ClassAsciiKind::*;
797        match name {
798            "alnum" => Some(Alnum),
799            "alpha" => Some(Alpha),
800            "ascii" => Some(Ascii),
801            "blank" => Some(Blank),
802            "cntrl" => Some(Cntrl),
803            "digit" => Some(Digit),
804            "graph" => Some(Graph),
805            "lower" => Some(Lower),
806            "print" => Some(Print),
807            "punct" => Some(Punct),
808            "space" => Some(Space),
809            "upper" => Some(Upper),
810            "word" => Some(Word),
811            "xdigit" => Some(Xdigit),
812            _ => None,
813        }
814    }
815}
816
817/// A Unicode character class.
818#[derive(Clone, Debug, Eq, PartialEq)]
819pub struct ClassUnicode {
820    /// The span of this class.
821    pub span: Span,
822    /// Whether this class is negated or not.
823    ///
824    /// Note: be careful when using this attribute. This specifically refers
825    /// to whether the class is written as `\p` or `\P`, where the latter
826    /// is `negated = true`. However, it also possible to write something like
827    /// `\P{scx!=Katakana}` which is actually equivalent to
828    /// `\p{scx=Katakana}` and is therefore not actually negated even though
829    /// `negated = true` here. To test whether this class is truly negated
830    /// or not, use the `is_negated` method.
831    pub negated: bool,
832    /// The kind of Unicode class.
833    pub kind: ClassUnicodeKind,
834}
835
836impl ClassUnicode {
837    /// Returns true if this class has been negated.
838    ///
839    /// Note that this takes the Unicode op into account, if it's present.
840    /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
841    pub fn is_negated(&self) -> bool {
842        match self.kind {
843            ClassUnicodeKind::NamedValue {
844                op: ClassUnicodeOpKind::NotEqual,
845                ..
846            } => !self.negated,
847            _ => self.negated,
848        }
849    }
850}
851
852/// The available forms of Unicode character classes.
853#[derive(Clone, Debug, Eq, PartialEq)]
854pub enum ClassUnicodeKind {
855    /// A one letter abbreviated class, e.g., `\pN`.
856    OneLetter(char),
857    /// A binary property, general category or script. The string may be
858    /// empty.
859    Named(String),
860    /// A property name and an associated value.
861    NamedValue {
862        /// The type of Unicode op used to associate `name` with `value`.
863        op: ClassUnicodeOpKind,
864        /// The property name (which may be empty).
865        name: String,
866        /// The property value (which may be empty).
867        value: String,
868    },
869}
870
871/// The type of op used in a Unicode character class.
872#[derive(Clone, Debug, Eq, PartialEq)]
873pub enum ClassUnicodeOpKind {
874    /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
875    Equal,
876    /// A property set to a specific value using a colon, e.g.,
877    /// `\p{scx:Katakana}`.
878    Colon,
879    /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
880    NotEqual,
881}
882
883impl ClassUnicodeOpKind {
884    /// Whether the op is an equality op or not.
885    pub fn is_equal(&self) -> bool {
886        match *self {
887            ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
888            _ => false,
889        }
890    }
891}
892
893/// A bracketed character class, e.g., `[a-z0-9]`.
894#[derive(Clone, Debug, Eq, PartialEq)]
895pub struct ClassBracketed {
896    /// The span of this class.
897    pub span: Span,
898    /// Whether this class is negated or not. e.g., `[a]` is not negated but
899    /// `[^a]` is.
900    pub negated: bool,
901    /// The type of this set. A set is either a normal union of things, e.g.,
902    /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
903    pub kind: ClassSet,
904}
905
906/// A character class set.
907///
908/// This type corresponds to the internal structure of a bracketed character
909/// class. That is, every bracketed character is one of two types: a union of
910/// items (literals, ranges, other bracketed classes) or a tree of binary set
911/// operations.
912#[derive(Clone, Debug, Eq, PartialEq)]
913pub enum ClassSet {
914    /// An item, which can be a single literal, range, nested character class
915    /// or a union of items.
916    Item(ClassSetItem),
917    /// A single binary operation (i.e., &&, -- or ~~).
918    BinaryOp(ClassSetBinaryOp),
919}
920
921impl ClassSet {
922    /// Build a set from a union.
923    pub fn union(ast: ClassSetUnion) -> ClassSet {
924        ClassSet::Item(ClassSetItem::Union(ast))
925    }
926
927    /// Return the span of this character class set.
928    pub fn span(&self) -> &Span {
929        match *self {
930            ClassSet::Item(ref x) => x.span(),
931            ClassSet::BinaryOp(ref x) => &x.span,
932        }
933    }
934
935    /// Return true if and only if this class set is empty.
936    fn is_empty(&self) -> bool {
937        match *self {
938            ClassSet::Item(ClassSetItem::Empty(_)) => true,
939            _ => false,
940        }
941    }
942}
943
944/// A single component of a character class set.
945#[derive(Clone, Debug, Eq, PartialEq)]
946pub enum ClassSetItem {
947    /// An empty item.
948    ///
949    /// Note that a bracketed character class cannot contain a single empty
950    /// item. Empty items can appear when using one of the binary operators.
951    /// For example, `[&&]` is the intersection of two empty classes.
952    Empty(Span),
953    /// A single literal.
954    Literal(Literal),
955    /// A range between two literals.
956    Range(ClassSetRange),
957    /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
958    Ascii(ClassAscii),
959    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
960    Unicode(ClassUnicode),
961    /// A perl character class, e.g., `\d` or `\W`.
962    Perl(ClassPerl),
963    /// A bracketed character class set, which may contain zero or more
964    /// character ranges and/or zero or more nested classes. e.g.,
965    /// `[a-zA-Z\pL]`.
966    Bracketed(Box<ClassBracketed>),
967    /// A union of items.
968    Union(ClassSetUnion),
969}
970
971impl ClassSetItem {
972    /// Return the span of this character class set item.
973    pub fn span(&self) -> &Span {
974        match *self {
975            ClassSetItem::Empty(ref span) => span,
976            ClassSetItem::Literal(ref x) => &x.span,
977            ClassSetItem::Range(ref x) => &x.span,
978            ClassSetItem::Ascii(ref x) => &x.span,
979            ClassSetItem::Perl(ref x) => &x.span,
980            ClassSetItem::Unicode(ref x) => &x.span,
981            ClassSetItem::Bracketed(ref x) => &x.span,
982            ClassSetItem::Union(ref x) => &x.span,
983        }
984    }
985}
986
987/// A single character class range in a set.
988#[derive(Clone, Debug, Eq, PartialEq)]
989pub struct ClassSetRange {
990    /// The span of this range.
991    pub span: Span,
992    /// The start of this range.
993    pub start: Literal,
994    /// The end of this range.
995    pub end: Literal,
996}
997
998impl ClassSetRange {
999    /// Returns true if and only if this character class range is valid.
1000    ///
1001    /// The only case where a range is invalid is if its start is greater than
1002    /// its end.
1003    pub fn is_valid(&self) -> bool {
1004        self.start.c <= self.end.c
1005    }
1006}
1007
1008/// A union of items inside a character class set.
1009#[derive(Clone, Debug, Eq, PartialEq)]
1010pub struct ClassSetUnion {
1011    /// The span of the items in this operation. e.g., the `a-z0-9` in
1012    /// `[^a-z0-9]`
1013    pub span: Span,
1014    /// The sequence of items that make up this union.
1015    pub items: Vec<ClassSetItem>,
1016}
1017
1018impl ClassSetUnion {
1019    /// Push a new item in this union.
1020    ///
1021    /// The ending position of this union's span is updated to the ending
1022    /// position of the span of the item given. If the union is empty, then
1023    /// the starting position of this union is set to the starting position
1024    /// of this item.
1025    ///
1026    /// In other words, if you only use this method to add items to a union
1027    /// and you set the spans on each item correctly, then you should never
1028    /// need to adjust the span of the union directly.
1029    pub fn push(&mut self, item: ClassSetItem) {
1030        if self.items.is_empty() {
1031            self.span.start = item.span().start;
1032        }
1033        self.span.end = item.span().end;
1034        self.items.push(item);
1035    }
1036
1037    /// Return this union as a character class set item.
1038    ///
1039    /// If this union contains zero items, then an empty union is
1040    /// returned. If this concatenation contains exactly 1 item, then the
1041    /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1042    /// returned.
1043    pub fn into_item(mut self) -> ClassSetItem {
1044        match self.items.len() {
1045            0 => ClassSetItem::Empty(self.span),
1046            1 => self.items.pop().unwrap(),
1047            _ => ClassSetItem::Union(self),
1048        }
1049    }
1050}
1051
1052/// A Unicode character class set operation.
1053#[derive(Clone, Debug, Eq, PartialEq)]
1054pub struct ClassSetBinaryOp {
1055    /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1056    pub span: Span,
1057    /// The type of this set operation.
1058    pub kind: ClassSetBinaryOpKind,
1059    /// The left hand side of the operation.
1060    pub lhs: Box<ClassSet>,
1061    /// The right hand side of the operation.
1062    pub rhs: Box<ClassSet>,
1063}
1064
1065/// The type of a Unicode character class set operation.
1066///
1067/// Note that this doesn't explicitly represent union since there is no
1068/// explicit union operator. Concatenation inside a character class corresponds
1069/// to the union operation.
1070#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1071pub enum ClassSetBinaryOpKind {
1072    /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1073    Intersection,
1074    /// The difference of two sets, e.g., `\pN--[0-9]`.
1075    Difference,
1076    /// The symmetric difference of two sets. The symmetric difference is the
1077    /// set of elements belonging to one but not both sets.
1078    /// e.g., `[\pL~~[:ascii:]]`.
1079    SymmetricDifference,
1080}
1081
1082/// A single zero-width assertion.
1083#[derive(Clone, Debug, Eq, PartialEq)]
1084pub struct Assertion {
1085    /// The span of this assertion.
1086    pub span: Span,
1087    /// The assertion kind, e.g., `\b` or `^`.
1088    pub kind: AssertionKind,
1089}
1090
1091/// An assertion kind.
1092#[derive(Clone, Debug, Eq, PartialEq)]
1093pub enum AssertionKind {
1094    /// `^`
1095    StartLine,
1096    /// `$`
1097    EndLine,
1098    /// `\A`
1099    StartText,
1100    /// `\z`
1101    EndText,
1102    /// `\b`
1103    WordBoundary,
1104    /// `\B`
1105    NotWordBoundary,
1106}
1107
1108/// A repetition operation applied to a regular expression.
1109#[derive(Clone, Debug, Eq, PartialEq)]
1110pub struct Repetition {
1111    /// The span of this operation.
1112    pub span: Span,
1113    /// The actual operation.
1114    pub op: RepetitionOp,
1115    /// Whether this operation was applied greedily or not.
1116    pub greedy: bool,
1117    /// The regular expression under repetition.
1118    pub ast: Box<Ast>,
1119}
1120
1121/// The repetition operator itself.
1122#[derive(Clone, Debug, Eq, PartialEq)]
1123pub struct RepetitionOp {
1124    /// The span of this operator. This includes things like `+`, `*?` and
1125    /// `{m,n}`.
1126    pub span: Span,
1127    /// The type of operation.
1128    pub kind: RepetitionKind,
1129}
1130
1131/// The kind of a repetition operator.
1132#[derive(Clone, Debug, Eq, PartialEq)]
1133pub enum RepetitionKind {
1134    /// `?`
1135    ZeroOrOne,
1136    /// `*`
1137    ZeroOrMore,
1138    /// `+`
1139    OneOrMore,
1140    /// `{m,n}`
1141    Range(RepetitionRange),
1142}
1143
1144/// A range repetition operator.
1145#[derive(Clone, Debug, Eq, PartialEq)]
1146pub enum RepetitionRange {
1147    /// `{m}`
1148    Exactly(u32),
1149    /// `{m,}`
1150    AtLeast(u32),
1151    /// `{m,n}`
1152    Bounded(u32, u32),
1153}
1154
1155impl RepetitionRange {
1156    /// Returns true if and only if this repetition range is valid.
1157    ///
1158    /// The only case where a repetition range is invalid is if it is bounded
1159    /// and its start is greater than its end.
1160    pub fn is_valid(&self) -> bool {
1161        match *self {
1162            RepetitionRange::Bounded(s, e) if s > e => false,
1163            _ => true,
1164        }
1165    }
1166}
1167
1168/// A grouped regular expression.
1169///
1170/// This includes both capturing and non-capturing groups. This does **not**
1171/// include flag-only groups like `(?is)`, but does contain any group that
1172/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1173/// `(?is:a)`.
1174#[derive(Clone, Debug, Eq, PartialEq)]
1175pub struct Group {
1176    /// The span of this group.
1177    pub span: Span,
1178    /// The kind of this group.
1179    pub kind: GroupKind,
1180    /// The regular expression in this group.
1181    pub ast: Box<Ast>,
1182}
1183
1184impl Group {
1185    /// If this group is non-capturing, then this returns the (possibly empty)
1186    /// set of flags. Otherwise, `None` is returned.
1187    pub fn flags(&self) -> Option<&Flags> {
1188        match self.kind {
1189            GroupKind::NonCapturing(ref flags) => Some(flags),
1190            _ => None,
1191        }
1192    }
1193
1194    /// Returns true if and only if this group is capturing.
1195    pub fn is_capturing(&self) -> bool {
1196        match self.kind {
1197            GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
1198            GroupKind::NonCapturing(_) => false,
1199        }
1200    }
1201
1202    /// Returns the capture index of this group, if this is a capturing group.
1203    ///
1204    /// This returns a capture index precisely when `is_capturing` is `true`.
1205    pub fn capture_index(&self) -> Option<u32> {
1206        match self.kind {
1207            GroupKind::CaptureIndex(i) => Some(i),
1208            GroupKind::CaptureName(ref x) => Some(x.index),
1209            GroupKind::NonCapturing(_) => None,
1210        }
1211    }
1212}
1213
1214/// The kind of a group.
1215#[derive(Clone, Debug, Eq, PartialEq)]
1216pub enum GroupKind {
1217    /// `(a)`
1218    CaptureIndex(u32),
1219    /// `(?P<name>a)`
1220    CaptureName(CaptureName),
1221    /// `(?:a)` and `(?i:a)`
1222    NonCapturing(Flags),
1223}
1224
1225/// A capture name.
1226///
1227/// This corresponds to the name itself between the angle brackets in, e.g.,
1228/// `(?P<foo>expr)`.
1229#[derive(Clone, Debug, Eq, PartialEq)]
1230pub struct CaptureName {
1231    /// The span of this capture name.
1232    pub span: Span,
1233    /// The capture name.
1234    pub name: String,
1235    /// The capture index.
1236    pub index: u32,
1237}
1238
1239/// A group of flags that is not applied to a particular regular expression.
1240#[derive(Clone, Debug, Eq, PartialEq)]
1241pub struct SetFlags {
1242    /// The span of these flags, including the grouping parentheses.
1243    pub span: Span,
1244    /// The actual sequence of flags.
1245    pub flags: Flags,
1246}
1247
1248/// A group of flags.
1249///
1250/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1251#[derive(Clone, Debug, Eq, PartialEq)]
1252pub struct Flags {
1253    /// The span of this group of flags.
1254    pub span: Span,
1255    /// A sequence of flag items. Each item is either a flag or a negation
1256    /// operator.
1257    pub items: Vec<FlagsItem>,
1258}
1259
1260impl Flags {
1261    /// Add the given item to this sequence of flags.
1262    ///
1263    /// If the item was added successfully, then `None` is returned. If the
1264    /// given item is a duplicate, then `Some(i)` is returned, where
1265    /// `items[i].kind == item.kind`.
1266    pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1267        for (i, x) in self.items.iter().enumerate() {
1268            if x.kind == item.kind {
1269                return Some(i);
1270            }
1271        }
1272        self.items.push(item);
1273        None
1274    }
1275
1276    /// Returns the state of the given flag in this set.
1277    ///
1278    /// If the given flag is in the set but is negated, then `Some(false)` is
1279    /// returned.
1280    ///
1281    /// If the given flag is in the set and is not negated, then `Some(true)`
1282    /// is returned.
1283    ///
1284    /// Otherwise, `None` is returned.
1285    pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1286        let mut negated = false;
1287        for x in &self.items {
1288            match x.kind {
1289                FlagsItemKind::Negation => {
1290                    negated = true;
1291                }
1292                FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1293                    return Some(!negated);
1294                }
1295                _ => {}
1296            }
1297        }
1298        None
1299    }
1300}
1301
1302/// A single item in a group of flags.
1303#[derive(Clone, Debug, Eq, PartialEq)]
1304pub struct FlagsItem {
1305    /// The span of this item.
1306    pub span: Span,
1307    /// The kind of this item.
1308    pub kind: FlagsItemKind,
1309}
1310
1311/// The kind of an item in a group of flags.
1312#[derive(Clone, Debug, Eq, PartialEq)]
1313pub enum FlagsItemKind {
1314    /// A negation operator applied to all subsequent flags in the enclosing
1315    /// group.
1316    Negation,
1317    /// A single flag in a group.
1318    Flag(Flag),
1319}
1320
1321impl FlagsItemKind {
1322    /// Returns true if and only if this item is a negation operator.
1323    pub fn is_negation(&self) -> bool {
1324        match *self {
1325            FlagsItemKind::Negation => true,
1326            _ => false,
1327        }
1328    }
1329}
1330
1331/// A single flag.
1332#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1333pub enum Flag {
1334    /// `i`
1335    CaseInsensitive,
1336    /// `m`
1337    MultiLine,
1338    /// `s`
1339    DotMatchesNewLine,
1340    /// `U`
1341    SwapGreed,
1342    /// `u`
1343    Unicode,
1344    /// `x`
1345    IgnoreWhitespace,
1346}
1347
1348/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1349/// space but heap space proportional to the depth of the `Ast`.
1350impl Drop for Ast {
1351    fn drop(&mut self) {
1352        use std::mem;
1353
1354        match *self {
1355            Ast::Empty(_)
1356            | Ast::Flags(_)
1357            | Ast::Literal(_)
1358            | Ast::Dot(_)
1359            | Ast::Assertion(_)
1360            // Classes are recursive, so they get their own Drop impl.
1361            | Ast::Class(_) => return,
1362            Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1363            Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1364            Ast::Alternation(ref x) if x.asts.is_empty() => return,
1365            Ast::Concat(ref x) if x.asts.is_empty() => return,
1366            _ => {}
1367        }
1368
1369        let empty_span = || Span::splat(Position::new(0, 0, 0));
1370        let empty_ast = || Ast::Empty(empty_span());
1371        let mut stack = vec![mem::replace(self, empty_ast())];
1372        while let Some(mut ast) = stack.pop() {
1373            match ast {
1374                Ast::Empty(_)
1375                | Ast::Flags(_)
1376                | Ast::Literal(_)
1377                | Ast::Dot(_)
1378                | Ast::Assertion(_)
1379                // Classes are recursive, so they get their own Drop impl.
1380                | Ast::Class(_) => {}
1381                Ast::Repetition(ref mut x) => {
1382                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1383                }
1384                Ast::Group(ref mut x) => {
1385                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1386                }
1387                Ast::Alternation(ref mut x) => {
1388                    stack.extend(x.asts.drain(..));
1389                }
1390                Ast::Concat(ref mut x) => {
1391                    stack.extend(x.asts.drain(..));
1392                }
1393            }
1394        }
1395    }
1396}
1397
1398/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1399/// stack space but heap space proportional to the depth of the `ClassSet`.
1400impl Drop for ClassSet {
1401    fn drop(&mut self) {
1402        use std::mem;
1403
1404        match *self {
1405            ClassSet::Item(ref item) => match *item {
1406                ClassSetItem::Empty(_)
1407                | ClassSetItem::Literal(_)
1408                | ClassSetItem::Range(_)
1409                | ClassSetItem::Ascii(_)
1410                | ClassSetItem::Unicode(_)
1411                | ClassSetItem::Perl(_) => return,
1412                ClassSetItem::Bracketed(ref x) => {
1413                    if x.kind.is_empty() {
1414                        return;
1415                    }
1416                }
1417                ClassSetItem::Union(ref x) => {
1418                    if x.items.is_empty() {
1419                        return;
1420                    }
1421                }
1422            },
1423            ClassSet::BinaryOp(ref op) => {
1424                if op.lhs.is_empty() && op.rhs.is_empty() {
1425                    return;
1426                }
1427            }
1428        }
1429
1430        let empty_span = || Span::splat(Position::new(0, 0, 0));
1431        let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
1432        let mut stack = vec![mem::replace(self, empty_set())];
1433        while let Some(mut set) = stack.pop() {
1434            match set {
1435                ClassSet::Item(ref mut item) => match *item {
1436                    ClassSetItem::Empty(_)
1437                    | ClassSetItem::Literal(_)
1438                    | ClassSetItem::Range(_)
1439                    | ClassSetItem::Ascii(_)
1440                    | ClassSetItem::Unicode(_)
1441                    | ClassSetItem::Perl(_) => {}
1442                    ClassSetItem::Bracketed(ref mut x) => {
1443                        stack.push(mem::replace(&mut x.kind, empty_set()));
1444                    }
1445                    ClassSetItem::Union(ref mut x) => {
1446                        stack.extend(x.items.drain(..).map(ClassSet::Item));
1447                    }
1448                },
1449                ClassSet::BinaryOp(ref mut op) => {
1450                    stack.push(mem::replace(&mut op.lhs, empty_set()));
1451                    stack.push(mem::replace(&mut op.rhs, empty_set()));
1452                }
1453            }
1454        }
1455    }
1456}
1457
1458#[cfg(test)]
1459mod tests {
1460    use super::*;
1461
1462    // We use a thread with an explicit stack size to test that our destructor
1463    // for Ast can handle arbitrarily sized expressions in constant stack
1464    // space. In case we run on a platform without threads (WASM?), we limit
1465    // this test to Windows/Unix.
1466    #[test]
1467    #[cfg(any(unix, windows))]
1468    fn no_stack_overflow_on_drop() {
1469        use std::thread;
1470
1471        let run = || {
1472            let span = || Span::splat(Position::new(0, 0, 0));
1473            let mut ast = Ast::Empty(span());
1474            for i in 0..200 {
1475                ast = Ast::Group(Group {
1476                    span: span(),
1477                    kind: GroupKind::CaptureIndex(i),
1478                    ast: Box::new(ast),
1479                });
1480            }
1481            assert!(!ast.is_empty());
1482        };
1483
1484        // We run our test on a thread with a small stack size so we can
1485        // force the issue more easily.
1486        thread::Builder::new()
1487            .stack_size(1 << 10)
1488            .spawn(run)
1489            .unwrap()
1490            .join()
1491            .unwrap();
1492    }
1493}