regex/
compile.rs

1use std::collections::HashMap;
2use std::iter;
3use std::result;
4use std::sync::Arc;
5
6use syntax::hir::{self, Hir};
7use syntax::is_word_byte;
8use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
9
10use prog::{
11    EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
12    InstSave, InstSplit, Program,
13};
14
15use Error;
16
17type Result = result::Result<Patch, Error>;
18
19#[derive(Debug)]
20struct Patch {
21    hole: Hole,
22    entry: InstPtr,
23}
24
25/// A compiler translates a regular expression AST to a sequence of
26/// instructions. The sequence of instructions represents an NFA.
27pub struct Compiler {
28    insts: Vec<MaybeInst>,
29    compiled: Program,
30    capture_name_idx: HashMap<String, usize>,
31    num_exprs: usize,
32    size_limit: usize,
33    suffix_cache: SuffixCache,
34    utf8_seqs: Option<Utf8Sequences>,
35    byte_classes: ByteClassSet,
36}
37
38impl Compiler {
39    /// Create a new regular expression compiler.
40    ///
41    /// Various options can be set before calling `compile` on an expression.
42    pub fn new() -> Self {
43        Compiler {
44            insts: vec![],
45            compiled: Program::new(),
46            capture_name_idx: HashMap::new(),
47            num_exprs: 0,
48            size_limit: 10 * (1 << 20),
49            suffix_cache: SuffixCache::new(1000),
50            utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
51            byte_classes: ByteClassSet::new(),
52        }
53    }
54
55    /// The size of the resulting program is limited by size_limit. If
56    /// the program approximately exceeds the given size (in bytes), then
57    /// compilation will stop and return an error.
58    pub fn size_limit(mut self, size_limit: usize) -> Self {
59        self.size_limit = size_limit;
60        self
61    }
62
63    /// If bytes is true, then the program is compiled as a byte based
64    /// automaton, which incorporates UTF-8 decoding into the machine. If it's
65    /// false, then the automaton is Unicode scalar value based, e.g., an
66    /// engine utilizing such an automaton is responsible for UTF-8 decoding.
67    ///
68    /// The specific invariant is that when returning a byte based machine,
69    /// the neither the `Char` nor `Ranges` instructions are produced.
70    /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
71    /// instruction is never produced.
72    ///
73    /// Note that `dfa(true)` implies `bytes(true)`.
74    pub fn bytes(mut self, yes: bool) -> Self {
75        self.compiled.is_bytes = yes;
76        self
77    }
78
79    /// When disabled, the program compiled may match arbitrary bytes.
80    ///
81    /// When enabled (the default), all compiled programs exclusively match
82    /// valid UTF-8 bytes.
83    pub fn only_utf8(mut self, yes: bool) -> Self {
84        self.compiled.only_utf8 = yes;
85        self
86    }
87
88    /// When set, the machine returned is suitable for use in the DFA matching
89    /// engine.
90    ///
91    /// In particular, this ensures that if the regex is not anchored in the
92    /// beginning, then a preceding `.*?` is included in the program. (The NFA
93    /// based engines handle the preceding `.*?` explicitly, which is difficult
94    /// or impossible in the DFA engine.)
95    pub fn dfa(mut self, yes: bool) -> Self {
96        self.compiled.is_dfa = yes;
97        self
98    }
99
100    /// When set, the machine returned is suitable for matching text in
101    /// reverse. In particular, all concatenations are flipped.
102    pub fn reverse(mut self, yes: bool) -> Self {
103        self.compiled.is_reverse = yes;
104        self
105    }
106
107    /// Compile a regular expression given its AST.
108    ///
109    /// The compiler is guaranteed to succeed unless the program exceeds the
110    /// specified size limit. If the size limit is exceeded, then compilation
111    /// stops and returns an error.
112    pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
113        debug_assert!(exprs.len() >= 1);
114        self.num_exprs = exprs.len();
115        if exprs.len() == 1 {
116            self.compile_one(&exprs[0])
117        } else {
118            self.compile_many(exprs)
119        }
120    }
121
122    fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
123        // If we're compiling a forward DFA and we aren't anchored, then
124        // add a `.*?` before the first capture group.
125        // Other matching engines handle this by baking the logic into the
126        // matching engine itself.
127        let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
128        self.compiled.is_anchored_start = expr.is_anchored_start();
129        self.compiled.is_anchored_end = expr.is_anchored_end();
130        if self.compiled.needs_dotstar() {
131            dotstar_patch = self.c_dotstar()?;
132            self.compiled.start = dotstar_patch.entry;
133        }
134        self.compiled.captures = vec![None];
135        let patch = self.c_capture(0, expr)?;
136        if self.compiled.needs_dotstar() {
137            self.fill(dotstar_patch.hole, patch.entry);
138        } else {
139            self.compiled.start = patch.entry;
140        }
141        self.fill_to_next(patch.hole);
142        self.compiled.matches = vec![self.insts.len()];
143        self.push_compiled(Inst::Match(0));
144        self.compile_finish()
145    }
146
147    fn compile_many(
148        mut self,
149        exprs: &[Hir],
150    ) -> result::Result<Program, Error> {
151        debug_assert!(exprs.len() > 1);
152
153        self.compiled.is_anchored_start =
154            exprs.iter().all(|e| e.is_anchored_start());
155        self.compiled.is_anchored_end =
156            exprs.iter().all(|e| e.is_anchored_end());
157        let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
158        if self.compiled.needs_dotstar() {
159            dotstar_patch = self.c_dotstar()?;
160            self.compiled.start = dotstar_patch.entry;
161        } else {
162            self.compiled.start = 0; // first instruction is always split
163        }
164        self.fill_to_next(dotstar_patch.hole);
165
166        let mut prev_hole = Hole::None;
167        for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
168            self.fill_to_next(prev_hole);
169            let split = self.push_split_hole();
170            let Patch { hole, entry } = self.c_capture(0, expr)?;
171            self.fill_to_next(hole);
172            self.compiled.matches.push(self.insts.len());
173            self.push_compiled(Inst::Match(i));
174            prev_hole = self.fill_split(split, Some(entry), None);
175        }
176        let i = exprs.len() - 1;
177        let Patch { hole, entry } = self.c_capture(0, &exprs[i])?;
178        self.fill(prev_hole, entry);
179        self.fill_to_next(hole);
180        self.compiled.matches.push(self.insts.len());
181        self.push_compiled(Inst::Match(i));
182        self.compile_finish()
183    }
184
185    fn compile_finish(mut self) -> result::Result<Program, Error> {
186        self.compiled.insts =
187            self.insts.into_iter().map(|inst| inst.unwrap()).collect();
188        self.compiled.byte_classes = self.byte_classes.byte_classes();
189        self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
190        Ok(self.compiled)
191    }
192
193    /// Compile expr into self.insts, returning a patch on success,
194    /// or an error if we run out of memory.
195    ///
196    /// All of the c_* methods of the compiler share the contract outlined
197    /// here.
198    ///
199    /// The main thing that a c_* method does is mutate `self.insts`
200    /// to add a list of mostly compiled instructions required to execute
201    /// the given expression. `self.insts` contains MaybeInsts rather than
202    /// Insts because there is some backpatching required.
203    ///
204    /// The `Patch` value returned by each c_* method provides metadata
205    /// about the compiled instructions emitted to `self.insts`. The
206    /// `entry` member of the patch refers to the first instruction
207    /// (the entry point), while the `hole` member contains zero or
208    /// more offsets to partial instructions that need to be backpatched.
209    /// The c_* routine can't know where its list of instructions are going to
210    /// jump to after execution, so it is up to the caller to patch
211    /// these jumps to point to the right place. So compiling some
212    /// expression, e, we would end up with a situation that looked like:
213    ///
214    /// ```text
215    /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
216    ///                     ^              ^             ^
217    ///                     |                \         /
218    ///                   entry                \     /
219    ///                                         hole
220    /// ```
221    ///
222    /// To compile two expressions, e1 and e2, concatinated together we
223    /// would do:
224    ///
225    /// ```ignore
226    /// let patch1 = self.c(e1);
227    /// let patch2 = self.c(e2);
228    /// ```
229    ///
230    /// while leaves us with a situation that looks like
231    ///
232    /// ```text
233    /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
234    ///                     ^        ^            ^        ^
235    ///                     |        |            |        |
236    ///                entry1        hole1   entry2        hole2
237    /// ```
238    ///
239    /// Then to merge the two patches together into one we would backpatch
240    /// hole1 with entry2 and return a new patch that enters at entry1
241    /// and has hole2 for a hole. In fact, if you look at the c_concat
242    /// method you will see that it does exactly this, though it handles
243    /// a list of expressions rather than just the two that we use for
244    /// an example.
245    fn c(&mut self, expr: &Hir) -> Result {
246        use prog;
247        use syntax::hir::HirKind::*;
248
249        self.check_size()?;
250        match *expr.kind() {
251            Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
252            Literal(hir::Literal::Unicode(c)) => self.c_char(c),
253            Literal(hir::Literal::Byte(b)) => {
254                assert!(self.compiled.uses_bytes());
255                self.c_byte(b)
256            }
257            Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
258            Class(hir::Class::Bytes(ref cls)) => {
259                if self.compiled.uses_bytes() {
260                    self.c_class_bytes(cls.ranges())
261                } else {
262                    assert!(cls.is_all_ascii());
263                    let mut char_ranges = vec![];
264                    for r in cls.iter() {
265                        let (s, e) = (r.start() as char, r.end() as char);
266                        char_ranges.push(hir::ClassUnicodeRange::new(s, e));
267                    }
268                    self.c_class(&char_ranges)
269                }
270            }
271            Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
272                self.byte_classes.set_range(b'\n', b'\n');
273                self.c_empty_look(prog::EmptyLook::EndLine)
274            }
275            Anchor(hir::Anchor::StartLine) => {
276                self.byte_classes.set_range(b'\n', b'\n');
277                self.c_empty_look(prog::EmptyLook::StartLine)
278            }
279            Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
280                self.byte_classes.set_range(b'\n', b'\n');
281                self.c_empty_look(prog::EmptyLook::StartLine)
282            }
283            Anchor(hir::Anchor::EndLine) => {
284                self.byte_classes.set_range(b'\n', b'\n');
285                self.c_empty_look(prog::EmptyLook::EndLine)
286            }
287            Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
288                self.c_empty_look(prog::EmptyLook::EndText)
289            }
290            Anchor(hir::Anchor::StartText) => {
291                self.c_empty_look(prog::EmptyLook::StartText)
292            }
293            Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
294                self.c_empty_look(prog::EmptyLook::StartText)
295            }
296            Anchor(hir::Anchor::EndText) => {
297                self.c_empty_look(prog::EmptyLook::EndText)
298            }
299            WordBoundary(hir::WordBoundary::Unicode) => {
300                if !cfg!(feature = "unicode-perl") {
301                    return Err(Error::Syntax(
302                        "Unicode word boundaries are unavailable when \
303                         the unicode-perl feature is disabled"
304                            .to_string(),
305                    ));
306                }
307                self.compiled.has_unicode_word_boundary = true;
308                self.byte_classes.set_word_boundary();
309                self.c_empty_look(prog::EmptyLook::WordBoundary)
310            }
311            WordBoundary(hir::WordBoundary::UnicodeNegate) => {
312                if !cfg!(feature = "unicode-perl") {
313                    return Err(Error::Syntax(
314                        "Unicode word boundaries are unavailable when \
315                         the unicode-perl feature is disabled"
316                            .to_string(),
317                    ));
318                }
319                self.compiled.has_unicode_word_boundary = true;
320                self.byte_classes.set_word_boundary();
321                self.c_empty_look(prog::EmptyLook::NotWordBoundary)
322            }
323            WordBoundary(hir::WordBoundary::Ascii) => {
324                self.byte_classes.set_word_boundary();
325                self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
326            }
327            WordBoundary(hir::WordBoundary::AsciiNegate) => {
328                self.byte_classes.set_word_boundary();
329                self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
330            }
331            Group(ref g) => match g.kind {
332                hir::GroupKind::NonCapturing => self.c(&g.hir),
333                hir::GroupKind::CaptureIndex(index) => {
334                    if index as usize >= self.compiled.captures.len() {
335                        self.compiled.captures.push(None);
336                    }
337                    self.c_capture(2 * index as usize, &g.hir)
338                }
339                hir::GroupKind::CaptureName { index, ref name } => {
340                    if index as usize >= self.compiled.captures.len() {
341                        let n = name.to_string();
342                        self.compiled.captures.push(Some(n.clone()));
343                        self.capture_name_idx.insert(n, index as usize);
344                    }
345                    self.c_capture(2 * index as usize, &g.hir)
346                }
347            },
348            Concat(ref es) => {
349                if self.compiled.is_reverse {
350                    self.c_concat(es.iter().rev())
351                } else {
352                    self.c_concat(es)
353                }
354            }
355            Alternation(ref es) => self.c_alternate(&**es),
356            Repetition(ref rep) => self.c_repeat(rep),
357        }
358    }
359
360    fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result {
361        if self.num_exprs > 1 || self.compiled.is_dfa {
362            // Don't ever compile Save instructions for regex sets because
363            // they are never used. They are also never used in DFA programs
364            // because DFAs can't handle captures.
365            self.c(expr)
366        } else {
367            let entry = self.insts.len();
368            let hole = self.push_hole(InstHole::Save { slot: first_slot });
369            let patch = self.c(expr)?;
370            self.fill(hole, patch.entry);
371            self.fill_to_next(patch.hole);
372            let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
373            Ok(Patch { hole: hole, entry: entry })
374        }
375    }
376
377    fn c_dotstar(&mut self) -> Result {
378        Ok(if !self.compiled.only_utf8() {
379            self.c(&Hir::repetition(hir::Repetition {
380                kind: hir::RepetitionKind::ZeroOrMore,
381                greedy: false,
382                hir: Box::new(Hir::any(true)),
383            }))?
384        } else {
385            self.c(&Hir::repetition(hir::Repetition {
386                kind: hir::RepetitionKind::ZeroOrMore,
387                greedy: false,
388                hir: Box::new(Hir::any(false)),
389            }))?
390        })
391    }
392
393    fn c_char(&mut self, c: char) -> Result {
394        self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
395    }
396
397    fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result {
398        assert!(!ranges.is_empty());
399        if self.compiled.uses_bytes() {
400            CompileClass { c: self, ranges: ranges }.compile()
401        } else {
402            let ranges: Vec<(char, char)> =
403                ranges.iter().map(|r| (r.start(), r.end())).collect();
404            let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
405                self.push_hole(InstHole::Char { c: ranges[0].0 })
406            } else {
407                self.push_hole(InstHole::Ranges { ranges: ranges })
408            };
409            Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
410        }
411    }
412
413    fn c_byte(&mut self, b: u8) -> Result {
414        self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
415    }
416
417    fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result {
418        debug_assert!(!ranges.is_empty());
419
420        let first_split_entry = self.insts.len();
421        let mut holes = vec![];
422        let mut prev_hole = Hole::None;
423        for r in &ranges[0..ranges.len() - 1] {
424            self.fill_to_next(prev_hole);
425            let split = self.push_split_hole();
426            let next = self.insts.len();
427            self.byte_classes.set_range(r.start(), r.end());
428            holes.push(self.push_hole(InstHole::Bytes {
429                start: r.start(),
430                end: r.end(),
431            }));
432            prev_hole = self.fill_split(split, Some(next), None);
433        }
434        let next = self.insts.len();
435        let r = &ranges[ranges.len() - 1];
436        self.byte_classes.set_range(r.start(), r.end());
437        holes.push(
438            self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
439        );
440        self.fill(prev_hole, next);
441        Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
442    }
443
444    fn c_empty_look(&mut self, look: EmptyLook) -> Result {
445        let hole = self.push_hole(InstHole::EmptyLook { look: look });
446        Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
447    }
448
449    fn c_concat<'a, I>(&mut self, exprs: I) -> Result
450    where
451        I: IntoIterator<Item = &'a Hir>,
452    {
453        let mut exprs = exprs.into_iter();
454        let first = match exprs.next() {
455            Some(expr) => expr,
456            None => {
457                return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
458            }
459        };
460        let Patch { mut hole, entry } = self.c(first)?;
461        for e in exprs {
462            let p = self.c(e)?;
463            self.fill(hole, p.entry);
464            hole = p.hole;
465        }
466        Ok(Patch { hole: hole, entry: entry })
467    }
468
469    fn c_alternate(&mut self, exprs: &[Hir]) -> Result {
470        debug_assert!(
471            exprs.len() >= 2,
472            "alternates must have at least 2 exprs"
473        );
474
475        // Initial entry point is always the first split.
476        let first_split_entry = self.insts.len();
477
478        // Save up all of the holes from each alternate. They will all get
479        // patched to point to the same location.
480        let mut holes = vec![];
481
482        let mut prev_hole = Hole::None;
483        for e in &exprs[0..exprs.len() - 1] {
484            self.fill_to_next(prev_hole);
485            let split = self.push_split_hole();
486            let prev_entry = self.insts.len();
487            let Patch { hole, entry } = self.c(e)?;
488            if prev_entry == self.insts.len() {
489                // TODO(burntsushi): It is kind of silly that we don't support
490                // empty-subexpressions in alternates, but it is supremely
491                // awkward to support them in the existing compiler
492                // infrastructure. This entire compiler needs to be thrown out
493                // anyway, so don't feel too bad.
494                return Err(Error::Syntax(
495                    "alternations cannot currently contain \
496                     empty sub-expressions"
497                        .to_string(),
498                ));
499            }
500            holes.push(hole);
501            prev_hole = self.fill_split(split, Some(entry), None);
502        }
503        let prev_entry = self.insts.len();
504        let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?;
505        if prev_entry == self.insts.len() {
506            // TODO(burntsushi): See TODO above.
507            return Err(Error::Syntax(
508                "alternations cannot currently contain \
509                 empty sub-expressions"
510                    .to_string(),
511            ));
512        }
513        holes.push(hole);
514        self.fill(prev_hole, entry);
515        Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
516    }
517
518    fn c_repeat(&mut self, rep: &hir::Repetition) -> Result {
519        use syntax::hir::RepetitionKind::*;
520        match rep.kind {
521            ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
522            ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
523            OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
524            Range(hir::RepetitionRange::Exactly(min_max)) => {
525                self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
526            }
527            Range(hir::RepetitionRange::AtLeast(min)) => {
528                self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
529            }
530            Range(hir::RepetitionRange::Bounded(min, max)) => {
531                self.c_repeat_range(&rep.hir, rep.greedy, min, max)
532            }
533        }
534    }
535
536    fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result {
537        let split_entry = self.insts.len();
538        let split = self.push_split_hole();
539        let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
540
541        let split_hole = if greedy {
542            self.fill_split(split, Some(entry_rep), None)
543        } else {
544            self.fill_split(split, None, Some(entry_rep))
545        };
546        let holes = vec![hole_rep, split_hole];
547        Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
548    }
549
550    fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
551        let split_entry = self.insts.len();
552        let split = self.push_split_hole();
553        let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
554
555        self.fill(hole_rep, split_entry);
556        let split_hole = if greedy {
557            self.fill_split(split, Some(entry_rep), None)
558        } else {
559            self.fill_split(split, None, Some(entry_rep))
560        };
561        Ok(Patch { hole: split_hole, entry: split_entry })
562    }
563
564    fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
565        let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
566        self.fill_to_next(hole_rep);
567        let split = self.push_split_hole();
568
569        let split_hole = if greedy {
570            self.fill_split(split, Some(entry_rep), None)
571        } else {
572            self.fill_split(split, None, Some(entry_rep))
573        };
574        Ok(Patch { hole: split_hole, entry: entry_rep })
575    }
576
577    fn c_repeat_range_min_or_more(
578        &mut self,
579        expr: &Hir,
580        greedy: bool,
581        min: u32,
582    ) -> Result {
583        let min = u32_to_usize(min);
584        let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
585        let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?;
586        self.fill(patch_concat.hole, patch_rep.entry);
587        Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
588    }
589
590    fn c_repeat_range(
591        &mut self,
592        expr: &Hir,
593        greedy: bool,
594        min: u32,
595        max: u32,
596    ) -> Result {
597        let (min, max) = (u32_to_usize(min), u32_to_usize(max));
598        let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
599        let initial_entry = patch_concat.entry;
600        if min == max {
601            return Ok(patch_concat);
602        }
603        // It is much simpler to compile, e.g., `a{2,5}` as:
604        //
605        //     aaa?a?a?
606        //
607        // But you end up with a sequence of instructions like this:
608        //
609        //     0: 'a'
610        //     1: 'a',
611        //     2: split(3, 4)
612        //     3: 'a'
613        //     4: split(5, 6)
614        //     5: 'a'
615        //     6: split(7, 8)
616        //     7: 'a'
617        //     8: MATCH
618        //
619        // This is *incredibly* inefficient because the splits end
620        // up forming a chain, which has to be resolved everything a
621        // transition is followed.
622        let mut holes = vec![];
623        let mut prev_hole = patch_concat.hole;
624        for _ in min..max {
625            self.fill_to_next(prev_hole);
626            let split = self.push_split_hole();
627            let Patch { hole, entry } = self.c(expr)?;
628            prev_hole = hole;
629            if greedy {
630                holes.push(self.fill_split(split, Some(entry), None));
631            } else {
632                holes.push(self.fill_split(split, None, Some(entry)));
633            }
634        }
635        holes.push(prev_hole);
636        Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
637    }
638
639    fn fill(&mut self, hole: Hole, goto: InstPtr) {
640        match hole {
641            Hole::None => {}
642            Hole::One(pc) => {
643                self.insts[pc].fill(goto);
644            }
645            Hole::Many(holes) => {
646                for hole in holes {
647                    self.fill(hole, goto);
648                }
649            }
650        }
651    }
652
653    fn fill_to_next(&mut self, hole: Hole) {
654        let next = self.insts.len();
655        self.fill(hole, next);
656    }
657
658    fn fill_split(
659        &mut self,
660        hole: Hole,
661        goto1: Option<InstPtr>,
662        goto2: Option<InstPtr>,
663    ) -> Hole {
664        match hole {
665            Hole::None => Hole::None,
666            Hole::One(pc) => match (goto1, goto2) {
667                (Some(goto1), Some(goto2)) => {
668                    self.insts[pc].fill_split(goto1, goto2);
669                    Hole::None
670                }
671                (Some(goto1), None) => {
672                    self.insts[pc].half_fill_split_goto1(goto1);
673                    Hole::One(pc)
674                }
675                (None, Some(goto2)) => {
676                    self.insts[pc].half_fill_split_goto2(goto2);
677                    Hole::One(pc)
678                }
679                (None, None) => unreachable!(
680                    "at least one of the split \
681                     holes must be filled"
682                ),
683            },
684            Hole::Many(holes) => {
685                let mut new_holes = vec![];
686                for hole in holes {
687                    new_holes.push(self.fill_split(hole, goto1, goto2));
688                }
689                if new_holes.is_empty() {
690                    Hole::None
691                } else if new_holes.len() == 1 {
692                    new_holes.pop().unwrap()
693                } else {
694                    Hole::Many(new_holes)
695                }
696            }
697        }
698    }
699
700    fn push_compiled(&mut self, inst: Inst) {
701        self.insts.push(MaybeInst::Compiled(inst));
702    }
703
704    fn push_hole(&mut self, inst: InstHole) -> Hole {
705        let hole = self.insts.len();
706        self.insts.push(MaybeInst::Uncompiled(inst));
707        Hole::One(hole)
708    }
709
710    fn push_split_hole(&mut self) -> Hole {
711        let hole = self.insts.len();
712        self.insts.push(MaybeInst::Split);
713        Hole::One(hole)
714    }
715
716    fn check_size(&self) -> result::Result<(), Error> {
717        use std::mem::size_of;
718
719        if self.insts.len() * size_of::<Inst>() > self.size_limit {
720            Err(Error::CompiledTooBig(self.size_limit))
721        } else {
722            Ok(())
723        }
724    }
725}
726
727#[derive(Debug)]
728enum Hole {
729    None,
730    One(InstPtr),
731    Many(Vec<Hole>),
732}
733
734#[derive(Clone, Debug)]
735enum MaybeInst {
736    Compiled(Inst),
737    Uncompiled(InstHole),
738    Split,
739    Split1(InstPtr),
740    Split2(InstPtr),
741}
742
743impl MaybeInst {
744    fn fill(&mut self, goto: InstPtr) {
745        let filled = match *self {
746            MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
747            MaybeInst::Split1(goto1) => {
748                Inst::Split(InstSplit { goto1: goto1, goto2: goto })
749            }
750            MaybeInst::Split2(goto2) => {
751                Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
752            }
753            _ => unreachable!(
754                "not all instructions were compiled! \
755                 found uncompiled instruction: {:?}",
756                self
757            ),
758        };
759        *self = MaybeInst::Compiled(filled);
760    }
761
762    fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
763        let filled = match *self {
764            MaybeInst::Split => {
765                Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
766            }
767            _ => unreachable!(
768                "must be called on Split instruction, \
769                 instead it was called on: {:?}",
770                self
771            ),
772        };
773        *self = MaybeInst::Compiled(filled);
774    }
775
776    fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
777        let half_filled = match *self {
778            MaybeInst::Split => goto1,
779            _ => unreachable!(
780                "must be called on Split instruction, \
781                 instead it was called on: {:?}",
782                self
783            ),
784        };
785        *self = MaybeInst::Split1(half_filled);
786    }
787
788    fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
789        let half_filled = match *self {
790            MaybeInst::Split => goto2,
791            _ => unreachable!(
792                "must be called on Split instruction, \
793                 instead it was called on: {:?}",
794                self
795            ),
796        };
797        *self = MaybeInst::Split2(half_filled);
798    }
799
800    fn unwrap(self) -> Inst {
801        match self {
802            MaybeInst::Compiled(inst) => inst,
803            _ => unreachable!(
804                "must be called on a compiled instruction, \
805                 instead it was called on: {:?}",
806                self
807            ),
808        }
809    }
810}
811
812#[derive(Clone, Debug)]
813enum InstHole {
814    Save { slot: usize },
815    EmptyLook { look: EmptyLook },
816    Char { c: char },
817    Ranges { ranges: Vec<(char, char)> },
818    Bytes { start: u8, end: u8 },
819}
820
821impl InstHole {
822    fn fill(&self, goto: InstPtr) -> Inst {
823        match *self {
824            InstHole::Save { slot } => {
825                Inst::Save(InstSave { goto: goto, slot: slot })
826            }
827            InstHole::EmptyLook { look } => {
828                Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
829            }
830            InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
831            InstHole::Ranges { ref ranges } => {
832                Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
833            }
834            InstHole::Bytes { start, end } => {
835                Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
836            }
837        }
838    }
839}
840
841struct CompileClass<'a, 'b> {
842    c: &'a mut Compiler,
843    ranges: &'b [hir::ClassUnicodeRange],
844}
845
846impl<'a, 'b> CompileClass<'a, 'b> {
847    fn compile(mut self) -> Result {
848        let mut holes = vec![];
849        let mut initial_entry = None;
850        let mut last_split = Hole::None;
851        let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
852        self.c.suffix_cache.clear();
853
854        for (i, range) in self.ranges.iter().enumerate() {
855            let is_last_range = i + 1 == self.ranges.len();
856            utf8_seqs.reset(range.start(), range.end());
857            let mut it = (&mut utf8_seqs).peekable();
858            loop {
859                let utf8_seq = match it.next() {
860                    None => break,
861                    Some(utf8_seq) => utf8_seq,
862                };
863                if is_last_range && it.peek().is_none() {
864                    let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
865                    holes.push(hole);
866                    self.c.fill(last_split, entry);
867                    last_split = Hole::None;
868                    if initial_entry.is_none() {
869                        initial_entry = Some(entry);
870                    }
871                } else {
872                    if initial_entry.is_none() {
873                        initial_entry = Some(self.c.insts.len());
874                    }
875                    self.c.fill_to_next(last_split);
876                    last_split = self.c.push_split_hole();
877                    let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
878                    holes.push(hole);
879                    last_split =
880                        self.c.fill_split(last_split, Some(entry), None);
881                }
882            }
883        }
884        self.c.utf8_seqs = Some(utf8_seqs);
885        Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
886    }
887
888    fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
889        if self.c.compiled.is_reverse {
890            self.c_utf8_seq_(seq)
891        } else {
892            self.c_utf8_seq_(seq.into_iter().rev())
893        }
894    }
895
896    fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
897    where
898        I: IntoIterator<Item = &'r Utf8Range>,
899    {
900        // The initial instruction for each UTF-8 sequence should be the same.
901        let mut from_inst = ::std::usize::MAX;
902        let mut last_hole = Hole::None;
903        for byte_range in seq {
904            let key = SuffixCacheKey {
905                from_inst: from_inst,
906                start: byte_range.start,
907                end: byte_range.end,
908            };
909            {
910                let pc = self.c.insts.len();
911                if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
912                    from_inst = cached_pc;
913                    continue;
914                }
915            }
916            self.c.byte_classes.set_range(byte_range.start, byte_range.end);
917            if from_inst == ::std::usize::MAX {
918                last_hole = self.c.push_hole(InstHole::Bytes {
919                    start: byte_range.start,
920                    end: byte_range.end,
921                });
922            } else {
923                self.c.push_compiled(Inst::Bytes(InstBytes {
924                    goto: from_inst,
925                    start: byte_range.start,
926                    end: byte_range.end,
927                }));
928            }
929            from_inst = self.c.insts.len().checked_sub(1).unwrap();
930            debug_assert!(from_inst < ::std::usize::MAX);
931        }
932        debug_assert!(from_inst < ::std::usize::MAX);
933        Ok(Patch { hole: last_hole, entry: from_inst })
934    }
935}
936
937/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
938/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
939/// The set of byte ranges looks like this:
940///
941/// [0-7F]
942/// [C2-DF][80-BF]
943/// [E0][A0-BF][80-BF]
944/// [E1-EC][80-BF][80-BF]
945/// [ED][80-9F][80-BF]
946/// [EE-EF][80-BF][80-BF]
947///
948/// Each line above translates to one alternate in the compiled regex program.
949/// However, all but one of the alternates end in the same suffix, which is
950/// a waste of an instruction. The suffix cache facilitates reusing them across
951/// alternates.
952///
953/// Note that a HashMap could be trivially used for this, but we don't need its
954/// overhead. Some small bounded space (LRU style) is more than enough.
955///
956/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
957/// except it uses hashes as original indices and then compares full keys for
958/// validation against `dense` array.
959struct SuffixCache {
960    sparse: Box<[usize]>,
961    dense: Vec<SuffixCacheEntry>,
962}
963
964#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
965struct SuffixCacheEntry {
966    key: SuffixCacheKey,
967    pc: InstPtr,
968}
969
970#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
971struct SuffixCacheKey {
972    from_inst: InstPtr,
973    start: u8,
974    end: u8,
975}
976
977impl SuffixCache {
978    fn new(size: usize) -> Self {
979        SuffixCache {
980            sparse: vec![0usize; size].into(),
981            dense: Vec::with_capacity(size),
982        }
983    }
984
985    fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
986        let hash = self.hash(&key);
987        let pos = &mut self.sparse[hash];
988        if let Some(entry) = self.dense.get(*pos) {
989            if entry.key == key {
990                return Some(entry.pc);
991            }
992        }
993        *pos = self.dense.len();
994        self.dense.push(SuffixCacheEntry { key: key, pc: pc });
995        None
996    }
997
998    fn clear(&mut self) {
999        self.dense.clear();
1000    }
1001
1002    fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1003        // Basic FNV-1a hash as described:
1004        // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1005        const FNV_PRIME: u64 = 1099511628211;
1006        let mut h = 14695981039346656037;
1007        h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1008        h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1009        h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1010        (h as usize) % self.sparse.len()
1011    }
1012}
1013
1014struct ByteClassSet([bool; 256]);
1015
1016impl ByteClassSet {
1017    fn new() -> Self {
1018        ByteClassSet([false; 256])
1019    }
1020
1021    fn set_range(&mut self, start: u8, end: u8) {
1022        debug_assert!(start <= end);
1023        if start > 0 {
1024            self.0[start as usize - 1] = true;
1025        }
1026        self.0[end as usize] = true;
1027    }
1028
1029    fn set_word_boundary(&mut self) {
1030        // We need to mark all ranges of bytes whose pairs result in
1031        // evaluating \b differently.
1032        let iswb = is_word_byte;
1033        let mut b1: u16 = 0;
1034        let mut b2: u16;
1035        while b1 <= 255 {
1036            b2 = b1 + 1;
1037            while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1038                b2 += 1;
1039            }
1040            self.set_range(b1 as u8, (b2 - 1) as u8);
1041            b1 = b2;
1042        }
1043    }
1044
1045    fn byte_classes(&self) -> Vec<u8> {
1046        // N.B. If you're debugging the DFA, it's useful to simply return
1047        // `(0..256).collect()`, which effectively removes the byte classes
1048        // and makes the transitions easier to read.
1049        // (0usize..256).map(|x| x as u8).collect()
1050        let mut byte_classes = vec![0; 256];
1051        let mut class = 0u8;
1052        let mut i = 0;
1053        loop {
1054            byte_classes[i] = class as u8;
1055            if i >= 255 {
1056                break;
1057            }
1058            if self.0[i] {
1059                class = class.checked_add(1).unwrap();
1060            }
1061            i += 1;
1062        }
1063        byte_classes
1064    }
1065}
1066
1067fn u32_to_usize(n: u32) -> usize {
1068    // In case usize is less than 32 bits, we need to guard against overflow.
1069    // On most platforms this compiles to nothing.
1070    // TODO Use `std::convert::TryFrom` once it's stable.
1071    if (n as u64) > (::std::usize::MAX as u64) {
1072        panic!("BUG: {} is too big to be pointer sized", n)
1073    }
1074    n as usize
1075}
1076
1077#[cfg(test)]
1078mod tests {
1079    use super::ByteClassSet;
1080
1081    #[test]
1082    fn byte_classes() {
1083        let mut set = ByteClassSet::new();
1084        set.set_range(b'a', b'z');
1085        let classes = set.byte_classes();
1086        assert_eq!(classes[0], 0);
1087        assert_eq!(classes[1], 0);
1088        assert_eq!(classes[2], 0);
1089        assert_eq!(classes[b'a' as usize - 1], 0);
1090        assert_eq!(classes[b'a' as usize], 1);
1091        assert_eq!(classes[b'm' as usize], 1);
1092        assert_eq!(classes[b'z' as usize], 1);
1093        assert_eq!(classes[b'z' as usize + 1], 2);
1094        assert_eq!(classes[254], 2);
1095        assert_eq!(classes[255], 2);
1096
1097        let mut set = ByteClassSet::new();
1098        set.set_range(0, 2);
1099        set.set_range(4, 6);
1100        let classes = set.byte_classes();
1101        assert_eq!(classes[0], 0);
1102        assert_eq!(classes[1], 0);
1103        assert_eq!(classes[2], 0);
1104        assert_eq!(classes[3], 1);
1105        assert_eq!(classes[4], 2);
1106        assert_eq!(classes[5], 2);
1107        assert_eq!(classes[6], 2);
1108        assert_eq!(classes[7], 3);
1109        assert_eq!(classes[255], 3);
1110    }
1111
1112    #[test]
1113    fn full_byte_classes() {
1114        let mut set = ByteClassSet::new();
1115        for i in 0..256u16 {
1116            set.set_range(i as u8, i as u8);
1117        }
1118        assert_eq!(set.byte_classes().len(), 256);
1119    }
1120}