regex/
exec.rs

1use std::cell::RefCell;
2use std::collections::HashMap;
3use std::sync::Arc;
4
5#[cfg(feature = "perf-literal")]
6use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7use syntax::hir::literal::Literals;
8use syntax::hir::Hir;
9use syntax::ParserBuilder;
10
11use backtrack;
12use cache::{Cached, CachedGuard};
13use compile::Compiler;
14#[cfg(feature = "perf-dfa")]
15use dfa;
16use error::Error;
17use input::{ByteInput, CharInput};
18use literal::LiteralSearcher;
19use pikevm;
20use prog::Program;
21use re_builder::RegexOptions;
22use re_bytes;
23use re_set;
24use re_trait::{Locations, RegularExpression, Slot};
25use re_unicode;
26use utf8::next_utf8;
27
28/// `Exec` manages the execution of a regular expression.
29///
30/// In particular, this manages the various compiled forms of a single regular
31/// expression and the choice of which matching engine to use to execute a
32/// regular expression.
33pub struct Exec {
34    /// All read only state.
35    ro: Arc<ExecReadOnly>,
36    /// Caches for the various matching engines.
37    cache: Cached<ProgramCache>,
38}
39
40/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
41/// means it is no longer Sync, but we can now avoid the overhead of
42/// synchronization to fetch the cache.
43#[derive(Debug)]
44pub struct ExecNoSync<'c> {
45    /// All read only state.
46    ro: &'c Arc<ExecReadOnly>,
47    /// Caches for the various matching engines.
48    cache: CachedGuard<'c, ProgramCache>,
49}
50
51/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
52pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
53
54/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
55/// state is determined at compile time and never changes during search.
56#[derive(Debug)]
57struct ExecReadOnly {
58    /// The original regular expressions given by the caller to compile.
59    res: Vec<String>,
60    /// A compiled program that is used in the NFA simulation and backtracking.
61    /// It can be byte-based or Unicode codepoint based.
62    ///
63    /// N.B. It is not possibly to make this byte-based from the public API.
64    /// It is only used for testing byte based programs in the NFA simulations.
65    nfa: Program,
66    /// A compiled byte based program for DFA execution. This is only used
67    /// if a DFA can be executed. (Currently, only word boundary assertions are
68    /// not supported.) Note that this program contains an embedded `.*?`
69    /// preceding the first capture group, unless the regex is anchored at the
70    /// beginning.
71    dfa: Program,
72    /// The same as above, except the program is reversed (and there is no
73    /// preceding `.*?`). This is used by the DFA to find the starting location
74    /// of matches.
75    dfa_reverse: Program,
76    /// A set of suffix literals extracted from the regex.
77    ///
78    /// Prefix literals are stored on the `Program`, since they are used inside
79    /// the matching engines.
80    suffixes: LiteralSearcher,
81    /// An Aho-Corasick automaton with leftmost-first match semantics.
82    ///
83    /// This is only set when the entire regex is a simple unanchored
84    /// alternation of literals. We could probably use it more circumstances,
85    /// but this is already hacky enough in this architecture.
86    ///
87    /// N.B. We use u32 as a state ID representation under the assumption that
88    /// if we were to exhaust the ID space, we probably would have long
89    /// surpassed the compilation size limit.
90    #[cfg(feature = "perf-literal")]
91    ac: Option<AhoCorasick<u32>>,
92    /// match_type encodes as much upfront knowledge about how we're going to
93    /// execute a search as possible.
94    match_type: MatchType,
95}
96
97/// Facilitates the construction of an executor by exposing various knobs
98/// to control how a regex is executed and what kinds of resources it's
99/// permitted to use.
100pub struct ExecBuilder {
101    options: RegexOptions,
102    match_type: Option<MatchType>,
103    bytes: bool,
104    only_utf8: bool,
105}
106
107/// Parsed represents a set of parsed regular expressions and their detected
108/// literals.
109struct Parsed {
110    exprs: Vec<Hir>,
111    prefixes: Literals,
112    suffixes: Literals,
113    bytes: bool,
114}
115
116impl ExecBuilder {
117    /// Create a regex execution builder.
118    ///
119    /// This uses default settings for everything except the regex itself,
120    /// which must be provided. Further knobs can be set by calling methods,
121    /// and then finally, `build` to actually create the executor.
122    pub fn new(re: &str) -> Self {
123        Self::new_many(&[re])
124    }
125
126    /// Like new, but compiles the union of the given regular expressions.
127    ///
128    /// Note that when compiling 2 or more regular expressions, capture groups
129    /// are completely unsupported. (This means both `find` and `captures`
130    /// wont work.)
131    pub fn new_many<I, S>(res: I) -> Self
132    where
133        S: AsRef<str>,
134        I: IntoIterator<Item = S>,
135    {
136        let mut opts = RegexOptions::default();
137        opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
138        Self::new_options(opts)
139    }
140
141    /// Create a regex execution builder.
142    pub fn new_options(opts: RegexOptions) -> Self {
143        ExecBuilder {
144            options: opts,
145            match_type: None,
146            bytes: false,
147            only_utf8: true,
148        }
149    }
150
151    /// Set the matching engine to be automatically determined.
152    ///
153    /// This is the default state and will apply whatever optimizations are
154    /// possible, such as running a DFA.
155    ///
156    /// This overrides whatever was previously set via the `nfa` or
157    /// `bounded_backtracking` methods.
158    pub fn automatic(mut self) -> Self {
159        self.match_type = None;
160        self
161    }
162
163    /// Sets the matching engine to use the NFA algorithm no matter what
164    /// optimizations are possible.
165    ///
166    /// This overrides whatever was previously set via the `automatic` or
167    /// `bounded_backtracking` methods.
168    pub fn nfa(mut self) -> Self {
169        self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
170        self
171    }
172
173    /// Sets the matching engine to use a bounded backtracking engine no
174    /// matter what optimizations are possible.
175    ///
176    /// One must use this with care, since the bounded backtracking engine
177    /// uses memory proportion to `len(regex) * len(text)`.
178    ///
179    /// This overrides whatever was previously set via the `automatic` or
180    /// `nfa` methods.
181    pub fn bounded_backtracking(mut self) -> Self {
182        self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
183        self
184    }
185
186    /// Compiles byte based programs for use with the NFA matching engines.
187    ///
188    /// By default, the NFA engines match on Unicode scalar values. They can
189    /// be made to use byte based programs instead. In general, the byte based
190    /// programs are slower because of a less efficient encoding of character
191    /// classes.
192    ///
193    /// Note that this does not impact DFA matching engines, which always
194    /// execute on bytes.
195    pub fn bytes(mut self, yes: bool) -> Self {
196        self.bytes = yes;
197        self
198    }
199
200    /// When disabled, the program compiled may match arbitrary bytes.
201    ///
202    /// When enabled (the default), all compiled programs exclusively match
203    /// valid UTF-8 bytes.
204    pub fn only_utf8(mut self, yes: bool) -> Self {
205        self.only_utf8 = yes;
206        self
207    }
208
209    /// Set the Unicode flag.
210    pub fn unicode(mut self, yes: bool) -> Self {
211        self.options.unicode = yes;
212        self
213    }
214
215    /// Parse the current set of patterns into their AST and extract literals.
216    fn parse(&self) -> Result<Parsed, Error> {
217        let mut exprs = Vec::with_capacity(self.options.pats.len());
218        let mut prefixes = Some(Literals::empty());
219        let mut suffixes = Some(Literals::empty());
220        let mut bytes = false;
221        let is_set = self.options.pats.len() > 1;
222        // If we're compiling a regex set and that set has any anchored
223        // expressions, then disable all literal optimizations.
224        for pat in &self.options.pats {
225            let mut parser = ParserBuilder::new()
226                .octal(self.options.octal)
227                .case_insensitive(self.options.case_insensitive)
228                .multi_line(self.options.multi_line)
229                .dot_matches_new_line(self.options.dot_matches_new_line)
230                .swap_greed(self.options.swap_greed)
231                .ignore_whitespace(self.options.ignore_whitespace)
232                .unicode(self.options.unicode)
233                .allow_invalid_utf8(!self.only_utf8)
234                .nest_limit(self.options.nest_limit)
235                .build();
236            let expr =
237                parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
238            bytes = bytes || !expr.is_always_utf8();
239
240            if cfg!(feature = "perf-literal") {
241                if !expr.is_anchored_start() && expr.is_any_anchored_start() {
242                    // Partial anchors unfortunately make it hard to use
243                    // prefixes, so disable them.
244                    prefixes = None;
245                } else if is_set && expr.is_anchored_start() {
246                    // Regex sets with anchors do not go well with literal
247                    // optimizations.
248                    prefixes = None;
249                }
250                prefixes = prefixes.and_then(|mut prefixes| {
251                    if !prefixes.union_prefixes(&expr) {
252                        None
253                    } else {
254                        Some(prefixes)
255                    }
256                });
257
258                if !expr.is_anchored_end() && expr.is_any_anchored_end() {
259                    // Partial anchors unfortunately make it hard to use
260                    // suffixes, so disable them.
261                    suffixes = None;
262                } else if is_set && expr.is_anchored_end() {
263                    // Regex sets with anchors do not go well with literal
264                    // optimizations.
265                    suffixes = None;
266                }
267                suffixes = suffixes.and_then(|mut suffixes| {
268                    if !suffixes.union_suffixes(&expr) {
269                        None
270                    } else {
271                        Some(suffixes)
272                    }
273                });
274            }
275            exprs.push(expr);
276        }
277        Ok(Parsed {
278            exprs: exprs,
279            prefixes: prefixes.unwrap_or_else(Literals::empty),
280            suffixes: suffixes.unwrap_or_else(Literals::empty),
281            bytes: bytes,
282        })
283    }
284
285    /// Build an executor that can run a regular expression.
286    pub fn build(self) -> Result<Exec, Error> {
287        // Special case when we have no patterns to compile.
288        // This can happen when compiling a regex set.
289        if self.options.pats.is_empty() {
290            let ro = Arc::new(ExecReadOnly {
291                res: vec![],
292                nfa: Program::new(),
293                dfa: Program::new(),
294                dfa_reverse: Program::new(),
295                suffixes: LiteralSearcher::empty(),
296                #[cfg(feature = "perf-literal")]
297                ac: None,
298                match_type: MatchType::Nothing,
299            });
300            return Ok(Exec { ro: ro, cache: Cached::new() });
301        }
302        let parsed = self.parse()?;
303        let mut nfa = Compiler::new()
304            .size_limit(self.options.size_limit)
305            .bytes(self.bytes || parsed.bytes)
306            .only_utf8(self.only_utf8)
307            .compile(&parsed.exprs)?;
308        let mut dfa = Compiler::new()
309            .size_limit(self.options.size_limit)
310            .dfa(true)
311            .only_utf8(self.only_utf8)
312            .compile(&parsed.exprs)?;
313        let mut dfa_reverse = Compiler::new()
314            .size_limit(self.options.size_limit)
315            .dfa(true)
316            .only_utf8(self.only_utf8)
317            .reverse(true)
318            .compile(&parsed.exprs)?;
319
320        #[cfg(feature = "perf-literal")]
321        let ac = self.build_aho_corasick(&parsed);
322        nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
323        dfa.prefixes = nfa.prefixes.clone();
324        dfa.dfa_size_limit = self.options.dfa_size_limit;
325        dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
326
327        let mut ro = ExecReadOnly {
328            res: self.options.pats,
329            nfa: nfa,
330            dfa: dfa,
331            dfa_reverse: dfa_reverse,
332            suffixes: LiteralSearcher::suffixes(parsed.suffixes),
333            #[cfg(feature = "perf-literal")]
334            ac: ac,
335            match_type: MatchType::Nothing,
336        };
337        ro.match_type = ro.choose_match_type(self.match_type);
338
339        let ro = Arc::new(ro);
340        Ok(Exec { ro: ro, cache: Cached::new() })
341    }
342
343    #[cfg(feature = "perf-literal")]
344    fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
345        if parsed.exprs.len() != 1 {
346            return None;
347        }
348        let lits = match alternation_literals(&parsed.exprs[0]) {
349            None => return None,
350            Some(lits) => lits,
351        };
352        // If we have a small number of literals, then let Teddy handle
353        // things (see literal/mod.rs).
354        if lits.len() <= 32 {
355            return None;
356        }
357        Some(
358            AhoCorasickBuilder::new()
359                .match_kind(MatchKind::LeftmostFirst)
360                .auto_configure(&lits)
361                // We always want this to reduce size, regardless
362                // of what auto-configure does.
363                .byte_classes(true)
364                .build_with_size::<u32, _, _>(&lits)
365                // This should never happen because we'd long exceed the
366                // compilation limit for regexes first.
367                .expect("AC automaton too big"),
368        )
369    }
370}
371
372impl<'c> RegularExpression for ExecNoSyncStr<'c> {
373    type Text = str;
374
375    fn slots_len(&self) -> usize {
376        self.0.slots_len()
377    }
378
379    fn next_after_empty(&self, text: &str, i: usize) -> usize {
380        next_utf8(text.as_bytes(), i)
381    }
382
383    #[cfg_attr(feature = "perf-inline", inline(always))]
384    fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
385        self.0.shortest_match_at(text.as_bytes(), start)
386    }
387
388    #[cfg_attr(feature = "perf-inline", inline(always))]
389    fn is_match_at(&self, text: &str, start: usize) -> bool {
390        self.0.is_match_at(text.as_bytes(), start)
391    }
392
393    #[cfg_attr(feature = "perf-inline", inline(always))]
394    fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
395        self.0.find_at(text.as_bytes(), start)
396    }
397
398    #[cfg_attr(feature = "perf-inline", inline(always))]
399    fn captures_read_at(
400        &self,
401        locs: &mut Locations,
402        text: &str,
403        start: usize,
404    ) -> Option<(usize, usize)> {
405        self.0.captures_read_at(locs, text.as_bytes(), start)
406    }
407}
408
409impl<'c> RegularExpression for ExecNoSync<'c> {
410    type Text = [u8];
411
412    /// Returns the number of capture slots in the regular expression. (There
413    /// are two slots for every capture group, corresponding to possibly empty
414    /// start and end locations of the capture.)
415    fn slots_len(&self) -> usize {
416        self.ro.nfa.captures.len() * 2
417    }
418
419    fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
420        i + 1
421    }
422
423    /// Returns the end of a match location, possibly occurring before the
424    /// end location of the correct leftmost-first match.
425    #[cfg_attr(feature = "perf-inline", inline(always))]
426    fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
427        if !self.is_anchor_end_match(text) {
428            return None;
429        }
430        match self.ro.match_type {
431            #[cfg(feature = "perf-literal")]
432            MatchType::Literal(ty) => {
433                self.find_literals(ty, text, start).map(|(_, e)| e)
434            }
435            #[cfg(feature = "perf-dfa")]
436            MatchType::Dfa | MatchType::DfaMany => {
437                match self.shortest_dfa(text, start) {
438                    dfa::Result::Match(end) => Some(end),
439                    dfa::Result::NoMatch(_) => None,
440                    dfa::Result::Quit => self.shortest_nfa(text, start),
441                }
442            }
443            #[cfg(feature = "perf-dfa")]
444            MatchType::DfaAnchoredReverse => {
445                match dfa::Fsm::reverse(
446                    &self.ro.dfa_reverse,
447                    self.cache.value(),
448                    true,
449                    &text[start..],
450                    text.len(),
451                ) {
452                    dfa::Result::Match(_) => Some(text.len()),
453                    dfa::Result::NoMatch(_) => None,
454                    dfa::Result::Quit => self.shortest_nfa(text, start),
455                }
456            }
457            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
458            MatchType::DfaSuffix => {
459                match self.shortest_dfa_reverse_suffix(text, start) {
460                    dfa::Result::Match(e) => Some(e),
461                    dfa::Result::NoMatch(_) => None,
462                    dfa::Result::Quit => self.shortest_nfa(text, start),
463                }
464            }
465            MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
466            MatchType::Nothing => None,
467        }
468    }
469
470    /// Returns true if and only if the regex matches text.
471    ///
472    /// For single regular expressions, this is equivalent to calling
473    /// shortest_match(...).is_some().
474    #[cfg_attr(feature = "perf-inline", inline(always))]
475    fn is_match_at(&self, text: &[u8], start: usize) -> bool {
476        if !self.is_anchor_end_match(text) {
477            return false;
478        }
479        // We need to do this dance because shortest_match relies on the NFA
480        // filling in captures[1], but a RegexSet has no captures. In other
481        // words, a RegexSet can't (currently) use shortest_match. ---AG
482        match self.ro.match_type {
483            #[cfg(feature = "perf-literal")]
484            MatchType::Literal(ty) => {
485                self.find_literals(ty, text, start).is_some()
486            }
487            #[cfg(feature = "perf-dfa")]
488            MatchType::Dfa | MatchType::DfaMany => {
489                match self.shortest_dfa(text, start) {
490                    dfa::Result::Match(_) => true,
491                    dfa::Result::NoMatch(_) => false,
492                    dfa::Result::Quit => self.match_nfa(text, start),
493                }
494            }
495            #[cfg(feature = "perf-dfa")]
496            MatchType::DfaAnchoredReverse => {
497                match dfa::Fsm::reverse(
498                    &self.ro.dfa_reverse,
499                    self.cache.value(),
500                    true,
501                    &text[start..],
502                    text.len(),
503                ) {
504                    dfa::Result::Match(_) => true,
505                    dfa::Result::NoMatch(_) => false,
506                    dfa::Result::Quit => self.match_nfa(text, start),
507                }
508            }
509            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
510            MatchType::DfaSuffix => {
511                match self.shortest_dfa_reverse_suffix(text, start) {
512                    dfa::Result::Match(_) => true,
513                    dfa::Result::NoMatch(_) => false,
514                    dfa::Result::Quit => self.match_nfa(text, start),
515                }
516            }
517            MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
518            MatchType::Nothing => false,
519        }
520    }
521
522    /// Finds the start and end location of the leftmost-first match, starting
523    /// at the given location.
524    #[cfg_attr(feature = "perf-inline", inline(always))]
525    fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
526        if !self.is_anchor_end_match(text) {
527            return None;
528        }
529        match self.ro.match_type {
530            #[cfg(feature = "perf-literal")]
531            MatchType::Literal(ty) => self.find_literals(ty, text, start),
532            #[cfg(feature = "perf-dfa")]
533            MatchType::Dfa => match self.find_dfa_forward(text, start) {
534                dfa::Result::Match((s, e)) => Some((s, e)),
535                dfa::Result::NoMatch(_) => None,
536                dfa::Result::Quit => {
537                    self.find_nfa(MatchNfaType::Auto, text, start)
538                }
539            },
540            #[cfg(feature = "perf-dfa")]
541            MatchType::DfaAnchoredReverse => {
542                match self.find_dfa_anchored_reverse(text, start) {
543                    dfa::Result::Match((s, e)) => Some((s, e)),
544                    dfa::Result::NoMatch(_) => None,
545                    dfa::Result::Quit => {
546                        self.find_nfa(MatchNfaType::Auto, text, start)
547                    }
548                }
549            }
550            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
551            MatchType::DfaSuffix => {
552                match self.find_dfa_reverse_suffix(text, start) {
553                    dfa::Result::Match((s, e)) => Some((s, e)),
554                    dfa::Result::NoMatch(_) => None,
555                    dfa::Result::Quit => {
556                        self.find_nfa(MatchNfaType::Auto, text, start)
557                    }
558                }
559            }
560            MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
561            MatchType::Nothing => None,
562            #[cfg(feature = "perf-dfa")]
563            MatchType::DfaMany => {
564                unreachable!("BUG: RegexSet cannot be used with find")
565            }
566        }
567    }
568
569    /// Finds the start and end location of the leftmost-first match and also
570    /// fills in all matching capture groups.
571    ///
572    /// The number of capture slots given should be equal to the total number
573    /// of capture slots in the compiled program.
574    ///
575    /// Note that the first two slots always correspond to the start and end
576    /// locations of the overall match.
577    fn captures_read_at(
578        &self,
579        locs: &mut Locations,
580        text: &[u8],
581        start: usize,
582    ) -> Option<(usize, usize)> {
583        let slots = locs.as_slots();
584        for slot in slots.iter_mut() {
585            *slot = None;
586        }
587        // If the caller unnecessarily uses this, then we try to save them
588        // from themselves.
589        match slots.len() {
590            0 => return self.find_at(text, start),
591            2 => {
592                return self.find_at(text, start).map(|(s, e)| {
593                    slots[0] = Some(s);
594                    slots[1] = Some(e);
595                    (s, e)
596                });
597            }
598            _ => {} // fallthrough
599        }
600        if !self.is_anchor_end_match(text) {
601            return None;
602        }
603        match self.ro.match_type {
604            #[cfg(feature = "perf-literal")]
605            MatchType::Literal(ty) => {
606                self.find_literals(ty, text, start).and_then(|(s, e)| {
607                    self.captures_nfa_type(
608                        MatchNfaType::Auto,
609                        slots,
610                        text,
611                        s,
612                        e,
613                    )
614                })
615            }
616            #[cfg(feature = "perf-dfa")]
617            MatchType::Dfa => {
618                if self.ro.nfa.is_anchored_start {
619                    self.captures_nfa(slots, text, start)
620                } else {
621                    match self.find_dfa_forward(text, start) {
622                        dfa::Result::Match((s, e)) => self.captures_nfa_type(
623                            MatchNfaType::Auto,
624                            slots,
625                            text,
626                            s,
627                            e,
628                        ),
629                        dfa::Result::NoMatch(_) => None,
630                        dfa::Result::Quit => {
631                            self.captures_nfa(slots, text, start)
632                        }
633                    }
634                }
635            }
636            #[cfg(feature = "perf-dfa")]
637            MatchType::DfaAnchoredReverse => {
638                match self.find_dfa_anchored_reverse(text, start) {
639                    dfa::Result::Match((s, e)) => self.captures_nfa_type(
640                        MatchNfaType::Auto,
641                        slots,
642                        text,
643                        s,
644                        e,
645                    ),
646                    dfa::Result::NoMatch(_) => None,
647                    dfa::Result::Quit => self.captures_nfa(slots, text, start),
648                }
649            }
650            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
651            MatchType::DfaSuffix => {
652                match self.find_dfa_reverse_suffix(text, start) {
653                    dfa::Result::Match((s, e)) => self.captures_nfa_type(
654                        MatchNfaType::Auto,
655                        slots,
656                        text,
657                        s,
658                        e,
659                    ),
660                    dfa::Result::NoMatch(_) => None,
661                    dfa::Result::Quit => self.captures_nfa(slots, text, start),
662                }
663            }
664            MatchType::Nfa(ty) => {
665                self.captures_nfa_type(ty, slots, text, start, text.len())
666            }
667            MatchType::Nothing => None,
668            #[cfg(feature = "perf-dfa")]
669            MatchType::DfaMany => {
670                unreachable!("BUG: RegexSet cannot be used with captures")
671            }
672        }
673    }
674}
675
676impl<'c> ExecNoSync<'c> {
677    /// Finds the leftmost-first match using only literal search.
678    #[cfg(feature = "perf-literal")]
679    #[cfg_attr(feature = "perf-inline", inline(always))]
680    fn find_literals(
681        &self,
682        ty: MatchLiteralType,
683        text: &[u8],
684        start: usize,
685    ) -> Option<(usize, usize)> {
686        use self::MatchLiteralType::*;
687        match ty {
688            Unanchored => {
689                let lits = &self.ro.nfa.prefixes;
690                lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
691            }
692            AnchoredStart => {
693                let lits = &self.ro.nfa.prefixes;
694                if !self.ro.nfa.is_anchored_start
695                    || (self.ro.nfa.is_anchored_start && start == 0)
696                {
697                    lits.find_start(&text[start..])
698                        .map(|(s, e)| (start + s, start + e))
699                } else {
700                    None
701                }
702            }
703            AnchoredEnd => {
704                let lits = &self.ro.suffixes;
705                lits.find_end(&text[start..])
706                    .map(|(s, e)| (start + s, start + e))
707            }
708            AhoCorasick => self
709                .ro
710                .ac
711                .as_ref()
712                .unwrap()
713                .find(&text[start..])
714                .map(|m| (start + m.start(), start + m.end())),
715        }
716    }
717
718    /// Finds the leftmost-first match (start and end) using only the DFA.
719    ///
720    /// If the result returned indicates that the DFA quit, then another
721    /// matching engine should be used.
722    #[cfg(feature = "perf-dfa")]
723    #[cfg_attr(feature = "perf-inline", inline(always))]
724    fn find_dfa_forward(
725        &self,
726        text: &[u8],
727        start: usize,
728    ) -> dfa::Result<(usize, usize)> {
729        use dfa::Result::*;
730        let end = match dfa::Fsm::forward(
731            &self.ro.dfa,
732            self.cache.value(),
733            false,
734            text,
735            start,
736        ) {
737            NoMatch(i) => return NoMatch(i),
738            Quit => return Quit,
739            Match(end) if start == end => return Match((start, start)),
740            Match(end) => end,
741        };
742        // Now run the DFA in reverse to find the start of the match.
743        match dfa::Fsm::reverse(
744            &self.ro.dfa_reverse,
745            self.cache.value(),
746            false,
747            &text[start..],
748            end - start,
749        ) {
750            Match(s) => Match((start + s, end)),
751            NoMatch(i) => NoMatch(i),
752            Quit => Quit,
753        }
754    }
755
756    /// Finds the leftmost-first match (start and end) using only the DFA,
757    /// but assumes the regex is anchored at the end and therefore starts at
758    /// the end of the regex and matches in reverse.
759    ///
760    /// If the result returned indicates that the DFA quit, then another
761    /// matching engine should be used.
762    #[cfg(feature = "perf-dfa")]
763    #[cfg_attr(feature = "perf-inline", inline(always))]
764    fn find_dfa_anchored_reverse(
765        &self,
766        text: &[u8],
767        start: usize,
768    ) -> dfa::Result<(usize, usize)> {
769        use dfa::Result::*;
770        match dfa::Fsm::reverse(
771            &self.ro.dfa_reverse,
772            self.cache.value(),
773            false,
774            &text[start..],
775            text.len() - start,
776        ) {
777            Match(s) => Match((start + s, text.len())),
778            NoMatch(i) => NoMatch(i),
779            Quit => Quit,
780        }
781    }
782
783    /// Finds the end of the shortest match using only the DFA.
784    #[cfg(feature = "perf-dfa")]
785    #[cfg_attr(feature = "perf-inline", inline(always))]
786    fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
787        dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
788    }
789
790    /// Finds the end of the shortest match using only the DFA by scanning for
791    /// suffix literals.
792    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
793    #[cfg_attr(feature = "perf-inline", inline(always))]
794    fn shortest_dfa_reverse_suffix(
795        &self,
796        text: &[u8],
797        start: usize,
798    ) -> dfa::Result<usize> {
799        match self.exec_dfa_reverse_suffix(text, start) {
800            None => self.shortest_dfa(text, start),
801            Some(r) => r.map(|(_, end)| end),
802        }
803    }
804
805    /// Finds the end of the shortest match using only the DFA by scanning for
806    /// suffix literals. It also reports the start of the match.
807    ///
808    /// Note that if None is returned, then the optimization gave up to avoid
809    /// worst case quadratic behavior. A forward scanning DFA should be tried
810    /// next.
811    ///
812    /// If a match is returned and the full leftmost-first match is desired,
813    /// then a forward scan starting from the beginning of the match must be
814    /// done.
815    ///
816    /// If the result returned indicates that the DFA quit, then another
817    /// matching engine should be used.
818    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
819    #[cfg_attr(feature = "perf-inline", inline(always))]
820    fn exec_dfa_reverse_suffix(
821        &self,
822        text: &[u8],
823        original_start: usize,
824    ) -> Option<dfa::Result<(usize, usize)>> {
825        use dfa::Result::*;
826
827        let lcs = self.ro.suffixes.lcs();
828        debug_assert!(lcs.len() >= 1);
829        let mut start = original_start;
830        let mut end = start;
831        let mut last_literal = start;
832        while end <= text.len() {
833            last_literal += match lcs.find(&text[last_literal..]) {
834                None => return Some(NoMatch(text.len())),
835                Some(i) => i,
836            };
837            end = last_literal + lcs.len();
838            match dfa::Fsm::reverse(
839                &self.ro.dfa_reverse,
840                self.cache.value(),
841                false,
842                &text[start..end],
843                end - start,
844            ) {
845                Match(0) | NoMatch(0) => return None,
846                Match(i) => return Some(Match((start + i, end))),
847                NoMatch(i) => {
848                    start += i;
849                    last_literal += 1;
850                    continue;
851                }
852                Quit => return Some(Quit),
853            };
854        }
855        Some(NoMatch(text.len()))
856    }
857
858    /// Finds the leftmost-first match (start and end) using only the DFA
859    /// by scanning for suffix literals.
860    ///
861    /// If the result returned indicates that the DFA quit, then another
862    /// matching engine should be used.
863    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
864    #[cfg_attr(feature = "perf-inline", inline(always))]
865    fn find_dfa_reverse_suffix(
866        &self,
867        text: &[u8],
868        start: usize,
869    ) -> dfa::Result<(usize, usize)> {
870        use dfa::Result::*;
871
872        let match_start = match self.exec_dfa_reverse_suffix(text, start) {
873            None => return self.find_dfa_forward(text, start),
874            Some(Match((start, _))) => start,
875            Some(r) => return r,
876        };
877        // At this point, we've found a match. The only way to quit now
878        // without a match is if the DFA gives up (seems unlikely).
879        //
880        // Now run the DFA forwards to find the proper end of the match.
881        // (The suffix literal match can only indicate the earliest
882        // possible end location, which may appear before the end of the
883        // leftmost-first match.)
884        match dfa::Fsm::forward(
885            &self.ro.dfa,
886            self.cache.value(),
887            false,
888            text,
889            match_start,
890        ) {
891            NoMatch(_) => panic!("BUG: reverse match implies forward match"),
892            Quit => Quit,
893            Match(e) => Match((match_start, e)),
894        }
895    }
896
897    /// Executes the NFA engine to return whether there is a match or not.
898    ///
899    /// Ideally, we could use shortest_nfa(...).is_some() and get the same
900    /// performance characteristics, but regex sets don't have captures, which
901    /// shortest_nfa depends on.
902    #[cfg(feature = "perf-dfa")]
903    fn match_nfa(&self, text: &[u8], start: usize) -> bool {
904        self.match_nfa_type(MatchNfaType::Auto, text, start)
905    }
906
907    /// Like match_nfa, but allows specification of the type of NFA engine.
908    fn match_nfa_type(
909        &self,
910        ty: MatchNfaType,
911        text: &[u8],
912        start: usize,
913    ) -> bool {
914        self.exec_nfa(
915            ty,
916            &mut [false],
917            &mut [],
918            true,
919            false,
920            text,
921            start,
922            text.len(),
923        )
924    }
925
926    /// Finds the shortest match using an NFA.
927    #[cfg(feature = "perf-dfa")]
928    fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
929        self.shortest_nfa_type(MatchNfaType::Auto, text, start)
930    }
931
932    /// Like shortest_nfa, but allows specification of the type of NFA engine.
933    fn shortest_nfa_type(
934        &self,
935        ty: MatchNfaType,
936        text: &[u8],
937        start: usize,
938    ) -> Option<usize> {
939        let mut slots = [None, None];
940        if self.exec_nfa(
941            ty,
942            &mut [false],
943            &mut slots,
944            true,
945            true,
946            text,
947            start,
948            text.len(),
949        ) {
950            slots[1]
951        } else {
952            None
953        }
954    }
955
956    /// Like find, but executes an NFA engine.
957    fn find_nfa(
958        &self,
959        ty: MatchNfaType,
960        text: &[u8],
961        start: usize,
962    ) -> Option<(usize, usize)> {
963        let mut slots = [None, None];
964        if self.exec_nfa(
965            ty,
966            &mut [false],
967            &mut slots,
968            false,
969            false,
970            text,
971            start,
972            text.len(),
973        ) {
974            match (slots[0], slots[1]) {
975                (Some(s), Some(e)) => Some((s, e)),
976                _ => None,
977            }
978        } else {
979            None
980        }
981    }
982
983    /// Like find_nfa, but fills in captures.
984    ///
985    /// `slots` should have length equal to `2 * nfa.captures.len()`.
986    #[cfg(feature = "perf-dfa")]
987    fn captures_nfa(
988        &self,
989        slots: &mut [Slot],
990        text: &[u8],
991        start: usize,
992    ) -> Option<(usize, usize)> {
993        self.captures_nfa_type(
994            MatchNfaType::Auto,
995            slots,
996            text,
997            start,
998            text.len(),
999        )
1000    }
1001
1002    /// Like captures_nfa, but allows specification of type of NFA engine.
1003    fn captures_nfa_type(
1004        &self,
1005        ty: MatchNfaType,
1006        slots: &mut [Slot],
1007        text: &[u8],
1008        start: usize,
1009        end: usize,
1010    ) -> Option<(usize, usize)> {
1011        if self.exec_nfa(
1012            ty,
1013            &mut [false],
1014            slots,
1015            false,
1016            false,
1017            text,
1018            start,
1019            end,
1020        ) {
1021            match (slots[0], slots[1]) {
1022                (Some(s), Some(e)) => Some((s, e)),
1023                _ => None,
1024            }
1025        } else {
1026            None
1027        }
1028    }
1029
1030    fn exec_nfa(
1031        &self,
1032        mut ty: MatchNfaType,
1033        matches: &mut [bool],
1034        slots: &mut [Slot],
1035        quit_after_match: bool,
1036        quit_after_match_with_pos: bool,
1037        text: &[u8],
1038        start: usize,
1039        end: usize,
1040    ) -> bool {
1041        use self::MatchNfaType::*;
1042        if let Auto = ty {
1043            if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
1044                ty = Backtrack;
1045            } else {
1046                ty = PikeVM;
1047            }
1048        }
1049        // The backtracker can't return the shortest match position as it is
1050        // implemented today. So if someone calls `shortest_match` and we need
1051        // to run an NFA, then use the PikeVM.
1052        if quit_after_match_with_pos || ty == PikeVM {
1053            self.exec_pikevm(
1054                matches,
1055                slots,
1056                quit_after_match,
1057                text,
1058                start,
1059                end,
1060            )
1061        } else {
1062            self.exec_backtrack(matches, slots, text, start, end)
1063        }
1064    }
1065
1066    /// Always run the NFA algorithm.
1067    fn exec_pikevm(
1068        &self,
1069        matches: &mut [bool],
1070        slots: &mut [Slot],
1071        quit_after_match: bool,
1072        text: &[u8],
1073        start: usize,
1074        end: usize,
1075    ) -> bool {
1076        if self.ro.nfa.uses_bytes() {
1077            pikevm::Fsm::exec(
1078                &self.ro.nfa,
1079                self.cache.value(),
1080                matches,
1081                slots,
1082                quit_after_match,
1083                ByteInput::new(text, self.ro.nfa.only_utf8),
1084                start,
1085                end,
1086            )
1087        } else {
1088            pikevm::Fsm::exec(
1089                &self.ro.nfa,
1090                self.cache.value(),
1091                matches,
1092                slots,
1093                quit_after_match,
1094                CharInput::new(text),
1095                start,
1096                end,
1097            )
1098        }
1099    }
1100
1101    /// Always runs the NFA using bounded backtracking.
1102    fn exec_backtrack(
1103        &self,
1104        matches: &mut [bool],
1105        slots: &mut [Slot],
1106        text: &[u8],
1107        start: usize,
1108        end: usize,
1109    ) -> bool {
1110        if self.ro.nfa.uses_bytes() {
1111            backtrack::Bounded::exec(
1112                &self.ro.nfa,
1113                self.cache.value(),
1114                matches,
1115                slots,
1116                ByteInput::new(text, self.ro.nfa.only_utf8),
1117                start,
1118                end,
1119            )
1120        } else {
1121            backtrack::Bounded::exec(
1122                &self.ro.nfa,
1123                self.cache.value(),
1124                matches,
1125                slots,
1126                CharInput::new(text),
1127                start,
1128                end,
1129            )
1130        }
1131    }
1132
1133    /// Finds which regular expressions match the given text.
1134    ///
1135    /// `matches` should have length equal to the number of regexes being
1136    /// searched.
1137    ///
1138    /// This is only useful when one wants to know which regexes in a set
1139    /// match some text.
1140    pub fn many_matches_at(
1141        &self,
1142        matches: &mut [bool],
1143        text: &[u8],
1144        start: usize,
1145    ) -> bool {
1146        use self::MatchType::*;
1147        if !self.is_anchor_end_match(text) {
1148            return false;
1149        }
1150        match self.ro.match_type {
1151            #[cfg(feature = "perf-literal")]
1152            Literal(ty) => {
1153                debug_assert_eq!(matches.len(), 1);
1154                matches[0] = self.find_literals(ty, text, start).is_some();
1155                matches[0]
1156            }
1157            #[cfg(feature = "perf-dfa")]
1158            Dfa | DfaAnchoredReverse | DfaMany => {
1159                match dfa::Fsm::forward_many(
1160                    &self.ro.dfa,
1161                    self.cache.value(),
1162                    matches,
1163                    text,
1164                    start,
1165                ) {
1166                    dfa::Result::Match(_) => true,
1167                    dfa::Result::NoMatch(_) => false,
1168                    dfa::Result::Quit => self.exec_nfa(
1169                        MatchNfaType::Auto,
1170                        matches,
1171                        &mut [],
1172                        false,
1173                        false,
1174                        text,
1175                        start,
1176                        text.len(),
1177                    ),
1178                }
1179            }
1180            #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1181            DfaSuffix => {
1182                match dfa::Fsm::forward_many(
1183                    &self.ro.dfa,
1184                    self.cache.value(),
1185                    matches,
1186                    text,
1187                    start,
1188                ) {
1189                    dfa::Result::Match(_) => true,
1190                    dfa::Result::NoMatch(_) => false,
1191                    dfa::Result::Quit => self.exec_nfa(
1192                        MatchNfaType::Auto,
1193                        matches,
1194                        &mut [],
1195                        false,
1196                        false,
1197                        text,
1198                        start,
1199                        text.len(),
1200                    ),
1201                }
1202            }
1203            Nfa(ty) => self.exec_nfa(
1204                ty,
1205                matches,
1206                &mut [],
1207                false,
1208                false,
1209                text,
1210                start,
1211                text.len(),
1212            ),
1213            Nothing => false,
1214        }
1215    }
1216
1217    #[cfg_attr(feature = "perf-inline", inline(always))]
1218    fn is_anchor_end_match(&self, text: &[u8]) -> bool {
1219        #[cfg(not(feature = "perf-literal"))]
1220        fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
1221            true
1222        }
1223
1224        #[cfg(feature = "perf-literal")]
1225        fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
1226            // Only do this check if the haystack is big (>1MB).
1227            if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
1228                let lcs = ro.suffixes.lcs();
1229                if lcs.len() >= 1 && !lcs.is_suffix(text) {
1230                    return false;
1231                }
1232            }
1233            true
1234        }
1235
1236        imp(&self.ro, text)
1237    }
1238
1239    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1240        &self.ro.nfa.capture_name_idx
1241    }
1242}
1243
1244impl<'c> ExecNoSyncStr<'c> {
1245    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1246        self.0.capture_name_idx()
1247    }
1248}
1249
1250impl Exec {
1251    /// Get a searcher that isn't Sync.
1252    #[cfg_attr(feature = "perf-inline", inline(always))]
1253    pub fn searcher(&self) -> ExecNoSync {
1254        let create = || RefCell::new(ProgramCacheInner::new(&self.ro));
1255        ExecNoSync {
1256            ro: &self.ro, // a clone is too expensive here! (and not needed)
1257            cache: self.cache.get_or(create),
1258        }
1259    }
1260
1261    /// Get a searcher that isn't Sync and can match on &str.
1262    #[cfg_attr(feature = "perf-inline", inline(always))]
1263    pub fn searcher_str(&self) -> ExecNoSyncStr {
1264        ExecNoSyncStr(self.searcher())
1265    }
1266
1267    /// Build a Regex from this executor.
1268    pub fn into_regex(self) -> re_unicode::Regex {
1269        re_unicode::Regex::from(self)
1270    }
1271
1272    /// Build a RegexSet from this executor.
1273    pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
1274        re_set::unicode::RegexSet::from(self)
1275    }
1276
1277    /// Build a Regex from this executor that can match arbitrary bytes.
1278    pub fn into_byte_regex(self) -> re_bytes::Regex {
1279        re_bytes::Regex::from(self)
1280    }
1281
1282    /// Build a RegexSet from this executor that can match arbitrary bytes.
1283    pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
1284        re_set::bytes::RegexSet::from(self)
1285    }
1286
1287    /// The original regular expressions given by the caller that were
1288    /// compiled.
1289    pub fn regex_strings(&self) -> &[String] {
1290        &self.ro.res
1291    }
1292
1293    /// Return a slice of capture names.
1294    ///
1295    /// Any capture that isn't named is None.
1296    pub fn capture_names(&self) -> &[Option<String>] {
1297        &self.ro.nfa.captures
1298    }
1299
1300    /// Return a reference to named groups mapping (from group name to
1301    /// group position).
1302    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
1303        &self.ro.nfa.capture_name_idx
1304    }
1305}
1306
1307impl Clone for Exec {
1308    fn clone(&self) -> Exec {
1309        Exec { ro: self.ro.clone(), cache: Cached::new() }
1310    }
1311}
1312
1313impl ExecReadOnly {
1314    fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
1315        if let Some(MatchType::Nfa(_)) = hint {
1316            return hint.unwrap();
1317        }
1318        // If the NFA is empty, then we'll never match anything.
1319        if self.nfa.insts.is_empty() {
1320            return MatchType::Nothing;
1321        }
1322        if let Some(literalty) = self.choose_literal_match_type() {
1323            return literalty;
1324        }
1325        if let Some(dfaty) = self.choose_dfa_match_type() {
1326            return dfaty;
1327        }
1328        // We're so totally hosed.
1329        MatchType::Nfa(MatchNfaType::Auto)
1330    }
1331
1332    /// If a plain literal scan can be used, then a corresponding literal
1333    /// search type is returned.
1334    fn choose_literal_match_type(&self) -> Option<MatchType> {
1335        #[cfg(not(feature = "perf-literal"))]
1336        fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1337            None
1338        }
1339
1340        #[cfg(feature = "perf-literal")]
1341        fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1342            // If our set of prefixes is complete, then we can use it to find
1343            // a match in lieu of a regex engine. This doesn't quite work well
1344            // in the presence of multiple regexes, so only do it when there's
1345            // one.
1346            //
1347            // TODO(burntsushi): Also, don't try to match literals if the regex
1348            // is partially anchored. We could technically do it, but we'd need
1349            // to create two sets of literals: all of them and then the subset
1350            // that aren't anchored. We would then only search for all of them
1351            // when at the beginning of the input and use the subset in all
1352            // other cases.
1353            if ro.res.len() != 1 {
1354                return None;
1355            }
1356            if ro.ac.is_some() {
1357                return Some(MatchType::Literal(
1358                    MatchLiteralType::AhoCorasick,
1359                ));
1360            }
1361            if ro.nfa.prefixes.complete() {
1362                return if ro.nfa.is_anchored_start {
1363                    Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
1364                } else {
1365                    Some(MatchType::Literal(MatchLiteralType::Unanchored))
1366                };
1367            }
1368            if ro.suffixes.complete() {
1369                return if ro.nfa.is_anchored_end {
1370                    Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
1371                } else {
1372                    // This case shouldn't happen. When the regex isn't
1373                    // anchored, then complete prefixes should imply complete
1374                    // suffixes.
1375                    Some(MatchType::Literal(MatchLiteralType::Unanchored))
1376                };
1377            }
1378            None
1379        }
1380
1381        imp(self)
1382    }
1383
1384    /// If a DFA scan can be used, then choose the appropriate DFA strategy.
1385    fn choose_dfa_match_type(&self) -> Option<MatchType> {
1386        #[cfg(not(feature = "perf-dfa"))]
1387        fn imp(_: &ExecReadOnly) -> Option<MatchType> {
1388            None
1389        }
1390
1391        #[cfg(feature = "perf-dfa")]
1392        fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
1393            if !dfa::can_exec(&ro.dfa) {
1394                return None;
1395            }
1396            // Regex sets require a slightly specialized path.
1397            if ro.res.len() >= 2 {
1398                return Some(MatchType::DfaMany);
1399            }
1400            // If the regex is anchored at the end but not the start, then
1401            // just match in reverse from the end of the haystack.
1402            if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
1403                return Some(MatchType::DfaAnchoredReverse);
1404            }
1405            #[cfg(feature = "perf-literal")]
1406            {
1407                // If there's a longish suffix literal, then it might be faster
1408                // to look for that first.
1409                if ro.should_suffix_scan() {
1410                    return Some(MatchType::DfaSuffix);
1411                }
1412            }
1413            // Fall back to your garden variety forward searching lazy DFA.
1414            Some(MatchType::Dfa)
1415        }
1416
1417        imp(self)
1418    }
1419
1420    /// Returns true if the program is amenable to suffix scanning.
1421    ///
1422    /// When this is true, as a heuristic, we assume it is OK to quickly scan
1423    /// for suffix literals and then do a *reverse* DFA match from any matches
1424    /// produced by the literal scan. (And then followed by a forward DFA
1425    /// search, since the previously found suffix literal maybe not actually be
1426    /// the end of a match.)
1427    ///
1428    /// This is a bit of a specialized optimization, but can result in pretty
1429    /// big performance wins if 1) there are no prefix literals and 2) the
1430    /// suffix literals are pretty rare in the text. (1) is obviously easy to
1431    /// account for but (2) is harder. As a proxy, we assume that longer
1432    /// strings are generally rarer, so we only enable this optimization when
1433    /// we have a meaty suffix.
1434    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1435    fn should_suffix_scan(&self) -> bool {
1436        if self.suffixes.is_empty() {
1437            return false;
1438        }
1439        let lcs_len = self.suffixes.lcs().char_len();
1440        lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
1441    }
1442}
1443
1444#[derive(Clone, Copy, Debug)]
1445enum MatchType {
1446    /// A single or multiple literal search. This is only used when the regex
1447    /// can be decomposed into a literal search.
1448    #[cfg(feature = "perf-literal")]
1449    Literal(MatchLiteralType),
1450    /// A normal DFA search.
1451    #[cfg(feature = "perf-dfa")]
1452    Dfa,
1453    /// A reverse DFA search starting from the end of a haystack.
1454    #[cfg(feature = "perf-dfa")]
1455    DfaAnchoredReverse,
1456    /// A reverse DFA search with suffix literal scanning.
1457    #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
1458    DfaSuffix,
1459    /// Use the DFA on two or more regular expressions.
1460    #[cfg(feature = "perf-dfa")]
1461    DfaMany,
1462    /// An NFA variant.
1463    Nfa(MatchNfaType),
1464    /// No match is ever possible, so don't ever try to search.
1465    Nothing,
1466}
1467
1468#[derive(Clone, Copy, Debug)]
1469#[cfg(feature = "perf-literal")]
1470enum MatchLiteralType {
1471    /// Match literals anywhere in text.
1472    Unanchored,
1473    /// Match literals only at the start of text.
1474    AnchoredStart,
1475    /// Match literals only at the end of text.
1476    AnchoredEnd,
1477    /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1478    /// ExecReadOnly.
1479    AhoCorasick,
1480}
1481
1482#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1483enum MatchNfaType {
1484    /// Choose between Backtrack and PikeVM.
1485    Auto,
1486    /// NFA bounded backtracking.
1487    ///
1488    /// (This is only set by tests, since it never makes sense to always want
1489    /// backtracking.)
1490    Backtrack,
1491    /// The Pike VM.
1492    ///
1493    /// (This is only set by tests, since it never makes sense to always want
1494    /// the Pike VM.)
1495    PikeVM,
1496}
1497
1498/// `ProgramCache` maintains reusable allocations for each matching engine
1499/// available to a particular program.
1500pub type ProgramCache = RefCell<ProgramCacheInner>;
1501
1502#[derive(Debug)]
1503pub struct ProgramCacheInner {
1504    pub pikevm: pikevm::Cache,
1505    pub backtrack: backtrack::Cache,
1506    #[cfg(feature = "perf-dfa")]
1507    pub dfa: dfa::Cache,
1508    #[cfg(feature = "perf-dfa")]
1509    pub dfa_reverse: dfa::Cache,
1510}
1511
1512impl ProgramCacheInner {
1513    fn new(ro: &ExecReadOnly) -> Self {
1514        ProgramCacheInner {
1515            pikevm: pikevm::Cache::new(&ro.nfa),
1516            backtrack: backtrack::Cache::new(&ro.nfa),
1517            #[cfg(feature = "perf-dfa")]
1518            dfa: dfa::Cache::new(&ro.dfa),
1519            #[cfg(feature = "perf-dfa")]
1520            dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
1521        }
1522    }
1523}
1524
1525/// Alternation literals checks if the given HIR is a simple alternation of
1526/// literals, and if so, returns them. Otherwise, this returns None.
1527#[cfg(feature = "perf-literal")]
1528fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1529    use syntax::hir::{HirKind, Literal};
1530
1531    // This is pretty hacky, but basically, if `is_alternation_literal` is
1532    // true, then we can make several assumptions about the structure of our
1533    // HIR. This is what justifies the `unreachable!` statements below.
1534    //
1535    // This code should be refactored once we overhaul this crate's
1536    // optimization pipeline, because this is a terribly inflexible way to go
1537    // about things.
1538
1539    if !expr.is_alternation_literal() {
1540        return None;
1541    }
1542    let alts = match *expr.kind() {
1543        HirKind::Alternation(ref alts) => alts,
1544        _ => return None, // one literal isn't worth it
1545    };
1546
1547    let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
1548        Literal::Unicode(c) => {
1549            let mut buf = [0; 4];
1550            dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1551        }
1552        Literal::Byte(b) => {
1553            dst.push(b);
1554        }
1555    };
1556
1557    let mut lits = vec![];
1558    for alt in alts {
1559        let mut lit = vec![];
1560        match *alt.kind() {
1561            HirKind::Literal(ref x) => extendlit(x, &mut lit),
1562            HirKind::Concat(ref exprs) => {
1563                for e in exprs {
1564                    match *e.kind() {
1565                        HirKind::Literal(ref x) => extendlit(x, &mut lit),
1566                        _ => unreachable!("expected literal, got {:?}", e),
1567                    }
1568                }
1569            }
1570            _ => unreachable!("expected literal or concat, got {:?}", alt),
1571        }
1572        lits.push(lit);
1573    }
1574    Some(lits)
1575}
1576
1577#[cfg(test)]
1578mod test {
1579    #[test]
1580    fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
1581        use internal::ExecBuilder;
1582
1583        let backtrack_bytes_re = ExecBuilder::new("^S")
1584            .bounded_backtracking()
1585            .only_utf8(false)
1586            .build()
1587            .map(|exec| exec.into_byte_regex())
1588            .map_err(|err| format!("{}", err))
1589            .unwrap();
1590
1591        let default_bytes_re = ExecBuilder::new("^S")
1592            .only_utf8(false)
1593            .build()
1594            .map(|exec| exec.into_byte_regex())
1595            .map_err(|err| format!("{}", err))
1596            .unwrap();
1597
1598        let input = vec![83, 83];
1599
1600        let s1 = backtrack_bytes_re.split(&input);
1601        let s2 = default_bytes_re.split(&input);
1602        for (chunk1, chunk2) in s1.zip(s2) {
1603            assert_eq!(chunk1, chunk2);
1604        }
1605    }
1606
1607    #[test]
1608    fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
1609        use internal::ExecBuilder;
1610
1611        let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1612            .bounded_backtracking()
1613            .bytes(true)
1614            .build()
1615            .map(|exec| exec.into_regex())
1616            .map_err(|err| format!("{}", err))
1617            .unwrap();
1618
1619        let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
1620            .bytes(true)
1621            .build()
1622            .map(|exec| exec.into_regex())
1623            .map_err(|err| format!("{}", err))
1624            .unwrap();
1625
1626        let input = "**";
1627
1628        let s1 = backtrack_bytes_re.split(input);
1629        let s2 = default_bytes_re.split(input);
1630        for (chunk1, chunk2) in s1.zip(s2) {
1631            assert_eq!(chunk1, chunk2);
1632        }
1633    }
1634}