regex/literal/
imp.rs

1use std::cmp;
2use std::mem;
3
4use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
5use memchr::{memchr, memchr2, memchr3};
6use syntax::hir::literal::{Literal, Literals};
7
8use freqs::BYTE_FREQUENCIES;
9
10/// A prefix extracted from a compiled regular expression.
11///
12/// A regex prefix is a set of literal strings that *must* be matched at the
13/// beginning of a regex in order for the entire regex to match. Similarly
14/// for a regex suffix.
15#[derive(Clone, Debug)]
16pub struct LiteralSearcher {
17    complete: bool,
18    lcp: FreqyPacked,
19    lcs: FreqyPacked,
20    matcher: Matcher,
21}
22
23#[derive(Clone, Debug)]
24enum Matcher {
25    /// No literals. (Never advances through the input.)
26    Empty,
27    /// A set of four or more single byte literals.
28    Bytes(SingleByteSet),
29    /// A single substring, find using memchr and frequency analysis.
30    FreqyPacked(FreqyPacked),
31    /// A single substring, find using Boyer-Moore.
32    BoyerMoore(BoyerMooreSearch),
33    /// An Aho-Corasick automaton.
34    AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
35    /// A packed multiple substring searcher, using SIMD.
36    ///
37    /// Note that Aho-Corasick will actually use this packed searcher
38    /// internally automatically, however, there is some overhead associated
39    /// with going through the Aho-Corasick machinery. So using the packed
40    /// searcher directly results in some gains.
41    Packed { s: packed::Searcher, lits: Vec<Literal> },
42}
43
44impl LiteralSearcher {
45    /// Returns a matcher that never matches and never advances the input.
46    pub fn empty() -> Self {
47        Self::new(Literals::empty(), Matcher::Empty)
48    }
49
50    /// Returns a matcher for literal prefixes from the given set.
51    pub fn prefixes(lits: Literals) -> Self {
52        let matcher = Matcher::prefixes(&lits);
53        Self::new(lits, matcher)
54    }
55
56    /// Returns a matcher for literal suffixes from the given set.
57    pub fn suffixes(lits: Literals) -> Self {
58        let matcher = Matcher::suffixes(&lits);
59        Self::new(lits, matcher)
60    }
61
62    fn new(lits: Literals, matcher: Matcher) -> Self {
63        let complete = lits.all_complete();
64        LiteralSearcher {
65            complete: complete,
66            lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
67            lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
68            matcher: matcher,
69        }
70    }
71
72    /// Returns true if all matches comprise the entire regular expression.
73    ///
74    /// This does not necessarily mean that a literal match implies a match
75    /// of the regular expression. For example, the regular expresison `^a`
76    /// is comprised of a single complete literal `a`, but the regular
77    /// expression demands that it only match at the beginning of a string.
78    pub fn complete(&self) -> bool {
79        self.complete && !self.is_empty()
80    }
81
82    /// Find the position of a literal in `haystack` if it exists.
83    #[cfg_attr(feature = "perf-inline", inline(always))]
84    pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
85        use self::Matcher::*;
86        match self.matcher {
87            Empty => Some((0, 0)),
88            Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
89            FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
90            BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
91            AC { ref ac, .. } => {
92                ac.find(haystack).map(|m| (m.start(), m.end()))
93            }
94            Packed { ref s, .. } => {
95                s.find(haystack).map(|m| (m.start(), m.end()))
96            }
97        }
98    }
99
100    /// Like find, except matches must start at index `0`.
101    pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
102        for lit in self.iter() {
103            if lit.len() > haystack.len() {
104                continue;
105            }
106            if lit == &haystack[0..lit.len()] {
107                return Some((0, lit.len()));
108            }
109        }
110        None
111    }
112
113    /// Like find, except matches must end at index `haystack.len()`.
114    pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
115        for lit in self.iter() {
116            if lit.len() > haystack.len() {
117                continue;
118            }
119            if lit == &haystack[haystack.len() - lit.len()..] {
120                return Some((haystack.len() - lit.len(), haystack.len()));
121            }
122        }
123        None
124    }
125
126    /// Returns an iterator over all literals to be matched.
127    pub fn iter(&self) -> LiteralIter {
128        match self.matcher {
129            Matcher::Empty => LiteralIter::Empty,
130            Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
131            Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
132            Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
133            Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
134            Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
135        }
136    }
137
138    /// Returns a matcher for the longest common prefix of this matcher.
139    pub fn lcp(&self) -> &FreqyPacked {
140        &self.lcp
141    }
142
143    /// Returns a matcher for the longest common suffix of this matcher.
144    pub fn lcs(&self) -> &FreqyPacked {
145        &self.lcs
146    }
147
148    /// Returns true iff this prefix is empty.
149    pub fn is_empty(&self) -> bool {
150        self.len() == 0
151    }
152
153    /// Returns the number of prefixes in this machine.
154    pub fn len(&self) -> usize {
155        use self::Matcher::*;
156        match self.matcher {
157            Empty => 0,
158            Bytes(ref sset) => sset.dense.len(),
159            FreqyPacked(_) => 1,
160            BoyerMoore(_) => 1,
161            AC { ref ac, .. } => ac.pattern_count(),
162            Packed { ref lits, .. } => lits.len(),
163        }
164    }
165
166    /// Return the approximate heap usage of literals in bytes.
167    pub fn approximate_size(&self) -> usize {
168        use self::Matcher::*;
169        match self.matcher {
170            Empty => 0,
171            Bytes(ref sset) => sset.approximate_size(),
172            FreqyPacked(ref single) => single.approximate_size(),
173            BoyerMoore(ref single) => single.approximate_size(),
174            AC { ref ac, .. } => ac.heap_bytes(),
175            Packed { ref s, .. } => s.heap_bytes(),
176        }
177    }
178}
179
180impl Matcher {
181    fn prefixes(lits: &Literals) -> Self {
182        let sset = SingleByteSet::prefixes(lits);
183        Matcher::new(lits, sset)
184    }
185
186    fn suffixes(lits: &Literals) -> Self {
187        let sset = SingleByteSet::suffixes(lits);
188        Matcher::new(lits, sset)
189    }
190
191    fn new(lits: &Literals, sset: SingleByteSet) -> Self {
192        if lits.literals().is_empty() {
193            return Matcher::Empty;
194        }
195        if sset.dense.len() >= 26 {
196            // Avoid trying to match a large number of single bytes.
197            // This is *very* sensitive to a frequency analysis comparison
198            // between the bytes in sset and the composition of the haystack.
199            // No matter the size of sset, if its members all are rare in the
200            // haystack, then it'd be worth using it. How to tune this... IDK.
201            // ---AG
202            return Matcher::Empty;
203        }
204        if sset.complete {
205            return Matcher::Bytes(sset);
206        }
207        if lits.literals().len() == 1 {
208            let lit = lits.literals()[0].to_vec();
209            if BoyerMooreSearch::should_use(lit.as_slice()) {
210                return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
211            } else {
212                return Matcher::FreqyPacked(FreqyPacked::new(lit));
213            }
214        }
215
216        let pats = lits.literals().to_owned();
217        let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
218        if lits.literals().len() <= 100 && !is_aho_corasick_fast {
219            let mut builder = packed::Config::new()
220                .match_kind(packed::MatchKind::LeftmostFirst)
221                .builder();
222            if let Some(s) = builder.extend(&pats).build() {
223                return Matcher::Packed { s, lits: pats };
224            }
225        }
226        let ac = AhoCorasickBuilder::new()
227            .match_kind(aho_corasick::MatchKind::LeftmostFirst)
228            .dfa(true)
229            .build_with_size::<u32, _, _>(&pats)
230            .unwrap();
231        Matcher::AC { ac, lits: pats }
232    }
233}
234
235pub enum LiteralIter<'a> {
236    Empty,
237    Bytes(&'a [u8]),
238    Single(&'a [u8]),
239    AC(&'a [Literal]),
240    Packed(&'a [Literal]),
241}
242
243impl<'a> Iterator for LiteralIter<'a> {
244    type Item = &'a [u8];
245
246    fn next(&mut self) -> Option<Self::Item> {
247        match *self {
248            LiteralIter::Empty => None,
249            LiteralIter::Bytes(ref mut many) => {
250                if many.is_empty() {
251                    None
252                } else {
253                    let next = &many[0..1];
254                    *many = &many[1..];
255                    Some(next)
256                }
257            }
258            LiteralIter::Single(ref mut one) => {
259                if one.is_empty() {
260                    None
261                } else {
262                    let next = &one[..];
263                    *one = &[];
264                    Some(next)
265                }
266            }
267            LiteralIter::AC(ref mut lits) => {
268                if lits.is_empty() {
269                    None
270                } else {
271                    let next = &lits[0];
272                    *lits = &lits[1..];
273                    Some(&**next)
274                }
275            }
276            LiteralIter::Packed(ref mut lits) => {
277                if lits.is_empty() {
278                    None
279                } else {
280                    let next = &lits[0];
281                    *lits = &lits[1..];
282                    Some(&**next)
283                }
284            }
285        }
286    }
287}
288
289#[derive(Clone, Debug)]
290struct SingleByteSet {
291    sparse: Vec<bool>,
292    dense: Vec<u8>,
293    complete: bool,
294    all_ascii: bool,
295}
296
297impl SingleByteSet {
298    fn new() -> SingleByteSet {
299        SingleByteSet {
300            sparse: vec![false; 256],
301            dense: vec![],
302            complete: true,
303            all_ascii: true,
304        }
305    }
306
307    fn prefixes(lits: &Literals) -> SingleByteSet {
308        let mut sset = SingleByteSet::new();
309        for lit in lits.literals() {
310            sset.complete = sset.complete && lit.len() == 1;
311            if let Some(&b) = lit.get(0) {
312                if !sset.sparse[b as usize] {
313                    if b > 0x7F {
314                        sset.all_ascii = false;
315                    }
316                    sset.dense.push(b);
317                    sset.sparse[b as usize] = true;
318                }
319            }
320        }
321        sset
322    }
323
324    fn suffixes(lits: &Literals) -> SingleByteSet {
325        let mut sset = SingleByteSet::new();
326        for lit in lits.literals() {
327            sset.complete = sset.complete && lit.len() == 1;
328            if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
329                if !sset.sparse[b as usize] {
330                    if b > 0x7F {
331                        sset.all_ascii = false;
332                    }
333                    sset.dense.push(b);
334                    sset.sparse[b as usize] = true;
335                }
336            }
337        }
338        sset
339    }
340
341    /// Faster find that special cases certain sizes to use memchr.
342    #[cfg_attr(feature = "perf-inline", inline(always))]
343    fn find(&self, text: &[u8]) -> Option<usize> {
344        match self.dense.len() {
345            0 => None,
346            1 => memchr(self.dense[0], text),
347            2 => memchr2(self.dense[0], self.dense[1], text),
348            3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
349            _ => self._find(text),
350        }
351    }
352
353    /// Generic find that works on any sized set.
354    fn _find(&self, haystack: &[u8]) -> Option<usize> {
355        for (i, &b) in haystack.iter().enumerate() {
356            if self.sparse[b as usize] {
357                return Some(i);
358            }
359        }
360        None
361    }
362
363    fn approximate_size(&self) -> usize {
364        (self.dense.len() * mem::size_of::<u8>())
365            + (self.sparse.len() * mem::size_of::<bool>())
366    }
367}
368
369/// Provides an implementation of fast subtring search using frequency
370/// analysis.
371///
372/// memchr is so fast that we do everything we can to keep the loop in memchr
373/// for as long as possible. The easiest way to do this is to intelligently
374/// pick the byte to send to memchr. The best byte is the byte that occurs
375/// least frequently in the haystack. Since doing frequency analysis on the
376/// haystack is far too expensive, we compute a set of fixed frequencies up
377/// front and hard code them in src/freqs.rs. Frequency analysis is done via
378/// scripts/frequencies.py.
379#[derive(Clone, Debug)]
380pub struct FreqyPacked {
381    /// The pattern.
382    pat: Vec<u8>,
383    /// The number of Unicode characters in the pattern. This is useful for
384    /// determining the effective length of a pattern when deciding which
385    /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
386    /// as one character.
387    char_len: usize,
388    /// The rarest byte in the pattern, according to pre-computed frequency
389    /// analysis.
390    rare1: u8,
391    /// The offset of the rarest byte in `pat`.
392    rare1i: usize,
393    /// The second rarest byte in the pattern, according to pre-computed
394    /// frequency analysis. (This may be equivalent to the rarest byte.)
395    ///
396    /// The second rarest byte is used as a type of guard for quickly detecting
397    /// a mismatch after memchr locates an instance of the rarest byte. This
398    /// is a hedge against pathological cases where the pre-computed frequency
399    /// analysis may be off. (But of course, does not prevent *all*
400    /// pathological cases.)
401    rare2: u8,
402    /// The offset of the second rarest byte in `pat`.
403    rare2i: usize,
404}
405
406impl FreqyPacked {
407    fn new(pat: Vec<u8>) -> FreqyPacked {
408        if pat.is_empty() {
409            return FreqyPacked::empty();
410        }
411
412        // Find the rarest two bytes. Try to make them distinct (but it's not
413        // required).
414        let mut rare1 = pat[0];
415        let mut rare2 = pat[0];
416        for b in pat[1..].iter().cloned() {
417            if freq_rank(b) < freq_rank(rare1) {
418                rare1 = b;
419            }
420        }
421        for &b in &pat {
422            if rare1 == rare2 {
423                rare2 = b
424            } else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
425                rare2 = b;
426            }
427        }
428
429        // And find the offsets of their last occurrences.
430        let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
431        let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
432
433        let char_len = char_len_lossy(&pat);
434        FreqyPacked {
435            pat: pat,
436            char_len: char_len,
437            rare1: rare1,
438            rare1i: rare1i,
439            rare2: rare2,
440            rare2i: rare2i,
441        }
442    }
443
444    fn empty() -> FreqyPacked {
445        FreqyPacked {
446            pat: vec![],
447            char_len: 0,
448            rare1: 0,
449            rare1i: 0,
450            rare2: 0,
451            rare2i: 0,
452        }
453    }
454
455    #[cfg_attr(feature = "perf-inline", inline(always))]
456    pub fn find(&self, haystack: &[u8]) -> Option<usize> {
457        let pat = &*self.pat;
458        if haystack.len() < pat.len() || pat.is_empty() {
459            return None;
460        }
461        let mut i = self.rare1i;
462        while i < haystack.len() {
463            i += match memchr(self.rare1, &haystack[i..]) {
464                None => return None,
465                Some(i) => i,
466            };
467            let start = i - self.rare1i;
468            let end = start + pat.len();
469            if end > haystack.len() {
470                return None;
471            }
472            let aligned = &haystack[start..end];
473            if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
474                return Some(start);
475            }
476            i += 1;
477        }
478        None
479    }
480
481    #[cfg_attr(feature = "perf-inline", inline(always))]
482    pub fn is_suffix(&self, text: &[u8]) -> bool {
483        if text.len() < self.len() {
484            return false;
485        }
486        text[text.len() - self.len()..] == *self.pat
487    }
488
489    pub fn len(&self) -> usize {
490        self.pat.len()
491    }
492
493    pub fn char_len(&self) -> usize {
494        self.char_len
495    }
496
497    fn approximate_size(&self) -> usize {
498        self.pat.len() * mem::size_of::<u8>()
499    }
500}
501
502fn char_len_lossy(bytes: &[u8]) -> usize {
503    String::from_utf8_lossy(bytes).chars().count()
504}
505
506/// An implementation of Tuned Boyer-Moore as laid out by
507/// Andrew Hume and Daniel Sunday in "Fast String Searching".
508/// O(n) in the size of the input.
509///
510/// Fast string searching algorithms come in many variations,
511/// but they can generally be described in terms of three main
512/// components.
513///
514/// The skip loop is where the string searcher wants to spend
515/// as much time as possible. Exactly which character in the
516/// pattern the skip loop examines varies from algorithm to
517/// algorithm, but in the simplest case this loop repeated
518/// looks at the last character in the pattern and jumps
519/// forward in the input if it is not in the pattern.
520/// Robert Boyer and J Moore called this the "fast" loop in
521/// their original paper.
522///
523/// The match loop is responsible for actually examining the
524/// whole potentially matching substring. In order to fail
525/// faster, the match loop sometimes has a guard test attached.
526/// The guard test uses frequency analysis of the different
527/// characters in the pattern to choose the least frequency
528/// occurring character and use it to find match failures
529/// as quickly as possible.
530///
531/// The shift rule governs how the algorithm will shuffle its
532/// test window in the event of a failure during the match loop.
533/// Certain shift rules allow the worst-case run time of the
534/// algorithm to be shown to be O(n) in the size of the input
535/// rather than O(nm) in the size of the input and the size
536/// of the pattern (as naive Boyer-Moore is).
537///
538/// "Fast String Searching", in addition to presenting a tuned
539/// algorithm, provides a comprehensive taxonomy of the many
540/// different flavors of string searchers. Under that taxonomy
541/// TBM, the algorithm implemented here, uses an unrolled fast
542/// skip loop with memchr fallback, a forward match loop with guard,
543/// and the mini Sunday's delta shift rule. To unpack that you'll have to
544/// read the paper.
545#[derive(Clone, Debug)]
546pub struct BoyerMooreSearch {
547    /// The pattern we are going to look for in the haystack.
548    pattern: Vec<u8>,
549
550    /// The skip table for the skip loop.
551    ///
552    /// Maps the character at the end of the input
553    /// to a shift.
554    skip_table: Vec<usize>,
555
556    /// The guard character (least frequently occurring char).
557    guard: u8,
558    /// The reverse-index of the guard character in the pattern.
559    guard_reverse_idx: usize,
560
561    /// Daniel Sunday's mini generalized delta2 shift table.
562    ///
563    /// We use a skip loop, so we only have to provide a shift
564    /// for the skip char (last char). This is why it is a mini
565    /// shift rule.
566    md2_shift: usize,
567}
568
569impl BoyerMooreSearch {
570    /// Create a new string searcher, performing whatever
571    /// compilation steps are required.
572    fn new(pattern: Vec<u8>) -> Self {
573        debug_assert!(pattern.len() > 0);
574
575        let (g, gi) = Self::select_guard(pattern.as_slice());
576        let skip_table = Self::compile_skip_table(pattern.as_slice());
577        let md2_shift = Self::compile_md2_shift(pattern.as_slice());
578        BoyerMooreSearch {
579            pattern: pattern,
580            skip_table: skip_table,
581            guard: g,
582            guard_reverse_idx: gi,
583            md2_shift: md2_shift,
584        }
585    }
586
587    /// Find the pattern in `haystack`, returning the offset
588    /// of the start of the first occurrence of the pattern
589    /// in `haystack`.
590    #[inline]
591    fn find(&self, haystack: &[u8]) -> Option<usize> {
592        if haystack.len() < self.pattern.len() {
593            return None;
594        }
595
596        let mut window_end = self.pattern.len() - 1;
597
598        // Inspired by the grep source. It is a way
599        // to do correct loop unrolling without having to place
600        // a crashpad of terminating charicters at the end in
601        // the way described in the Fast String Searching paper.
602        const NUM_UNROLL: usize = 10;
603        // 1 for the initial position, and 1 for the md2 shift
604        let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
605
606        if haystack.len() > short_circut {
607            // just 1 for the md2 shift
608            let backstop =
609                haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
610            loop {
611                window_end =
612                    match self.skip_loop(haystack, window_end, backstop) {
613                        Some(i) => i,
614                        None => return None,
615                    };
616                if window_end >= backstop {
617                    break;
618                }
619
620                if self.check_match(haystack, window_end) {
621                    return Some(window_end - (self.pattern.len() - 1));
622                } else {
623                    let skip = self.skip_table[haystack[window_end] as usize];
624                    window_end +=
625                        if skip == 0 { self.md2_shift } else { skip };
626                    continue;
627                }
628            }
629        }
630
631        // now process the input after the backstop
632        while window_end < haystack.len() {
633            let mut skip = self.skip_table[haystack[window_end] as usize];
634            if skip == 0 {
635                if self.check_match(haystack, window_end) {
636                    return Some(window_end - (self.pattern.len() - 1));
637                } else {
638                    skip = self.md2_shift;
639                }
640            }
641            window_end += skip;
642        }
643
644        None
645    }
646
647    fn len(&self) -> usize {
648        return self.pattern.len();
649    }
650
651    /// The key heuristic behind which the BoyerMooreSearch lives.
652    ///
653    /// See `rust-lang/regex/issues/408`.
654    ///
655    /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
656    /// platform-specific memchr routine with a bit of frequency
657    /// analysis sprinkled on top actually wins most of the time.
658    /// However, there are a few cases where Tuned Boyer-Moore still
659    /// wins.
660    ///
661    /// If the haystack is random, frequency analysis doesn't help us,
662    /// so Boyer-Moore will win for sufficiently large needles.
663    /// Unfortunately, there is no obvious way to determine this
664    /// ahead of time.
665    ///
666    /// If the pattern itself consists of very common characters,
667    /// frequency analysis won't get us anywhere. The most extreme
668    /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
669    /// this case is wholly determined by the pattern, so we can actually
670    /// implement the heuristic.
671    ///
672    /// A third case is if the pattern is sufficiently long. The idea
673    /// here is that once the pattern gets long enough the Tuned
674    /// Boyer-Moore skip loop will start making strides long enough
675    /// to beat the asm deep magic that is memchr.
676    fn should_use(pattern: &[u8]) -> bool {
677        // The minimum pattern length required to use TBM.
678        const MIN_LEN: usize = 9;
679        // The minimum frequency rank (lower is rarer) that every byte in the
680        // pattern must have in order to use TBM. That is, if the pattern
681        // contains _any_ byte with a lower rank, then TBM won't be used.
682        const MIN_CUTOFF: usize = 150;
683        // The maximum frequency rank for any byte.
684        const MAX_CUTOFF: usize = 255;
685        // The scaling factor used to determine the actual cutoff frequency
686        // to use (keeping in mind that the minimum frequency rank is bounded
687        // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
688        // likely to be used as the pattern grows longer. That is, longer
689        // patterns permit somewhat less frequent bytes than shorter patterns,
690        // under the assumption that TBM gets better as the pattern gets
691        // longer.
692        const LEN_CUTOFF_PROPORTION: usize = 4;
693
694        let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
695        let cutoff = cmp::max(
696            MIN_CUTOFF,
697            MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
698        );
699        // The pattern must be long enough to be worthwhile. e.g., memchr will
700        // be faster on `e` because it is short even though e is quite common.
701        pattern.len() > MIN_LEN
702            // all the bytes must be more common than the cutoff.
703            && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
704    }
705
706    /// Check to see if there is a match at the given position
707    #[inline]
708    fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
709        // guard test
710        if haystack[window_end - self.guard_reverse_idx] != self.guard {
711            return false;
712        }
713
714        // match loop
715        let window_start = window_end - (self.pattern.len() - 1);
716        for i in 0..self.pattern.len() {
717            if self.pattern[i] != haystack[window_start + i] {
718                return false;
719            }
720        }
721
722        true
723    }
724
725    /// Skip forward according to the shift table.
726    ///
727    /// Returns the offset of the next occurrence
728    /// of the last char in the pattern, or the none
729    /// if it never reappears. If `skip_loop` hits the backstop
730    /// it will leave early.
731    #[inline]
732    fn skip_loop(
733        &self,
734        haystack: &[u8],
735        mut window_end: usize,
736        backstop: usize,
737    ) -> Option<usize> {
738        let window_end_snapshot = window_end;
739        let skip_of = |we: usize| -> usize {
740            // Unsafe might make this faster, but the benchmarks
741            // were hard to interpret.
742            self.skip_table[haystack[we] as usize]
743        };
744
745        loop {
746            let mut skip = skip_of(window_end);
747            window_end += skip;
748            skip = skip_of(window_end);
749            window_end += skip;
750            if skip != 0 {
751                skip = skip_of(window_end);
752                window_end += skip;
753                skip = skip_of(window_end);
754                window_end += skip;
755                skip = skip_of(window_end);
756                window_end += skip;
757                if skip != 0 {
758                    skip = skip_of(window_end);
759                    window_end += skip;
760                    skip = skip_of(window_end);
761                    window_end += skip;
762                    skip = skip_of(window_end);
763                    window_end += skip;
764                    if skip != 0 {
765                        skip = skip_of(window_end);
766                        window_end += skip;
767                        skip = skip_of(window_end);
768                        window_end += skip;
769
770                        // If ten iterations did not make at least 16 words
771                        // worth of progress, we just fall back on memchr.
772                        if window_end - window_end_snapshot
773                            > 16 * mem::size_of::<usize>()
774                        {
775                            // Returning a window_end >= backstop will
776                            // immediatly break us out of the inner loop in
777                            // `find`.
778                            if window_end >= backstop {
779                                return Some(window_end);
780                            }
781
782                            continue; // we made enough progress
783                        } else {
784                            // In case we are already there, and so that
785                            // we will catch the guard char.
786                            window_end = window_end
787                                .checked_sub(1 + self.guard_reverse_idx)
788                                .unwrap_or(0);
789
790                            match memchr(self.guard, &haystack[window_end..]) {
791                                None => return None,
792                                Some(g_idx) => {
793                                    return Some(
794                                        window_end
795                                            + g_idx
796                                            + self.guard_reverse_idx,
797                                    );
798                                }
799                            }
800                        }
801                    }
802                }
803            }
804
805            return Some(window_end);
806        }
807    }
808
809    /// Compute the ufast skip table.
810    fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
811        let mut tab = vec![pattern.len(); 256];
812
813        // For every char in the pattern, we write a skip
814        // that will line us up with the rightmost occurrence.
815        //
816        // N.B. the sentinel (0) is written by the last
817        // loop iteration.
818        for (i, c) in pattern.iter().enumerate() {
819            tab[*c as usize] = (pattern.len() - 1) - i;
820        }
821
822        tab
823    }
824
825    /// Select the guard character based off of the precomputed
826    /// frequency table.
827    fn select_guard(pattern: &[u8]) -> (u8, usize) {
828        let mut rarest = pattern[0];
829        let mut rarest_rev_idx = pattern.len() - 1;
830        for (i, c) in pattern.iter().enumerate() {
831            if freq_rank(*c) < freq_rank(rarest) {
832                rarest = *c;
833                rarest_rev_idx = (pattern.len() - 1) - i;
834            }
835        }
836
837        (rarest, rarest_rev_idx)
838    }
839
840    /// If there is another occurrence of the skip
841    /// char, shift to it, otherwise just shift to
842    /// the next window.
843    fn compile_md2_shift(pattern: &[u8]) -> usize {
844        let shiftc = *pattern.last().unwrap();
845
846        // For a pattern of length 1 we will never apply the
847        // shift rule, so we use a poison value on the principle
848        // that failing fast is a good thing.
849        if pattern.len() == 1 {
850            return 0xDEADBEAF;
851        }
852
853        let mut i = pattern.len() - 2;
854        while i > 0 {
855            if pattern[i] == shiftc {
856                return (pattern.len() - 1) - i;
857            }
858            i -= 1;
859        }
860
861        // The skip char never re-occurs in the pattern, so
862        // we can just shift the whole window length.
863        pattern.len() - 1
864    }
865
866    fn approximate_size(&self) -> usize {
867        (self.pattern.len() * mem::size_of::<u8>())
868            + (256 * mem::size_of::<usize>()) // skip table
869    }
870}
871
872fn freq_rank(b: u8) -> usize {
873    BYTE_FREQUENCIES[b as usize] as usize
874}
875
876#[cfg(test)]
877mod tests {
878    use super::{BoyerMooreSearch, FreqyPacked};
879
880    //
881    // Unit Tests
882    //
883
884    // The "hello, world" of string searching
885    #[test]
886    fn bm_find_subs() {
887        let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
888        let haystack = b"I keep seeing patterns in this text";
889        assert_eq!(14, searcher.find(haystack).unwrap());
890    }
891
892    #[test]
893    fn bm_find_no_subs() {
894        let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
895        let haystack = b"I keep seeing needles in this text";
896        assert_eq!(None, searcher.find(haystack));
897    }
898
899    //
900    // Regression Tests
901    //
902
903    #[test]
904    fn bm_skip_reset_bug() {
905        let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
906        let needle = vec![0, 1, 1, 0];
907
908        let searcher = BoyerMooreSearch::new(needle);
909        let offset = searcher.find(haystack.as_slice()).unwrap();
910        assert_eq!(4, offset);
911    }
912
913    #[test]
914    fn bm_backstop_underflow_bug() {
915        let haystack = vec![0, 0];
916        let needle = vec![0, 0];
917
918        let searcher = BoyerMooreSearch::new(needle);
919        let offset = searcher.find(haystack.as_slice()).unwrap();
920        assert_eq!(0, offset);
921    }
922
923    #[test]
924    fn bm_naive_off_by_one_bug() {
925        let haystack = vec![91];
926        let needle = vec![91];
927
928        let naive_offset = naive_find(&needle, &haystack).unwrap();
929        assert_eq!(0, naive_offset);
930    }
931
932    #[test]
933    fn bm_memchr_fallback_indexing_bug() {
934        let mut haystack = vec![
935            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
936            0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
937            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
938            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
939            0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
940        ];
941        let needle = vec![1, 1, 1, 1, 32, 32, 87];
942        let needle_start = haystack.len();
943        haystack.extend(needle.clone());
944
945        let searcher = BoyerMooreSearch::new(needle);
946        assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
947    }
948
949    #[test]
950    fn bm_backstop_boundary() {
951        let haystack = b"\
952// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
953e_data.clone_created(entity_id, entity_to_add.entity_id);
954aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
955aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
956"
957        .to_vec();
958        let needle = b"clone_created".to_vec();
959
960        let searcher = BoyerMooreSearch::new(needle);
961        let result = searcher.find(&haystack);
962        assert_eq!(Some(43), result);
963    }
964
965    #[test]
966    fn bm_win_gnu_indexing_bug() {
967        let haystack_raw = vec![
968            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
969            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
970            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
971            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
972        ];
973        let needle = vec![1, 1, 1, 1, 1, 1, 1];
974        let haystack = haystack_raw.as_slice();
975
976        BoyerMooreSearch::new(needle.clone()).find(haystack);
977    }
978
979    //
980    // QuickCheck Properties
981    //
982
983    use quickcheck::TestResult;
984
985    fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
986        assert!(needle.len() <= haystack.len());
987
988        for i in 0..(haystack.len() - (needle.len() - 1)) {
989            if haystack[i] == needle[0]
990                && &haystack[i..(i + needle.len())] == needle
991            {
992                return Some(i);
993            }
994        }
995
996        None
997    }
998
999    quickcheck! {
1000        fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1001            if pile1.len() == 0 || pile2.len() == 0 {
1002                return TestResult::discard();
1003            }
1004
1005            let (needle, haystack) = if pile1.len() < pile2.len() {
1006                (pile1, pile2.as_slice())
1007            } else {
1008                (pile2, pile1.as_slice())
1009            };
1010
1011            let searcher = BoyerMooreSearch::new(needle.clone());
1012            TestResult::from_bool(
1013                searcher.find(haystack) == naive_find(&needle, haystack))
1014        }
1015
1016        fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1017            if pile1.len() == 0 || pile2.len() == 0 {
1018                return TestResult::discard();
1019            }
1020
1021            let (needle, haystack) = if pile1.len() < pile2.len() {
1022                (pile1, pile2.as_slice())
1023            } else {
1024                (pile2, pile1.as_slice())
1025            };
1026
1027            let bm_searcher = BoyerMooreSearch::new(needle.clone());
1028            let freqy_memchr = FreqyPacked::new(needle);
1029            TestResult::from_bool(
1030                bm_searcher.find(haystack) == freqy_memchr.find(haystack))
1031        }
1032
1033        fn qc_bm_finds_trailing_needle(
1034            haystack_pre: Vec<u8>,
1035            needle: Vec<u8>
1036        ) -> TestResult {
1037            if needle.len() == 0 {
1038                return TestResult::discard();
1039            }
1040
1041            let mut haystack = haystack_pre.clone();
1042            let searcher = BoyerMooreSearch::new(needle.clone());
1043
1044            if haystack.len() >= needle.len() &&
1045                searcher.find(haystack.as_slice()).is_some() {
1046                return TestResult::discard();
1047            }
1048
1049            haystack.extend(needle.clone());
1050
1051            // What if the the tail of the haystack can start the
1052            // needle?
1053            let start = haystack_pre.len()
1054                .checked_sub(needle.len())
1055                .unwrap_or(0);
1056            for i in 0..(needle.len() - 1) {
1057                if searcher.find(&haystack[(i + start)..]).is_some() {
1058                    return TestResult::discard();
1059                }
1060            }
1061
1062            TestResult::from_bool(
1063                searcher.find(haystack.as_slice())
1064                        .map(|x| x == haystack_pre.len())
1065                        .unwrap_or(false))
1066        }
1067
1068        // qc_equals_* is only testing the negative case as @burntsushi
1069        // pointed out in https://github.com/rust-lang/regex/issues/446.
1070        // This quickcheck prop represents an effort to force testing of
1071        // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
1072        // already check some of the positive cases, but they don't cover
1073        // cases where the needle is in the middle of haystack. This prop
1074        // fills that hole.
1075        fn qc_bm_finds_subslice(
1076            haystack: Vec<u8>,
1077            needle_start: usize,
1078            needle_length: usize
1079        ) -> TestResult {
1080            if haystack.len() == 0 {
1081                return TestResult::discard();
1082            }
1083
1084            let needle_start = needle_start % haystack.len();
1085            let needle_length = needle_length % (haystack.len() - needle_start);
1086
1087            if needle_length == 0 {
1088                return TestResult::discard();
1089            }
1090
1091            let needle = &haystack[needle_start..(needle_start + needle_length)];
1092
1093            let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
1094
1095            let start = naive_find(&needle, &haystack);
1096            match start {
1097                None => TestResult::from_bool(false),
1098                Some(nf_start) =>
1099                    TestResult::from_bool(
1100                        nf_start <= needle_start
1101                            && bm_searcher.find(&haystack) == start
1102                    )
1103            }
1104        }
1105
1106        fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
1107            if needle.len() == 0 {
1108                return TestResult::discard();
1109            }
1110
1111            let mut haystack = needle.clone();
1112            let searcher = BoyerMooreSearch::new(needle.clone());
1113            haystack.extend(needle);
1114
1115            TestResult::from_bool(
1116                searcher.find(haystack.as_slice())
1117                        .map(|x| x == 0)
1118                        .unwrap_or(false))
1119        }
1120    }
1121}