regex/
backtrack.rs

1// This is the backtracking matching engine. It has the same exact capability
2// as the full NFA simulation, except it is artificially restricted to small
3// regexes on small inputs because of its memory requirements.
4//
5// In particular, this is a *bounded* backtracking engine. It retains worst
6// case linear time by keeping track of the states that it has visited (using a
7// bitmap). Namely, once a state is visited, it is never visited again. Since a
8// state is keyed by `(instruction index, input index)`, we have that its time
9// complexity is `O(mn)` (i.e., linear in the size of the search text).
10//
11// The backtracking engine can beat out the NFA simulation on small
12// regexes/inputs because it doesn't have to keep track of multiple copies of
13// the capture groups. In benchmarks, the backtracking engine is roughly twice
14// as fast as the full NFA simulation. Note though that its performance doesn't
15// scale, even if you're willing to live with the memory requirements. Namely,
16// the bitset has to be zeroed on each execution, which becomes quite expensive
17// on large bitsets.
18
19use exec::ProgramCache;
20use input::{Input, InputAt};
21use prog::{InstPtr, Program};
22use re_trait::Slot;
23
24type Bits = u32;
25
26const BIT_SIZE: usize = 32;
27const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
28
29/// Returns true iff the given regex and input should be executed by this
30/// engine with reasonable memory usage.
31pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
32    // Total memory usage in bytes is determined by:
33    //
34    //   ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
35    //
36    // The actual limit picked is pretty much a heuristic.
37    // See: https://github.com/rust-lang/regex/issues/215
38    let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
39    size <= MAX_SIZE_BYTES
40}
41
42/// A backtracking matching engine.
43#[derive(Debug)]
44pub struct Bounded<'a, 'm, 'r, 's, I> {
45    prog: &'r Program,
46    input: I,
47    matches: &'m mut [bool],
48    slots: &'s mut [Slot],
49    m: &'a mut Cache,
50}
51
52/// Shared cached state between multiple invocations of a backtracking engine
53/// in the same thread.
54#[derive(Clone, Debug)]
55pub struct Cache {
56    jobs: Vec<Job>,
57    visited: Vec<Bits>,
58}
59
60impl Cache {
61    /// Create new empty cache for the backtracking engine.
62    pub fn new(_prog: &Program) -> Self {
63        Cache { jobs: vec![], visited: vec![] }
64    }
65}
66
67/// A job is an explicit unit of stack space in the backtracking engine.
68///
69/// The "normal" representation is a single state transition, which corresponds
70/// to an NFA state and a character in the input. However, the backtracking
71/// engine must keep track of old capture group values. We use the explicit
72/// stack to do it.
73#[derive(Clone, Copy, Debug)]
74enum Job {
75    Inst { ip: InstPtr, at: InputAt },
76    SaveRestore { slot: usize, old_pos: Option<usize> },
77}
78
79impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
80    /// Execute the backtracking matching engine.
81    ///
82    /// If there's a match, `exec` returns `true` and populates the given
83    /// captures accordingly.
84    pub fn exec(
85        prog: &'r Program,
86        cache: &ProgramCache,
87        matches: &'m mut [bool],
88        slots: &'s mut [Slot],
89        input: I,
90        start: usize,
91        end: usize,
92    ) -> bool {
93        let mut cache = cache.borrow_mut();
94        let cache = &mut cache.backtrack;
95        let start = input.at(start);
96        let mut b = Bounded {
97            prog: prog,
98            input: input,
99            matches: matches,
100            slots: slots,
101            m: cache,
102        };
103        b.exec_(start, end)
104    }
105
106    /// Clears the cache such that the backtracking engine can be executed
107    /// on some input of fixed length.
108    fn clear(&mut self) {
109        // Reset the job memory so that we start fresh.
110        self.m.jobs.clear();
111
112        // Now we need to clear the bit state set.
113        // We do this by figuring out how much space we need to keep track
114        // of the states we've visited.
115        // Then we reset all existing allocated space to 0.
116        // Finally, we request more space if we need it.
117        //
118        // This is all a little circuitous, but doing this unsafely
119        // doesn't seem to have a measurable impact on performance.
120        // (Probably because backtracking is limited to such small
121        // inputs/regexes in the first place.)
122        let visited_len =
123            (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
124                / BIT_SIZE;
125        self.m.visited.truncate(visited_len);
126        for v in &mut self.m.visited {
127            *v = 0;
128        }
129        if visited_len > self.m.visited.len() {
130            let len = self.m.visited.len();
131            self.m.visited.reserve_exact(visited_len - len);
132            for _ in 0..(visited_len - len) {
133                self.m.visited.push(0);
134            }
135        }
136    }
137
138    /// Start backtracking at the given position in the input, but also look
139    /// for literal prefixes.
140    fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
141        self.clear();
142        // If this is an anchored regex at the beginning of the input, then
143        // we're either already done or we only need to try backtracking once.
144        if self.prog.is_anchored_start {
145            return if !at.is_start() { false } else { self.backtrack(at) };
146        }
147        let mut matched = false;
148        loop {
149            if !self.prog.prefixes.is_empty() {
150                at = match self.input.prefix_at(&self.prog.prefixes, at) {
151                    None => break,
152                    Some(at) => at,
153                };
154            }
155            matched = self.backtrack(at) || matched;
156            if matched && self.prog.matches.len() == 1 {
157                return true;
158            }
159            if at.pos() >= end {
160                break;
161            }
162            at = self.input.at(at.next_pos());
163        }
164        matched
165    }
166
167    /// The main backtracking loop starting at the given input position.
168    fn backtrack(&mut self, start: InputAt) -> bool {
169        // N.B. We use an explicit stack to avoid recursion.
170        // To avoid excessive pushing and popping, most transitions are handled
171        // in the `step` helper function, which only pushes to the stack when
172        // there's a capture or a branch.
173        let mut matched = false;
174        self.m.jobs.push(Job::Inst { ip: 0, at: start });
175        while let Some(job) = self.m.jobs.pop() {
176            match job {
177                Job::Inst { ip, at } => {
178                    if self.step(ip, at) {
179                        // Only quit if we're matching one regex.
180                        // If we're matching a regex set, then mush on and
181                        // try to find other matches (if we want them).
182                        if self.prog.matches.len() == 1 {
183                            return true;
184                        }
185                        matched = true;
186                    }
187                }
188                Job::SaveRestore { slot, old_pos } => {
189                    if slot < self.slots.len() {
190                        self.slots[slot] = old_pos;
191                    }
192                }
193            }
194        }
195        matched
196    }
197
198    fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
199        use prog::Inst::*;
200        loop {
201            // This loop is an optimization to avoid constantly pushing/popping
202            // from the stack. Namely, if we're pushing a job only to run it
203            // next, avoid the push and just mutate `ip` (and possibly `at`)
204            // in place.
205            if self.has_visited(ip, at) {
206                return false;
207            }
208            match self.prog[ip] {
209                Match(slot) => {
210                    if slot < self.matches.len() {
211                        self.matches[slot] = true;
212                    }
213                    return true;
214                }
215                Save(ref inst) => {
216                    if let Some(&old_pos) = self.slots.get(inst.slot) {
217                        // If this path doesn't work out, then we save the old
218                        // capture index (if one exists) in an alternate
219                        // job. If the next path fails, then the alternate
220                        // job is popped and the old capture index is restored.
221                        self.m.jobs.push(Job::SaveRestore {
222                            slot: inst.slot,
223                            old_pos: old_pos,
224                        });
225                        self.slots[inst.slot] = Some(at.pos());
226                    }
227                    ip = inst.goto;
228                }
229                Split(ref inst) => {
230                    self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
231                    ip = inst.goto1;
232                }
233                EmptyLook(ref inst) => {
234                    if self.input.is_empty_match(at, inst) {
235                        ip = inst.goto;
236                    } else {
237                        return false;
238                    }
239                }
240                Char(ref inst) => {
241                    if inst.c == at.char() {
242                        ip = inst.goto;
243                        at = self.input.at(at.next_pos());
244                    } else {
245                        return false;
246                    }
247                }
248                Ranges(ref inst) => {
249                    if inst.matches(at.char()) {
250                        ip = inst.goto;
251                        at = self.input.at(at.next_pos());
252                    } else {
253                        return false;
254                    }
255                }
256                Bytes(ref inst) => {
257                    if let Some(b) = at.byte() {
258                        if inst.matches(b) {
259                            ip = inst.goto;
260                            at = self.input.at(at.next_pos());
261                            continue;
262                        }
263                    }
264                    return false;
265                }
266            }
267        }
268    }
269
270    fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
271        let k = ip * (self.input.len() + 1) + at.pos();
272        let k1 = k / BIT_SIZE;
273        let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
274        if self.m.visited[k1] & k2 == 0 {
275            self.m.visited[k1] |= k2;
276            false
277        } else {
278            true
279        }
280    }
281}
282
283fn usize_to_u32(n: usize) -> u32 {
284    if (n as u64) > (::std::u32::MAX as u64) {
285        panic!("BUG: {} is too big to fit into u32", n)
286    }
287    n as u32
288}