1use std::collections::HashMap;
2use std::iter;
3use std::result;
4use std::sync::Arc;
5
6use syntax::hir::{self, Hir};
7use syntax::is_word_byte;
8use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
9
10use prog::{
11 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
12 InstSave, InstSplit, Program,
13};
14
15use Error;
16
17type Result = result::Result<Patch, Error>;
18
19#[derive(Debug)]
20struct Patch {
21 hole: Hole,
22 entry: InstPtr,
23}
24
25pub struct Compiler {
28 insts: Vec<MaybeInst>,
29 compiled: Program,
30 capture_name_idx: HashMap<String, usize>,
31 num_exprs: usize,
32 size_limit: usize,
33 suffix_cache: SuffixCache,
34 utf8_seqs: Option<Utf8Sequences>,
35 byte_classes: ByteClassSet,
36}
37
38impl Compiler {
39 pub fn new() -> Self {
43 Compiler {
44 insts: vec![],
45 compiled: Program::new(),
46 capture_name_idx: HashMap::new(),
47 num_exprs: 0,
48 size_limit: 10 * (1 << 20),
49 suffix_cache: SuffixCache::new(1000),
50 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
51 byte_classes: ByteClassSet::new(),
52 }
53 }
54
55 pub fn size_limit(mut self, size_limit: usize) -> Self {
59 self.size_limit = size_limit;
60 self
61 }
62
63 pub fn bytes(mut self, yes: bool) -> Self {
75 self.compiled.is_bytes = yes;
76 self
77 }
78
79 pub fn only_utf8(mut self, yes: bool) -> Self {
84 self.compiled.only_utf8 = yes;
85 self
86 }
87
88 pub fn dfa(mut self, yes: bool) -> Self {
96 self.compiled.is_dfa = yes;
97 self
98 }
99
100 pub fn reverse(mut self, yes: bool) -> Self {
103 self.compiled.is_reverse = yes;
104 self
105 }
106
107 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
113 debug_assert!(exprs.len() >= 1);
114 self.num_exprs = exprs.len();
115 if exprs.len() == 1 {
116 self.compile_one(&exprs[0])
117 } else {
118 self.compile_many(exprs)
119 }
120 }
121
122 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
123 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
128 self.compiled.is_anchored_start = expr.is_anchored_start();
129 self.compiled.is_anchored_end = expr.is_anchored_end();
130 if self.compiled.needs_dotstar() {
131 dotstar_patch = self.c_dotstar()?;
132 self.compiled.start = dotstar_patch.entry;
133 }
134 self.compiled.captures = vec![None];
135 let patch = self.c_capture(0, expr)?;
136 if self.compiled.needs_dotstar() {
137 self.fill(dotstar_patch.hole, patch.entry);
138 } else {
139 self.compiled.start = patch.entry;
140 }
141 self.fill_to_next(patch.hole);
142 self.compiled.matches = vec![self.insts.len()];
143 self.push_compiled(Inst::Match(0));
144 self.compile_finish()
145 }
146
147 fn compile_many(
148 mut self,
149 exprs: &[Hir],
150 ) -> result::Result<Program, Error> {
151 debug_assert!(exprs.len() > 1);
152
153 self.compiled.is_anchored_start =
154 exprs.iter().all(|e| e.is_anchored_start());
155 self.compiled.is_anchored_end =
156 exprs.iter().all(|e| e.is_anchored_end());
157 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
158 if self.compiled.needs_dotstar() {
159 dotstar_patch = self.c_dotstar()?;
160 self.compiled.start = dotstar_patch.entry;
161 } else {
162 self.compiled.start = 0; }
164 self.fill_to_next(dotstar_patch.hole);
165
166 let mut prev_hole = Hole::None;
167 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
168 self.fill_to_next(prev_hole);
169 let split = self.push_split_hole();
170 let Patch { hole, entry } = self.c_capture(0, expr)?;
171 self.fill_to_next(hole);
172 self.compiled.matches.push(self.insts.len());
173 self.push_compiled(Inst::Match(i));
174 prev_hole = self.fill_split(split, Some(entry), None);
175 }
176 let i = exprs.len() - 1;
177 let Patch { hole, entry } = self.c_capture(0, &exprs[i])?;
178 self.fill(prev_hole, entry);
179 self.fill_to_next(hole);
180 self.compiled.matches.push(self.insts.len());
181 self.push_compiled(Inst::Match(i));
182 self.compile_finish()
183 }
184
185 fn compile_finish(mut self) -> result::Result<Program, Error> {
186 self.compiled.insts =
187 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
188 self.compiled.byte_classes = self.byte_classes.byte_classes();
189 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
190 Ok(self.compiled)
191 }
192
193 fn c(&mut self, expr: &Hir) -> Result {
246 use prog;
247 use syntax::hir::HirKind::*;
248
249 self.check_size()?;
250 match *expr.kind() {
251 Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
252 Literal(hir::Literal::Unicode(c)) => self.c_char(c),
253 Literal(hir::Literal::Byte(b)) => {
254 assert!(self.compiled.uses_bytes());
255 self.c_byte(b)
256 }
257 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
258 Class(hir::Class::Bytes(ref cls)) => {
259 if self.compiled.uses_bytes() {
260 self.c_class_bytes(cls.ranges())
261 } else {
262 assert!(cls.is_all_ascii());
263 let mut char_ranges = vec![];
264 for r in cls.iter() {
265 let (s, e) = (r.start() as char, r.end() as char);
266 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
267 }
268 self.c_class(&char_ranges)
269 }
270 }
271 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
272 self.byte_classes.set_range(b'\n', b'\n');
273 self.c_empty_look(prog::EmptyLook::EndLine)
274 }
275 Anchor(hir::Anchor::StartLine) => {
276 self.byte_classes.set_range(b'\n', b'\n');
277 self.c_empty_look(prog::EmptyLook::StartLine)
278 }
279 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
280 self.byte_classes.set_range(b'\n', b'\n');
281 self.c_empty_look(prog::EmptyLook::StartLine)
282 }
283 Anchor(hir::Anchor::EndLine) => {
284 self.byte_classes.set_range(b'\n', b'\n');
285 self.c_empty_look(prog::EmptyLook::EndLine)
286 }
287 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
288 self.c_empty_look(prog::EmptyLook::EndText)
289 }
290 Anchor(hir::Anchor::StartText) => {
291 self.c_empty_look(prog::EmptyLook::StartText)
292 }
293 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
294 self.c_empty_look(prog::EmptyLook::StartText)
295 }
296 Anchor(hir::Anchor::EndText) => {
297 self.c_empty_look(prog::EmptyLook::EndText)
298 }
299 WordBoundary(hir::WordBoundary::Unicode) => {
300 if !cfg!(feature = "unicode-perl") {
301 return Err(Error::Syntax(
302 "Unicode word boundaries are unavailable when \
303 the unicode-perl feature is disabled"
304 .to_string(),
305 ));
306 }
307 self.compiled.has_unicode_word_boundary = true;
308 self.byte_classes.set_word_boundary();
309 self.c_empty_look(prog::EmptyLook::WordBoundary)
310 }
311 WordBoundary(hir::WordBoundary::UnicodeNegate) => {
312 if !cfg!(feature = "unicode-perl") {
313 return Err(Error::Syntax(
314 "Unicode word boundaries are unavailable when \
315 the unicode-perl feature is disabled"
316 .to_string(),
317 ));
318 }
319 self.compiled.has_unicode_word_boundary = true;
320 self.byte_classes.set_word_boundary();
321 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
322 }
323 WordBoundary(hir::WordBoundary::Ascii) => {
324 self.byte_classes.set_word_boundary();
325 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
326 }
327 WordBoundary(hir::WordBoundary::AsciiNegate) => {
328 self.byte_classes.set_word_boundary();
329 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
330 }
331 Group(ref g) => match g.kind {
332 hir::GroupKind::NonCapturing => self.c(&g.hir),
333 hir::GroupKind::CaptureIndex(index) => {
334 if index as usize >= self.compiled.captures.len() {
335 self.compiled.captures.push(None);
336 }
337 self.c_capture(2 * index as usize, &g.hir)
338 }
339 hir::GroupKind::CaptureName { index, ref name } => {
340 if index as usize >= self.compiled.captures.len() {
341 let n = name.to_string();
342 self.compiled.captures.push(Some(n.clone()));
343 self.capture_name_idx.insert(n, index as usize);
344 }
345 self.c_capture(2 * index as usize, &g.hir)
346 }
347 },
348 Concat(ref es) => {
349 if self.compiled.is_reverse {
350 self.c_concat(es.iter().rev())
351 } else {
352 self.c_concat(es)
353 }
354 }
355 Alternation(ref es) => self.c_alternate(&**es),
356 Repetition(ref rep) => self.c_repeat(rep),
357 }
358 }
359
360 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result {
361 if self.num_exprs > 1 || self.compiled.is_dfa {
362 self.c(expr)
366 } else {
367 let entry = self.insts.len();
368 let hole = self.push_hole(InstHole::Save { slot: first_slot });
369 let patch = self.c(expr)?;
370 self.fill(hole, patch.entry);
371 self.fill_to_next(patch.hole);
372 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
373 Ok(Patch { hole: hole, entry: entry })
374 }
375 }
376
377 fn c_dotstar(&mut self) -> Result {
378 Ok(if !self.compiled.only_utf8() {
379 self.c(&Hir::repetition(hir::Repetition {
380 kind: hir::RepetitionKind::ZeroOrMore,
381 greedy: false,
382 hir: Box::new(Hir::any(true)),
383 }))?
384 } else {
385 self.c(&Hir::repetition(hir::Repetition {
386 kind: hir::RepetitionKind::ZeroOrMore,
387 greedy: false,
388 hir: Box::new(Hir::any(false)),
389 }))?
390 })
391 }
392
393 fn c_char(&mut self, c: char) -> Result {
394 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
395 }
396
397 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result {
398 assert!(!ranges.is_empty());
399 if self.compiled.uses_bytes() {
400 CompileClass { c: self, ranges: ranges }.compile()
401 } else {
402 let ranges: Vec<(char, char)> =
403 ranges.iter().map(|r| (r.start(), r.end())).collect();
404 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
405 self.push_hole(InstHole::Char { c: ranges[0].0 })
406 } else {
407 self.push_hole(InstHole::Ranges { ranges: ranges })
408 };
409 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
410 }
411 }
412
413 fn c_byte(&mut self, b: u8) -> Result {
414 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
415 }
416
417 fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result {
418 debug_assert!(!ranges.is_empty());
419
420 let first_split_entry = self.insts.len();
421 let mut holes = vec![];
422 let mut prev_hole = Hole::None;
423 for r in &ranges[0..ranges.len() - 1] {
424 self.fill_to_next(prev_hole);
425 let split = self.push_split_hole();
426 let next = self.insts.len();
427 self.byte_classes.set_range(r.start(), r.end());
428 holes.push(self.push_hole(InstHole::Bytes {
429 start: r.start(),
430 end: r.end(),
431 }));
432 prev_hole = self.fill_split(split, Some(next), None);
433 }
434 let next = self.insts.len();
435 let r = &ranges[ranges.len() - 1];
436 self.byte_classes.set_range(r.start(), r.end());
437 holes.push(
438 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
439 );
440 self.fill(prev_hole, next);
441 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
442 }
443
444 fn c_empty_look(&mut self, look: EmptyLook) -> Result {
445 let hole = self.push_hole(InstHole::EmptyLook { look: look });
446 Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
447 }
448
449 fn c_concat<'a, I>(&mut self, exprs: I) -> Result
450 where
451 I: IntoIterator<Item = &'a Hir>,
452 {
453 let mut exprs = exprs.into_iter();
454 let first = match exprs.next() {
455 Some(expr) => expr,
456 None => {
457 return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
458 }
459 };
460 let Patch { mut hole, entry } = self.c(first)?;
461 for e in exprs {
462 let p = self.c(e)?;
463 self.fill(hole, p.entry);
464 hole = p.hole;
465 }
466 Ok(Patch { hole: hole, entry: entry })
467 }
468
469 fn c_alternate(&mut self, exprs: &[Hir]) -> Result {
470 debug_assert!(
471 exprs.len() >= 2,
472 "alternates must have at least 2 exprs"
473 );
474
475 let first_split_entry = self.insts.len();
477
478 let mut holes = vec![];
481
482 let mut prev_hole = Hole::None;
483 for e in &exprs[0..exprs.len() - 1] {
484 self.fill_to_next(prev_hole);
485 let split = self.push_split_hole();
486 let prev_entry = self.insts.len();
487 let Patch { hole, entry } = self.c(e)?;
488 if prev_entry == self.insts.len() {
489 return Err(Error::Syntax(
495 "alternations cannot currently contain \
496 empty sub-expressions"
497 .to_string(),
498 ));
499 }
500 holes.push(hole);
501 prev_hole = self.fill_split(split, Some(entry), None);
502 }
503 let prev_entry = self.insts.len();
504 let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?;
505 if prev_entry == self.insts.len() {
506 return Err(Error::Syntax(
508 "alternations cannot currently contain \
509 empty sub-expressions"
510 .to_string(),
511 ));
512 }
513 holes.push(hole);
514 self.fill(prev_hole, entry);
515 Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
516 }
517
518 fn c_repeat(&mut self, rep: &hir::Repetition) -> Result {
519 use syntax::hir::RepetitionKind::*;
520 match rep.kind {
521 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
522 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
523 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
524 Range(hir::RepetitionRange::Exactly(min_max)) => {
525 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
526 }
527 Range(hir::RepetitionRange::AtLeast(min)) => {
528 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
529 }
530 Range(hir::RepetitionRange::Bounded(min, max)) => {
531 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
532 }
533 }
534 }
535
536 fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result {
537 let split_entry = self.insts.len();
538 let split = self.push_split_hole();
539 let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
540
541 let split_hole = if greedy {
542 self.fill_split(split, Some(entry_rep), None)
543 } else {
544 self.fill_split(split, None, Some(entry_rep))
545 };
546 let holes = vec![hole_rep, split_hole];
547 Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
548 }
549
550 fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
551 let split_entry = self.insts.len();
552 let split = self.push_split_hole();
553 let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
554
555 self.fill(hole_rep, split_entry);
556 let split_hole = if greedy {
557 self.fill_split(split, Some(entry_rep), None)
558 } else {
559 self.fill_split(split, None, Some(entry_rep))
560 };
561 Ok(Patch { hole: split_hole, entry: split_entry })
562 }
563
564 fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
565 let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
566 self.fill_to_next(hole_rep);
567 let split = self.push_split_hole();
568
569 let split_hole = if greedy {
570 self.fill_split(split, Some(entry_rep), None)
571 } else {
572 self.fill_split(split, None, Some(entry_rep))
573 };
574 Ok(Patch { hole: split_hole, entry: entry_rep })
575 }
576
577 fn c_repeat_range_min_or_more(
578 &mut self,
579 expr: &Hir,
580 greedy: bool,
581 min: u32,
582 ) -> Result {
583 let min = u32_to_usize(min);
584 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
585 let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?;
586 self.fill(patch_concat.hole, patch_rep.entry);
587 Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
588 }
589
590 fn c_repeat_range(
591 &mut self,
592 expr: &Hir,
593 greedy: bool,
594 min: u32,
595 max: u32,
596 ) -> Result {
597 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
598 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
599 let initial_entry = patch_concat.entry;
600 if min == max {
601 return Ok(patch_concat);
602 }
603 let mut holes = vec![];
623 let mut prev_hole = patch_concat.hole;
624 for _ in min..max {
625 self.fill_to_next(prev_hole);
626 let split = self.push_split_hole();
627 let Patch { hole, entry } = self.c(expr)?;
628 prev_hole = hole;
629 if greedy {
630 holes.push(self.fill_split(split, Some(entry), None));
631 } else {
632 holes.push(self.fill_split(split, None, Some(entry)));
633 }
634 }
635 holes.push(prev_hole);
636 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
637 }
638
639 fn fill(&mut self, hole: Hole, goto: InstPtr) {
640 match hole {
641 Hole::None => {}
642 Hole::One(pc) => {
643 self.insts[pc].fill(goto);
644 }
645 Hole::Many(holes) => {
646 for hole in holes {
647 self.fill(hole, goto);
648 }
649 }
650 }
651 }
652
653 fn fill_to_next(&mut self, hole: Hole) {
654 let next = self.insts.len();
655 self.fill(hole, next);
656 }
657
658 fn fill_split(
659 &mut self,
660 hole: Hole,
661 goto1: Option<InstPtr>,
662 goto2: Option<InstPtr>,
663 ) -> Hole {
664 match hole {
665 Hole::None => Hole::None,
666 Hole::One(pc) => match (goto1, goto2) {
667 (Some(goto1), Some(goto2)) => {
668 self.insts[pc].fill_split(goto1, goto2);
669 Hole::None
670 }
671 (Some(goto1), None) => {
672 self.insts[pc].half_fill_split_goto1(goto1);
673 Hole::One(pc)
674 }
675 (None, Some(goto2)) => {
676 self.insts[pc].half_fill_split_goto2(goto2);
677 Hole::One(pc)
678 }
679 (None, None) => unreachable!(
680 "at least one of the split \
681 holes must be filled"
682 ),
683 },
684 Hole::Many(holes) => {
685 let mut new_holes = vec![];
686 for hole in holes {
687 new_holes.push(self.fill_split(hole, goto1, goto2));
688 }
689 if new_holes.is_empty() {
690 Hole::None
691 } else if new_holes.len() == 1 {
692 new_holes.pop().unwrap()
693 } else {
694 Hole::Many(new_holes)
695 }
696 }
697 }
698 }
699
700 fn push_compiled(&mut self, inst: Inst) {
701 self.insts.push(MaybeInst::Compiled(inst));
702 }
703
704 fn push_hole(&mut self, inst: InstHole) -> Hole {
705 let hole = self.insts.len();
706 self.insts.push(MaybeInst::Uncompiled(inst));
707 Hole::One(hole)
708 }
709
710 fn push_split_hole(&mut self) -> Hole {
711 let hole = self.insts.len();
712 self.insts.push(MaybeInst::Split);
713 Hole::One(hole)
714 }
715
716 fn check_size(&self) -> result::Result<(), Error> {
717 use std::mem::size_of;
718
719 if self.insts.len() * size_of::<Inst>() > self.size_limit {
720 Err(Error::CompiledTooBig(self.size_limit))
721 } else {
722 Ok(())
723 }
724 }
725}
726
727#[derive(Debug)]
728enum Hole {
729 None,
730 One(InstPtr),
731 Many(Vec<Hole>),
732}
733
734#[derive(Clone, Debug)]
735enum MaybeInst {
736 Compiled(Inst),
737 Uncompiled(InstHole),
738 Split,
739 Split1(InstPtr),
740 Split2(InstPtr),
741}
742
743impl MaybeInst {
744 fn fill(&mut self, goto: InstPtr) {
745 let filled = match *self {
746 MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
747 MaybeInst::Split1(goto1) => {
748 Inst::Split(InstSplit { goto1: goto1, goto2: goto })
749 }
750 MaybeInst::Split2(goto2) => {
751 Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
752 }
753 _ => unreachable!(
754 "not all instructions were compiled! \
755 found uncompiled instruction: {:?}",
756 self
757 ),
758 };
759 *self = MaybeInst::Compiled(filled);
760 }
761
762 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
763 let filled = match *self {
764 MaybeInst::Split => {
765 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
766 }
767 _ => unreachable!(
768 "must be called on Split instruction, \
769 instead it was called on: {:?}",
770 self
771 ),
772 };
773 *self = MaybeInst::Compiled(filled);
774 }
775
776 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
777 let half_filled = match *self {
778 MaybeInst::Split => goto1,
779 _ => unreachable!(
780 "must be called on Split instruction, \
781 instead it was called on: {:?}",
782 self
783 ),
784 };
785 *self = MaybeInst::Split1(half_filled);
786 }
787
788 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
789 let half_filled = match *self {
790 MaybeInst::Split => goto2,
791 _ => unreachable!(
792 "must be called on Split instruction, \
793 instead it was called on: {:?}",
794 self
795 ),
796 };
797 *self = MaybeInst::Split2(half_filled);
798 }
799
800 fn unwrap(self) -> Inst {
801 match self {
802 MaybeInst::Compiled(inst) => inst,
803 _ => unreachable!(
804 "must be called on a compiled instruction, \
805 instead it was called on: {:?}",
806 self
807 ),
808 }
809 }
810}
811
812#[derive(Clone, Debug)]
813enum InstHole {
814 Save { slot: usize },
815 EmptyLook { look: EmptyLook },
816 Char { c: char },
817 Ranges { ranges: Vec<(char, char)> },
818 Bytes { start: u8, end: u8 },
819}
820
821impl InstHole {
822 fn fill(&self, goto: InstPtr) -> Inst {
823 match *self {
824 InstHole::Save { slot } => {
825 Inst::Save(InstSave { goto: goto, slot: slot })
826 }
827 InstHole::EmptyLook { look } => {
828 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
829 }
830 InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
831 InstHole::Ranges { ref ranges } => {
832 Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
833 }
834 InstHole::Bytes { start, end } => {
835 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
836 }
837 }
838 }
839}
840
841struct CompileClass<'a, 'b> {
842 c: &'a mut Compiler,
843 ranges: &'b [hir::ClassUnicodeRange],
844}
845
846impl<'a, 'b> CompileClass<'a, 'b> {
847 fn compile(mut self) -> Result {
848 let mut holes = vec![];
849 let mut initial_entry = None;
850 let mut last_split = Hole::None;
851 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
852 self.c.suffix_cache.clear();
853
854 for (i, range) in self.ranges.iter().enumerate() {
855 let is_last_range = i + 1 == self.ranges.len();
856 utf8_seqs.reset(range.start(), range.end());
857 let mut it = (&mut utf8_seqs).peekable();
858 loop {
859 let utf8_seq = match it.next() {
860 None => break,
861 Some(utf8_seq) => utf8_seq,
862 };
863 if is_last_range && it.peek().is_none() {
864 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
865 holes.push(hole);
866 self.c.fill(last_split, entry);
867 last_split = Hole::None;
868 if initial_entry.is_none() {
869 initial_entry = Some(entry);
870 }
871 } else {
872 if initial_entry.is_none() {
873 initial_entry = Some(self.c.insts.len());
874 }
875 self.c.fill_to_next(last_split);
876 last_split = self.c.push_split_hole();
877 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
878 holes.push(hole);
879 last_split =
880 self.c.fill_split(last_split, Some(entry), None);
881 }
882 }
883 }
884 self.c.utf8_seqs = Some(utf8_seqs);
885 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
886 }
887
888 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
889 if self.c.compiled.is_reverse {
890 self.c_utf8_seq_(seq)
891 } else {
892 self.c_utf8_seq_(seq.into_iter().rev())
893 }
894 }
895
896 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
897 where
898 I: IntoIterator<Item = &'r Utf8Range>,
899 {
900 let mut from_inst = ::std::usize::MAX;
902 let mut last_hole = Hole::None;
903 for byte_range in seq {
904 let key = SuffixCacheKey {
905 from_inst: from_inst,
906 start: byte_range.start,
907 end: byte_range.end,
908 };
909 {
910 let pc = self.c.insts.len();
911 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
912 from_inst = cached_pc;
913 continue;
914 }
915 }
916 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
917 if from_inst == ::std::usize::MAX {
918 last_hole = self.c.push_hole(InstHole::Bytes {
919 start: byte_range.start,
920 end: byte_range.end,
921 });
922 } else {
923 self.c.push_compiled(Inst::Bytes(InstBytes {
924 goto: from_inst,
925 start: byte_range.start,
926 end: byte_range.end,
927 }));
928 }
929 from_inst = self.c.insts.len().checked_sub(1).unwrap();
930 debug_assert!(from_inst < ::std::usize::MAX);
931 }
932 debug_assert!(from_inst < ::std::usize::MAX);
933 Ok(Patch { hole: last_hole, entry: from_inst })
934 }
935}
936
937struct SuffixCache {
960 sparse: Box<[usize]>,
961 dense: Vec<SuffixCacheEntry>,
962}
963
964#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
965struct SuffixCacheEntry {
966 key: SuffixCacheKey,
967 pc: InstPtr,
968}
969
970#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
971struct SuffixCacheKey {
972 from_inst: InstPtr,
973 start: u8,
974 end: u8,
975}
976
977impl SuffixCache {
978 fn new(size: usize) -> Self {
979 SuffixCache {
980 sparse: vec![0usize; size].into(),
981 dense: Vec::with_capacity(size),
982 }
983 }
984
985 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
986 let hash = self.hash(&key);
987 let pos = &mut self.sparse[hash];
988 if let Some(entry) = self.dense.get(*pos) {
989 if entry.key == key {
990 return Some(entry.pc);
991 }
992 }
993 *pos = self.dense.len();
994 self.dense.push(SuffixCacheEntry { key: key, pc: pc });
995 None
996 }
997
998 fn clear(&mut self) {
999 self.dense.clear();
1000 }
1001
1002 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1003 const FNV_PRIME: u64 = 1099511628211;
1006 let mut h = 14695981039346656037;
1007 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1008 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1009 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1010 (h as usize) % self.sparse.len()
1011 }
1012}
1013
1014struct ByteClassSet([bool; 256]);
1015
1016impl ByteClassSet {
1017 fn new() -> Self {
1018 ByteClassSet([false; 256])
1019 }
1020
1021 fn set_range(&mut self, start: u8, end: u8) {
1022 debug_assert!(start <= end);
1023 if start > 0 {
1024 self.0[start as usize - 1] = true;
1025 }
1026 self.0[end as usize] = true;
1027 }
1028
1029 fn set_word_boundary(&mut self) {
1030 let iswb = is_word_byte;
1033 let mut b1: u16 = 0;
1034 let mut b2: u16;
1035 while b1 <= 255 {
1036 b2 = b1 + 1;
1037 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1038 b2 += 1;
1039 }
1040 self.set_range(b1 as u8, (b2 - 1) as u8);
1041 b1 = b2;
1042 }
1043 }
1044
1045 fn byte_classes(&self) -> Vec<u8> {
1046 let mut byte_classes = vec![0; 256];
1051 let mut class = 0u8;
1052 let mut i = 0;
1053 loop {
1054 byte_classes[i] = class as u8;
1055 if i >= 255 {
1056 break;
1057 }
1058 if self.0[i] {
1059 class = class.checked_add(1).unwrap();
1060 }
1061 i += 1;
1062 }
1063 byte_classes
1064 }
1065}
1066
1067fn u32_to_usize(n: u32) -> usize {
1068 if (n as u64) > (::std::usize::MAX as u64) {
1072 panic!("BUG: {} is too big to be pointer sized", n)
1073 }
1074 n as usize
1075}
1076
1077#[cfg(test)]
1078mod tests {
1079 use super::ByteClassSet;
1080
1081 #[test]
1082 fn byte_classes() {
1083 let mut set = ByteClassSet::new();
1084 set.set_range(b'a', b'z');
1085 let classes = set.byte_classes();
1086 assert_eq!(classes[0], 0);
1087 assert_eq!(classes[1], 0);
1088 assert_eq!(classes[2], 0);
1089 assert_eq!(classes[b'a' as usize - 1], 0);
1090 assert_eq!(classes[b'a' as usize], 1);
1091 assert_eq!(classes[b'm' as usize], 1);
1092 assert_eq!(classes[b'z' as usize], 1);
1093 assert_eq!(classes[b'z' as usize + 1], 2);
1094 assert_eq!(classes[254], 2);
1095 assert_eq!(classes[255], 2);
1096
1097 let mut set = ByteClassSet::new();
1098 set.set_range(0, 2);
1099 set.set_range(4, 6);
1100 let classes = set.byte_classes();
1101 assert_eq!(classes[0], 0);
1102 assert_eq!(classes[1], 0);
1103 assert_eq!(classes[2], 0);
1104 assert_eq!(classes[3], 1);
1105 assert_eq!(classes[4], 2);
1106 assert_eq!(classes[5], 2);
1107 assert_eq!(classes[6], 2);
1108 assert_eq!(classes[7], 3);
1109 assert_eq!(classes[255], 3);
1110 }
1111
1112 #[test]
1113 fn full_byte_classes() {
1114 let mut set = ByteClassSet::new();
1115 for i in 0..256u16 {
1116 set.set_range(i as u8, i as u8);
1117 }
1118 assert_eq!(set.byte_classes().len(), 256);
1119 }
1120}