1use core::{fmt, ops};
6
7use crate::{
8 ast::{CountedRepetition, GroupName, Node, Span, Spanned, Syntax},
9 utils::{
10 ceil_char_boundary, is_char_boundary, is_escapable_char, is_meta_char, split_first_char,
11 Stack,
12 },
13 Error, ErrorKind,
14};
15
16#[cfg(test)]
17mod tests;
18
19#[derive(Debug, Clone)]
21#[non_exhaustive]
22pub struct ValidationOutput {
23 pub node_count: usize,
25}
26
27#[derive(Debug, Default)]
52pub struct RegexOptions {
53 ignore_whitespace: bool,
54}
55
56impl RegexOptions {
57 pub const DEFAULT: Self = Self {
59 ignore_whitespace: false,
60 };
61
62 #[must_use]
65 pub const fn ignore_whitespace(mut self, yes: bool) -> Self {
66 self.ignore_whitespace = yes;
67 self
68 }
69
70 pub const fn try_validate(&self, regex: &str) -> Result<ValidationOutput, Error> {
76 let mut state = <ParseState>::new(regex, self);
77 loop {
78 match state.step() {
79 Err(err) => return Err(err),
80 Ok(ops::ControlFlow::Break(())) => break,
81 Ok(ops::ControlFlow::Continue(())) => { }
82 }
83 }
84
85 Ok(ValidationOutput {
86 node_count: state.syntax.len(),
87 })
88 }
89
90 #[track_caller]
97 pub const fn validate(&self, regex: &str) -> ValidationOutput {
98 match self.try_validate(regex) {
99 Err(err) => err.compile_panic(regex),
100 Ok(output) => output,
101 }
102 }
103
104 pub const fn try_parse<const CAP: usize>(&self, regex: &str) -> Result<Syntax<CAP>, Error> {
116 let mut state = ParseState::custom(regex, self, true);
117 loop {
118 match state.step() {
119 Err(err) => return Err(err),
120 Ok(ops::ControlFlow::Break(())) => return Ok(state.into_syntax()),
121 Ok(ops::ControlFlow::Continue(())) => { }
122 }
123 }
124 }
125
126 #[track_caller]
133 pub const fn parse<const CAP: usize>(&self, regex: &str) -> Syntax<CAP> {
134 match self.try_parse(regex) {
135 Ok(spans) => spans,
136 Err(err) => err.compile_panic(regex),
137 }
138 }
139
140 #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
146 #[cfg(feature = "alloc")]
147 pub fn try_parse_to_vec(&self, regex: &str) -> Result<crate::alloc::Vec<Spanned>, Error> {
148 use core::mem;
149
150 use crate::alloc::Vec;
151
152 const STEP_CAP: usize = 8;
156
157 let mut state = ParseState::<STEP_CAP>::custom(regex, self, true);
158 let mut syntax = Vec::new();
159 loop {
160 let step_result = state.step()?;
161 #[allow(clippy::missing_panics_doc)] let SyntaxOrLen::Syntax(new_syntax) = &mut state.syntax
165 else {
166 panic!("cannot happen");
167 };
168 let new_syntax = mem::replace(new_syntax, Syntax::new(Spanned::DUMMY));
169 syntax.extend_from_slice(new_syntax.as_slice());
170
171 if step_result.is_break() {
172 return Ok(syntax);
173 }
174 }
175 }
176}
177
178#[derive(Debug)]
179enum PrimitiveKind {
180 Literal(char),
181 PerlClass,
182 Other,
183}
184
185impl PrimitiveKind {
186 const fn is_valid_set_member(&self) -> bool {
187 matches!(self, Self::Literal(_) | Self::PerlClass)
188 }
189}
190
191#[derive(Debug, Clone, Copy)]
192struct Flags {
193 is_empty: bool,
194 ignore_whitespace: Option<bool>,
195}
196
197#[derive(Debug)]
199struct Backup {
200 pos: usize,
201 span_count: usize,
202}
203
204const GROUP_DEPTH: usize = 16;
206const MAX_NAMED_GROUPS: usize = 64;
208
209#[derive(Debug, Clone, Copy)]
210struct GroupFrame {
211 prev_ignore_whitespace: bool,
214}
215
216impl GroupFrame {
217 const DUMMY: Self = Self {
218 prev_ignore_whitespace: false,
219 };
220}
221
222struct GroupNames<const CAP: usize> {
223 items: [Span; CAP],
224 len: usize,
225}
226
227impl<const CAP: usize> fmt::Debug for GroupNames<CAP> {
228 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
229 let (items, _) = self.items.split_at(self.len);
230 formatter.debug_list().entries(items).finish()
231 }
232}
233
234impl<const CAP: usize> GroupNames<CAP> {
235 const fn new() -> Self {
236 Self {
237 items: [Span::new(0, 0); CAP],
238 len: 0,
239 }
240 }
241
242 const fn insert(&mut self, item: Span, regex_bytes: &[u8]) -> Result<(), Error> {
243 const fn bytes_eq(bytes: &[u8], x: Span, y: Span) -> bool {
244 let mut i = 0;
245 while i < x.len() {
246 if bytes[x.start + i] != bytes[y.start + i] {
247 return false;
248 }
249 i += 1;
250 }
251 true
252 }
253
254 if self.len == CAP {
255 return Err(ErrorKind::NamedGroupOverflow.with_position(item.start..item.end));
256 }
257
258 let item_len = item.len();
259 let mut i = 0;
260 while i < self.len {
261 let prev_pos = self.items[i];
262 if prev_pos.len() == item_len && bytes_eq(regex_bytes, prev_pos, item) {
263 let err = ErrorKind::DuplicateCaptureName {
264 prev_pos: prev_pos.start..prev_pos.end,
265 };
266 return Err(err.with_position(item.start..item.end));
267 }
268 i += 1;
269 }
270
271 self.items[self.len] = item;
272 self.len += 1;
273 Ok(())
274 }
275}
276
277#[derive(Debug)]
278enum SyntaxOrLen<const CAP: usize> {
279 Syntax(Syntax<CAP>),
280 Len(usize),
281}
282
283impl<const CAP: usize> SyntaxOrLen<CAP> {
284 const fn new(with_syntax: bool) -> Self {
285 if with_syntax {
286 Self::Syntax(Syntax::new(Spanned::DUMMY))
287 } else {
288 Self::Len(0)
289 }
290 }
291
292 const fn len(&self) -> usize {
293 match self {
294 Self::Syntax(syntax) => syntax.len(),
295 Self::Len(len) => *len,
296 }
297 }
298
299 const fn trim(&mut self, new_len: usize) {
300 debug_assert!(self.len() >= new_len);
301 match self {
302 Self::Syntax(syntax) => syntax.trim(new_len),
303 Self::Len(len) => *len = new_len,
304 }
305 }
306}
307
308#[derive(Debug)]
309pub(crate) struct ParseState<'a, const CAP: usize = 0> {
310 regex_bytes: &'a [u8],
312 pos: usize,
314 groups: Stack<GroupFrame, GROUP_DEPTH>,
315 group_names: GroupNames<MAX_NAMED_GROUPS>,
316 set_depth: usize,
317 is_empty_last_item: bool,
318 ignore_whitespace: bool,
319 syntax: SyntaxOrLen<CAP>,
320}
321
322impl<'a> ParseState<'a> {
323 pub(crate) const fn new(regex: &'a str, options: &RegexOptions) -> Self {
324 Self::custom(regex, options, false)
325 }
326}
327
328impl<'a, const CAP: usize> ParseState<'a, CAP> {
329 pub(crate) const fn custom(regex: &'a str, options: &RegexOptions, with_syntax: bool) -> Self {
330 Self {
331 regex_bytes: regex.as_bytes(),
332 pos: 0,
333 groups: Stack::new(GroupFrame::DUMMY),
334 group_names: GroupNames::new(),
335 set_depth: 0,
336 is_empty_last_item: true,
337 ignore_whitespace: options.ignore_whitespace,
338 syntax: SyntaxOrLen::new(with_syntax),
339 }
340 }
341
342 const fn backup(&self) -> Backup {
343 Backup {
344 pos: self.pos,
345 span_count: self.syntax.len(),
346 }
347 }
348
349 const fn restore(&mut self, backup: &Backup) {
350 debug_assert!(self.pos >= backup.pos);
351
352 self.pos = backup.pos;
353 self.syntax.trim(backup.span_count);
354 }
355
356 const fn ascii_char_at(&self, pos: usize) -> Option<u8> {
357 if pos >= self.regex_bytes.len() {
358 None
359 } else {
360 let ch = self.regex_bytes[pos];
361 if ch <= 0x7f {
362 Some(ch)
363 } else {
364 None
365 }
366 }
367 }
368
369 const fn ascii_char(&self) -> Option<u8> {
370 self.ascii_char_at(self.pos)
371 }
372
373 const fn error(&self, start: usize, kind: ErrorKind) -> Error {
374 debug_assert!(start <= self.pos);
375 debug_assert!(start <= self.regex_bytes.len());
376 if start < self.regex_bytes.len() {
377 debug_assert!(is_char_boundary(self.regex_bytes[start]));
378 }
379
380 let end = if self.pos <= self.regex_bytes.len() {
381 ceil_char_boundary(self.regex_bytes, self.pos)
382 } else {
383 self.regex_bytes.len()
384 };
385 kind.with_position(start..end)
386 }
387
388 const fn push_ast(&mut self, start_pos: usize, node: Node) -> Result<(), Error> {
389 self.push_custom_ast(Span::new(start_pos, self.pos), node)
390 }
391
392 const fn push_custom_ast(&mut self, range: Span, node: Node) -> Result<(), Error> {
393 match &mut self.syntax {
394 SyntaxOrLen::Syntax(syntax) => {
395 let span = Spanned { node, span: range };
396 if syntax.push(span).is_err() {
397 return Err(self.error(range.start, ErrorKind::AstOverflow));
398 }
399 }
400 SyntaxOrLen::Len(len) => *len += 1,
401 }
402 Ok(())
403 }
404
405 const fn gobble(&mut self, bytes: &[u8]) -> bool {
407 let mut i = 0;
408 while i < bytes.len()
409 && self.pos + i < self.regex_bytes.len()
410 && bytes[i] == self.regex_bytes[self.pos + i]
411 {
412 i += 1;
413 }
414 if i == bytes.len() {
415 self.pos += bytes.len();
417 true
418 } else {
419 false
420 }
421 }
422
423 const fn gobble_any(&mut self, needles: &[&[u8]]) -> bool {
424 let mut i = 0;
425 while i < needles.len() {
426 let matched = self.gobble(needles[i]);
427 if matched {
428 return true;
429 }
430 i += 1;
431 }
432 false
433 }
434
435 const fn matches(&self, range: ops::Range<usize>, needle: &[u8]) -> bool {
436 if range.end - range.start != needle.len() {
437 return false;
438 }
439 if range.end > self.regex_bytes.len() {
440 return false;
441 }
442
443 let mut i = 0;
444 while i < needle.len() && self.regex_bytes[range.start + i] == needle[i] {
445 i += 1;
446 }
447 i == needle.len()
448 }
449
450 const fn matches_any(&self, range: ops::Range<usize>, needles: &[&[u8]]) -> bool {
451 let mut i = 0;
452 while i < needles.len() {
453 if self.matches(range.start..range.end, needles[i]) {
454 return true;
455 }
456 i += 1;
457 }
458 false
459 }
460
461 const fn gobble_whitespace_and_comments(&mut self) -> Result<(), Error> {
462 if !self.ignore_whitespace {
463 return Ok(());
464 }
465
466 let mut comment_range = None::<Span>;
468 let mut is_in_comment = false;
469 while !self.is_eof() {
470 let Some((ch, next_pos)) = split_first_char(self.regex_bytes, self.pos) else {
471 break; };
473
474 if is_in_comment {
475 if ch == '\n' {
476 is_in_comment = false;
477 if let Some(range) = &mut comment_range {
478 range.end = self.pos;
479 }
480 }
481 self.pos = next_pos;
482 } else if ch.is_ascii_whitespace() {
483 self.pos = next_pos;
485 } else if ch == '#' {
486 is_in_comment = true;
487 if comment_range.is_none() {
488 comment_range = Some(Span::new(self.pos, self.pos + 1));
489 }
490 self.pos = next_pos;
491 } else {
492 break;
493 }
494 }
495
496 if let Some(mut range) = comment_range {
497 if is_in_comment {
498 range.end = self.pos;
500 }
501 const_try!(self.push_custom_ast(range, Node::Comment));
502 }
503 Ok(())
504 }
505
506 const fn disallowed_error(&self, pos: usize) -> Error {
508 debug_assert!(self.ignore_whitespace);
509
510 let err = if self.regex_bytes[pos] == b'#' {
511 ErrorKind::DisallowedComment
512 } else {
513 debug_assert!(self.regex_bytes[pos].is_ascii_whitespace());
514 ErrorKind::DisallowedWhitespace
515 };
516 err.with_position(pos..pos + 1)
517 }
518
519 const fn peek_whitespace(&self, mut pos: usize) -> Option<char> {
521 let mut is_in_comment = false;
522 while !self.is_eof() {
523 let Some((ch, next_pos)) = split_first_char(self.regex_bytes, pos) else {
524 break; };
526
527 if is_in_comment {
528 if ch == '\n' {
529 is_in_comment = false;
530 }
531 pos = next_pos;
532 } else if self.ignore_whitespace && ch.is_ascii_whitespace() {
533 pos = next_pos;
535 } else if self.ignore_whitespace && ch == '#' {
536 is_in_comment = true;
537 pos = next_pos;
538 } else {
539 return Some(ch);
540 }
541 }
542 None
543 }
544
545 const fn parse_uncounted_repetition(&mut self) -> Result<(), Error> {
547 let start_pos = self.pos;
548 self.pos += 1;
550 if self.is_empty_last_item {
551 return Err(self.error(self.pos - 1, ErrorKind::MissingRepetition));
552 }
553 self.gobble(b"?");
555
556 const_try!(self.push_ast(start_pos, Node::UncountedRepetition));
557 Ok(())
558 }
559
560 const fn parse_counted_repetition(&mut self) -> Result<(), Error> {
562 let start_pos = self.pos;
563 debug_assert!(self.regex_bytes[self.pos] == b'{');
564 self.pos += 1; if self.is_empty_last_item {
567 return Err(self.error(start_pos, ErrorKind::MissingRepetition));
568 }
569 const_try!(self.gobble_whitespace_and_comments());
570
571 let (min_count, min_count_span) = const_try!(self.parse_decimal());
573 let Some(min_count) = min_count else {
574 return Err(
575 ErrorKind::EmptyDecimal.with_position(min_count_span.start..min_count_span.end)
576 );
577 };
578 const_try!(self.gobble_whitespace_and_comments());
579
580 let Some(current_char) = self.ascii_char() else {
581 return Err(self.error(start_pos, ErrorKind::UnfinishedRepetition));
582 };
583 let max_count_span = if current_char == b',' {
584 self.pos += 1;
585 const_try!(self.gobble_whitespace_and_comments());
586
587 let max_count_start = self.pos;
588 let (max_count, max_count_span) = const_try!(self.parse_decimal());
590 if let Some(count) = max_count {
591 if count < min_count {
592 return Err(self.error(start_pos, ErrorKind::InvalidRepetitionRange));
593 }
594 } else if max_count_start != self.pos {
595 return Err(self.error(max_count_start, ErrorKind::EmptyDecimal));
598 }
599 Some(max_count_span)
600 } else {
601 None
602 };
603
604 const_try!(self.gobble_whitespace_and_comments());
605 if matches!(self.ascii_char(), Some(b'}')) {
606 self.pos += 1;
607 const_try!(self.gobble_whitespace_and_comments());
608
609 self.gobble(b"?");
611 const_try!(self.push_ast(
612 start_pos,
613 Node::CountedRepetition(match max_count_span {
614 Some(span) if span.is_empty() => CountedRepetition::AtLeast(min_count_span),
615 Some(span) => CountedRepetition::Between(min_count_span, span),
616 None => CountedRepetition::Exactly(min_count_span),
617 })
618 ));
619 Ok(())
620 } else {
621 Err(self.error(start_pos, ErrorKind::UnfinishedRepetition))
622 }
623 }
624
625 const fn parse_decimal(&mut self) -> Result<(Option<u32>, Span), Error> {
627 while !self.is_eof() && self.regex_bytes[self.pos].is_ascii_whitespace() {
628 self.pos += 1;
629 }
630
631 let start_pos = self.pos;
632 let mut pos = self.pos;
633 let mut decimal = 0_u32;
634 while pos < self.regex_bytes.len() && self.regex_bytes[pos].is_ascii_digit() {
635 let Some(new_decimal) = decimal.checked_mul(10) else {
636 return Err(ErrorKind::InvalidDecimal.with_position(start_pos..pos + 1));
637 };
638 decimal = match new_decimal.checked_add((self.regex_bytes[pos] - b'0') as u32) {
639 Some(dec) => dec,
640 None => return Err(ErrorKind::InvalidDecimal.with_position(start_pos..pos + 1)),
641 };
642 pos += 1;
643 }
644
645 let decimal = if pos == self.pos {
646 None
647 } else {
648 self.pos = pos;
649 Some(decimal)
650 };
651
652 while !self.is_eof() && self.regex_bytes[self.pos].is_ascii_whitespace() {
653 self.pos += 1;
654 }
655
656 if self.ignore_whitespace {
657 if let Some(ch) = self.peek_whitespace(pos) {
661 if ch.is_ascii_digit() {
662 return Err(self.disallowed_error(pos));
663 }
664 }
665 }
666
667 Ok((decimal, Span::new(start_pos, pos)))
668 }
669
670 const fn parse_primitive(&mut self, ch: char, next_pos: usize) -> Result<(), Error> {
671 match ch {
672 '\\' => {
673 const_try!(self.parse_escape());
674 Ok(())
675 }
676 '.' => {
677 self.pos += 1;
678 const_try!(self.push_ast(self.pos - 1, Node::Dot));
679 Ok(())
680 }
681 '^' | '$' => {
682 self.pos += 1;
683 const_try!(self.push_ast(self.pos - 1, Node::LineAssertion));
684 Ok(())
685 }
686 _ => {
687 self.pos = next_pos;
688 Ok(())
689 }
690 }
691 }
692
693 const fn parse_escape(&mut self) -> Result<PrimitiveKind, Error> {
694 let start_pos = self.pos;
695 debug_assert!(self.regex_bytes[self.pos] == b'\\');
696 self.pos += 1;
697
698 let Some(current_char) = self.ascii_char() else {
699 return Err(self.error(start_pos, ErrorKind::UnfinishedEscape));
700 };
701 self.pos += 1;
703
704 match current_char {
705 b'0'..=b'9' => Err(self.error(start_pos, ErrorKind::UnsupportedBackref)),
706 b'x' | b'u' | b'U' => {
707 let ch = const_try!(self.parse_hex_escape(start_pos, current_char));
708 const_try!(self.push_ast(start_pos, Node::HexEscape));
709 Ok(PrimitiveKind::Literal(ch))
710 }
711 b'p' | b'P' => Err(self.error(start_pos, ErrorKind::UnicodeClassesNotSupported)),
712 b'd' | b's' | b'w' | b'D' | b'S' | b'W' => {
713 const_try!(self.push_ast(start_pos, Node::PerlClass));
714 Ok(PrimitiveKind::PerlClass)
715 }
716
717 b'n' => {
718 const_try!(self.push_ast(start_pos, Node::EscapedLiteral));
719 Ok(PrimitiveKind::Literal('\n'))
720 }
721 b't' => {
722 const_try!(self.push_ast(start_pos, Node::EscapedLiteral));
723 Ok(PrimitiveKind::Literal('\t'))
724 }
725 b'r' => {
726 const_try!(self.push_ast(start_pos, Node::EscapedLiteral));
727 Ok(PrimitiveKind::Literal('\r'))
728 }
729 b'a' => {
730 const_try!(self.push_ast(start_pos, Node::EscapedLiteral));
731 Ok(PrimitiveKind::Literal('\x07'))
732 }
733 b'f' => {
734 const_try!(self.push_ast(start_pos, Node::EscapedLiteral));
735 Ok(PrimitiveKind::Literal('\x0C'))
736 }
737 b'v' => {
738 const_try!(self.push_ast(start_pos, Node::EscapedLiteral));
739 Ok(PrimitiveKind::Literal('\x0B'))
740 }
741
742 b'A' | b'z' | b'B' | b'<' | b'>' => {
743 const_try!(self.push_ast(start_pos, Node::StdAssertion));
744 Ok(PrimitiveKind::Other)
745 }
746 b'b' => {
747 if !matches!(self.ascii_char(), Some(b'{'))
748 || !const_try!(self.try_parse_word_boundary(start_pos))
749 {
750 const_try!(self.push_ast(start_pos, Node::StdAssertion));
751 }
752 Ok(PrimitiveKind::Other)
753 }
754 ch if is_meta_char(ch) => {
755 const_try!(self.push_ast(start_pos, Node::EscapedChar { meta: true }));
756 Ok(PrimitiveKind::Literal(ch as char))
757 }
758 ch if is_escapable_char(ch) => {
759 const_try!(self.push_ast(start_pos, Node::EscapedChar { meta: false }));
760 Ok(PrimitiveKind::Literal(ch as char))
761 }
762 _ => Err(self.error(start_pos, ErrorKind::UnsupportedEscape)),
763 }
764 }
765
766 const fn try_parse_word_boundary(&mut self, escape_start: usize) -> Result<bool, Error> {
767 const fn is_valid_char(ch: u8) -> bool {
768 matches!(ch, b'A'..=b'Z' | b'a'..=b'z' | b'-')
769 }
770
771 const KNOWN_BOUNDARIES: &[&[u8]] = &[b"start", b"end", b"start-half", b"end-half"];
772
773 let backup = self.backup();
774 debug_assert!(self.regex_bytes[self.pos] == b'{');
775 self.pos += 1;
776
777 const_try!(self.gobble_whitespace_and_comments());
778 let start_pos = self.pos;
779 if self.is_eof() {
780 return Err(self.error(start_pos, ErrorKind::UnfinishedWordBoundary));
781 }
782
783 if !is_valid_char(self.regex_bytes[self.pos]) {
784 self.restore(&backup);
785 return Ok(false); }
787
788 while !self.is_eof() && is_valid_char(self.regex_bytes[self.pos]) {
789 self.pos += 1;
790 }
791 let end_pos = self.pos;
792 const_try!(self.gobble_whitespace_and_comments());
793
794 if self.ignore_whitespace {
795 if let Some(ch) = self.peek_whitespace(self.pos) {
797 if ch.is_ascii() && is_valid_char(ch as u8) {
798 return Err(self.disallowed_error(end_pos));
799 }
800 }
801 }
802
803 if self.is_eof() || self.regex_bytes[self.pos] != b'}' {
804 return Err(self.error(start_pos, ErrorKind::UnfinishedWordBoundary));
805 }
806
807 if !self.matches_any(start_pos..end_pos, KNOWN_BOUNDARIES) {
809 return Err(ErrorKind::UnknownWordBoundary.with_position(start_pos..end_pos));
810 }
811
812 self.pos += 1; const_try!(self.push_ast(escape_start, Node::StdAssertion));
814 Ok(true)
815 }
816
817 const fn parse_hex_escape(&mut self, start_pos: usize, marker_ch: u8) -> Result<char, Error> {
828 let Some(current_char) = self.ascii_char() else {
829 return Err(self.error(start_pos, ErrorKind::UnfinishedEscape));
830 };
831 if current_char == b'{' {
832 self.parse_hex_brace(start_pos)
834 } else {
835 let expected_digits = match marker_ch {
836 b'x' => 2,
837 b'u' => 4,
838 b'U' => 8,
839 _ => unreachable!(),
840 };
841 self.parse_hex_digits(start_pos, expected_digits)
842 }
843 }
844
845 const fn parse_hex_brace(&mut self, start_pos: usize) -> Result<char, Error> {
846 self.pos += 1; let first_digit_pos = self.pos;
849 let mut hex = 0_u32;
850 while self.pos < self.regex_bytes.len() && self.regex_bytes[self.pos] != b'}' {
851 let digit = self.regex_bytes[self.pos];
852 self.pos += 1;
853 let digit = match digit {
854 b'0'..=b'9' => digit - b'0',
855 b'a'..=b'f' => digit - b'a' + 10,
856 b'A'..=b'F' => digit - b'A' + 10,
857 _ if self.ignore_whitespace && digit.is_ascii_whitespace() => {
858 return Err(self.error(self.pos - 1, ErrorKind::DisallowedWhitespace));
860 }
861 b'#' if self.ignore_whitespace => {
862 return Err(self.error(self.pos - 1, ErrorKind::DisallowedComment));
864 }
865 _ => return Err(self.error(start_pos, ErrorKind::InvalidHex)),
866 };
867 if self.pos >= first_digit_pos + 8 {
868 return Err(self.error(start_pos, ErrorKind::NonUnicodeHex));
869 }
870 hex = hex * 16 + digit as u32;
872 }
873
874 if self.is_eof() {
875 return Err(self.error(start_pos, ErrorKind::UnfinishedEscape));
876 }
877
878 debug_assert!(self.regex_bytes[self.pos] == b'}');
880 self.pos += 1;
881
882 if self.pos == first_digit_pos + 1 {
883 Err(self.error(start_pos, ErrorKind::EmptyHex))
884 } else {
885 match char::from_u32(hex) {
886 Some(ch) => Ok(ch),
887 None => Err(self.error(start_pos, ErrorKind::NonUnicodeHex)),
888 }
889 }
890 }
891
892 const fn parse_hex_digits(
893 &mut self,
894 start_pos: usize,
895 expected_digits: usize,
896 ) -> Result<char, Error> {
897 let first_digit_pos = self.pos;
898 let mut hex = 0_u32;
899 while self.pos - first_digit_pos < expected_digits {
900 if self.is_eof() {
901 return Err(self.error(start_pos, ErrorKind::UnfinishedEscape));
902 }
903 let digit = self.regex_bytes[self.pos];
904 self.pos += 1; let digit = match digit {
907 b'0'..=b'9' => digit - b'0',
908 b'a'..=b'f' => digit - b'a' + 10,
909 b'A'..=b'F' => digit - b'A' + 10,
910 _ if self.ignore_whitespace && digit.is_ascii_whitespace() => {
911 return Err(self.error(self.pos - 1, ErrorKind::DisallowedWhitespace));
913 }
914 b'#' if self.ignore_whitespace => {
915 return Err(self.error(self.pos - 1, ErrorKind::DisallowedComment));
917 }
918 _ => return Err(self.error(start_pos, ErrorKind::InvalidHex)),
919 };
920 hex = hex * 16 + digit as u32;
922 }
923
924 match char::from_u32(hex) {
925 Some(ch) => Ok(ch),
926 None => Err(self.error(start_pos, ErrorKind::NonUnicodeHex)),
927 }
928 }
929
930 const fn is_eof(&self) -> bool {
931 self.pos == self.regex_bytes.len()
932 }
933
934 const fn parse_group_start_or_flags(&mut self) -> Result<(), Error> {
935 const LOOKAROUND_PREFIXES: &[&[u8]] = &[b"?=", b"?!", b"?<=", b"?<!"];
936 const NAMED_PREFIXES: &[&[u8]] = &[b"?P<", b"?<"];
937
938 let start_pos = self.pos;
940 debug_assert!(self.regex_bytes[self.pos] == b'(');
941 self.pos += 1;
942
943 let start_ast_idx = self.syntax.len();
944 const_try!(self.push_ast(
945 start_pos,
946 Node::GroupStart {
947 name: None,
948 flags: None,
949 }
950 ));
951 const_try!(self.gobble_whitespace_and_comments());
952
953 if self.is_eof() {
954 return Err(self.error(start_pos, ErrorKind::UnfinishedGroup));
955 }
956
957 let is_lookaround = self.gobble_any(LOOKAROUND_PREFIXES);
958 if is_lookaround {
959 return Err(self.error(start_pos, ErrorKind::LookaroundNotSupported));
960 }
961
962 let name_start = self.pos;
963 let is_named = self.gobble_any(NAMED_PREFIXES);
964 if is_named {
965 let start = Span::new(name_start, self.pos);
966 let name_ast = const_try!(self.parse_capture_name(start));
967 if let SyntaxOrLen::Syntax(syntax) = &mut self.syntax {
968 if let Node::GroupStart { name, .. } = &mut syntax.index_mut(start_ast_idx).node {
969 *name = Some(name_ast);
970 }
971 }
972
973 const_try!(self.group_names.insert(name_ast.name, self.regex_bytes));
974 }
975
976 let mut spanned_flags = None;
977 let mut is_standalone_flags = false;
978 if !is_named && self.regex_bytes[self.pos] == b'?' {
979 let flags_start = self.pos;
980 let flags = const_try!(self.parse_flags());
981 if !flags.is_empty {
982 spanned_flags = Some((flags, Span::new(flags_start, self.pos)));
983 }
984
985 let ch = self.regex_bytes[self.pos];
986 debug_assert!(ch == b':' || ch == b')');
987
988 if ch == b')' {
989 if flags.is_empty {
991 self.pos += 1;
993 return Err(self.error(start_pos, ErrorKind::MissingRepetition));
995 }
996 is_standalone_flags = true;
998 } else {
999 self.pos += 1;
1000 }
1001 }
1002
1003 if let SyntaxOrLen::Syntax(syntax) = &mut self.syntax {
1004 if let (Node::GroupStart { flags, .. }, Some((_, span))) =
1005 (&mut syntax.index_mut(start_ast_idx).node, &spanned_flags)
1006 {
1007 *flags = Some(*span);
1008 }
1009 }
1010
1011 let new_ignore_whitespace = if let Some((
1012 Flags {
1013 ignore_whitespace: Some(ws),
1014 ..
1015 },
1016 _,
1017 )) = spanned_flags
1018 {
1019 ws
1020 } else {
1021 self.ignore_whitespace
1022 };
1023
1024 let push_result = self.groups.push(GroupFrame {
1025 prev_ignore_whitespace: if is_standalone_flags {
1026 new_ignore_whitespace
1029 } else {
1030 let prev = self.ignore_whitespace;
1031 self.ignore_whitespace = new_ignore_whitespace;
1032 prev
1033 },
1034 });
1035 if push_result.is_err() {
1036 return Err(self.error(start_pos, ErrorKind::GroupDepthOverflow));
1037 }
1038
1039 if is_standalone_flags {
1041 const_try!(self.end_group());
1042 }
1043 Ok(())
1044 }
1045
1046 const fn parse_capture_name(&mut self, start: Span) -> Result<GroupName, Error> {
1048 const fn is_capture_char(ch: u8, is_first: bool) -> bool {
1049 if is_first {
1050 ch == b'_' || ch.is_ascii_alphabetic()
1051 } else {
1052 ch == b'_' || ch == b'.' || ch == b'[' || ch == b']' || ch.is_ascii_alphanumeric()
1053 }
1054 }
1055
1056 let start_pos = self.pos;
1057 while self.pos < self.regex_bytes.len() && self.regex_bytes[self.pos] != b'>' {
1058 let ch = self.regex_bytes[self.pos];
1059 if ch > 0x7f {
1060 return Err(self.error(start_pos, ErrorKind::NonAsciiCaptureName));
1061 }
1062
1063 let is_first = start_pos == self.pos;
1064 self.pos += 1;
1065 if !is_capture_char(ch, is_first) {
1066 return Err(self.error(start_pos, ErrorKind::InvalidCaptureName));
1067 }
1068 }
1069
1070 if self.is_eof() {
1071 return Err(self.error(start_pos, ErrorKind::UnfinishedCaptureName));
1072 }
1073 if start_pos == self.pos {
1074 return Err(self.error(start_pos, ErrorKind::EmptyCaptureName));
1075 }
1076
1077 debug_assert!(self.regex_bytes[self.pos] == b'>');
1078 self.pos += 1;
1079
1080 Ok(GroupName {
1081 start,
1082 name: Span::new(start_pos, self.pos - 1),
1083 end: Span::new(self.pos - 1, self.pos),
1084 })
1085 }
1086
1087 const fn parse_flags(&mut self) -> Result<Flags, Error> {
1089 const KNOWN_FLAGS_COUNT: usize = 7;
1090 const KNOWN_FLAGS: [u8; KNOWN_FLAGS_COUNT] = *b"ximsUuR";
1091
1092 let start_pos = self.pos;
1093 debug_assert!(self.regex_bytes[self.pos] == b'?');
1094 self.pos += 1;
1095
1096 let mut negation = false;
1097 let mut flag_values = [None::<bool>; KNOWN_FLAGS_COUNT];
1098 let mut is_empty = true;
1099 let mut is_empty_negation = true;
1100 while !self.is_eof() {
1101 let ch = self.regex_bytes[self.pos];
1102 if ch == b':' || ch == b')' {
1103 break;
1104 }
1105 self.pos += 1;
1106
1107 if ch == b'-' {
1108 if negation {
1109 return Err(self.error(self.pos - 1, ErrorKind::RepeatedFlagNegation));
1110 }
1111 negation = true;
1112 continue;
1113 }
1114
1115 let mut i = 0;
1116 while i < KNOWN_FLAGS_COUNT {
1117 if ch == KNOWN_FLAGS[i] {
1118 break;
1119 }
1120 i += 1;
1121 }
1122 if i == KNOWN_FLAGS_COUNT {
1123 return Err(self.error(self.pos - 1, ErrorKind::UnsupportedFlag));
1124 }
1125
1126 if let Some(prev_value) = flag_values[i] {
1127 let err = ErrorKind::RepeatedFlag {
1128 contradicting: prev_value == negation,
1129 };
1130 return Err(self.error(self.pos - 1, err));
1131 }
1132 flag_values[i] = Some(!negation);
1133 is_empty = false;
1134 is_empty_negation = !negation;
1135 }
1136
1137 if self.is_eof() {
1138 return Err(self.error(start_pos, ErrorKind::UnfinishedFlags));
1139 }
1140 if negation && is_empty_negation {
1141 return Err(self.error(self.pos - 1, ErrorKind::UnfinishedFlagsNegation));
1142 }
1143
1144 Ok(Flags {
1145 is_empty,
1146 ignore_whitespace: flag_values[0],
1147 })
1148 }
1149
1150 const fn end_group(&mut self) -> Result<(), Error> {
1151 let start_pos = self.pos;
1152 debug_assert!(self.regex_bytes[self.pos] == b')');
1153 self.pos += 1;
1154
1155 if let Some(popped) = self.groups.pop() {
1156 self.ignore_whitespace = popped.prev_ignore_whitespace;
1157 } else {
1158 return Err(self.error(start_pos, ErrorKind::NonMatchingGroupEnd));
1159 }
1160 self.push_ast(start_pos, Node::GroupEnd)
1161 }
1162
1163 const fn parse_set_class_start(&mut self) -> Result<(), Error> {
1166 let start_pos = self.pos;
1167 debug_assert!(self.regex_bytes[self.pos] == b'[');
1168 self.pos += 1;
1169
1170 let ast_idx = self.syntax.len();
1171 const_try!(self.push_ast(start_pos, Node::SetStart { negation: None }));
1172
1173 const_try!(self.gobble_whitespace_and_comments());
1174 if self.is_eof() {
1175 return Err(self.error(start_pos, ErrorKind::UnfinishedSet));
1176 }
1177
1178 if self.regex_bytes[self.pos] == b'^' {
1179 self.pos += 1;
1180 let negation_range = Span::new(self.pos - 1, self.pos);
1181 const_try!(self.gobble_whitespace_and_comments());
1182 if self.is_eof() {
1183 return Err(self.error(start_pos, ErrorKind::UnfinishedSet));
1184 }
1185
1186 if let SyntaxOrLen::Syntax(syntax) = &mut self.syntax {
1187 if let Node::SetStart { negation } = &mut syntax.index_mut(ast_idx).node {
1188 *negation = Some(negation_range);
1189 }
1190 }
1191 }
1192
1193 if self.regex_bytes[self.pos] == b']' {
1194 self.pos += 1;
1196 const_try!(self.gobble_whitespace_and_comments());
1197 } else {
1198 while !self.is_eof() && self.regex_bytes[self.pos] == b'-' {
1200 self.pos += 1;
1201 const_try!(self.gobble_whitespace_and_comments());
1202 }
1203 }
1204 Ok(())
1205 }
1206
1207 const fn set_step(&mut self) -> Result<(), Error> {
1208 debug_assert!(self.set_depth > 0);
1209
1210 const_try!(self.gobble_whitespace_and_comments());
1211 if self.is_eof() {
1212 return Err(self.error(self.pos, ErrorKind::UnfinishedSet));
1213 }
1214
1215 let op_start = self.pos;
1216 if self.gobble_any(&[b"&&", b"--", b"~~"]) {
1217 const_try!(self.push_ast(op_start, Node::SetOp));
1218 return Ok(());
1219 }
1220
1221 let ch = self.regex_bytes[self.pos];
1222 match ch {
1223 b'[' => {
1224 if self.try_parse_ascii_class() {
1226 const_try!(self.push_ast(op_start, Node::AsciiClass));
1227 } else {
1228 self.pos = op_start; const_try!(self.parse_set_class_start());
1230 self.set_depth += 1;
1231 }
1232 }
1233 b']' => {
1234 self.pos += 1;
1235 const_try!(self.push_ast(op_start, Node::SetEnd));
1236 self.set_depth -= 1;
1237 }
1238 _ => const_try!(self.parse_set_class_range()),
1239 }
1240 Ok(())
1241 }
1242
1243 const fn try_parse_ascii_class(&mut self) -> bool {
1247 const CLASSES: &[&[u8]] = &[
1248 b"alnum", b"alpha", b"ascii", b"blank", b"cntrl", b"digit", b"graph", b"lower",
1249 b"print", b"punct", b"space", b"upper", b"word", b"xdigit",
1250 ];
1251
1252 debug_assert!(self.regex_bytes[self.pos] == b'[');
1253 self.pos += 1;
1254
1255 if self.is_eof() || self.regex_bytes[self.pos] != b':' {
1256 return false;
1257 }
1258
1259 self.pos += 1;
1260 if self.is_eof() {
1261 return false;
1262 }
1263 if self.regex_bytes[self.pos] == b'^' {
1264 self.pos += 1;
1265 if self.is_eof() {
1266 return false;
1267 }
1268 }
1269
1270 let is_known_class = self.gobble_any(CLASSES);
1271 if !is_known_class {
1273 return false;
1274 }
1275
1276 !self.is_eof() && self.gobble(b":]")
1277 }
1278
1279 const fn parse_set_class_range(&mut self) -> Result<(), Error> {
1280 let start_pos = self.pos;
1281 let start_item = const_try!(self.parse_set_class_item());
1282 const_try!(self.gobble_whitespace_and_comments());
1283 if self.is_eof() {
1284 return Err(self.error(self.pos, ErrorKind::UnfinishedSet));
1285 }
1286
1287 let ch = self.regex_bytes[self.pos];
1288 if ch != b'-' || {
1289 let next_ch = self.peek_whitespace(self.pos + 1);
1290 matches!(next_ch, Some(']' | '-'))
1291 } {
1292 if !start_item.is_valid_set_member() {
1294 return Err(self.error(start_pos, ErrorKind::InvalidEscapeInSet));
1295 }
1296 return Ok(());
1297 }
1298
1299 let PrimitiveKind::Literal(range_start) = start_item else {
1300 return Err(self.error(start_pos, ErrorKind::InvalidRangeStart));
1301 };
1302
1303 debug_assert!(ch == b'-');
1304 self.pos += 1;
1305 const_try!(self.push_ast(self.pos - 1, Node::SetRange));
1306 const_try!(self.gobble_whitespace_and_comments());
1307
1308 let end_start_pos = self.pos;
1309 let PrimitiveKind::Literal(range_end) = const_try!(self.parse_set_class_item()) else {
1310 return Err(self.error(end_start_pos, ErrorKind::InvalidRangeEnd));
1311 };
1312
1313 if range_start > range_end {
1314 return Err(self.error(start_pos, ErrorKind::InvalidRange));
1315 }
1316 Ok(())
1317 }
1318
1319 const fn parse_set_class_item(&mut self) -> Result<PrimitiveKind, Error> {
1320 let Some((ch, next_pos)) = split_first_char(self.regex_bytes, self.pos) else {
1321 return Err(self.error(self.pos, ErrorKind::UnfinishedSet));
1322 };
1323 if ch == '\\' {
1324 self.parse_escape()
1325 } else {
1326 self.pos = next_pos;
1327 Ok(PrimitiveKind::Literal(ch))
1328 }
1329 }
1330
1331 pub(crate) const fn step(&mut self) -> Result<ops::ControlFlow<()>, Error> {
1332 if self.set_depth > 0 {
1333 const_try!(self.set_step());
1334 return Ok(ops::ControlFlow::Continue(()));
1335 }
1336
1337 const_try!(self.gobble_whitespace_and_comments());
1338
1339 let Some((current_ch, next_pos)) = split_first_char(self.regex_bytes, self.pos) else {
1340 if !self.groups.is_empty() {
1341 return Err(self.error(self.regex_bytes.len(), ErrorKind::UnfinishedGroup));
1342 }
1343 return Ok(ops::ControlFlow::Break(()));
1344 };
1345 match current_ch {
1346 '(' => {
1347 const_try!(self.parse_group_start_or_flags());
1348 self.is_empty_last_item = true;
1350 }
1351 ')' => {
1352 const_try!(self.end_group());
1353 self.is_empty_last_item = false;
1354 }
1355 '|' => {
1356 self.pos += 1;
1358 const_try!(self.push_ast(self.pos - 1, Node::Alteration));
1359 self.is_empty_last_item = true;
1360 }
1361 '[' => {
1362 const_try!(self.parse_set_class_start());
1363 self.set_depth = 1;
1364 self.is_empty_last_item = false;
1365 }
1366 '?' | '*' | '+' => {
1367 const_try!(self.parse_uncounted_repetition());
1368 self.is_empty_last_item = false;
1369 }
1370 '{' => {
1371 const_try!(self.parse_counted_repetition());
1372 self.is_empty_last_item = false;
1373 }
1374 _ => {
1375 const_try!(self.parse_primitive(current_ch, next_pos));
1376 self.is_empty_last_item = false;
1377 }
1378 }
1379 Ok(ops::ControlFlow::Continue(()))
1380 }
1381
1382 pub(crate) const fn into_syntax(self) -> Syntax<CAP> {
1383 match self.syntax {
1384 SyntaxOrLen::Syntax(syntax) => syntax,
1385 SyntaxOrLen::Len(_) => Syntax::new(Spanned::DUMMY),
1386 }
1387 }
1388}