compile_regex/lib.rs
1//! Compile-time regular expression validation and parsing.
2//!
3//! This library provides compile-time validation and parsing for regular expressions.
4//! It has only a lightweight [`compile-fmt`] dependency (to produce better panic messages)
5//! and is no-std / no-alloc compatible. Unlike some alternatives, it does not wrap a proc macro.
6//!
7//! The library strives to be compatible with [`regex`] / [`regex-syntax`] crates; it applies
8//! the same approach to parsing as the latter. It only implements parsing / validation; i.e.,
9//! it **does not** produce automata for matching against a regex. On the other hand, **almost all** of
10//! `regex` syntax is supported:
11//!
12//! - Whitespace control (`x` / `-x` flags) are correctly accounted for during parsing
13//! - Duplicate capture names are correctly checked for, so e.g. `(?<t>.)(?<t>.)` is invalid.
14//! - Counted repetition ranges are checked, so e.g. `.{2,1}` is invalid.
15//! - Char ranges in char sets are checked, so e.g. `[9-0]` is invalid.
16//!
17//! # Why?
18//!
19//! The main use case is checking whether a particular string constitutes a valid regex so that
20//! it can be supplied to a `Regex` constructor, e.g. via a [`LazyLock`](std::sync::LazyLock).
21//!
22//! Ultimately, it's a benchmark of how far one can take compile-time computations in Rust just by using
23//! a bunch of `const fn`s. As it turns out, it can get you pretty far.
24//!
25//! # Limitations
26//!
27//! - Unicode classes (`\p` and `\P` escapes) are not supported since it's almost impossible to check these
28//! in compile time.
29//! - The depth of group nesting is limited to 8. (Nesting is used internally for whitespace control, i.e., the `x` flag.)
30//! - The number of named captures is limited to 16.
31//!
32//! # Alternatives / similar tools
33//!
34//! - Use [`regex`] or [`regex-syntax`] if you don't need compile-time validation / parsing.
35//! - There are a couple of crates that use `regex` + proc macro to move regex validation to compile time,
36//! for example, [`regex_static`](https://docs.rs/regex_static/).
37//! - [`ere`](https://docs.rs/ere/) parses and compiles regular expressions in compile time.
38//! It supports POSIX extended regexes (i.e., a strict subset of what `regex` supports), and still uses proc macros.
39//!
40//! # Crate features
41//!
42//! ## `alloc`
43//!
44//! *(On by default)*
45//!
46//! Enables support of alloc types, such as [`Vec`] in [`RegexOptions::try_parse_to_vec()`].
47//!
48//! ## `std`
49//!
50//! *(On by default)*
51//!
52//! Enables support of the standard library types, e.g. the [`Error`](std::error::Error) trait implementation
53//! for [`Error`].
54//!
55//! # Examples
56//!
57//! ```
58//! use compile_regex::{ast, parse, validate};
59//!
60//! // Validate a regex for phone numbers.
61//! const _: () = validate(r"(?<code>\+1\s*)?\(\d{3}\)\d{3}-\d{4}");
62//! // Parse the same regex with whitespace and additional named captures
63//! const PHONE_REGEX: &str = r"(?x)
64//! (?<intl> \+1\s*)? # International prefix
65//! (?<city> \( \d{3} \)) # City code
66//! \s*
67//! (?<num> \d{3}-\d{4})";
68//! const SYNTAX: &[ast::Spanned] = parse!(PHONE_REGEX);
69//!
70//! println!("{SYNTAX:#?}");
71//!
72//! // Get all named groups in the regex.
73//! let group_names = SYNTAX.iter().filter_map(|spanned| {
74//! if let ast::Node::GroupStart { name: Some(name), .. } = &spanned.node {
75//! return Some(&PHONE_REGEX[name.name]);
76//! }
77//! None
78//! });
79//! let group_names: Vec<_> = group_names.collect();
80//! assert_eq!(group_names, ["intl", "city", "num"]);
81//! ```
82//!
83//! ## Errors
84//!
85//! If the [`validate()`] function or the [`parse!`] macro fail, they raise a compile-time error:
86//!
87//! ```compile_fail
88//! # use compile_regex::validate;
89//! // Fails because '+' is not escaped and is thus treated
90//! // as a one-or-more quantifier
91//! const _: () = validate(r"(?<code>+1\s*)?");
92//! ```
93//!
94//! Getting information about an error:
95//!
96//! ```
97//! use compile_regex::{try_validate, Error, ErrorKind};
98//! # use assert_matches::assert_matches;
99//!
100//! const ERR: Error = match try_validate(r"(?<code>+1\s*)?") {
101//! Ok(_) => panic!("validation succeeded"),
102//! Err(err) => err,
103//! };
104//!
105//! assert_matches!(ERR.kind(), ErrorKind::MissingRepetition);
106//! assert_eq!(ERR.pos(), 8..9);
107//! ```
108//!
109//! ## See also
110//!
111//! See [`RegexOptions`] docs for more advanced use cases.
112//!
113//! [`compile-fmt`]: https://docs.rs/compile-fmt/
114//! [`regex`]: https://docs.rs/regex/
115//! [`regex-syntax`]: https://docs.rs/regex-syntax/
116
117// Conditional compilation
118#![cfg_attr(not(feature = "std"), no_std)]
119// Documentation settings
120#![doc(html_root_url = "https://docs.rs/compile-regex/0.1.0")]
121#![cfg_attr(docsrs, feature(doc_cfg))]
122
123pub use crate::{
124 errors::{Error, ErrorKind},
125 parse::{RegexOptions, ValidationOutput},
126 utils::Stack,
127};
128
129#[macro_use]
130mod utils;
131pub mod ast;
132mod errors;
133mod parse;
134#[cfg(test)]
135mod tests;
136
137/// `alloc` re-exports.
138#[cfg(feature = "alloc")]
139mod alloc {
140 #[cfg(not(feature = "std"))]
141 extern crate alloc as std;
142
143 pub(crate) use std::vec::Vec;
144}
145
146/// Tries to validate the provided regular expression with the default [options](RegexOptions).
147///
148/// # Errors
149///
150/// Returns an error if the provided `regex` is not a valid regular expression.
151pub const fn try_validate(regex: &str) -> Result<(), Error> {
152 match RegexOptions::DEFAULT.try_validate(regex) {
153 Ok(_) => Ok(()),
154 Err(err) => Err(err),
155 }
156}
157
158/// Validates the provided regular expression, panicking on errors. This is a shortcut for
159/// [`try_validate()`]`.unwrap()`.
160///
161/// # Panics
162///
163/// Panics if the provided `regex` is not a valid regular expression.
164#[track_caller]
165pub const fn validate(regex: &str) {
166 RegexOptions::DEFAULT.validate(regex);
167}
168
169/// Produces [spanned syntax nodes](ast::Spanned) for the provided regex. The regex must be a constant expression
170/// (but not necessarily a string literal).
171///
172/// This is a preferred way to define syntax nodes in compile time (as opposed to using [`RegexOptions::parse()`])
173/// because the latter can lead to unused [`Syntax`](ast::Syntax) capacity added to the data section
174/// of the built executable. This padding is inaccessible, but the Rust compiler isn't smart enough to realize this.
175/// This macro computes the exact necessary capacity to store syntax nodes.
176///
177/// # Examples
178///
179/// ```
180/// use compile_regex::{ast, parse};
181/// # use assert_matches::assert_matches;
182///
183/// const SYNTAX: &[ast::Spanned] = parse!(r"^\s*\d{3,5}?");
184///
185/// assert_eq!(SYNTAX.len(), 5);
186/// assert_matches!(SYNTAX[0].node, ast::Node::LineAssertion); // ^
187/// assert_matches!(SYNTAX[4].node, ast::Node::CountedRepetition(_)); // {3,5}?
188/// ```
189///
190/// ## Use with `RegexOptions`
191///
192/// The macro optionally accepts parsing options.
193///
194/// ```
195/// use compile_regex::{ast, parse, RegexOptions};
196///
197/// const SYNTAX: &[ast::Spanned] = parse!(
198/// options: RegexOptions::DEFAULT.ignore_whitespace(true),
199/// r"(?<digits> # This is a comment :) so the closing brace should be ignored
200/// [0- 9]+ # without ignoring whitespace, this range would be invalid
201/// )"
202/// );
203///
204/// assert!(SYNTAX
205/// .iter()
206/// .any(|spanned| matches!(spanned.node, ast::Node::SetRange)));
207/// ```
208#[macro_export]
209macro_rules! parse {
210 ($regex:expr) => {
211 $crate::parse!(options: $crate::RegexOptions::DEFAULT, $regex)
212 };
213 (options: $options:expr, $regex:expr) => {{
214 const CAP: usize = $crate::RegexOptions::validate(&$options, $regex).node_count;
215 $crate::RegexOptions::parse::<CAP>(&$options, $regex).as_slice()
216 }};
217}
218
219#[cfg(doctest)]
220doc_comment::doctest!("../README.md");