Skip to content

Commit

Permalink
progress: API and docs refresh
Browse files Browse the repository at this point in the history
  • Loading branch information
BurntSushi committed May 18, 2023
1 parent 9c4ce43 commit 5aa3eb5
Show file tree
Hide file tree
Showing 36 changed files with 5,348 additions and 1,366 deletions.
35 changes: 22 additions & 13 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ std = [
"regex-automata/std",
"regex-syntax/std",
]
# This feature enables the 'log' crate to emit messages. This is usually
# only useful for folks working on the regex crate itself, but can be useful
# if you're trying hard to do some performance hacking on regex patterns
# themselves. Note that you'll need to pair this with a crate like 'env_logger'
# to actually emit the log messages somewhere.
logging = [
"aho-corasick?/logging",
"regex-automata/logging",
]
# The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until
# then, it is an alias for the 'std' feature.
use_std = ["std"]
Expand All @@ -64,11 +73,6 @@ perf = [
"perf-inline",
"perf-literal",
]
# Enables fast caching. (If disabled, caching is still used, but is slower.)
# Currently, this feature has no effect. It used to remove the thread_local
# dependency and use a slower internal cache, but now the default cache has
# been improved and thread_local is no longer a dependency at all.
perf-cache = []
# Enables use of a lazy DFA when possible.
perf-dfa = ["regex-automata/hybrid"]
# Enables use of a fully compiled DFA when possible.
Expand All @@ -86,6 +90,11 @@ perf-literal = [
"dep:memchr",
"regex-automata/perf-literal",
]
# Enables fast caching. (If disabled, caching is still used, but is slower.)
# Currently, this feature has no effect. It used to remove the thread_local
# dependency and use a slower internal cache, but now the default cache has
# been improved and thread_local is no longer a dependency at all.
perf-cache = []


# UNICODE DATA FEATURES
Expand Down Expand Up @@ -151,7 +160,7 @@ unstable = ["pattern"]
# by default if the unstable feature is enabled.
pattern = []

# For very fast prefix literal matching.
# For very fast multi-prefix literal matching.
[dependencies.aho-corasick]
version = "1.0.0"
optional = true
Expand All @@ -161,22 +170,22 @@ optional = true
version = "2.5.0"
optional = true

# For parsing regular expressions.
[dependencies.regex-syntax]
path = "regex-syntax"
version = "0.7.1"
default-features = false

# For the actual regex engines.
[dependencies.regex-automata]
path = "regex-automata"
version = "0.3.0"
default-features = false
features = ["alloc", "syntax", "meta", "nfa-pikevm"]

# For parsing regular expressions.
[dependencies.regex-syntax]
path = "regex-syntax"
version = "0.7.1"
default-features = false

[dev-dependencies]
# For examples.
lazy_static = "1"
once_cell = "1.17.1"
# For property based tests.
quickcheck = { version = "1.0.3", default-features = false }
# To check README's example
Expand Down
2 changes: 1 addition & 1 deletion regex-automata/src/hybrid/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1954,7 +1954,7 @@ impl Cache {
/// This panics if no search has been started by [`Cache::search_start`].
#[inline]
pub fn search_update(&mut self, at: usize) {
let mut p =
let p =
self.progress.as_mut().expect("no in-progress search to update");
p.at = at;
}
Expand Down
6 changes: 6 additions & 0 deletions regex-automata/src/meta/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2819,6 +2819,12 @@ impl Config {
///
/// By default, `\n` is the line terminator.
///
/// **Warning**: This does not change the behavior of `.`. To do that,
/// you'll need to configure the syntax option
/// [`syntax::Config::line_terminator`](crate::util::syntax::Config::line_terminator)
/// in addition to this. Otherwise, `.` will continue to match any
/// character other than `\n`.
///
/// # Example
///
/// ```
Expand Down
34 changes: 34 additions & 0 deletions regex-automata/src/util/syntax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ pub struct Config {
multi_line: bool,
dot_matches_new_line: bool,
crlf: bool,
line_terminator: u8,
swap_greed: bool,
ignore_whitespace: bool,
unicode: bool,
Expand All @@ -164,6 +165,7 @@ impl Config {
multi_line: false,
dot_matches_new_line: false,
crlf: false,
line_terminator: b'\n',
swap_greed: false,
ignore_whitespace: false,
unicode: true,
Expand Down Expand Up @@ -239,6 +241,31 @@ impl Config {
self
}

/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
///
/// Namely, instead of `.` (by default) matching everything except for `\n`,
/// this will cause `.` to match everything except for the byte given.
///
/// If `.` is used in a context where Unicode mode is enabled and this byte
/// isn't ASCII, then an error will be returned. When Unicode mode is
/// disabled, then any byte is permitted, but will return an error if UTF-8
/// mode is enabled and it is a non-ASCII byte.
///
/// In short, any ASCII value for a line terminator is always okay. But a
/// non-ASCII byte might result in an error depending on whether Unicode
/// mode or UTF-8 mode are enabled.
///
/// Note that if `R` mode is enabled then it always takes precedence and
/// the line terminator will be treated as `\r` and `\n` simultaneously.
///
/// Note also that this *doesn't* impact the look-around assertions
/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
/// configuration in the regex engine itself.
pub fn line_terminator(mut self, byte: u8) -> Config {
self.line_terminator = byte;
self
}

/// Enable or disable the "swap greed" flag by default.
///
/// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
Expand Down Expand Up @@ -377,6 +404,11 @@ impl Config {
self.crlf
}

/// Returns the line terminator in this syntax configuration.
pub fn get_line_terminator(&self) -> u8 {
self.line_terminator
}

/// Returns whether "swap greed" mode is enabled.
pub fn get_swap_greed(&self) -> bool {
self.swap_greed
Expand Down Expand Up @@ -410,6 +442,7 @@ impl Config {
.multi_line(self.multi_line)
.dot_matches_new_line(self.dot_matches_new_line)
.crlf(self.crlf)
.line_terminator(self.line_terminator)
.swap_greed(self.swap_greed)
.ignore_whitespace(self.ignore_whitespace)
.utf8(self.utf8)
Expand All @@ -436,6 +469,7 @@ impl Config {
.multi_line(self.multi_line)
.crlf(self.crlf)
.dot_matches_new_line(self.dot_matches_new_line)
.line_terminator(self.line_terminator)
.swap_greed(self.swap_greed)
.utf8(self.utf8);
}
Expand Down
1 change: 1 addition & 0 deletions regex-automata/tests/dfa/onepass/suite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}
1 change: 1 addition & 0 deletions regex-automata/tests/dfa/suite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}

/// Execute an overlapping search, and for each match found, also find its
Expand Down
1 change: 1 addition & 0 deletions regex-automata/tests/hybrid/suite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}

/// Execute an overlapping search, and for each match found, also find its
Expand Down
1 change: 1 addition & 0 deletions regex-automata/tests/meta/suite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,4 +196,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}
1 change: 1 addition & 0 deletions regex-automata/tests/nfa/thompson/backtrack/suite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,4 +209,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}
1 change: 1 addition & 0 deletions regex-automata/tests/nfa/thompson/pikevm/suite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
.line_terminator(test.line_terminator())
}
2 changes: 0 additions & 2 deletions regex-lite/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ mod fuzz;
mod string;

const BLACKLIST: &[&str] = &[
// CRLF-aware line anchors aren't supported in regex API yet.
"crlf",
// Custom line terminators aren't supported in regex-lite. We could add it,
// but it didn't seem worth it.
"line-terminator",
Expand Down
39 changes: 39 additions & 0 deletions regex-syntax/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ pub enum ErrorKind {
/// This error occurs when translating a pattern that could match a byte
/// sequence that isn't UTF-8 and `utf8` was enabled.
InvalidUtf8,
/// This error occurs when one uses a non-ASCII byte for a line terminator,
/// but where Unicode mode is enabled and UTF-8 mode is disabled.
InvalidLineTerminator,
/// This occurs when an unrecognized Unicode property name could not
/// be found.
UnicodePropertyNotFound,
Expand Down Expand Up @@ -120,6 +123,7 @@ impl core::fmt::Display for ErrorKind {
let msg = match *self {
UnicodeNotAllowed => "Unicode not allowed here",
InvalidUtf8 => "pattern can match invalid UTF-8",
InvalidLineTerminator => "invalid line terminator, must be ASCII",
UnicodePropertyNotFound => "Unicode property not found",
UnicodePropertyValueNotFound => "Unicode property value not found",
UnicodePerlClassNotFound => {
Expand Down Expand Up @@ -648,6 +652,12 @@ impl Hir {
cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
Hir::class(Class::Bytes(cls))
}
Dot::AnyCharExcept(ch) => {
let mut cls =
ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
cls.negate();
Hir::class(Class::Unicode(cls))
}
Dot::AnyCharExceptLF => {
let mut cls = ClassUnicode::empty();
cls.push(ClassUnicodeRange::new('\0', '\x09'));
Expand All @@ -661,6 +671,12 @@ impl Hir {
cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
Hir::class(Class::Unicode(cls))
}
Dot::AnyByteExcept(byte) => {
let mut cls =
ClassBytes::new([ClassBytesRange::new(byte, byte)]);
cls.negate();
Hir::class(Class::Bytes(cls))
}
Dot::AnyByteExceptLF => {
let mut cls = ClassBytes::empty();
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
Expand Down Expand Up @@ -1772,6 +1788,18 @@ pub enum Dot {
///
/// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
AnyByte,
/// Matches the UTF-8 encoding of any Unicode scalar value except for the
/// `char` given.
///
/// This is equivalent to using `(?u-s:.)` with the line terminator set
/// to a particular ASCII byte. (Because of peculiarities in the regex
/// engines, a line terminator must be a single byte. It follows that when
/// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
/// value. That is, ti must be ASCII.)
///
/// (This and `AnyCharExceptLF` both exist because of legacy reasons.
/// `AnyCharExceptLF` will be dropped in the next breaking change release.)
AnyCharExcept(char),
/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
///
/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
Expand All @@ -1781,6 +1809,17 @@ pub enum Dot {
///
/// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
AnyCharExceptCRLF,
/// Matches any byte value except for the `u8` given.
///
/// This is equivalent to using `(?-us:.)` with the line terminator set
/// to a particular ASCII byte. (Because of peculiarities in the regex
/// engines, a line terminator must be a single byte. It follows that when
/// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
/// value. That is, ti must be ASCII.)
///
/// (This and `AnyByteExceptLF` both exist because of legacy reasons.
/// `AnyByteExceptLF` will be dropped in the next breaking change release.)
AnyByteExcept(u8),
/// Matches any byte value except for `\n`.
///
/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
Expand Down
Loading

0 comments on commit 5aa3eb5

Please sign in to comment.