progress: API and docs refresh

rust-lang · May 18, 2023 · 5aa3eb5 · 5aa3eb5
1 parent 9c4ce43
commit 5aa3eb5
Show file tree

Hide file tree

Showing 36 changed files with 5,348 additions and 1,366 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -45,6 +45,15 @@ std = [
  "regex-automata/std",
  "regex-syntax/std",
 ]
+# This feature enables the 'log' crate to emit messages. This is usually
+# only useful for folks working on the regex crate itself, but can be useful
+# if you're trying hard to do some performance hacking on regex patterns
+# themselves. Note that you'll need to pair this with a crate like 'env_logger'
+# to actually emit the log messages somewhere.
+logging = [
+ "aho-corasick?/logging",
+ "regex-automata/logging",
+]
 # The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until
 # then, it is an alias for the 'std' feature.
 use_std = ["std"]
@@ -64,11 +73,6 @@ perf = [
  "perf-inline",
  "perf-literal",
 ]
-# Enables fast caching. (If disabled, caching is still used, but is slower.)
-# Currently, this feature has no effect. It used to remove the thread_local
-# dependency and use a slower internal cache, but now the default cache has
-# been improved and thread_local is no longer a dependency at all.
-perf-cache = []
 # Enables use of a lazy DFA when possible.
 perf-dfa = ["regex-automata/hybrid"]
 # Enables use of a fully compiled DFA when possible.
@@ -86,6 +90,11 @@ perf-literal = [
  "dep:memchr",
  "regex-automata/perf-literal",
 ]
+# Enables fast caching. (If disabled, caching is still used, but is slower.)
+# Currently, this feature has no effect. It used to remove the thread_local
+# dependency and use a slower internal cache, but now the default cache has
+# been improved and thread_local is no longer a dependency at all.
+perf-cache = []
 
 
 # UNICODE DATA FEATURES
@@ -151,7 +160,7 @@ unstable = ["pattern"]
 # by default if the unstable feature is enabled.
 pattern = []
 
-# For very fast prefix literal matching.
+# For very fast multi-prefix literal matching.
 [dependencies.aho-corasick]
 version = "1.0.0"
 optional = true
@@ -161,22 +170,22 @@ optional = true
 version = "2.5.0"
 optional = true
 
-# For parsing regular expressions.
-[dependencies.regex-syntax]
-path = "regex-syntax"
-version = "0.7.1"
-default-features = false
-
 # For the actual regex engines.
 [dependencies.regex-automata]
 path = "regex-automata"
 version = "0.3.0"
 default-features = false
 features = ["alloc", "syntax", "meta", "nfa-pikevm"]
 
+# For parsing regular expressions.
+[dependencies.regex-syntax]
+path = "regex-syntax"
+version = "0.7.1"
+default-features = false
+
 [dev-dependencies]
 # For examples.
-lazy_static = "1"
+once_cell = "1.17.1"
 # For property based tests.
 quickcheck = { version = "1.0.3", default-features = false }
 # To check README's example

diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs
@@ -1954,7 +1954,7 @@ impl Cache {
  /// This panics if no search has been started by [`Cache::search_start`].
  #[inline]
  pub fn search_update(&mut self, at: usize) {
- let mut p =
+ let p =
  self.progress.as_mut().expect("no in-progress search to update");
  p.at = at;
  }

diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs
@@ -2819,6 +2819,12 @@ impl Config {
  ///
  /// By default, `\n` is the line terminator.
  ///
+ /// **Warning**: This does not change the behavior of `.`. To do that,
+ /// you'll need to configure the syntax option
+ /// [`syntax::Config::line_terminator`](crate::util::syntax::Config::line_terminator)
+ /// in addition to this. Otherwise, `.` will continue to match any
+ /// character other than `\n`.
+ ///
  /// # Example
  ///
  /// ```

diff --git a/regex-automata/src/util/syntax.rs b/regex-automata/src/util/syntax.rs
@@ -147,6 +147,7 @@ pub struct Config {
  multi_line: bool,
  dot_matches_new_line: bool,
  crlf: bool,
+ line_terminator: u8,
  swap_greed: bool,
  ignore_whitespace: bool,
  unicode: bool,
@@ -164,6 +165,7 @@ impl Config {
  multi_line: false,
  dot_matches_new_line: false,
  crlf: false,
+ line_terminator: b'\n',
  swap_greed: false,
  ignore_whitespace: false,
  unicode: true,
@@ -239,6 +241,31 @@ impl Config {
  self
  }
 
+ /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
+ ///
+ /// Namely, instead of `.` (by default) matching everything except for `\n`,
+ /// this will cause `.` to match everything except for the byte given.
+ ///
+ /// If `.` is used in a context where Unicode mode is enabled and this byte
+ /// isn't ASCII, then an error will be returned. When Unicode mode is
+ /// disabled, then any byte is permitted, but will return an error if UTF-8
+ /// mode is enabled and it is a non-ASCII byte.
+ ///
+ /// In short, any ASCII value for a line terminator is always okay. But a
+ /// non-ASCII byte might result in an error depending on whether Unicode
+ /// mode or UTF-8 mode are enabled.
+ ///
+ /// Note that if `R` mode is enabled then it always takes precedence and
+ /// the line terminator will be treated as `\r` and `\n` simultaneously.
+ ///
+ /// Note also that this *doesn't* impact the look-around assertions
+ /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
+ /// configuration in the regex engine itself.
+ pub fn line_terminator(mut self, byte: u8) -> Config {
+ self.line_terminator = byte;
+ self
+ }
+
  /// Enable or disable the "swap greed" flag by default.
  ///
  /// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
@@ -377,6 +404,11 @@ impl Config {
  self.crlf
  }
 
+ /// Returns the line terminator in this syntax configuration.
+ pub fn get_line_terminator(&self) -> u8 {
+ self.line_terminator
+ }
+
  /// Returns whether "swap greed" mode is enabled.
  pub fn get_swap_greed(&self) -> bool {
  self.swap_greed
@@ -410,6 +442,7 @@ impl Config {
  .multi_line(self.multi_line)
  .dot_matches_new_line(self.dot_matches_new_line)
  .crlf(self.crlf)
+ .line_terminator(self.line_terminator)
  .swap_greed(self.swap_greed)
  .ignore_whitespace(self.ignore_whitespace)
  .utf8(self.utf8)
@@ -436,6 +469,7 @@ impl Config {
  .multi_line(self.multi_line)
  .crlf(self.crlf)
  .dot_matches_new_line(self.dot_matches_new_line)
+ .line_terminator(self.line_terminator)
  .swap_greed(self.swap_greed)
  .utf8(self.utf8);
  }

diff --git a/regex-automata/tests/dfa/onepass/suite.rs b/regex-automata/tests/dfa/onepass/suite.rs
@@ -193,4 +193,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
  .case_insensitive(test.case_insensitive())
  .unicode(test.unicode())
  .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
 }
diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs
@@ -391,6 +391,7 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
  .case_insensitive(test.case_insensitive())
  .unicode(test.unicode())
  .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
 }
 
 /// Execute an overlapping search, and for each match found, also find its

diff --git a/regex-automata/tests/hybrid/suite.rs b/regex-automata/tests/hybrid/suite.rs
@@ -281,6 +281,7 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
  .case_insensitive(test.case_insensitive())
  .unicode(test.unicode())
  .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
 }
 
 /// Execute an overlapping search, and for each match found, also find its

diff --git a/regex-automata/tests/meta/suite.rs b/regex-automata/tests/meta/suite.rs
@@ -196,4 +196,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
  .case_insensitive(test.case_insensitive())
  .unicode(test.unicode())
  .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
 }
diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs
@@ -209,4 +209,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
  .case_insensitive(test.case_insensitive())
  .unicode(test.unicode())
  .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
 }
diff --git a/regex-automata/tests/nfa/thompson/pikevm/suite.rs b/regex-automata/tests/nfa/thompson/pikevm/suite.rs
@@ -158,4 +158,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config {
  .case_insensitive(test.case_insensitive())
  .unicode(test.unicode())
  .utf8(test.utf8())
+ .line_terminator(test.line_terminator())
 }
diff --git a/regex-lite/tests/lib.rs b/regex-lite/tests/lib.rs
@@ -2,8 +2,6 @@ mod fuzz;
 mod string;
 
 const BLACKLIST: &[&str] = &[
- // CRLF-aware line anchors aren't supported in regex API yet.
- "crlf",
  // Custom line terminators aren't supported in regex-lite. We could add it,
  // but it didn't seem worth it.
  "line-terminator",

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -88,6 +88,9 @@ pub enum ErrorKind {
  /// This error occurs when translating a pattern that could match a byte
  /// sequence that isn't UTF-8 and `utf8` was enabled.
  InvalidUtf8,
+ /// This error occurs when one uses a non-ASCII byte for a line terminator,
+ /// but where Unicode mode is enabled and UTF-8 mode is disabled.
+ InvalidLineTerminator,
  /// This occurs when an unrecognized Unicode property name could not
  /// be found.
  UnicodePropertyNotFound,
@@ -120,6 +123,7 @@ impl core::fmt::Display for ErrorKind {
  let msg = match *self {
  UnicodeNotAllowed => "Unicode not allowed here",
  InvalidUtf8 => "pattern can match invalid UTF-8",
+ InvalidLineTerminator => "invalid line terminator, must be ASCII",
  UnicodePropertyNotFound => "Unicode property not found",
  UnicodePropertyValueNotFound => "Unicode property value not found",
  UnicodePerlClassNotFound => {
@@ -648,6 +652,12 @@ impl Hir {
  cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
  Hir::class(Class::Bytes(cls))
  }
+ Dot::AnyCharExcept(ch) => {
+ let mut cls =
+ ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
+ cls.negate();
+ Hir::class(Class::Unicode(cls))
+ }
  Dot::AnyCharExceptLF => {
  let mut cls = ClassUnicode::empty();
  cls.push(ClassUnicodeRange::new('\0', '\x09'));
@@ -661,6 +671,12 @@ impl Hir {
  cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
  Hir::class(Class::Unicode(cls))
  }
+ Dot::AnyByteExcept(byte) => {
+ let mut cls =
+ ClassBytes::new([ClassBytesRange::new(byte, byte)]);
+ cls.negate();
+ Hir::class(Class::Bytes(cls))
+ }
  Dot::AnyByteExceptLF => {
  let mut cls = ClassBytes::empty();
  cls.push(ClassBytesRange::new(b'\0', b'\x09'));
@@ -1772,6 +1788,18 @@ pub enum Dot {
  ///
  /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
  AnyByte,
+ /// Matches the UTF-8 encoding of any Unicode scalar value except for the
+ /// `char` given.
+ ///
+ /// This is equivalent to using `(?u-s:.)` with the line terminator set
+ /// to a particular ASCII byte. (Because of peculiarities in the regex
+ /// engines, a line terminator must be a single byte. It follows that when
+ /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+ /// value. That is, ti must be ASCII.)
+ ///
+ /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
+ /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
+ AnyCharExcept(char),
  /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
  ///
  /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
@@ -1781,6 +1809,17 @@ pub enum Dot {
  ///
  /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
  AnyCharExceptCRLF,
+ /// Matches any byte value except for the `u8` given.
+ ///
+ /// This is equivalent to using `(?-us:.)` with the line terminator set
+ /// to a particular ASCII byte. (Because of peculiarities in the regex
+ /// engines, a line terminator must be a single byte. It follows that when
+ /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
+ /// value. That is, ti must be ASCII.)
+ ///
+ /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
+ /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
+ AnyByteExcept(u8),
  /// Matches any byte value except for `\n`.
  ///
  /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.