fuzz: improve fuzz testing

It's still not as good as it could be, but we add fuzz targets for regex-lite and DFA deserialization in regex-automata.
rust-lang · Apr 30, 2023 · a8b2fc9 · a8b2fc9
1 parent 59cb003
commit a8b2fc9
Show file tree

Hide file tree

Showing 6 changed files with 136 additions and 20 deletions.
diff --git a/.vim/coc-settings.json b/.vim/coc-settings.json
@@ -0,0 +1,6 @@
+{
+ "rust-analyzer.linkedProjects": [
+ "fuzz/Cargo.toml",
+ "Cargo.toml"
+ ]
+}
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -3,16 +3,16 @@ name = "regex-fuzz"
 version = "0.0.0"
 authors = ["David Korczynski <[email protected]>"]
 publish = false
-edition = "2018"
+edition = "2021"
 
 [package.metadata]
 cargo-fuzz = true
 
 [dependencies]
 libfuzzer-sys = "0.4.1"
-
-[dependencies.regex]
-path = ".."
+regex = { path = ".." }
+regex-automata = { path = "../regex-automata" }
+regex-lite = { path = "../regex-lite" }
 
 # Prevent this from interfering with workspaces
 [workspace]
@@ -22,6 +22,18 @@ members = ["."]
 name = "fuzz_regex_match"
 path = "fuzz_targets/fuzz_regex_match.rs"
 
+[[bin]]
+name = "fuzz_regex_lite_match"
+path = "fuzz_targets/fuzz_regex_lite_match.rs"
+
+[[bin]]
+name = "fuzz_regex_automata_deserialize_dense_dfa"
+path = "fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs"
+
+[[bin]]
+name = "fuzz_regex_automata_deserialize_sparse_dfa"
+path = "fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs"
+
 [profile.release]
 opt-level = 3
 debug = true

diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs
@@ -0,0 +1,37 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+ let _ = run(data);
+});
+
+fn run(given_data: &[u8]) -> Option<()> {
+ use regex_automata::dfa::Automaton;
+
+ if given_data.len() < 2 {
+ return None;
+ }
+ let haystack_len = usize::from(given_data[0]);
+ let haystack = given_data.get(1..1 + haystack_len)?;
+ let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
+
+ // We help the fuzzer along by adding a preamble to the bytes that should
+ // at least make these first parts valid. The preamble expects a very
+ // specific sequence of bytes, so it makes sense to just force this.
+ let label = "rust-regex-automata-dfa-dense\x00\x00\x00";
+ assert_eq!(0, label.len() % 4);
+ let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
+ let version_check = 2u32.to_ne_bytes().to_vec();
+ let mut dfa_bytes: Vec<u8> = vec![];
+ dfa_bytes.extend(label.as_bytes());
+ dfa_bytes.extend(&endianness_check);
+ dfa_bytes.extend(&version_check);
+ dfa_bytes.extend(given_dfa_bytes);
+ // This is the real test: checking that any input we give to
+ // DFA::from_bytes will never result in a panic.
+ let (dfa, _) =
+ regex_automata::dfa::dense::DFA::from_bytes(&dfa_bytes).ok()?;
+ let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
+ Some(())
+}
diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs
@@ -0,0 +1,37 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+ let _ = run(data);
+});
+
+fn run(given_data: &[u8]) -> Option<()> {
+ use regex_automata::dfa::Automaton;
+
+ if given_data.len() < 2 {
+ return None;
+ }
+ let haystack_len = usize::from(given_data[0]);
+ let haystack = given_data.get(1..1 + haystack_len)?;
+ let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
+
+ // We help the fuzzer along by adding a preamble to the bytes that should
+ // at least make these first parts valid. The preamble expects a very
+ // specific sequence of bytes, so it makes sense to just force this.
+ let label = "rust-regex-automata-dfa-sparse\x00\x00";
+ assert_eq!(0, label.len() % 4);
+ let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
+ let version_check = 2u32.to_ne_bytes().to_vec();
+ let mut dfa_bytes: Vec<u8> = vec![];
+ dfa_bytes.extend(label.as_bytes());
+ dfa_bytes.extend(&endianness_check);
+ dfa_bytes.extend(&version_check);
+ dfa_bytes.extend(given_dfa_bytes);
+ // This is the real test: checking that any input we give to
+ // DFA::from_bytes will never result in a panic.
+ let (dfa, _) =
+ regex_automata::dfa::sparse::DFA::from_bytes(&dfa_bytes).ok()?;
+ let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
+ Some(())
+}
diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs
@@ -0,0 +1,23 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+ let _ = run(data);
+});
+
+fn run(data: &[u8]) -> Option<()> {
+ if data.len() < 2 {
+ return None;
+ }
+ let mut split_at = usize::from(data[0]);
+ let data = std::str::from_utf8(&data[1..]).ok()?;
+ // Split data into a regex and haystack to search.
+ let len = usize::try_from(data.chars().count()).ok()?;
+ split_at = std::cmp::max(split_at, 1) % len;
+ let char_index = data.char_indices().nth(split_at)?.0;
+ let (pattern, input) = data.split_at(char_index);
+ let re = regex_lite::Regex::new(pattern).ok()?;
+ re.is_match(input);
+ Some(())
+}
diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs
@@ -1,22 +1,23 @@
 #![no_main]
+
 use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {
+ let _ = run(data);
+});
+
+fn run(data: &[u8]) -> Option<()> {
  if data.len() < 2 {
- return;
+ return None;
  }
- let split_point = data[0] as usize;
- if let Ok(data) = std::str::from_utf8(&data[1..]) {
- use std::cmp::max;
- // split data into regular expression and actual input to search through
- let len = data.chars().count();
- let split_off_point = max(split_point, 1) % len as usize;
- let char_index = data.char_indices().nth(split_off_point);
- if let Some((char_index, _)) = char_index {
- let (pattern, input) = data.split_at(char_index);
- if let Ok(re) = regex::Regex::new(pattern) {
- re.is_match(input);
- }
- }
- }
-});
+ let mut split_at = usize::from(data[0]);
+ let data = std::str::from_utf8(&data[1..]).ok()?;
+ // Split data into a regex and haystack to search.
+ let len = usize::try_from(data.chars().count()).ok()?;
+ split_at = std::cmp::max(split_at, 1) % len;
+ let char_index = data.char_indices().nth(split_at)?.0;
+ let (pattern, input) = data.split_at(char_index);
+ let re = regex::Regex::new(pattern).ok()?;
+ re.is_match(input);
+ Some(())
+}