From a8b2fc9740ee788b05ed65ed37a53c66318fbf5b Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sun, 30 Apr 2023 09:24:46 -0400
Subject: [PATCH] fuzz: improve fuzz testing

It's still not as good as it could be, but we add fuzz targets for
regex-lite and DFA deserialization in regex-automata.
---
 .vim/coc-settings.json                        |  6 +++
 fuzz/Cargo.toml                               | 20 ++++++++--
 ...zz_regex_automata_deserialize_dense_dfa.rs | 37 +++++++++++++++++++
 ...z_regex_automata_deserialize_sparse_dfa.rs | 37 +++++++++++++++++++
 fuzz/fuzz_targets/fuzz_regex_lite_match.rs    | 23 ++++++++++++
 fuzz/fuzz_targets/fuzz_regex_match.rs         | 33 +++++++++--------
 6 files changed, 136 insertions(+), 20 deletions(-)
 create mode 100644 .vim/coc-settings.json
 create mode 100644 fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs
 create mode 100644 fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs
 create mode 100644 fuzz/fuzz_targets/fuzz_regex_lite_match.rs
diff --git a/.vim/coc-settings.json b/.vim/coc-settings.json
new file mode 100644
index 0000000000..d756767509
--- /dev/null
+++ b/.vim/coc-settings.json
@@ -0,0 +1,6 @@
+{
+  "rust-analyzer.linkedProjects": [
+    "fuzz/Cargo.toml",
+    "Cargo.toml"
+  ]
+}
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index c1e2776348..2b742f0873 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -3,16 +3,16 @@ name = "regex-fuzz"
 version = "0.0.0"
 authors = ["David Korczynski <david@adalogics.com>"]
 publish = false
-edition = "2018"
+edition = "2021"
 
 [package.metadata]
 cargo-fuzz = true
 
 [dependencies]
 libfuzzer-sys = "0.4.1"
-
-[dependencies.regex]
-path = ".."
+regex = { path = ".." }
+regex-automata = { path = "../regex-automata" }
+regex-lite = { path = "../regex-lite" }
 
 # Prevent this from interfering with workspaces
 [workspace]
@@ -22,6 +22,18 @@ members = ["."]
 name = "fuzz_regex_match"
 path = "fuzz_targets/fuzz_regex_match.rs"
 
+[[bin]]
+name = "fuzz_regex_lite_match"
+path = "fuzz_targets/fuzz_regex_lite_match.rs"
+
+[[bin]]
+name = "fuzz_regex_automata_deserialize_dense_dfa"
+path = "fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs"
+
+[[bin]]
+name = "fuzz_regex_automata_deserialize_sparse_dfa"
+path = "fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs"
+
 [profile.release]
 opt-level = 3
 debug = true
diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs
new file mode 100644
index 0000000000..88f94082b8
--- /dev/null
+++ b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs
@@ -0,0 +1,37 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    let _ = run(data);
+});
+
+fn run(given_data: &[u8]) -> Option<()> {
+    use regex_automata::dfa::Automaton;
+
+    if given_data.len() < 2 {
+        return None;
+    }
+    let haystack_len = usize::from(given_data[0]);
+    let haystack = given_data.get(1..1 + haystack_len)?;
+    let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
+
+    // We help the fuzzer along by adding a preamble to the bytes that should
+    // at least make these first parts valid. The preamble expects a very
+    // specific sequence of bytes, so it makes sense to just force this.
+    let label = "rust-regex-automata-dfa-dense\x00\x00\x00";
+    assert_eq!(0, label.len() % 4);
+    let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
+    let version_check = 2u32.to_ne_bytes().to_vec();
+    let mut dfa_bytes: Vec<u8> = vec![];
+    dfa_bytes.extend(label.as_bytes());
+    dfa_bytes.extend(&endianness_check);
+    dfa_bytes.extend(&version_check);
+    dfa_bytes.extend(given_dfa_bytes);
+    // This is the real test: checking that any input we give to
+    // DFA::from_bytes will never result in a panic.
+    let (dfa, _) =
+        regex_automata::dfa::dense::DFA::from_bytes(&dfa_bytes).ok()?;
+    let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
+    Some(())
+}
diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs
new file mode 100644
index 0000000000..e70b5156b9
--- /dev/null
+++ b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs
@@ -0,0 +1,37 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    let _ = run(data);
+});
+
+fn run(given_data: &[u8]) -> Option<()> {
+    use regex_automata::dfa::Automaton;
+
+    if given_data.len() < 2 {
+        return None;
+    }
+    let haystack_len = usize::from(given_data[0]);
+    let haystack = given_data.get(1..1 + haystack_len)?;
+    let given_dfa_bytes = given_data.get(1 + haystack_len..)?;
+
+    // We help the fuzzer along by adding a preamble to the bytes that should
+    // at least make these first parts valid. The preamble expects a very
+    // specific sequence of bytes, so it makes sense to just force this.
+    let label = "rust-regex-automata-dfa-sparse\x00\x00";
+    assert_eq!(0, label.len() % 4);
+    let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec();
+    let version_check = 2u32.to_ne_bytes().to_vec();
+    let mut dfa_bytes: Vec<u8> = vec![];
+    dfa_bytes.extend(label.as_bytes());
+    dfa_bytes.extend(&endianness_check);
+    dfa_bytes.extend(&version_check);
+    dfa_bytes.extend(given_dfa_bytes);
+    // This is the real test: checking that any input we give to
+    // DFA::from_bytes will never result in a panic.
+    let (dfa, _) =
+        regex_automata::dfa::sparse::DFA::from_bytes(&dfa_bytes).ok()?;
+    let _ = dfa.try_search_fwd(&regex_automata::Input::new(haystack));
+    Some(())
+}
diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs
new file mode 100644
index 0000000000..c4e61ccd7b
--- /dev/null
+++ b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs
@@ -0,0 +1,23 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    let _ = run(data);
+});
+
+fn run(data: &[u8]) -> Option<()> {
+    if data.len() < 2 {
+        return None;
+    }
+    let mut split_at = usize::from(data[0]);
+    let data = std::str::from_utf8(&data[1..]).ok()?;
+    // Split data into a regex and haystack to search.
+    let len = usize::try_from(data.chars().count()).ok()?;
+    split_at = std::cmp::max(split_at, 1) % len;
+    let char_index = data.char_indices().nth(split_at)?.0;
+    let (pattern, input) = data.split_at(char_index);
+    let re = regex_lite::Regex::new(pattern).ok()?;
+    re.is_match(input);
+    Some(())
+}
diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs
index bd9eefad54..5e9333f461 100644
--- a/fuzz/fuzz_targets/fuzz_regex_match.rs
+++ b/fuzz/fuzz_targets/fuzz_regex_match.rs
@@ -1,22 +1,23 @@
 #![no_main]
+
 use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {
+    let _ = run(data);
+});
+
+fn run(data: &[u8]) -> Option<()> {
     if data.len() < 2 {
-        return;
+        return None;
     }
-    let split_point = data[0] as usize;
-    if let Ok(data) = std::str::from_utf8(&data[1..]) {
-        use std::cmp::max;
-        // split data into regular expression and actual input to search through
-        let len = data.chars().count();
-        let split_off_point = max(split_point, 1) % len as usize;
-        let char_index = data.char_indices().nth(split_off_point);
-        if let Some((char_index, _)) = char_index {
-            let (pattern, input) = data.split_at(char_index);
-            if let Ok(re) = regex::Regex::new(pattern) {
-                re.is_match(input);
-            }
-        }
-    }
-});
+    let mut split_at = usize::from(data[0]);
+    let data = std::str::from_utf8(&data[1..]).ok()?;
+    // Split data into a regex and haystack to search.
+    let len = usize::try_from(data.chars().count()).ok()?;
+    split_at = std::cmp::max(split_at, 1) % len;
+    let char_index = data.char_indices().nth(split_at)?.0;
+    let (pattern, input) = data.split_at(char_index);
+    let re = regex::Regex::new(pattern).ok()?;
+    re.is_match(input);
+    Some(())
+}