From a8b2fc9740ee788b05ed65ed37a53c66318fbf5b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 30 Apr 2023 09:24:46 -0400 Subject: [PATCH] fuzz: improve fuzz testing It's still not as good as it could be, but we add fuzz targets for regex-lite and DFA deserialization in regex-automata. --- .vim/coc-settings.json | 6 +++ fuzz/Cargo.toml | 20 ++++++++-- ...zz_regex_automata_deserialize_dense_dfa.rs | 37 +++++++++++++++++++ ...z_regex_automata_deserialize_sparse_dfa.rs | 37 +++++++++++++++++++ fuzz/fuzz_targets/fuzz_regex_lite_match.rs | 23 ++++++++++++ fuzz/fuzz_targets/fuzz_regex_match.rs | 33 +++++++++-------- 6 files changed, 136 insertions(+), 20 deletions(-) create mode 100644 .vim/coc-settings.json create mode 100644 fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs create mode 100644 fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs create mode 100644 fuzz/fuzz_targets/fuzz_regex_lite_match.rs diff --git a/.vim/coc-settings.json b/.vim/coc-settings.json new file mode 100644 index 0000000000..d756767509 --- /dev/null +++ b/.vim/coc-settings.json @@ -0,0 +1,6 @@ +{ + "rust-analyzer.linkedProjects": [ + "fuzz/Cargo.toml", + "Cargo.toml" + ] +} diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index c1e2776348..2b742f0873 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -3,16 +3,16 @@ name = "regex-fuzz" version = "0.0.0" authors = ["David Korczynski "] publish = false -edition = "2018" +edition = "2021" [package.metadata] cargo-fuzz = true [dependencies] libfuzzer-sys = "0.4.1" - -[dependencies.regex] -path = ".." +regex = { path = ".." } +regex-automata = { path = "../regex-automata" } +regex-lite = { path = "../regex-lite" } # Prevent this from interfering with workspaces [workspace] @@ -22,6 +22,18 @@ members = ["."] name = "fuzz_regex_match" path = "fuzz_targets/fuzz_regex_match.rs" +[[bin]] +name = "fuzz_regex_lite_match" +path = "fuzz_targets/fuzz_regex_lite_match.rs" + +[[bin]] +name = "fuzz_regex_automata_deserialize_dense_dfa" +path = "fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs" + +[[bin]] +name = "fuzz_regex_automata_deserialize_sparse_dfa" +path = "fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs" + [profile.release] opt-level = 3 debug = true diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs new file mode 100644 index 0000000000..88f94082b8 --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs @@ -0,0 +1,37 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let _ = run(data); +}); + +fn run(given_data: &[u8]) -> Option<()> { + use regex_automata::dfa::Automaton; + + if given_data.len() < 2 { + return None; + } + let haystack_len = usize::from(given_data[0]); + let haystack = given_data.get(1..1 + haystack_len)?; + let given_dfa_bytes = given_data.get(1 + haystack_len..)?; + + // We help the fuzzer along by adding a preamble to the bytes that should + // at least make these first parts valid. The preamble expects a very + // specific sequence of bytes, so it makes sense to just force this. + let label = "rust-regex-automata-dfa-dense\x00\x00\x00"; + assert_eq!(0, label.len() % 4); + let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec(); + let version_check = 2u32.to_ne_bytes().to_vec(); + let mut dfa_bytes: Vec = vec![]; + dfa_bytes.extend(label.as_bytes()); + dfa_bytes.extend(&endianness_check); + dfa_bytes.extend(&version_check); + dfa_bytes.extend(given_dfa_bytes); + // This is the real test: checking that any input we give to + // DFA::from_bytes will never result in a panic. + let (dfa, _) = + regex_automata::dfa::dense::DFA::from_bytes(&dfa_bytes).ok()?; + let _ = dfa.try_search_fwd(®ex_automata::Input::new(haystack)); + Some(()) +} diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs new file mode 100644 index 0000000000..e70b5156b9 --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs @@ -0,0 +1,37 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let _ = run(data); +}); + +fn run(given_data: &[u8]) -> Option<()> { + use regex_automata::dfa::Automaton; + + if given_data.len() < 2 { + return None; + } + let haystack_len = usize::from(given_data[0]); + let haystack = given_data.get(1..1 + haystack_len)?; + let given_dfa_bytes = given_data.get(1 + haystack_len..)?; + + // We help the fuzzer along by adding a preamble to the bytes that should + // at least make these first parts valid. The preamble expects a very + // specific sequence of bytes, so it makes sense to just force this. + let label = "rust-regex-automata-dfa-sparse\x00\x00"; + assert_eq!(0, label.len() % 4); + let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec(); + let version_check = 2u32.to_ne_bytes().to_vec(); + let mut dfa_bytes: Vec = vec![]; + dfa_bytes.extend(label.as_bytes()); + dfa_bytes.extend(&endianness_check); + dfa_bytes.extend(&version_check); + dfa_bytes.extend(given_dfa_bytes); + // This is the real test: checking that any input we give to + // DFA::from_bytes will never result in a panic. + let (dfa, _) = + regex_automata::dfa::sparse::DFA::from_bytes(&dfa_bytes).ok()?; + let _ = dfa.try_search_fwd(®ex_automata::Input::new(haystack)); + Some(()) +} diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs new file mode 100644 index 0000000000..c4e61ccd7b --- /dev/null +++ b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs @@ -0,0 +1,23 @@ +#![no_main] + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let _ = run(data); +}); + +fn run(data: &[u8]) -> Option<()> { + if data.len() < 2 { + return None; + } + let mut split_at = usize::from(data[0]); + let data = std::str::from_utf8(&data[1..]).ok()?; + // Split data into a regex and haystack to search. + let len = usize::try_from(data.chars().count()).ok()?; + split_at = std::cmp::max(split_at, 1) % len; + let char_index = data.char_indices().nth(split_at)?.0; + let (pattern, input) = data.split_at(char_index); + let re = regex_lite::Regex::new(pattern).ok()?; + re.is_match(input); + Some(()) +} diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs index bd9eefad54..5e9333f461 100644 --- a/fuzz/fuzz_targets/fuzz_regex_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_match.rs @@ -1,22 +1,23 @@ #![no_main] + use libfuzzer_sys::fuzz_target; fuzz_target!(|data: &[u8]| { + let _ = run(data); +}); + +fn run(data: &[u8]) -> Option<()> { if data.len() < 2 { - return; + return None; } - let split_point = data[0] as usize; - if let Ok(data) = std::str::from_utf8(&data[1..]) { - use std::cmp::max; - // split data into regular expression and actual input to search through - let len = data.chars().count(); - let split_off_point = max(split_point, 1) % len as usize; - let char_index = data.char_indices().nth(split_off_point); - if let Some((char_index, _)) = char_index { - let (pattern, input) = data.split_at(char_index); - if let Ok(re) = regex::Regex::new(pattern) { - re.is_match(input); - } - } - } -}); + let mut split_at = usize::from(data[0]); + let data = std::str::from_utf8(&data[1..]).ok()?; + // Split data into a regex and haystack to search. + let len = usize::try_from(data.chars().count()).ok()?; + split_at = std::cmp::max(split_at, 1) % len; + let char_index = data.char_indices().nth(split_at)?.0; + let (pattern, input) = data.split_at(char_index); + let re = regex::Regex::new(pattern).ok()?; + re.is_match(input); + Some(()) +}