oxc-project · dyxushuai · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+ "rust-analyzer.showUnlinkedFileNotification": false
+}
diff --git a/Cargo.toml b/Cargo.toml
@@ -137,6 +137,7 @@ tsify = { version = "0.4.5" }
 wasm-bindgen = { version = "0.2" }
 serde-wasm-bindgen = { version = "0.6.3" }
 
+
 [profile.release.package.oxc_wasm]
 opt-level = 'z'
 

diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs
@@ -1,29 +1,36 @@
 use super::{
  cold_branch,
- search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
+ search::{byte_search, simd_byte_match_table, SimdByteMatchTable, SEARCH_BATCH_SIZE},
  Kind, Lexer, SourcePosition,
 };
 use crate::diagnostics;
 
-use std::cmp::max;
-
 use oxc_allocator::String;
 use oxc_span::Span;
 use oxc_syntax::identifier::{
  is_identifier_part, is_identifier_part_unicode, is_identifier_start_unicode,
 };
+use std::{borrow::Cow, cmp::max};
 
 const MIN_ESCAPED_STR_LEN: usize = 16;
 
-static ASCII_ID_START_TABLE: SafeByteMatchTable =
- safe_byte_match_table!(|b| b.is_ascii_alphabetic() || b == b'_' || b == b'$');
+static ASCII_ID_START_TABLE: SimdByteMatchTable =
+ simd_byte_match_table!(|b| b.is_ascii_alphabetic() || b == b'_' || b == b'$', false);
 
-static NOT_ASCII_ID_CONTINUE_TABLE: SafeByteMatchTable =
- safe_byte_match_table!(|b| !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$'));
+static NOT_ASCII_ID_CONTINUE_TABLE: SimdByteMatchTable =
+ simd_byte_match_table!(|b| !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$'), true);
 
 #[inline]
-fn is_identifier_start_ascii_byte(byte: u8) -> bool {
- ASCII_ID_START_TABLE.matches(byte)
+fn is_identifier_start_ascii_byte(data: Option<(Cow<[u8; SEARCH_BATCH_SIZE]>, usize)>) -> bool {
+ let data = match data {
+ Some(data) => data,
+ None => return false,
+ };
+ let mut iter = ASCII_ID_START_TABLE.matches(data.0.as_ref(), data.1);
+ match iter.next() {
+ Some((offset, _)) => offset == 0,
+ None => false,
+ }
 }
 
 impl<'a> Lexer<'a> {
@@ -224,10 +231,11 @@ impl<'a> Lexer<'a> {
  });
  }
 
+ let pos = self.source.position();
  // Handle if not an ASCII identifier byte.
  // SAFETY: Not at EOF, so safe to read a byte.
- let b = unsafe { start_pos.read() };
- if !is_identifier_start_ascii_byte(b) {
+ let data = unsafe { pos.peek_n_with_padding::<SEARCH_BATCH_SIZE>(self.source.end_addr()) };
+ if !is_identifier_start_ascii_byte(data) {
  return self.private_identifier_not_ascii_id();
  }
 

diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
@@ -17,6 +17,7 @@ mod numeric;
 mod punctuation;
 mod regex;
 mod search;
+mod simd;
 mod source;
 mod string;
 mod string_builder;

diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs
@@ -4,8 +4,52 @@
 //! * `byte_match_table!` and `safe_byte_match_table!` macros create those tables at compile time.
 //! * `byte_search!` macro searches source text for first byte matching a byte table.
 
+use super::simd;
+
 /// Batch size for searching
-pub const SEARCH_BATCH_SIZE: usize = 32;
+pub const SEARCH_BATCH_SIZE: usize = simd::ALIGNMENT;
+
+pub struct SimdByteMatchTable(simd::MatchTable);
+
+#[allow(dead_code)]
+impl SimdByteMatchTable {
+ // Create new `SimdByteMatchTable`.
+ pub const fn new(bytes: [bool; 256], reverse: bool) -> Self {
+ Self(simd::MatchTable::new(bytes, reverse))
+ }
+
+ /// Declare that using this table for searching.
+ /// An unsafe function here, whereas for `SafeByteMatchTable` it's safe.
+ /// `byte_search!` macro calls `.use_table()` on whatever table it's provided, which makes
+ /// using the macro unsafe for `ByteMatchTable`, but safe for `SafeByteMatchTable`.
+ #[allow(clippy::unused_self)]
+ #[inline]
+ pub const fn use_table(&self) {}
+
+ /// Test a value against this `ByteMatchTable`.
+ #[inline]
+ pub fn matches<'a>(
+ &'a self,
+ data: &'a [u8; SEARCH_BATCH_SIZE],
+ actual_len: usize,
+ ) -> impl Iterator<Item = (usize, u8)> + 'a {
+ self.0.matches(data, actual_len)
+ }
+}
+
+macro_rules! simd_byte_match_table {
+ (|$byte:ident| $res:expr, $reverse:expr) => {{
+ use crate::lexer::search::SimdByteMatchTable;
+ // Clippy creates warnings because e.g. `byte_match_table!(|b| b == 0)`
+ // is expanded to `SimdByteMatchTable([(0 == 0), ... ])`
+ #[allow(clippy::eq_op)]
+ const TABLE: SimdByteMatchTable = seq_macro::seq!($byte in 0u8..=255 {
+ SimdByteMatchTable::new([ #($res,)* ], $reverse)
+ });
+ TABLE
+ }};
+}
+pub(crate) use simd_byte_match_table;
 
 /// Byte matcher lookup table.
 ///
@@ -158,6 +202,7 @@ pub(crate) use byte_match_table;
 /// }
 /// }
 /// ```
+#[derive(Debug)]
 #[repr(C, align(64))]
 pub struct SafeByteMatchTable([bool; 256]);
 
@@ -207,10 +252,96 @@ impl SafeByteMatchTable {
  #[inline]
  pub const fn use_table(&self) {}
 
- /// Test a value against this `SafeByteMatchTable`.
+ /// Returns the position of matched first delimiter and the matched first byte.
  #[inline]
- pub const fn matches(&self, b: u8) -> bool {
- self.0[b as usize]
+ pub fn matches<'a>(
+ &'a self,
+ data: &'a [u8; SEARCH_BATCH_SIZE],
+ actual_len: usize,
+ ) -> impl Iterator<Item = (usize, u8)> + 'a {
+ SafeByteMatchTableIter { table: self, data, actual_len, offset: 0 }
+ }
+}
+
+struct SafeByteMatchTableIter<'a> {
+ table: &'a SafeByteMatchTable,
+ data: &'a [u8; SEARCH_BATCH_SIZE],
+ actual_len: usize,
+ offset: usize,
+}
+
+impl Iterator for SafeByteMatchTableIter<'_> {
+ type Item = (usize, u8);
+
+ fn next(&mut self) -> Option<Self::Item> {
+ for (i, &b) in self.data[self.offset..self.actual_len].iter().enumerate() {
+ self.offset += 1;
+ if self.table.0[b as usize] {
+ return Some((i, b));
+ }
+ }
+ None
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::SafeByteMatchTable;
+ use crate::lexer::{source::Source, UniquePromise};
+
+ const SEARCH_BATCH_SIZE: usize = 16;
+ #[test]
+ fn neon_find_non_ascii() {
+ let table = seq_macro::seq!(b in 0u8..=255 {
+ SafeByteMatchTable::new([#(!(b.is_ascii_alphanumeric() || b == b'_' || b == b'$'),)*])
+ });
+ let data = [
+ "AAAAAAAA\"\rAAAAAA",
+ "AAAAAAAAAAAAAAA\"",
+ "AAAAAAAAAAAAAAAA",
+ "AAAAAAAA",
+ "AAAAAAAA\r",
+ "AAAAAAAAAAAAAAA\r",
+ ]
+ .map(|x| Source::new(x, UniquePromise::new_for_tests()));
+ let expected = [
+ (vec![Some((8, b'"')), Some((0, b'\r')), None], SEARCH_BATCH_SIZE),
+ (vec![Some((15, b'"')), None], SEARCH_BATCH_SIZE),
+ (vec![None], SEARCH_BATCH_SIZE),
+ (vec![None], 8),
+ (vec![Some((8, b'\r')), None], 9),
+ (vec![Some((15, b'\r')), None], SEARCH_BATCH_SIZE),
+ ];
+
+ for (idx, d) in data.into_iter().enumerate() {
+ let pos = d.position();
+ let (data, actual_len) =
+ unsafe { pos.peek_n_with_padding::<SEARCH_BATCH_SIZE>(d.end_addr()) }.unwrap();
+ let mut result = table.matches(&data, actual_len);
+ for val in &expected[idx].0 {
+ assert_eq!(result.next(), *val);
+ }
+ assert_eq!(actual_len, expected[idx].1);
+ }
+ }
+
+ #[test]
+ fn neon_find_single_quote_string() {
+ let table = seq_macro::seq!(b in 0u8..=255 {
+ // find non ascii
+ SafeByteMatchTable::new([#(matches!(b, b'\'' | b'\r' | b'\n' | b'\\'),)*])
+ });
+ let s1 = String::from(138u8 as char);
+ let data = [&s1].map(|x| Source::new(x, UniquePromise::new_for_tests()));
+ let expected = [(None, 2)];
+
+ for (idx, d) in data.into_iter().enumerate() {
+ let pos = d.position();
+ let (data, actual_len) =
+ unsafe { pos.peek_n_with_padding::<SEARCH_BATCH_SIZE>(d.end_addr()) }.unwrap();
+ let mut result = table.matches(&data, actual_len);
+ assert_eq!((result.next(), actual_len), expected[idx]);
+ }
  }
 }
 
@@ -495,93 +626,54 @@ macro_rules! byte_search {
 
  let mut $pos = $start;
  #[allow(unused_unsafe)] // Silence warnings if macro called in unsafe code
- loop {
- if $pos.addr() <= $lexer.source.end_for_batch_search_addr() {
- // Search a batch of `SEARCH_BATCH_SIZE` bytes.
- // The compiler unrolls this loop.
- // SAFETY:
- // `$pos.addr() > lexer.source.end_for_batch_search_addr()` check above ensures there are
- // at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`.
- // So calls to `$pos.read()` and `$pos.add(1)` in this loop cannot go out of bounds.
- for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE {
- // SAFETY: `$pos` cannot go out of bounds in this loop (see above).
- let $match_byte = unsafe { $pos.read() };
- if $table.matches($match_byte) {
- // Found match.
- // Check if should continue.
- {
- let $continue_byte = $match_byte;
- if $should_continue {
- // Not a match after all - continue searching.
- // SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
- // See above about UTF-8 character boundaries invariant.
- $pos = unsafe { $pos.add(1) };
- continue;
- }
- }
-
- // Advance `lexer.source`'s position up to `$pos`, consuming unmatched bytes.
- // SAFETY: See above about UTF-8 character boundaries invariant.
- $lexer.source.set_position($pos);
-
- let $match_start = $start;
- return $match_handler;
- }
-
- // No match - continue searching
- // SAFETY: `$pos` cannot go out of bounds in this loop (see above).
- // Also see above about UTF-8 character boundaries invariant.
- $pos = unsafe { $pos.add(1) };
- }
- // No match in batch - loop round and searching next batch
- } else {
- // Not enough bytes remaining to process as a batch.
- // This branch marked `#[cold]` as should be very uncommon in normal-length JS files.
- // Very short JS files will be penalized, but they'll be very fast to parse anyway.
- // TODO: Could extend very short files with padding during parser initialization
- // to remove that problem.
- return crate::lexer::cold_branch(|| {
- let end_addr = $lexer.source.end_addr();
- while $pos.addr() < end_addr {
- // SAFETY: `pos` is not at end of source, so safe to read a byte
- let $match_byte = unsafe { $pos.read() };
- if $table.matches($match_byte) {
- // Found match.
- // Check if should continue.
- {
- let $continue_byte = $match_byte;
- if $should_continue {
- // Not a match after all - continue searching.
- // SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
- // See above about UTF-8 character boundaries invariant.
- $pos = unsafe { $pos.add(1) };
- continue;
- }
- }
-
- // Advance `lexer.source`'s position up to `pos`, consuming unmatched bytes.
- // SAFETY: See above about UTF-8 character boundaries invariant.
- $lexer.source.set_position($pos);
-
- let $match_start = $start;
- return $match_handler;
- }
-
- // No match - continue searching
+ while let Some((data, actual_len)) = unsafe {
+ $pos.peek_n_with_padding::<{ crate::lexer::search::SEARCH_BATCH_SIZE }>(
+ $lexer.source.end_addr(),
+ )
+ } {
+ let mut iter = $table.matches(&data, actual_len);
+ let mut remaining = actual_len;
+ while let Some((offset, b)) = iter.next() {
+ // Advance the $pos with the batch matched pos
+ // SAFETY: `pos` is not at end of source, so safe to advance `pos` bytes.
+ // See above about UTF-8 character boundaries invariant.
+ $pos = unsafe { $pos.add(offset) };
+ remaining -= offset;
+ // SAFETY: `$pos` cannot go out of bounds in this loop (see above).
+ let $match_byte = b;
+ // Found match.
+ // Check if should continue.
+ {
+ let $continue_byte = $match_byte;
+ if $should_continue {
+ // Not a match after all - continue searching.
  // SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
  // See above about UTF-8 character boundaries invariant.
  $pos = unsafe { $pos.add(1) };
+ remaining -= 1;
+ continue;
  }
+ }
+ // Advance `lexer.source`'s position up to `$pos`, consuming unmatched bytes.
+ // SAFETY: See above about UTF-8 character boundaries invariant.
+ $lexer.source.set_position($pos);
 
- // EOF.
- // Advance `lexer.source`'s position to end of file.
- $lexer.source.set_position($pos);
-
- let $eof_start = $start;
- $eof_handler
- });
+ let $match_start = $start;
+ return $match_handler;
  }
+ // No match in batch - loop round and searching next batch
+
+ // No match - continue searching
+ // SAFETY: `$pos` cannot go out of bounds in this loop (see above).
+ // Also see above about UTF-8 character boundaries invariant.
+ $pos = unsafe { $pos.add(remaining) };
  }
+
+ // EOF.
+ // Advance `lexer.source`'s position to end of file.
+ $lexer.source.set_position($pos);
+ let $eof_start = $start;
+ return $eof_handler;
  }};
 }
 pub(crate) use byte_search;