Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: oxc regex parser #2030

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 46 additions & 0 deletions '
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
use phf::phf_set;

const SYNTAX_CHARACTERS: phf::Set<char> = phf_set!['(', ')', '[', ']', '{', '}', '|', '-'];

const CLASS_SET_RESERVED_DOUBLE_PUNCTUATOR_CHARACTER: phf::Set<char> = phf_set! {
'&' => AMPERSAND,
'!' => EXCLAMATION_MARK,
'#' => NUMBER_SIGN,
'$' => DOLLAR_SIGN,
'%' => PERCENT_SIGN,
'*' => ASTERISK,
'+' => PLUS_SIGN,
',' => COMMA,
'.' => FULL_STOP,
':' => COLON,
';' => SEMICOLON,
'<' => LESS_THAN_SIGN,
'=' => EQUALS_SIGN,
'>' => GREATER_THAN_SIGN,
'?' => QUESTION_MARK,
'@' => COMMERCIAL_AT,
'^' => CIRCUMFLEX_ACCENT,
'`' => GRAVE_ACCENT,
'~' => TILDE,
};

#[inline]
pub fn is_syntax_character(cp: char) -> bool {
SYNTAX_CHARACTERS.contains(&cp)
}

pub fn is_lead_surrogate(code: u32) -> bool {
code >= 0xd800 && code <= 0xdbff
}

pub fn is_trail_surrogate(code: u32) -> bool {
code >= 0xdc00 && code <= 0xdfff
}

pub fn combine_surrogate_pair(lead: u32, trail: u32) -> u32 {
(lead - 0xd800) * 0x400 + (trail - 0xdc00) + 0x10000
}

pub fn is_class_set_reserved_double_punctuator_character(cp: char) -> bool {
CLASS_SET_RESERVED_DOUBLE_PUNCTUATOR_CHARACTER.contains(&cp)
}
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions crates/oxc_js_regex/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,8 @@ workspace = true
doctest = false

[dependencies]
oxc_allocator = { workspace = true }
oxc_span = { workspace = true }
phf = { workspace = true }
oxc_allocator = { workspace = true }
oxc_span = { workspace = true }
oxc_diagnostics = { workspace = true }
oxc_syntax.workspace = true
62 changes: 33 additions & 29 deletions crates/oxc_js_regex/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
use oxc_allocator::{Box, Vec};
use oxc_span::{Atom, Span};

use crate::ast_kind::AstKind;

/// The type which includes all nodes.
#[derive(Debug)]
pub enum Node<'a> {
Expand Down Expand Up @@ -42,46 +44,46 @@ pub enum Leaf<'a> {
/// The type which includes all atom nodes.
#[derive(Debug)]
pub enum Element<'a> {
Assertion(Box<'a, Assertion<'a>>),
QuantifiableElement(Box<'a, QuantifiableElement<'a>>),
Assertion(Assertion<'a>),
QuantifiableElement(QuantifiableElement<'a>),
Quantifier(Box<'a, Quantifier<'a>>),
}

/// The type which includes all atom nodes that Quantifier node can have as children.
#[derive(Debug)]
pub enum QuantifiableElement<'a> {
Backreference(Box<'a, Backreference<'a>>),
CapturingGroup(Box<'a, CapturingGroup<'a>>),
Character(Box<'a, Character>),
CharacterClass(Box<'a, CharacterClass<'a>>),
CharacterSet(Box<'a, CharacterSet<'a>>),
ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>),
Group(Box<'a, Group<'a>>),
LookaheadAssertion(Box<'a, LookaheadAssertion<'a>>),
Backreference(Backreference<'a>),
CapturingGroup(CapturingGroup<'a>),
Character(Character),
CharacterClass(CharacterClass<'a>),
CharacterSet(CharacterSet<'a>),
ExpressionCharacterClass(ExpressionCharacterClass<'a>),
Group(Group<'a>),
LookaheadAssertion(LookaheadAssertion<'a>),
}

/// The type which includes all character class atom nodes.
#[derive(Debug)]
pub enum CharacterClassElement<'a> {
ClassRangesCharacterClassElement(Box<'a, ClassRangesCharacterClassElement<'a>>),
UnicodeSetsCharacterClassElement(Box<'a, UnicodeSetsCharacterClassElement<'a>>),
ClassRangesCharacterClassElement(ClassRangesCharacterClassElement),
UnicodeSetsCharacterClassElement(UnicodeSetsCharacterClassElement<'a>),
}
#[derive(Debug)]
pub enum ClassRangesCharacterClassElement<'a> {
Character(Box<'a, Character>),
CharacterClassRange(Box<'a, CharacterClassRange>),
CharacterUnicodePropertyCharacterSet(Box<'a, CharacterUnicodePropertyCharacterSet>),
EscapeCharacterSet(Box<'a, EscapeCharacterSet>),
pub enum ClassRangesCharacterClassElement {
Character(Character),
CharacterClassRange(CharacterClassRange),
CharacterUnicodePropertyCharacterSet(CharacterUnicodePropertyCharacterSet),
EscapeCharacterSet(EscapeCharacterSet),
}
#[derive(Debug)]
pub enum UnicodeSetsCharacterClassElement<'a> {
Character(Box<'a, Character>),
CharacterClassRange(Box<'a, CharacterClassRange>),
ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>),
EscapeCharacterSet(Box<'a, EscapeCharacterSet>),
ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>),
UnicodePropertyCharacterSet(Box<'a, UnicodePropertyCharacterSet<'a>>),
UnicodeSetsCharacterClass(Box<'a, UnicodeSetsCharacterClass<'a>>),
Character(Character),
CharacterClassRange(CharacterClassRange),
ClassStringDisjunction(ClassStringDisjunction<'a>),
EscapeCharacterSet(EscapeCharacterSet),
ExpressionCharacterClass(ExpressionCharacterClass<'a>),
UnicodePropertyCharacterSet(UnicodePropertyCharacterSet<'a>),
UnicodeSetsCharacterClass(UnicodeSetsCharacterClass<'a>),
}

/// The root node.
Expand Down Expand Up @@ -117,7 +119,7 @@ pub struct Group<'a> {

/// The capturing group.
/// E.g. `(ab)`, `(?<name>ab)`
#[derive(Debug)]
#[derive(Debug, Default)]
pub struct CapturingGroup<'a> {
pub span: Span,
pub name: Option<Atom>,
Expand Down Expand Up @@ -155,8 +157,10 @@ pub struct LookbehindAssertion<'a> {
#[derive(Debug)]
pub struct Quantifier<'a> {
pub span: Span,
pub min: f64,
pub max: f64, // can be f64::INFINITY
/// https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/validator.ts#L384-L398
/// both `min` and `max` are integer
pub min: usize,
pub max: usize,
pub greedy: bool,
pub element: QuantifiableElement<'a>,
}
Expand All @@ -176,7 +180,7 @@ pub enum CharacterClass<'a> {
pub struct ClassRangesCharacterClass<'a> {
pub span: Span,
pub unicode_sets: bool,
pub elements: Vec<'a, ClassRangesCharacterClassElement<'a>>,
pub elements: Vec<'a, ClassRangesCharacterClassElement>,
}

/// The character class used in Unicode sets mode (`v` flag).
Expand Down Expand Up @@ -354,7 +358,7 @@ pub struct StringAlternative<'a> {
#[derive(Debug)]
pub struct Character {
pub span: Span,
pub value: u16, // UTF-16 code point
pub value: char, // UTF-16 code point
}

#[derive(Debug)]
Expand Down
73 changes: 73 additions & 0 deletions crates/oxc_js_regex/src/ast_builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
use oxc_allocator::{Allocator, Box, String, Vec};
use oxc_span::{Atom, GetSpan, SourceType, Span};

#[allow(clippy::wildcard_imports)]
use crate::ast::*;

/// AST builder for creating AST nodes
pub struct AstBuilder<'a> {
pub allocator: &'a Allocator,
}

impl<'a> AstBuilder<'a> {
pub fn new(allocator: &'a Allocator) -> Self {
Self { allocator }
}

#[inline]
pub fn alloc<T>(&self, value: T) -> Box<'a, T> {
Box(self.allocator.alloc(value))
}

#[inline]
pub fn new_vec<T>(&self) -> Vec<'a, T> {
Vec::new_in(self.allocator)
}

#[inline]
pub fn new_vec_with_capacity<T>(&self, capacity: usize) -> Vec<'a, T> {
Vec::with_capacity_in(capacity, self.allocator)
}

#[inline]
pub fn new_vec_single<T>(&self, value: T) -> Vec<'a, T> {
let mut vec = self.new_vec_with_capacity(1);
vec.push(value);
vec
}

#[inline]
pub fn new_str(&self, value: &str) -> &'a str {
String::from_str_in(value, self.allocator).into_bump_str()
}

pub fn copy<T>(&self, src: &T) -> T {
// SAFETY:
// This should be safe as long as `src` is an reference from the allocator.
// But honestly, I'm not really sure if this is safe.
unsafe { std::mem::transmute_copy(src) }
}

pub fn alternative(&mut self, span: Span, elements: Vec<'a, Element<'a>>) -> Branch<'a> {
Branch::Alternative(self.alloc(Alternative { span, elements }))
}

pub fn capturing_group(
&mut self,
span: Span,
name: Option<Atom>,
alternatives: Vec<'a, Alternative<'a>>,
references: Vec<'a, Backreference<'a>>,
) -> Branch<'a> {
Branch::CapturingGroup(self.alloc(CapturingGroup { span, name, alternatives, references }))
}

pub fn reg_exp_literal(
&mut self,
span: Span,
flags: Flags,
pattern: Pattern<'a>,
) -> RegExpLiteral<'a> {
RegExpLiteral { span, pattern, flags }
}
}
25 changes: 25 additions & 0 deletions crates/oxc_js_regex/src/ast_kind.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
use super::ast::*;

#[allow(unused)]
#[derive(Debug)]
pub enum AstKind<'a> {
Alternative(&'a Alternative<'a>),
CapturingGroup(&'a CapturingGroup<'a>),
CharacterClass(&'a CharacterClass<'a>),
CharacterClassRange(&'a CharacterClassRange),
ClassIntersection(&'a ClassIntersection<'a>),
ClassStringDisjunction(&'a ClassStringDisjunction<'a>),
ClassSubtraction(&'a ClassSubtraction<'a>),
ExpressionCharacterClass(&'a ExpressionCharacterClass<'a>),
Group(&'a Group<'a>),
LookaroundAssertion(&'a LookaroundAssertion<'a>),
Pattern(&'a Pattern<'a>),
Quantifier(&'a Quantifier<'a>),
RegExpLiteral(&'a RegExpLiteral<'a>),
StringAlternative(&'a StringAlternative<'a>),
Backreference(&'a Backreference<'a>),
BoundaryAssertion(&'a BoundaryAssertion<'a>),
Character(&'a Character),
CharacterSet(&'a CharacterSet<'a>),
Flags(&'a Flags),
}
22 changes: 22 additions & 0 deletions crates/oxc_js_regex/src/ecma_version.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#[allow(unused)]
#[derive(Clone, Copy, PartialEq, PartialOrd, Default)]
pub enum EcmaVersion {
#[default]
V5,
V2015,
V2016,
V2017,
V2018,
V2019,
V2020,
V2021,
V2022,
V2023,
V2024,
}
#[allow(unused)]
impl EcmaVersion {
pub fn latest_ecma_version() -> Self {
Self::V2024
}
}
4 changes: 4 additions & 0 deletions crates/oxc_js_regex/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
pub mod ast;
mod ast_builder;
mod ast_kind;
mod ecma_version;
mod lexer;
pub mod parser;
mod util;
pub mod validator;
pub mod visitor;