Skip to content

Commit

Permalink
feat: implemented non-capturing groups
Browse files Browse the repository at this point in the history
  • Loading branch information
ColinEberhardt committed Feb 24, 2021
1 parent 8a039f4 commit 6fec3ef
Show file tree
Hide file tree
Showing 10 changed files with 97 additions and 40 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Based on the classfication within the [MDN cheatsheet](https://developer.mozilla
- [x] (x) capturing group
- [ ] \n back reference
- [ ] (?<Name>x) named capturing group
- [ ] (?:x) Non-capturing group
- [x] (?:x) Non-capturing group

**Quantifiers**

Expand Down
53 changes: 43 additions & 10 deletions assembly/__spec_tests__/generated.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1076,7 +1076,22 @@ it("line: 207 - matches ^(a(b(c)))(d(e(f)))(h(i(j)))(k(l(m)))$ against 'abcdefhi
expect(match.matches[11]).toBe("abcdefhijklm".substring(10, 12));
expect(match.matches[12]).toBe("abcdefhijklm".substring(11, 12));
});
xit("line: 208 - non capturing groups not supported", () => {});
it("line: 208 - matches ^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$ against 'abcdefhijklm'", () => {
const match = exec(
"^(?:a(b(c)))(?:d(e(f)))(?:h(i(j)))(?:k(l(m)))$",
"abcdefhijklm",
"ms"
);
expect(match.matches[0]).toBe("abcdefhijklm".substring(0, 12));
expect(match.matches[1]).toBe("abcdefhijklm".substring(1, 3));
expect(match.matches[2]).toBe("abcdefhijklm".substring(2, 3));
expect(match.matches[3]).toBe("abcdefhijklm".substring(4, 6));
expect(match.matches[4]).toBe("abcdefhijklm".substring(5, 6));
expect(match.matches[5]).toBe("abcdefhijklm".substring(7, 9));
expect(match.matches[6]).toBe("abcdefhijklm".substring(8, 9));
expect(match.matches[7]).toBe("abcdefhijklm".substring(10, 12));
expect(match.matches[8]).toBe("abcdefhijklm".substring(11, 12));
});
xit("line: 209 - back references are not supported", () => {});
it("line: 210 - matches ^[.^$|()*+?{,}]+ against '.^$(*+)|{?,?}'", () => {
const match = exec("^[.^$|()*+?{,}]+", ".^$(*+)|{?,?}", "ms");
Expand Down Expand Up @@ -1305,10 +1320,10 @@ it("line: 266 - matches ^12.34 against '12\r34'", () => {
});
xit("line: 267 - lookaheads not supported", () => {});
xit("line: 268 - lookaheads not supported", () => {});
xit("line: 269 - non capturing groups not supported", () => {});
xit("line: 270 - non capturing groups not supported", () => {});
xit("line: 271 - non capturing groups not supported", () => {});
xit("line: 272 - non capturing groups not supported", () => {});
xit("line: 269 - lookaheads not supported", () => {});
xit("line: 270 - lookaheads not supported", () => {});
xit("line: 271 - lookaheads not supported", () => {});
xit("line: 272 - lookaheads not supported", () => {});
xit("line: 273 - lookaheads not supported", () => {});
xit("line: 274 - lookaheads not supported", () => {});
xit("line: 281 - test regex contains syntax not supported in JS", () => {});
Expand Down Expand Up @@ -1564,8 +1579,14 @@ it("line: 1162 - matches \\Aabc\\Z against 'qqq\nabc\nzzz'", () => {
});
xit("line: 1163 - JS does not support the A Z syntax for start and end of string", () => {});
xit("line: 1164 - JS does not support the A Z syntax for start and end of string", () => {});
xit("line: 1165 - non capturing groups not supported", () => {});
xit("line: 1166 - non capturing groups not supported", () => {});
it("line: 1165 - matches (?:b)|(?::+) against 'b::c'", () => {
const match = exec("(?:b)|(?::+)", "b::c", "ms");
expect(match.matches[0]).toBe("b::c".substring(0, 1));
});
it("line: 1166 - matches (?:b)|(?::+) against 'c::b'", () => {
const match = exec("(?:b)|(?::+)", "c::b", "ms");
expect(match.matches[0]).toBe("c::b".substring(1, 3));
});
it("line: 1167 - matches [-az]+ against 'az-'", () => {
const match = exec("[-az]+", "az-", "ms");
expect(match.matches[0]).toBe("az-".substring(0, 3));
Expand Down Expand Up @@ -1954,9 +1975,21 @@ it("line: 1311 - matches \\d\\d\\/\\d\\d\\/\\d\\d\\d\\d against '01/01/2000'", (
const match = exec("\\d\\d\\/\\d\\d\\/\\d\\d\\d\\d", "01/01/2000", "ms");
expect(match.matches[0]).toBe("01/01/2000".substring(0, 10));
});
xit("line: 1312 - non capturing groups not supported", () => {});
xit("line: 1313 - non capturing groups not supported", () => {});
xit("line: 1314 - non capturing groups not supported", () => {});
it("line: 1312 - matches word (?:[a-zA-Z0-9]+ ){0,10}otherword against 'word cat dog elephant mussel cow horse canary baboon snake shark otherword'", () => {
const match = exec(
"word (?:[a-zA-Z0-9]+ ){0,10}otherword",
"word cat dog elephant mussel cow horse canary baboon snake shark otherword",
"ms"
);
expect(match.matches[0]).toBe(
"word cat dog elephant mussel cow horse canary baboon snake shark otherword".substring(
0,
74
)
);
});
xit("line: 1313 - peformance issue", () => {});
xit("line: 1314 - peformance issue", () => {});
it("line: 1315 - matches ^(a){0,0} against 'bcd'", () => {
const match = exec("^(a){0,0}", "bcd", "ms");
expect(match.matches[0]).toBe("bcd".substring(0, 0));
Expand Down
6 changes: 6 additions & 0 deletions assembly/__tests__/capture-group.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,9 @@ it("range repitition capture groups should return the last match", () => {
expect(match.matches[0]).toBe("ac");
expect(match.matches[1]).toBe("c");
});

it("non-capturing groups should not capture", () => {
const match = exec("(?:foo)bar(baz)", "foobarbaz");
expect(match.matches[0]).toBe("foobarbaz");
expect(match.matches[1]).toBe("baz");
});
1 change: 1 addition & 0 deletions assembly/char.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export const enum Char {
Dot = 0x2e, // "."
Zero = 0x30,
Nine = 0x39,
Colon = 0x3a,
Question = 0x3f, // "?"
A = 0x41,
D = 0x44,
Expand Down
22 changes: 14 additions & 8 deletions assembly/nfa/nfa.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ export class GroupStartMarkerState extends State {
// captures from the path through the NFA that reaches the end are flagged
flagged: bool = false;

constructor(next: State, public groupId: i32) {
constructor(next: State, public capturing: bool, public groupId: i32) {
super();
this.transitions.push(next);
}
Expand All @@ -60,10 +60,12 @@ export class GroupEndMarkerState extends State {
}

matches(input: string, position: u32): MatchResult {
this.startMarker.capture = input.substring(
this.startMarker.location,
position
);
if (this.startMarker.capturing) {
this.startMarker.capture = input.substring(
this.startMarker.location,
position
);
}
return MatchResult.Ignore;
}
}
Expand Down Expand Up @@ -164,10 +166,10 @@ function oneOrMore(nfa: Automata, greedy: bool): Automata {
return new Automata(start, end);
}

function group(nfa: Automata, id: i32): Automata {
function group(nfa: Automata, capturing: bool, id: i32): Automata {
// groups are implemented by wrapping the automata with
// a pair of markers that record matches
const startMarker = new GroupStartMarkerState(nfa.start, id);
const startMarker = new GroupStartMarkerState(nfa.start, capturing, id);
const end = new State();
const endMarker = new GroupEndMarkerState(end, startMarker);
nfa.end.transitions.push(endMarker);
Expand Down Expand Up @@ -238,7 +240,11 @@ class AutomataFactor {
);
case NodeType.Group: {
const node = expression as GroupNode;
return group(this.automataForNode(node.expression), node.id);
return group(
this.automataForNode(node.expression),
node.capturing,
node.id
);
}
case NodeType.Assertion:
return Automata.fromEpsilon();
Expand Down
8 changes: 6 additions & 2 deletions assembly/parser/node.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,11 @@ export class AlternationNode extends Node {
let _id = 0;

export class GroupNode extends Node {
constructor(public expression: Node, public id: i32 = -1) {
constructor(
public expression: Node,
public capturing: bool,
public id: i32 = -1
) {
super(NodeType.Group);
if (id == -1) {
this.id = _id++;
Expand All @@ -221,7 +225,7 @@ export class GroupNode extends Node {
}

clone(): Node {
return new GroupNode(this.expression.clone(), this.id);
return new GroupNode(this.expression.clone(), this.capturing, this.id);
}

replace(node: Node, replacement: Node): void {
Expand Down
15 changes: 14 additions & 1 deletion assembly/parser/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,18 @@ export class Parser {
return true;
}

private isCapturing(): bool {
if (
this.iterator.current == Char.Question &&
this.iterator.lookahead(1) == Char.Colon
) {
this.eatToken(Char.Question);
this.eatToken(Char.Colon);
return false;
}
return true;
}

// parses a sequence of chars
private parseSequence(): Node {
let nodes = new Array<Node>();
Expand All @@ -218,7 +230,8 @@ export class Parser {
// @ts-ignore
} else if (token == Char.LeftParenthesis) {
this.eatToken(Char.LeftParenthesis);
nodes.push(new GroupNode(this.parseSequence()));
const capturing = this.isCapturing();
nodes.push(new GroupNode(this.parseSequence(), capturing));
this.eatToken(Char.RightParenthesis);
// @ts-ignore
} else if (token == Char.LeftCurlyBrace) {
Expand Down
11 changes: 7 additions & 4 deletions assembly/regexp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ export class Flags {

// capture groups are implemented as GroupStart / GroupEnd states that record (capture)
// the value of the current state of the string being matched.
// Repeated capture groups, via rage repetitions (e.g. {2,3}) share the same 'id'. The
// Repeated capture groups, via range repetitions (e.g. {2,3}) share the same 'id'. The
// returned regex should only return the value of the final repetition.
function filterCaptures(groupMarkers: GroupStartMarkerState[]): string[] {
function lastCapturesForGroup(groupMarkers: GroupStartMarkerState[]): string[] {
if (!groupMarkers.length) {
return [];
}
Expand Down Expand Up @@ -139,7 +139,10 @@ export class RegExp {
gm = new Array<GroupStartMarkerState>();
nfaWalker(this.nfa.start, (state) => {
if (state instanceof GroupStartMarkerState) {
gm.push(state as GroupStartMarkerState);
const startMarker = state as GroupStartMarkerState;
if (startMarker.capturing) {
gm.push(state as GroupStartMarkerState);
}
}
});
this.groupMarkers = gm;
Expand Down Expand Up @@ -181,7 +184,7 @@ export class RegExp {
});

const match = new Match(
[matchStr!].concat(filterCaptures(groupMarkers)),
[matchStr!].concat(lastCapturesForGroup(groupMarkers)),
matchIndex,
str
);
Expand Down
6 changes: 1 addition & 5 deletions spec/test-generator.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const knownIssues = {
...range(141, 143),
1288,
],
"peformance issue": [1313, 1314],

/* -------- issues with the tests ------------ */
"test appears to be incorrect?": [203, 204],
Expand Down Expand Up @@ -108,11 +109,6 @@ lines.forEach((line, index) => {
return;
}

if (["(?:"].some((f) => regex.includes(f))) {
testCase += `xit("line: ${index} - non capturing groups not supported", () => {});`;
return;
}

if (["(?!", "(?="].some((f) => regex.includes(f))) {
testCase += `xit("line: ${index} - lookaheads not supported", () => {});`;
return;
Expand Down
13 changes: 4 additions & 9 deletions ts/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,8 @@ globalAny.log = console.log;

import { RegExp } from "../assembly/regexp";

const regexObj = new RegExp("abc$", "m");
let match = regexObj.exec("abc\n");
const regexObj = new RegExp("word (?:[a-zA-Z0-9]+ ){0,300}otherword", "");
let match = regexObj.exec(
"word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope"
);
console.log(JSON.stringify(match, null, 2));
// match = regexObj.exec("f1\nbar\nbaz\nf2");
// console.log(JSON.stringify(match, null, 2));

// const regex = new RegExp("^f\\d{1}$", "gm");

// let match = regex.exec("f1\nbar\nbaz\nf2");
// expect(match!.matches[0]).toBe("f1");

0 comments on commit 6fec3ef

Please sign in to comment.