From af35a00821c31a3a20df49c67dc18e14720152cd Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 17 May 2026 10:46:33 +0000 Subject: [PATCH] perf: eliminate double-tokenization of selector and prelude ranges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the token-scan loops in parse_selector() and parse_atrule() with raw character scans (scan_to_open_brace / scan_to_block_or_semi). The main parser previously had to tokenize selector and prelude content once just to find the '{' / ';' boundary, and then SelectorParser / AtRulePreludeParser would re-tokenize the same range in full detail — every token processed twice. The new raw scans handle only what's needed to find a boundary safely: quoted strings, /* comments */, backslash escapes, and (for preludes) paren depth to skip semicolons inside url(...). They track newlines so the main Lexer can be repositioned exactly at the boundary character afterward. SelectorParser and AtRulePreludeParser are now the sole tokenizers of their ranges, cutting the tokenization work for selector/prelude content roughly in half. https://claude.ai/code/session_01CQeKNnXidD5EQVJY4xBMMp --- src/parse-utils.ts | 232 +++++++++++++++++++++++++++++++++++++++++++- src/parse.ts | 75 ++++++++------ src/string-utils.ts | 5 + 3 files changed, 280 insertions(+), 32 deletions(-) diff --git a/src/parse-utils.ts b/src/parse-utils.ts index aa3b14c..ba724fa 100644 --- a/src/parse-utils.ts +++ b/src/parse-utils.ts @@ -1,4 +1,17 @@ -import { CHAR_ASTERISK, CHAR_FORWARD_SLASH, is_whitespace } from './string-utils' +import { + CHAR_ASTERISK, + CHAR_BACKSLASH, + CHAR_CARRIAGE_RETURN, + CHAR_DOUBLE_QUOTE, + CHAR_FORWARD_SLASH, + CHAR_LEFT_BRACE, + CHAR_LEFT_PAREN, + CHAR_NEWLINE, + CHAR_RIGHT_PAREN, + CHAR_SEMICOLON, + CHAR_SINGLE_QUOTE, + is_whitespace, +} from './string-utils' /** * Skip whitespace forward from a position @@ -134,3 +147,220 @@ export function trim_boundaries( if (start >= end) return null return [start, end] } + +/** + * Raw character scan to locate the opening `{` of a CSS block, without full + * tokenization. Handles strings and comments to avoid false matches, and + * tracks newlines so the caller can reposition a Lexer accurately afterward. + * + * Returns `[pos, line, line_offset]` where `pos` is the position of `{` + * (or `source.length` if not found), and `line`/`line_offset` are the + * line-tracking state at that position. + * @internal + */ +export function scan_to_open_brace( + source: string, + pos: number, + line: number, + line_offset: number, +): [number, number, number] { + const len = source.length + let i = pos + + while (i < len) { + const ch = source.charCodeAt(i) + + if (ch === CHAR_LEFT_BRACE) return [i, line, line_offset] + + // Comments: /* ... */ + if (ch === CHAR_FORWARD_SLASH && i + 1 < len && source.charCodeAt(i + 1) === CHAR_ASTERISK) { + i += 2 + while (i < len) { + const c = source.charCodeAt(i) + if (c === CHAR_ASTERISK && i + 1 < len && source.charCodeAt(i + 1) === CHAR_FORWARD_SLASH) { + i += 2 + break + } + if (c === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (c === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + } + continue + } + + // Strings: '...' or "..." + if (ch === CHAR_SINGLE_QUOTE || ch === CHAR_DOUBLE_QUOTE) { + const quote = ch + i++ + while (i < len) { + const c = source.charCodeAt(i) + if (c === quote) { + i++ + break + } + if (c === CHAR_BACKSLASH) { + i++ // skip escaped char + } else if (c === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (c === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + } + continue + } + + // Backslash escape outside strings (e.g. \{ in a selector) + if (ch === CHAR_BACKSLASH && i + 1 < len) { + i++ + const next = source.charCodeAt(i) + if (next === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (next === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + continue + } + + if (ch === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (ch === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + + i++ + } + + return [i, line, line_offset] +} + +/** + * Raw character scan to locate the end of an at-rule prelude: `{` or `;` + * (whichever comes first at parenthesis depth 0). Handles strings, comments, + * paren depth (so semicolons inside `url(data:...;...)` are skipped), and + * tracks newlines so the caller can reposition a Lexer accurately afterward. + * + * Returns `[pos, line, line_offset]` at the position of the boundary character. + * @internal + */ +export function scan_to_block_or_semi( + source: string, + pos: number, + line: number, + line_offset: number, +): [number, number, number] { + const len = source.length + let i = pos + let depth = 0 + + while (i < len) { + const ch = source.charCodeAt(i) + + if (depth === 0 && (ch === CHAR_LEFT_BRACE || ch === CHAR_SEMICOLON)) { + return [i, line, line_offset] + } + + if (ch === CHAR_LEFT_PAREN) { + depth++ + i++ + continue + } + if (ch === CHAR_RIGHT_PAREN) { + if (depth > 0) depth-- + i++ + continue + } + + // Comments + if (ch === CHAR_FORWARD_SLASH && i + 1 < len && source.charCodeAt(i + 1) === CHAR_ASTERISK) { + i += 2 + while (i < len) { + const c = source.charCodeAt(i) + if (c === CHAR_ASTERISK && i + 1 < len && source.charCodeAt(i + 1) === CHAR_FORWARD_SLASH) { + i += 2 + break + } + if (c === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (c === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + } + continue + } + + // Strings + if (ch === CHAR_SINGLE_QUOTE || ch === CHAR_DOUBLE_QUOTE) { + const quote = ch + i++ + while (i < len) { + const c = source.charCodeAt(i) + if (c === quote) { + i++ + break + } + if (c === CHAR_BACKSLASH) { + i++ + } else if (c === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (c === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + } + continue + } + + // Backslash escape + if (ch === CHAR_BACKSLASH && i + 1 < len) { + i++ + const next = source.charCodeAt(i) + if (next === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (next === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + continue + } + + if (ch === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (ch === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + + i++ + } + + return [i, line, line_offset] +} diff --git a/src/parse.ts b/src/parse.ts index 0bb4603..2644ceb 100644 --- a/src/parse.ts +++ b/src/parse.ts @@ -32,9 +32,13 @@ import { TOKEN_RIGHT_BRACKET, TOKEN_COMMA, TOKEN_COLON, - TOKEN_FUNCTION, } from './token-types' -import { trim_boundaries } from './parse-utils' +import { + trim_boundaries, + scan_to_open_brace, + scan_to_block_or_semi, + skip_whitespace_and_comments_backward, +} from './parse-utils' import { CHAR_PERIOD, CHAR_GREATER_THAN, @@ -275,12 +279,22 @@ export class Parser { let selector_line = this.lexer.token_line let selector_column = this.lexer.token_column - // Consume tokens until we hit '{' - let last_end = this.lexer.token_end - while (!this.is_eof() && this.peek_type() !== TOKEN_LEFT_BRACE) { - last_end = this.lexer.token_end - this.next_token() - } + // Raw scan to find '{' without tokenizing the selector range. + // SelectorParser will be the sole tokenizer of this range. + let [brace_pos, brace_line, brace_line_offset] = scan_to_open_brace( + this.source, + selector_start, + selector_line, + selector_start - selector_column + 1, + ) + + // Trim trailing whitespace/comments before '{' to get the true selector end, + // matching what the old token-scan loop produced via next_token_fast(true). + let last_end = skip_whitespace_and_comments_backward(this.source, brace_pos, selector_start) + + // Reposition main lexer at '{' and read it as a token + this.lexer.seek(brace_pos, brace_line, brace_pos - brace_line_offset + 1) + this.lexer.next_token_fast(false) // If detailed selector parsing is enabled, use SelectorParser if (this.parse_selectors_enabled && this.selector_parser) { @@ -297,14 +311,13 @@ export class Parser { // Create node: RAW when parsing disabled, SELECTOR_LIST as error fallback let node_type = this.parse_selectors_enabled ? SELECTOR_LIST : RAW - let selector_node = this.arena.create_node( + return this.arena.create_node( node_type, selector_start, last_end - selector_start, selector_line, selector_column, ) - return selector_node } // Parse a declaration: property: value; @@ -383,28 +396,28 @@ export class Parser { // Track prelude start and end let prelude_start = this.lexer.token_start - let prelude_end = prelude_start - // Track parenthesis depth to handle semicolons inside functions (e.g., url(data:image/png;base64,...)) - // NOTE: Same pattern exists in parse-declaration.ts for value parsing - keep in sync - let paren_depth = 0 - - // Parse prelude (everything before '{' or ';') - while (!this.is_eof()) { - let token_type = this.peek_type() - - // Track parenthesis depth - if (token_type === TOKEN_LEFT_PAREN || token_type === TOKEN_FUNCTION) { - paren_depth++ - } else if (token_type === TOKEN_RIGHT_PAREN) { - paren_depth-- - } + let prelude_line = this.lexer.token_line + let prelude_column = this.lexer.token_column + + // Raw scan to find '{' or ';' without tokenizing the prelude range. + // AtRulePreludeParser will be the sole tokenizer of this range. + // Paren depth is tracked to correctly skip semicolons inside url(...). + let [boundary_pos, boundary_line, boundary_line_offset] = scan_to_block_or_semi( + this.source, + prelude_start, + prelude_line, + prelude_start - prelude_column + 1, + ) + // Trim trailing whitespace/comments before the boundary to match the old loop + let prelude_end = skip_whitespace_and_comments_backward( + this.source, + boundary_pos, + prelude_start, + ) - // Only break on '{' or ';' when outside all parentheses - if (token_type === TOKEN_LEFT_BRACE && paren_depth === 0) break - if (token_type === TOKEN_SEMICOLON && paren_depth === 0) break - prelude_end = this.lexer.token_end - this.next_token() - } + // Reposition main lexer at '{' or ';' and read it as a token + this.lexer.seek(boundary_pos, boundary_line, boundary_pos - boundary_line_offset + 1) + this.lexer.next_token_fast(false) // Store prelude position (trimmed) let trimmed = trim_boundaries(this.source, prelude_start, prelude_end) diff --git a/src/string-utils.ts b/src/string-utils.ts index e2d7675..71c559c 100644 --- a/src/string-utils.ts +++ b/src/string-utils.ts @@ -22,6 +22,11 @@ export const CHAR_DOLLAR = 0x24 // $ export const CHAR_CARET = 0x5e // ^ export const CHAR_COLON = 0x3a // : export const CHAR_LESS_THAN = 0x3c // < +export const CHAR_SEMICOLON = 0x3b // ; +export const CHAR_LEFT_BRACE = 0x7b // { +export const CHAR_LEFT_PAREN = 0x28 // ( +export const CHAR_RIGHT_PAREN = 0x29 // ) +export const CHAR_BACKSLASH = 0x5c // \ /** * Check if a character code is whitespace (space, tab, newline, CR, or FF)