diff --git a/src/parse-utils.ts b/src/parse-utils.ts index aa3b14c..ba724fa 100644 --- a/src/parse-utils.ts +++ b/src/parse-utils.ts @@ -1,4 +1,17 @@ -import { CHAR_ASTERISK, CHAR_FORWARD_SLASH, is_whitespace } from './string-utils' +import { + CHAR_ASTERISK, + CHAR_BACKSLASH, + CHAR_CARRIAGE_RETURN, + CHAR_DOUBLE_QUOTE, + CHAR_FORWARD_SLASH, + CHAR_LEFT_BRACE, + CHAR_LEFT_PAREN, + CHAR_NEWLINE, + CHAR_RIGHT_PAREN, + CHAR_SEMICOLON, + CHAR_SINGLE_QUOTE, + is_whitespace, +} from './string-utils' /** * Skip whitespace forward from a position @@ -134,3 +147,220 @@ export function trim_boundaries( if (start >= end) return null return [start, end] } + +/** + * Raw character scan to locate the opening `{` of a CSS block, without full + * tokenization. Handles strings and comments to avoid false matches, and + * tracks newlines so the caller can reposition a Lexer accurately afterward. + * + * Returns `[pos, line, line_offset]` where `pos` is the position of `{` + * (or `source.length` if not found), and `line`/`line_offset` are the + * line-tracking state at that position. + * @internal + */ +export function scan_to_open_brace( + source: string, + pos: number, + line: number, + line_offset: number, +): [number, number, number] { + const len = source.length + let i = pos + + while (i < len) { + const ch = source.charCodeAt(i) + + if (ch === CHAR_LEFT_BRACE) return [i, line, line_offset] + + // Comments: /* ... */ + if (ch === CHAR_FORWARD_SLASH && i + 1 < len && source.charCodeAt(i + 1) === CHAR_ASTERISK) { + i += 2 + while (i < len) { + const c = source.charCodeAt(i) + if (c === CHAR_ASTERISK && i + 1 < len && source.charCodeAt(i + 1) === CHAR_FORWARD_SLASH) { + i += 2 + break + } + if (c === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (c === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + } + continue + } + + // Strings: '...' or "..." + if (ch === CHAR_SINGLE_QUOTE || ch === CHAR_DOUBLE_QUOTE) { + const quote = ch + i++ + while (i < len) { + const c = source.charCodeAt(i) + if (c === quote) { + i++ + break + } + if (c === CHAR_BACKSLASH) { + i++ // skip escaped char + } else if (c === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (c === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + } + continue + } + + // Backslash escape outside strings (e.g. \{ in a selector) + if (ch === CHAR_BACKSLASH && i + 1 < len) { + i++ + const next = source.charCodeAt(i) + if (next === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (next === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + continue + } + + if (ch === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (ch === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + + i++ + } + + return [i, line, line_offset] +} + +/** + * Raw character scan to locate the end of an at-rule prelude: `{` or `;` + * (whichever comes first at parenthesis depth 0). Handles strings, comments, + * paren depth (so semicolons inside `url(data:...;...)` are skipped), and + * tracks newlines so the caller can reposition a Lexer accurately afterward. + * + * Returns `[pos, line, line_offset]` at the position of the boundary character. + * @internal + */ +export function scan_to_block_or_semi( + source: string, + pos: number, + line: number, + line_offset: number, +): [number, number, number] { + const len = source.length + let i = pos + let depth = 0 + + while (i < len) { + const ch = source.charCodeAt(i) + + if (depth === 0 && (ch === CHAR_LEFT_BRACE || ch === CHAR_SEMICOLON)) { + return [i, line, line_offset] + } + + if (ch === CHAR_LEFT_PAREN) { + depth++ + i++ + continue + } + if (ch === CHAR_RIGHT_PAREN) { + if (depth > 0) depth-- + i++ + continue + } + + // Comments + if (ch === CHAR_FORWARD_SLASH && i + 1 < len && source.charCodeAt(i + 1) === CHAR_ASTERISK) { + i += 2 + while (i < len) { + const c = source.charCodeAt(i) + if (c === CHAR_ASTERISK && i + 1 < len && source.charCodeAt(i + 1) === CHAR_FORWARD_SLASH) { + i += 2 + break + } + if (c === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (c === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + } + continue + } + + // Strings + if (ch === CHAR_SINGLE_QUOTE || ch === CHAR_DOUBLE_QUOTE) { + const quote = ch + i++ + while (i < len) { + const c = source.charCodeAt(i) + if (c === quote) { + i++ + break + } + if (c === CHAR_BACKSLASH) { + i++ + } else if (c === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (c === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + } + continue + } + + // Backslash escape + if (ch === CHAR_BACKSLASH && i + 1 < len) { + i++ + const next = source.charCodeAt(i) + if (next === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (next === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + i++ + continue + } + + if (ch === CHAR_NEWLINE) { + line++ + line_offset = i + 1 + } else if (ch === CHAR_CARRIAGE_RETURN) { + line++ + if (i + 1 < len && source.charCodeAt(i + 1) === CHAR_NEWLINE) i++ + line_offset = i + 1 + } + + i++ + } + + return [i, line, line_offset] +} diff --git a/src/parse.ts b/src/parse.ts index 0bb4603..2644ceb 100644 --- a/src/parse.ts +++ b/src/parse.ts @@ -32,9 +32,13 @@ import { TOKEN_RIGHT_BRACKET, TOKEN_COMMA, TOKEN_COLON, - TOKEN_FUNCTION, } from './token-types' -import { trim_boundaries } from './parse-utils' +import { + trim_boundaries, + scan_to_open_brace, + scan_to_block_or_semi, + skip_whitespace_and_comments_backward, +} from './parse-utils' import { CHAR_PERIOD, CHAR_GREATER_THAN, @@ -275,12 +279,22 @@ export class Parser { let selector_line = this.lexer.token_line let selector_column = this.lexer.token_column - // Consume tokens until we hit '{' - let last_end = this.lexer.token_end - while (!this.is_eof() && this.peek_type() !== TOKEN_LEFT_BRACE) { - last_end = this.lexer.token_end - this.next_token() - } + // Raw scan to find '{' without tokenizing the selector range. + // SelectorParser will be the sole tokenizer of this range. + let [brace_pos, brace_line, brace_line_offset] = scan_to_open_brace( + this.source, + selector_start, + selector_line, + selector_start - selector_column + 1, + ) + + // Trim trailing whitespace/comments before '{' to get the true selector end, + // matching what the old token-scan loop produced via next_token_fast(true). + let last_end = skip_whitespace_and_comments_backward(this.source, brace_pos, selector_start) + + // Reposition main lexer at '{' and read it as a token + this.lexer.seek(brace_pos, brace_line, brace_pos - brace_line_offset + 1) + this.lexer.next_token_fast(false) // If detailed selector parsing is enabled, use SelectorParser if (this.parse_selectors_enabled && this.selector_parser) { @@ -297,14 +311,13 @@ export class Parser { // Create node: RAW when parsing disabled, SELECTOR_LIST as error fallback let node_type = this.parse_selectors_enabled ? SELECTOR_LIST : RAW - let selector_node = this.arena.create_node( + return this.arena.create_node( node_type, selector_start, last_end - selector_start, selector_line, selector_column, ) - return selector_node } // Parse a declaration: property: value; @@ -383,28 +396,28 @@ export class Parser { // Track prelude start and end let prelude_start = this.lexer.token_start - let prelude_end = prelude_start - // Track parenthesis depth to handle semicolons inside functions (e.g., url(data:image/png;base64,...)) - // NOTE: Same pattern exists in parse-declaration.ts for value parsing - keep in sync - let paren_depth = 0 - - // Parse prelude (everything before '{' or ';') - while (!this.is_eof()) { - let token_type = this.peek_type() - - // Track parenthesis depth - if (token_type === TOKEN_LEFT_PAREN || token_type === TOKEN_FUNCTION) { - paren_depth++ - } else if (token_type === TOKEN_RIGHT_PAREN) { - paren_depth-- - } + let prelude_line = this.lexer.token_line + let prelude_column = this.lexer.token_column + + // Raw scan to find '{' or ';' without tokenizing the prelude range. + // AtRulePreludeParser will be the sole tokenizer of this range. + // Paren depth is tracked to correctly skip semicolons inside url(...). + let [boundary_pos, boundary_line, boundary_line_offset] = scan_to_block_or_semi( + this.source, + prelude_start, + prelude_line, + prelude_start - prelude_column + 1, + ) + // Trim trailing whitespace/comments before the boundary to match the old loop + let prelude_end = skip_whitespace_and_comments_backward( + this.source, + boundary_pos, + prelude_start, + ) - // Only break on '{' or ';' when outside all parentheses - if (token_type === TOKEN_LEFT_BRACE && paren_depth === 0) break - if (token_type === TOKEN_SEMICOLON && paren_depth === 0) break - prelude_end = this.lexer.token_end - this.next_token() - } + // Reposition main lexer at '{' or ';' and read it as a token + this.lexer.seek(boundary_pos, boundary_line, boundary_pos - boundary_line_offset + 1) + this.lexer.next_token_fast(false) // Store prelude position (trimmed) let trimmed = trim_boundaries(this.source, prelude_start, prelude_end) diff --git a/src/string-utils.ts b/src/string-utils.ts index e2d7675..71c559c 100644 --- a/src/string-utils.ts +++ b/src/string-utils.ts @@ -22,6 +22,11 @@ export const CHAR_DOLLAR = 0x24 // $ export const CHAR_CARET = 0x5e // ^ export const CHAR_COLON = 0x3a // : export const CHAR_LESS_THAN = 0x3c // < +export const CHAR_SEMICOLON = 0x3b // ; +export const CHAR_LEFT_BRACE = 0x7b // { +export const CHAR_LEFT_PAREN = 0x28 // ( +export const CHAR_RIGHT_PAREN = 0x29 // ) +export const CHAR_BACKSLASH = 0x5c // \ /** * Check if a character code is whitespace (space, tab, newline, CR, or FF)