diff --git a/.trellis/spec/flowscope-app/frontend/ui-change-protocol.md b/.trellis/spec/flowscope-app/frontend/ui-change-protocol.md index 55812831..a1976736 100644 --- a/.trellis/spec/flowscope-app/frontend/ui-change-protocol.md +++ b/.trellis/spec/flowscope-app/frontend/ui-change-protocol.md @@ -100,4 +100,11 @@ cargo build -p flowscope-cli --features serve # 2. embed 进 binary pkill -f 'flowscope.*--serve'; ./target/debug/flowscope --serve ... # 3. 重启 ``` -**agent-browser 的限制**:Radix UI DropdownMenu 内部的 button 无法被 `click @ref` 可靠点击(DOM 会被提前删除)。需要通过手动 Chrome 操作或 JS 注入验证。 +**浏览器自动化分两档**: + +| 工具 | Radix DropdownMenu / Popover 内的 button | 备注 | +|------|------------------------------------------|------| +| Cursor IDE 浏览器 MCP(`cursor-ide-browser`,基于 Playwright) | ✅ 可靠 | snapshot 拿 ref → click,DOM detach 前 Playwright 会等待事件冒泡完成 | +| 外部 `agent-browser` CLI | ❌ 不可靠 | DOM 会在 click 落地前被 Radix 卸载,需手动 Chrome 或 JS 注入 | + +优先用 Cursor 浏览器 MCP 做端到端验证;仅在没有 Cursor 上下文(独立 CI、远程 SSH)时退化到 agent-browser CLI + 手动验证。 diff --git a/.trellis/tasks/05-14-fix-complex-sql-parse/check.jsonl b/.trellis/tasks/05-14-fix-complex-sql-parse/check.jsonl new file mode 100644 index 00000000..9dd3234a --- /dev/null +++ b/.trellis/tasks/05-14-fix-complex-sql-parse/check.jsonl @@ -0,0 +1 @@ +{"_example": "Fill with {\"file\": \"\", \"reason\": \"\"}. Put spec/research files only — no code paths. Run `python3 .trellis/scripts/get_context.py --mode packages` to list available specs. Delete this line once real entries are added."} diff --git a/.trellis/tasks/05-14-fix-complex-sql-parse/implement.jsonl b/.trellis/tasks/05-14-fix-complex-sql-parse/implement.jsonl new file mode 100644 index 00000000..9dd3234a --- /dev/null +++ b/.trellis/tasks/05-14-fix-complex-sql-parse/implement.jsonl @@ -0,0 +1 @@ +{"_example": "Fill with {\"file\": \"\", \"reason\": \"\"}. Put spec/research files only — no code paths. Run `python3 .trellis/scripts/get_context.py --mode packages` to list available specs. Delete this line once real entries are added."} diff --git a/.trellis/tasks/05-14-fix-complex-sql-parse/prd.md b/.trellis/tasks/05-14-fix-complex-sql-parse/prd.md new file mode 100644 index 00000000..f2c5f93a --- /dev/null +++ b/.trellis/tasks/05-14-fix-complex-sql-parse/prd.md @@ -0,0 +1,39 @@ +# PRD: 修复 6 个 Hive SQL 复杂语法解析失败 + +## 目标 + +定位并修复 `auditId ∈ {2479, 2482, 2497, 2550, 2568, 2571}` 共 6 条 SQL 解析失败的根因,分门别类提交 PR。 + +## 最终归类(已全部最小复现验证) + +用户决定:只修 flowscope 引擎、按根因分组(每个根因 1 个 PR)。SQL 写错类(id=2550 CTE 缺 as)略过。 + +### 3 个根因 → 3 个 PR + +| PR# | 根因 | 影响 auditId | 最小复现 | 修复方案 | +|-----|------|--------------|----------|----------| +| PR1 | **STRUCT 命名字段 AS**:`STRUCT(a AS w, b AS l)` Spark/Hive 语法 | 2479, 2482, 2571 | `SELECT STRUCT(a AS w) FROM t` → FAIL | 新增 `sanitize_hive_struct_named_fields()`:剥离 `AS xxx`,保留位置 | +| PR2 | **DIV 整除运算符**:sqlparser MySQL 方言支持但 Hive 方言不支持 | 2568 | `SELECT x DIV 1000 FROM t` → FAIL | 新增 `sanitize_hive_div_operator()`:`a DIV b` → `CAST(a/b AS BIGINT)` | +| PR3 | **INSERT 带外层括号的 SELECT**:`INSERT ... PARTITION (...) (SELECT ...)` | 2497 | `INSERT ... PARTITION(dt='x') (SELECT ...)` → FAIL | 新增 `sanitize_hive_parenthesized_insert_select()`:去掉 SELECT 外层括号 | + +### 略过项 + +| auditId | 原因 | +|---------|------| +| 2550 | 用户 SQL bug:CTE `account_info_tencent (SELECT...)` 缺 `as` 关键字,是 SQL 真错误,引擎层不应纵容 | + +## 实现规范 + +- 修改文件:`crates/flowscope-core/src/parser/mod.rs` +- 注册触发条件:`dialect == Dialect::Hive` 且关键字预检测命中 +- 接入位置:`parse_sql_with_dialect_output()` 的 fallback 链路 +- 每个 PR 必须包含: + - 单元测试(覆盖正例 + 反例) + - 最小失败 SQL 加入 `tests/fixtures/` 作回归保护 + - PR 描述:含失败 SQL 最小复现 + 修复前后对比 + +## 验收标准 + +1. 每个 PR 独立通过 `just test-core` +2. 合并 3 个 PR 后,对 6 条失败 SQL 重跑 `flowscope-cli` 至少 5 条变 `success=1` +3. 不引入对 931 个文件中其它 SQL 的回归(重跑 `batch_parse_sql_final.py` 失败数 ≤ 1) diff --git a/.trellis/tasks/05-14-fix-complex-sql-parse/task.json b/.trellis/tasks/05-14-fix-complex-sql-parse/task.json new file mode 100644 index 00000000..ccf91b2c --- /dev/null +++ b/.trellis/tasks/05-14-fix-complex-sql-parse/task.json @@ -0,0 +1,26 @@ +{ + "id": "fix-complex-sql-parse", + "name": "fix-complex-sql-parse", + "title": "修复 6 个 Hive SQL 复杂语法解析失败", + "description": "", + "status": "in_progress", + "dev_type": null, + "scope": null, + "package": null, + "priority": "P2", + "creator": "wangliang", + "assignee": "wangliang", + "createdAt": "2026-05-14", + "completedAt": null, + "branch": null, + "base_branch": "master", + "worktree_path": null, + "commit": null, + "pr_url": null, + "subtasks": [], + "children": [], + "parent": null, + "relatedFiles": [], + "notes": "", + "meta": {} +} \ No newline at end of file diff --git a/.trellis/tasks/05-14-matrix-hide-cte/check.jsonl b/.trellis/tasks/05-14-matrix-hide-cte/check.jsonl new file mode 100644 index 00000000..9dd3234a --- /dev/null +++ b/.trellis/tasks/05-14-matrix-hide-cte/check.jsonl @@ -0,0 +1 @@ +{"_example": "Fill with {\"file\": \"\", \"reason\": \"\"}. Put spec/research files only — no code paths. Run `python3 .trellis/scripts/get_context.py --mode packages` to list available specs. Delete this line once real entries are added."} diff --git a/.trellis/tasks/05-14-matrix-hide-cte/implement.jsonl b/.trellis/tasks/05-14-matrix-hide-cte/implement.jsonl new file mode 100644 index 00000000..9dd3234a --- /dev/null +++ b/.trellis/tasks/05-14-matrix-hide-cte/implement.jsonl @@ -0,0 +1 @@ +{"_example": "Fill with {\"file\": \"\", \"reason\": \"\"}. Put spec/research files only — no code paths. Run `python3 .trellis/scripts/get_context.py --mode packages` to list available specs. Delete this line once real entries are added."} diff --git a/.trellis/tasks/05-14-matrix-hide-cte/prd.md b/.trellis/tasks/05-14-matrix-hide-cte/prd.md new file mode 100644 index 00000000..5fcca859 --- /dev/null +++ b/.trellis/tasks/05-14-matrix-hide-cte/prd.md @@ -0,0 +1,61 @@ +# Matrix 隐藏 CTE 开关与传递性依赖补全 + +## Goal + +在 Matrix 视图的 Tables 子模式下,**默认隐藏 CTE 节点**,并提供工具栏开关供用户按需展开。隐藏时通过 BFS 传递性补全,避免「仅通过 CTE 链相连」的物理表对依赖在矩阵中消失。 + +## Background + +- Matrix 当前用 `isTableLikeType()` 收集表节点,把 `'table' | 'view' | 'cte'` 一锅端到矩阵里(`packages/react/src/utils/matrixUtils.ts:76`)。 +- 截图实测:典型多 CTE 脚本里 CTE 节点能占矩阵 ≥30% 的行/列,把真物理表标签挤变形、稀释信号。 +- FlowScope 数据模型已经承认 CTE 不是物理对象(`audit-api-spec.md` 中 `table_count` 只计 `Table | View`)。 +- CTE 在跨脚本对比、聚类、heatmap 这些 Matrix 核心场景里没有语义价值。 + +## Requirements + +### 数据层(matrixUtils) + +- `MatrixData` 维持现接口,不破坏 Worker 协议。 +- Worker payload 新增 `cteItemSet: string[]`(序列化用 array,主线程转 Set),用于主线程识别哪些 item 是 CTE。 +- `extractTableDependenciesWithDetails` 已经会经由 column-level 路径建立 `cte→cte`、`cte→table`、`table→cte` 三种依赖;不修改这部分逻辑。 +- 新增主线程工具函数 `collapseCteFromMatrix(matrix, cteSet) → MatrixData`: + - 过滤掉 CTE item 行/列; + - 对每对剩余的物理表 `(A, B)`,若原 cells 中 `A→...→B` 仅经由 CTE 链可达(且原本无直接 write/read),则在新矩阵中补一条 `write` 边,details 用 transitive 链上首尾任一段的 details(标记 `indirect: true` 字段供后续 tooltip 区分)。 + - 已存在的直接边保留原样。 + +### 状态管理 + +- `MatrixViewControlledState` 新增 `hideCte: boolean`,默认 `true`。 +- 走 `useImmediateControlledMatrixState` pipeline,可被外部 controlled。 + +### UI + +- Tables 工具栏(`subMode === 'tables'`)新增一个开关按钮(lucide `Layers` 图标),位置紧挨 Heatmap / Cluster / Complexity 这一组。 +- Scripts 子模式下不显示该按钮。 +- 按下时切换 `hideCte`,开启状态视觉与其他 toggle 一致(`bg-cyan-100 text-cyan-600 ring-1 ring-cyan-500`)。 +- Tooltip 文案中文 + 英文: + - 标题:`Hide CTE Aliases` + - 说明:`Collapse CTE rows/columns and connect physical tables transitively.` + +### 渲染层 + +- `MatrixView` 在 `fullMatrixData` 之后新增一个 useMemo 衍生的 `displayMatrixData`,根据 `hideCte && subMode === 'tables'` 决定是否调用 `collapseCteFromMatrix`。 +- 后续所有 `sortedItems` / 过滤 / 渲染均基于 `displayMatrixData`。 +- legend 区在 `hideCte` 开启时显示一个 `CTE Hidden` 状态标识。 + +## Acceptance Criteria + +- [ ] 默认进入 Matrix-Tables 视图,CTE 节点(`a / before / t1` 这类)不出现在行列里。 +- [ ] 工具栏 `Layers` 按钮可以一键切换显示/隐藏 CTE。 +- [ ] 隐藏 CTE 时,原本通过 CTE 链相连的物理表之间能在矩阵里看到 `write` 箭头。 +- [ ] Scripts 子模式工具栏不显示该按钮。 +- [ ] 单测覆盖:`collapseCteFromMatrix` 至少覆盖 3 种场景: + - 物理表 → CTE → 物理表(间接,应补全); + - 物理表 → CTE → CTE → 物理表(多跳间接); + - 物理表 → 物理表 同时也途径 CTE(直接边优先,不重复补)。 +- [ ] `yarn workspace @pondpilot/flowscope-react lint && typecheck && test` 全部通过。 + +## Notes + +- 不动 Rust 引擎、不动 worker 内 build 逻辑,纯前端聚合 + 渲染层改造。 +- `details.indirect` 字段为可选扩展,TS 类型在 `TableDependencyWithDetails` 上标记为 `indirect?: true`。 diff --git a/.trellis/tasks/05-14-matrix-hide-cte/task.json b/.trellis/tasks/05-14-matrix-hide-cte/task.json new file mode 100644 index 00000000..620028fb --- /dev/null +++ b/.trellis/tasks/05-14-matrix-hide-cte/task.json @@ -0,0 +1,26 @@ +{ + "id": "matrix-hide-cte", + "name": "matrix-hide-cte", + "title": "Matrix \\u9690\\u85cf CTE \\u5f00\\u5173\\u4e0e\\u4f20\\u9012\\u6027\\u4f9d\\u8d56\\u8865\\u5168", + "description": "", + "status": "planning", + "dev_type": null, + "scope": null, + "package": null, + "priority": "P2", + "creator": "wangliang", + "assignee": "wangliang", + "createdAt": "2026-05-14", + "completedAt": null, + "branch": null, + "base_branch": "master", + "worktree_path": null, + "commit": null, + "pr_url": null, + "subtasks": [], + "children": [], + "parent": null, + "relatedFiles": [], + "notes": "", + "meta": {} +} \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index f735209b..1ee982ac 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,7 +35,7 @@ This file is intentionally short. `AGENTS.md` is the canonical source of build, 1. **组件文件定位**:精确到文件路径 + 行号(不是"某个地方")。 2. **API 数据先验证**:血缘/数据逻辑问题先用 curl 验证 API 返回;API 不对 → **先修 Rust 引擎**;API 对、图不对 → 才改前端渲染。 -3. **验证方式**:`agent-browser` 无法可靠点击 Radix UI DropdownMenu 内的 button,需手动 Chrome 验证或 JS 注入。 +3. **验证方式**:Cursor IDE 浏览器 MCP(基于 Playwright)可以可靠点击 Radix UI DropdownMenu / Popover 内的 button,无需手动 Chrome;外部 `agent-browser` CLI 在该场景仍不可靠,需走手动 Chrome 或 JS 注入。 完整规范、curl 模板、验证 SOP 见 `.trellis/spec/flowscope-app/frontend/ui-change-protocol.md`。 diff --git a/crates/flowscope-core/src/dialect_ext/flowscope_hive.rs b/crates/flowscope-core/src/dialect_ext/flowscope_hive.rs new file mode 100644 index 00000000..16bd5c15 --- /dev/null +++ b/crates/flowscope-core/src/dialect_ext/flowscope_hive.rs @@ -0,0 +1,222 @@ +//! Custom Hive dialect that fixes gaps in `sqlparser::dialect::HiveDialect`. +//! +//! sqlparser-rs 0.61's `HiveDialect` is missing several syntax features that +//! real-world Hive / Spark SQL relies on. This module composes the upstream +//! `HiveDialect` and overrides individual trait methods to close those gaps. +//! +//! Currently overridden: +//! +//! | Feature | Reason | +//! |-------------------------------|------------------------------------------| +//! | `supports_struct_literal` | `STRUCT(col AS name)` field naming | +//! | `parse_infix` (DIV operator) | `a DIV b` integer division | +//! | `is_table_alias` (OFFSET) | Hive has no OFFSET keyword, so `offset` | +//! | | can legally name a table (`(...) offset`)| +//! +//! Anything not explicitly overridden is delegated to the upstream +//! `HiveDialect` so behavior stays in sync with sqlparser-rs. + +use sqlparser::ast::{BinaryOperator, Expr}; +use sqlparser::dialect::{Dialect, HiveDialect}; +use sqlparser::keywords::Keyword; +use sqlparser::parser::{Parser, ParserError}; + +/// FlowScope's enhanced Hive dialect. +/// +/// Wraps `sqlparser::dialect::HiveDialect` and selectively overrides trait +/// methods to enable Hive features that the upstream dialect is missing. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct FlowscopeHiveDialect { + inner: HiveDialect, +} + +impl Dialect for FlowscopeHiveDialect { + fn is_delimited_identifier_start(&self, ch: char) -> bool { + self.inner.is_delimited_identifier_start(ch) + } + + fn is_identifier_start(&self, ch: char) -> bool { + self.inner.is_identifier_start(ch) + } + + fn is_identifier_part(&self, ch: char) -> bool { + self.inner.is_identifier_part(ch) + } + + fn supports_filter_during_aggregation(&self) -> bool { + self.inner.supports_filter_during_aggregation() + } + + fn supports_numeric_prefix(&self) -> bool { + self.inner.supports_numeric_prefix() + } + + fn require_interval_qualifier(&self) -> bool { + self.inner.require_interval_qualifier() + } + + fn supports_bang_not_operator(&self) -> bool { + self.inner.supports_bang_not_operator() + } + + fn supports_load_data(&self) -> bool { + self.inner.supports_load_data() + } + + fn supports_table_sample_before_alias(&self) -> bool { + self.inner.supports_table_sample_before_alias() + } + + fn supports_group_by_with_modifier(&self) -> bool { + self.inner.supports_group_by_with_modifier() + } + + // ── PR1 override ──────────────────────────────────────────────────────── + // + // Hive's `struct(field1 AS name1, field2 AS name2, ...)` literal is widely + // used in `collect_list(struct(...))`-style patterns. sqlparser-rs's + // HiveDialect inherits the default `false`, which causes `STRUCT(a AS b)` + // to fail with "Expected: ), found: AS". BigQuery / Databricks / Generic + // all override this to `true`; we do the same for Hive. + fn supports_struct_literal(&self) -> bool { + true + } + + // ── PR3 override ──────────────────────────────────────────────────────── + // + // Hive's SELECT grammar does NOT have an OFFSET clause (see + // ), + // so writing `CROSS JOIN (...) offset` to name a derived table `offset` is + // perfectly legal Hive syntax. sqlparser-rs, however, keeps OFFSET in its + // global `RESERVED_FOR_TABLE_ALIAS` list and HiveDialect inherits that + // default, which makes the construct fail with "Expected: ), found: ". We loosen the rule here. + // + // Anything that's truly required to disambiguate Hive grammar (SELECT, + // FROM, WHERE, etc.) is still rejected because we only remove OFFSET from + // the reserved set. + fn is_table_alias(&self, kw: &Keyword, parser: &mut Parser) -> bool { + if matches!(kw, Keyword::OFFSET) { + return true; + } + self.inner.is_table_alias(kw, parser) + } + + // ── PR2 override ──────────────────────────────────────────────────────── + // + // Hive's `DIV` operator (BIGINT integer division) is documented at + // + // but sqlparser-rs only registers `DIV` parsing inside MySqlDialect. We + // mirror that implementation here so `a DIV b` parses as a BinaryOp + // identical to MySQL's `MyIntegerDivide`. + fn parse_infix( + &self, + parser: &mut Parser, + expr: &Expr, + _precedence: u8, + ) -> Option> { + if parser.parse_keyword(Keyword::DIV) { + let rhs = match parser.parse_expr() { + Ok(e) => e, + Err(e) => return Some(Err(e)), + }; + return Some(Ok(Expr::BinaryOp { + left: Box::new(expr.clone()), + op: BinaryOperator::MyIntegerDivide, + right: Box::new(rhs), + })); + } + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + use sqlparser::parser::Parser; + + fn parse_ok(sql: &str) -> bool { + Parser::parse_sql(&FlowscopeHiveDialect::default(), sql).is_ok() + } + + #[test] + fn struct_with_named_fields_parses() { + // The exact pattern that triggered auditId=2479 / 2482 / 2571. + assert!(parse_ok("SELECT STRUCT(a AS w, b AS l) FROM t")); + } + + #[test] + fn collect_list_with_struct_parses() { + // collect_list(struct(...)) is the real-world usage from + // dwd_conan_course_lifecycle_detail_da.sql (id=2571). + assert!(parse_ok( + "SELECT collect_list(struct(ldap, cast(d AS INT) AS duration, c)) FROM t" + )); + } + + #[test] + fn upstream_hive_still_fails_struct_named() { + // Sanity guard: the upstream dialect must still fail; if this assertion + // breaks, the override has become redundant and can be removed. + assert!(Parser::parse_sql(&HiveDialect {}, "SELECT STRUCT(a AS w) FROM t").is_err()); + } + + #[test] + fn nested_struct_with_function_parses() { + // From id=2479 / 2482: + // TRANSFORM(ARRAY_SORT(COLLECT_LIST(STRUCT(week AS w, label AS l))), x -> x.l) + assert!(parse_ok( + "SELECT TRANSFORM(\ + ARRAY_SORT(COLLECT_LIST(STRUCT(week AS w, label AS l))),\ + x -> x.l\ + ) FROM t" + )); + } + + // ── PR2 (DIV operator) tests ───────────────────────────────────────────── + + #[test] + fn integer_division_with_div_operator_parses() { + // From id=2568: + // CAST(shumei_timestamp AS BIGINT) DIV 1000 + assert!(parse_ok("SELECT x DIV 1000 FROM t")); + assert!(parse_ok("SELECT CAST(x AS BIGINT) DIV 1000 AS r FROM t")); + } + + #[test] + fn upstream_hive_still_fails_div() { + assert!(Parser::parse_sql(&HiveDialect {}, "SELECT x DIV 1000 FROM t").is_err()); + } + + // ── PR3 (OFFSET as table alias) tests ──────────────────────────────────── + + #[test] + fn offset_keyword_as_table_alias_parses() { + // From id=2482: + // CROSS JOIN (SELECT 0 AS offset_val UNION ALL SELECT 1) offset + assert!(parse_ok( + "SELECT * FROM t \ + CROSS JOIN (SELECT 0 AS v UNION ALL SELECT 1) offset \ + WHERE t.x = offset.v" + )); + } + + #[test] + fn upstream_hive_still_fails_offset_alias() { + assert!(Parser::parse_sql( + &HiveDialect {}, + "SELECT * FROM t CROSS JOIN (SELECT 0 AS v) offset WHERE t.x = offset.v" + ) + .is_err()); + } + + #[test] + fn select_keyword_still_rejected_as_table_alias() { + // Sanity guard: the override only loosens OFFSET, not everything. + assert!(Parser::parse_sql( + &FlowscopeHiveDialect::default(), + "SELECT * FROM t CROSS JOIN (SELECT 1) select" + ) + .is_err()); + } +} diff --git a/crates/flowscope-core/src/dialect_ext/mod.rs b/crates/flowscope-core/src/dialect_ext/mod.rs new file mode 100644 index 00000000..87775492 --- /dev/null +++ b/crates/flowscope-core/src/dialect_ext/mod.rs @@ -0,0 +1,16 @@ +//! FlowScope-specific extensions to sqlparser-rs dialects. +//! +//! sqlparser-rs ships official dialect implementations that we generally rely +//! on as-is. When we discover real-world SQL that fails to parse due to a +//! missing trait override, we add a thin wrapper here that delegates to the +//! upstream dialect but enables the additional behavior. +//! +//! Each wrapper must: +//! - Keep upstream behavior by delegating non-overridden methods. +//! - Document each override with a comment + spec reference. +//! - Provide a unit test demonstrating both the failure on upstream and the +//! success on the wrapper. + +pub mod flowscope_hive; + +pub use flowscope_hive::FlowscopeHiveDialect; diff --git a/crates/flowscope-core/src/lib.rs b/crates/flowscope-core/src/lib.rs index eb5298ed..861ad4c5 100644 --- a/crates/flowscope-core/src/lib.rs +++ b/crates/flowscope-core/src/lib.rs @@ -1,5 +1,6 @@ pub mod analyzer; pub mod completion; +pub mod dialect_ext; pub mod error; pub mod extractors; pub mod generated; diff --git a/crates/flowscope-core/src/types/request.rs b/crates/flowscope-core/src/types/request.rs index b423c230..c4a862ac 100644 --- a/crates/flowscope-core/src/types/request.rs +++ b/crates/flowscope-core/src/types/request.rs @@ -109,10 +109,11 @@ pub enum Dialect { impl Dialect { pub fn to_sqlparser_dialect(&self) -> Box { + use crate::dialect_ext::FlowscopeHiveDialect; use sqlparser::dialect::{ AnsiDialect, BigQueryDialect, ClickHouseDialect, DatabricksDialect, DuckDbDialect, - GenericDialect, HiveDialect, MsSqlDialect, MySqlDialect, OracleDialect, - PostgreSqlDialect, RedshiftSqlDialect, SQLiteDialect, SnowflakeDialect, + GenericDialect, MsSqlDialect, MySqlDialect, OracleDialect, PostgreSqlDialect, + RedshiftSqlDialect, SQLiteDialect, SnowflakeDialect, }; match self { Self::Generic => Box::new(GenericDialect {}), @@ -121,7 +122,10 @@ impl Dialect { Self::Clickhouse => Box::new(ClickHouseDialect {}), Self::Databricks => Box::new(DatabricksDialect {}), Self::Duckdb => Box::new(DuckDbDialect {}), - Self::Hive => Box::new(HiveDialect {}), + // Use FlowScope's enhanced Hive dialect that fixes gaps in + // sqlparser-rs's upstream HiveDialect (STRUCT named fields, + // DIV operator, ...). See dialect_ext/flowscope_hive.rs. + Self::Hive => Box::new(FlowscopeHiveDialect::default()), Self::Mssql => Box::new(MsSqlDialect {}), Self::Mysql => Box::new(MySqlDialect {}), Self::Oracle => Box::new(OracleDialect {}), diff --git a/packages/react/src/components/MatrixView.tsx b/packages/react/src/components/MatrixView.tsx index 7ab7b1f9..b7d639aa 100644 --- a/packages/react/src/components/MatrixView.tsx +++ b/packages/react/src/components/MatrixView.tsx @@ -30,6 +30,7 @@ import { BarChart2, ScanLine, Loader2, + Layers, } from 'lucide-react'; import { useLineage } from '../store'; import type { MatrixSubMode } from '../types'; @@ -49,6 +50,9 @@ import { type ScriptDependency, type MatrixCellData, type MatrixData, + type MatrixMetrics, + collapseCteFromMatrix, + computeMatrixMetrics, } from '../utils/matrixUtils'; import { buildMatrixInWorker, cancelPendingMatrixBuilds } from '../utils/matrixWorkerService'; @@ -62,14 +66,7 @@ interface MatrixWorkerPayload { tableItemsRendered: number; scriptItemCount: number; scriptItemsRendered: number; -} - -interface MatrixMetrics { - rowCounts: Map; - colCounts: Map; - maxRow: number; - maxCol: number; - maxIntensity: number; + cteItemKeys: string[]; } const EMPTY_MATRIX: MatrixData = { items: [], cells: new Map() }; @@ -248,6 +245,10 @@ const MatrixCell = memo( return { backgroundColor: color }; }, [heatmapMode, hasDependency, intensity, cellData.type]); + const isIndirect = + (cellData.type === 'write' || cellData.type === 'read') && + (cellData.details as TableDependencyWithDetails | undefined)?.indirect === true; + const content = useMemo(() => { switch (cellData.type) { case 'self': @@ -255,19 +256,30 @@ const MatrixCell = memo( case 'write': return ( ); case 'read': return ( - + ); case 'none': default: return null; } - }, [cellData.type]); + }, [cellData.type, isIndirect]); const tooltipContent = useMemo(() => { const displayRowName = getShortName(rowName); @@ -351,6 +363,8 @@ const MatrixCell = memo( // Standard Tooltip if (subMode === 'tables') { const details = cellData.details as TableDependencyWithDetails | undefined; + const indirect = details?.indirect === true; + const viaCtes = details?.viaCtes ?? []; return (
@@ -358,7 +372,21 @@ const MatrixCell = memo( {isWrite ? displayColName : displayRowName}
- {details && details.columnCount > 0 && ( + {indirect && ( +
+ Indirect + {viaCtes.length > 0 && ( + + {' '} + via {viaCtes.length} CTE hop{viaCtes.length > 1 ? 's' : ''}:{' '} + + {viaCtes.map(getShortName).join(' → ')} + + + )} +
+ )} + {details && details.columnCount > 0 && !indirect && (
{details.columnCount} column {details.columnCount > 1 ? 's' : ''} mapped @@ -449,6 +477,13 @@ export interface MatrixViewControlledState { focusedNode: string | null; firstColumnWidth: number; headerHeight: number; + /** + * Hide CTE rows/columns in the Tables sub-mode and reconstruct physical-to-physical + * dependencies via transitive closure. Defaults to true because CTE aliases are + * scope-local and dilute the matrix signal for cross-script comparison. + * No effect in Scripts sub-mode. + */ + hideCte: boolean; } interface MatrixViewProps { @@ -652,6 +687,12 @@ export function MatrixView({ onStateChange, DEFAULT_HEADER_HEIGHT ); + const [hideCte, setHideCte] = useImmediateControlledMatrixState( + 'hideCte', + controlledState, + onStateChange, + true + ); const debouncedFilterText = useDebounce(filterText, SEARCH_DEBOUNCE_DELAY); const [hoveredCell, setHoveredCell] = useState<{ row: string; col: string } | null>(null); @@ -769,11 +810,39 @@ export function MatrixView({ }; }, [result]); - const fullMatrixData = useMemo(() => { + const rawMatrixData = useMemo(() => { if (!matrixPayload) return EMPTY_MATRIX; return matrixSubMode === 'tables' ? matrixPayload.tableMatrix : matrixPayload.scriptMatrix; }, [matrixSubMode, matrixPayload]); + const cteItemSet = useMemo(() => { + if (!matrixPayload) return null; + if (matrixPayload.cteItemKeys.length === 0) return null; + return new Set(matrixPayload.cteItemKeys); + }, [matrixPayload]); + + // Apply CTE collapse only in Tables sub-mode; Scripts items are file paths and + // never overlap with CTE keys, so the toggle has no effect there. + const fullMatrixData = useMemo(() => { + if (matrixSubMode !== 'tables' || !hideCte || !cteItemSet) { + return rawMatrixData; + } + const start = MATRIX_DEBUG ? performance.now() : 0; + const collapsed = collapseCteFromMatrix(rawMatrixData, cteItemSet); + if (MATRIX_DEBUG) { + const duration = performance.now() - start; + if (duration > 8) { + console.log(`[MatrixView] collapseCteFromMatrix: ${duration.toFixed(1)}ms`); + } + } + return collapsed; + }, [rawMatrixData, cteItemSet, hideCte, matrixSubMode]); + + const hiddenCteCount = useMemo(() => { + if (matrixSubMode !== 'tables' || !hideCte || !cteItemSet) return 0; + return rawMatrixData.items.filter((item) => cteItemSet.has(item)).length; + }, [rawMatrixData, cteItemSet, hideCte, matrixSubMode]); + const allColumnNames = matrixPayload?.allColumnNames ?? []; const limitInfo = useMemo(() => { @@ -816,8 +885,13 @@ export function MatrixView({ maxIntensity: 1, }; } + // When CTE collapse changed the matrix structure, the worker-computed metrics + // refer to a different item set; recompute on the (already small) collapsed view. + if (matrixSubMode === 'tables' && fullMatrixData !== rawMatrixData) { + return computeMatrixMetrics(fullMatrixData, 'tables'); + } return matrixSubMode === 'tables' ? matrixPayload.tableMetrics : matrixPayload.scriptMetrics; - }, [matrixPayload, matrixSubMode]); + }, [matrixPayload, matrixSubMode, fullMatrixData, rawMatrixData]); // Clustering Logic const sortedItems = useMemo(() => { @@ -1321,6 +1395,44 @@ export function MatrixView({ + {/* Hide CTE Toggle (Tables sub-mode only) */} + {matrixSubMode === 'tables' && ( + + + + + + +
Hide CTE Aliases
+
+ Collapse CTE rows/columns and connect physical tables transitively. +
+ {hideCte && hiddenCteCount > 0 && ( +
+ {hiddenCteCount} CTE{hiddenCteCount > 1 ? 's' : ''} hidden, dotted arrows = + indirect dependency. +
+ )} +
+
+
+ )} + {/* Legend Toggle */} {!showLegend && ( @@ -1758,6 +1870,9 @@ export function MatrixView({ {heatmapMode && Heatmap Active} {clusterMode && Sorted by Clusters} {complexityMode && Complexity Margins} + {matrixSubMode === 'tables' && hideCte && hiddenCteCount > 0 && ( + CTE Hidden ({hiddenCteCount}) + )}