From b221b52cfb231f72d377deec344a332fa835b56c Mon Sep 17 00:00:00 2001 From: limit_yan Date: Sat, 9 May 2026 16:31:39 +0800 Subject: [PATCH 1/6] feat(deep-review): enforce adaptive review runtime --- src/apps/cli/src/agent/core_adapter.rs | 1 + src/apps/desktop/src/api/agentic_api.rs | 66 + src/apps/desktop/src/api/git_api.rs | 29 +- src/apps/desktop/src/lib.rs | 22 +- src/apps/server/src/rpc_dispatcher.rs | 33 + src/crates/acp/src/runtime/prompt.rs | 2 + .../agents/prompts/deep_review_agent.md | 37 +- .../prompts/review_architecture_agent.md | 4 + .../prompts/review_business_logic_agent.md | 4 + .../agents/prompts/review_frontend_agent.md | 4 + .../prompts/review_performance_agent.md | 4 + .../prompts/review_quality_gate_agent.md | 4 + .../agents/prompts/review_security_agent.md | 4 + .../core/src/agentic/agents/registry.rs | 13 + .../core/src/agentic/context_profile.rs | 344 ++ .../src/agentic/coordination/coordinator.rs | 344 +- .../src/agentic/coordination/scheduler.rs | 6 + src/crates/core/src/agentic/core/message.rs | 72 + src/crates/core/src/agentic/core/mod.rs | 5 +- .../core/src/agentic/deep_review_policy.rs | 3158 ++++++++++++++++- src/crates/core/src/agentic/events/types.rs | 3 +- .../src/agentic/execution/execution_engine.rs | 455 ++- .../src/agentic/execution/round_executor.rs | 7 +- .../src/agentic/execution/stream_processor.rs | 83 +- .../core/src/agentic/execution/types.rs | 4 + src/crates/core/src/agentic/mod.rs | 2 + .../core/src/agentic/persistence/manager.rs | 3 + .../agentic/session/compression/compressor.rs | 126 +- .../session/compression/fallback/mod.rs | 14 +- .../session/compression/fallback/payload.rs | 35 +- .../session/compression/fallback/render.rs | 19 +- .../session/compression/fallback/tests.rs | 65 +- .../session/compression/fallback/types.rs | 7 +- .../session/compression/microcompact.rs | 341 +- .../src/agentic/session/evidence_ledger.rs | 540 +++ src/crates/core/src/agentic/session/mod.rs | 2 + .../src/agentic/session/session_manager.rs | 130 +- .../core/src/agentic/tools/framework.rs | 270 +- .../tools/implementations/bash_tool.rs | 48 + .../tools/implementations/code_review_tool.rs | 1446 +++++++- .../tools/implementations/delete_file_tool.rs | 19 +- .../tools/implementations/file_edit_tool.rs | 7 + .../tools/implementations/file_write_tool.rs | 7 + .../agentic/tools/implementations/git_tool.rs | 43 +- .../implementations/session_message_tool.rs | 1 + .../tools/implementations/task_tool.rs | 1811 +++++++++- .../agentic/tools/pipeline/tool_pipeline.rs | 45 + src/crates/core/src/service/config/types.rs | 11 + src/crates/core/src/service/cron/service.rs | 1 + .../core/src/service/git/git_service.rs | 117 + src/crates/core/src/service/git/git_types.rs | 25 + .../service/remote_connect/remote_server.rs | 1 + src/crates/core/src/service/session/types.rs | 63 + src/crates/core/tests/context_profile.rs | 172 + src/crates/events/src/agentic.rs | 103 +- src/crates/events/src/lib.rs | 3 +- src/crates/transport/src/adapters/tauri.rs | 29 + .../transport/src/adapters/websocket.rs | 27 + 58 files changed, 9984 insertions(+), 257 deletions(-) create mode 100644 src/crates/core/src/agentic/context_profile.rs create mode 100644 src/crates/core/src/agentic/session/evidence_ledger.rs create mode 100644 src/crates/core/tests/context_profile.rs diff --git a/src/apps/cli/src/agent/core_adapter.rs b/src/apps/cli/src/agent/core_adapter.rs index d271853b7..d0f4de04a 100644 --- a/src/apps/cli/src/agent/core_adapter.rs +++ b/src/apps/cli/src/agent/core_adapter.rs @@ -110,6 +110,7 @@ impl Agent for CoreAgentAdapter { self.agent_type.clone(), None, DialogSubmissionPolicy::for_source(DialogTriggerSource::Cli), + None, ) .await?; diff --git a/src/apps/desktop/src/api/agentic_api.rs b/src/apps/desktop/src/api/agentic_api.rs index a9956285b..094ee99a1 100644 --- a/src/apps/desktop/src/api/agentic_api.rs +++ b/src/apps/desktop/src/api/agentic_api.rs @@ -13,6 +13,10 @@ use bitfun_core::agentic::coordination::{ SubagentTimeoutAction, }; use bitfun_core::agentic::core::*; +use bitfun_core::agentic::deep_review_policy::{ + apply_deep_review_queue_control, default_review_team_definition, DeepReviewQueueControlAction, + ReviewTeamDefinition, +}; use bitfun_core::agentic::image_analysis::ImageContextData; use bitfun_core::agentic::tools::image_context::get_image_context; #[derive(Debug, Deserialize)] @@ -84,6 +88,8 @@ pub struct StartDialogTurnRequest { pub turn_id: Option, #[serde(default)] pub image_contexts: Option>, + #[serde(default)] + pub user_message_metadata: Option, } #[derive(Debug, Serialize)] @@ -176,6 +182,37 @@ pub struct SteerDialogTurnResponse { pub steering_id: String, } +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ControlDeepReviewQueueRequest { + pub session_id: String, + pub dialog_turn_id: String, + pub tool_id: String, + pub action: ControlDeepReviewQueueActionDTO, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ControlDeepReviewQueueActionDTO { + Pause, + Continue, + Cancel, + SkipOptional, +} + +impl From for DeepReviewQueueControlAction { + fn from(value: ControlDeepReviewQueueActionDTO) -> Self { + match value { + ControlDeepReviewQueueActionDTO::Pause => DeepReviewQueueControlAction::Pause, + ControlDeepReviewQueueActionDTO::Continue => DeepReviewQueueControlAction::Continue, + ControlDeepReviewQueueActionDTO::Cancel => DeepReviewQueueControlAction::Cancel, + ControlDeepReviewQueueActionDTO::SkipOptional => { + DeepReviewQueueControlAction::SkipOptional + } + } + } +} + #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct CancelSessionRequest { @@ -416,6 +453,7 @@ pub async fn start_dialog_turn( workspace_path, turn_id, image_contexts, + user_message_metadata, } = request; let policy = DialogSubmissionPolicy::for_source(DialogTriggerSource::DesktopUi); @@ -439,6 +477,7 @@ pub async fn start_dialog_turn( workspace_path, policy, None, + user_message_metadata, resolved_images, ) .await @@ -669,6 +708,28 @@ pub async fn steer_dialog_turn( }) } +#[tauri::command] +pub async fn control_deep_review_queue( + request: ControlDeepReviewQueueRequest, +) -> Result<(), String> { + if request.session_id.trim().is_empty() { + return Err("Missing session_id".to_string()); + } + if request.dialog_turn_id.trim().is_empty() { + return Err("Missing dialog_turn_id".to_string()); + } + if request.tool_id.trim().is_empty() { + return Err("Missing tool_id".to_string()); + } + + apply_deep_review_queue_control( + &request.dialog_turn_id, + &request.tool_id, + request.action.into(), + ); + Ok(()) +} + #[tauri::command] pub async fn cancel_session( coordinator: State<'_, Arc>, @@ -896,6 +957,11 @@ pub async fn get_available_modes(state: State<'_, AppState>) -> Result Result { + Ok(default_review_team_definition()) +} + #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] pub struct ModeInfoDTO { diff --git a/src/apps/desktop/src/api/git_api.rs b/src/apps/desktop/src/api/git_api.rs index 3db1b4b24..024f9351d 100644 --- a/src/apps/desktop/src/api/git_api.rs +++ b/src/apps/desktop/src/api/git_api.rs @@ -3,8 +3,8 @@ use crate::api::app_state::AppState; use bitfun_core::infrastructure::storage::StorageOptions; use bitfun_core::service::git::{ - GitAddParams, GitCommitParams, GitDiffParams, GitLogParams, GitPullParams, GitPushParams, - GitService, + GitAddParams, GitChangedFile, GitChangedFilesParams, GitCommitParams, GitDiffParams, + GitLogParams, GitPullParams, GitPushParams, GitService, }; use bitfun_core::service::git::{ GitBranch, GitCommit, GitOperationResult, GitRepository, GitStatus, @@ -91,6 +91,13 @@ pub struct GitDiffRequest { pub params: GitDiffParams, } +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct GitChangedFilesRequest { + pub repository_path: String, + pub params: GitChangedFilesParams, +} + #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct GitResetFilesRequest { @@ -371,6 +378,24 @@ pub async fn git_get_diff( }) } +#[tauri::command] +pub async fn git_get_changed_files( + _state: State<'_, AppState>, + request: GitChangedFilesRequest, +) -> Result, String> { + info!( + "Getting changed Git files for repository: {}", + request.repository_path + ); + + GitService::get_changed_files(&request.repository_path, &request.params) + .await + .map_err(|e| { + error!("Failed to get changed Git files: {}", e); + e.to_string() + }) +} + #[tauri::command] pub async fn git_reset_files( _state: State<'_, AppState>, diff --git a/src/apps/desktop/src/lib.rs b/src/apps/desktop/src/lib.rs index 72b1dd105..6e8b836ee 100644 --- a/src/apps/desktop/src/lib.rs +++ b/src/apps/desktop/src/lib.rs @@ -546,6 +546,7 @@ pub async fn run() { api::agentic_api::ensure_assistant_bootstrap, api::agentic_api::cancel_dialog_turn, api::agentic_api::steer_dialog_turn, + api::agentic_api::control_deep_review_queue, api::agentic_api::cancel_session, api::agentic_api::set_subagent_timeout, api::agentic_api::delete_session, @@ -557,6 +558,7 @@ pub async fn run() { api::agentic_api::cancel_tool, api::agentic_api::generate_session_title, api::agentic_api::get_available_modes, + api::agentic_api::get_default_review_team_definition, api::btw_api::btw_ask_stream, api::btw_api::btw_cancel, api::editor_ai_api::editor_ai_stream, @@ -677,6 +679,7 @@ pub async fn run() { git_create_branch, git_delete_branch, git_get_diff, + git_get_changed_files, git_reset_files, git_reset_to_commit, git_get_file_content, @@ -1072,12 +1075,29 @@ async fn init_agentic_system() -> anyhow::Result<( tool_pipeline.clone(), )); + // Get execution config from global settings + let exec_config = match bitfun_core::service::config::get_global_config_service().await { + Ok(config_service) => { + match config_service + .get_config::(None) + .await + { + Ok(global_config) => execution::ExecutionEngineConfig { + max_rounds: global_config.ai.max_rounds, + ..Default::default() + }, + Err(_) => Default::default(), + } + } + Err(_) => Default::default(), + }; + let execution_engine = Arc::new(execution::ExecutionEngine::new( round_executor, event_queue.clone(), session_manager.clone(), context_compressor, - execution::ExecutionEngineConfig::default(), + exec_config, )); let coordinator = Arc::new(coordination::ConversationCoordinator::new( diff --git a/src/apps/server/src/rpc_dispatcher.rs b/src/apps/server/src/rpc_dispatcher.rs index 068f4f6e7..cc71a040c 100644 --- a/src/apps/server/src/rpc_dispatcher.rs +++ b/src/apps/server/src/rpc_dispatcher.rs @@ -9,6 +9,9 @@ use anyhow::{Result, anyhow}; use bitfun_core::agentic::agents::SubAgentSource; use bitfun_core::agentic::coordination::{DialogSubmissionPolicy, DialogTriggerSource}; use bitfun_core::agentic::core::SessionConfig; +use bitfun_core::agentic::deep_review_policy::{ + DeepReviewQueueControlAction, apply_deep_review_queue_control, +}; use bitfun_core::service::config::types::SubAgentConfig; use bitfun_core::service::i18n::{LocaleId, LocaleMetadata, sync_global_i18n_service_locale}; use std::collections::HashMap; @@ -380,6 +383,36 @@ pub async fn dispatch( .map_err(|e| anyhow!("{}", e))?; Ok(serde_json::json!({ "success": true })) } + "control_deep_review_queue" => { + let request = extract_request(¶ms)?; + let session_id = get_string(&request, "sessionId")?; + let dialog_turn_id = get_string(&request, "dialogTurnId")?; + let tool_id = get_string(&request, "toolId")?; + let action_raw = get_string(&request, "action")?; + let action = match action_raw.as_str() { + "pause" => DeepReviewQueueControlAction::Pause, + "continue" => DeepReviewQueueControlAction::Continue, + "cancel" => DeepReviewQueueControlAction::Cancel, + "skip_optional" => DeepReviewQueueControlAction::SkipOptional, + other => { + return Err(anyhow!( + "Invalid DeepReview queue control action: {}", + other + )); + } + }; + if session_id.trim().is_empty() { + return Err(anyhow!("Missing sessionId")); + } + if dialog_turn_id.trim().is_empty() { + return Err(anyhow!("Missing dialogTurnId")); + } + if tool_id.trim().is_empty() { + return Err(anyhow!("Missing toolId")); + } + apply_deep_review_queue_control(&dialog_turn_id, &tool_id, action); + Ok(serde_json::json!({ "success": true })) + } "cancel_session" => { let request = extract_request(¶ms)?; let session_id = get_string(&request, "sessionId")?; diff --git a/src/crates/acp/src/runtime/prompt.rs b/src/crates/acp/src/runtime/prompt.rs index cdea66fa9..9dba30b15 100644 --- a/src/crates/acp/src/runtime/prompt.rs +++ b/src/crates/acp/src/runtime/prompt.rs @@ -51,6 +51,7 @@ impl BitfunAcpRuntime { acp_session.mode_id.clone(), Some(acp_session.cwd.clone()), DialogSubmissionPolicy::for_source(DialogTriggerSource::Cli), + None, ) .await .map_err(Self::internal_error)?; @@ -66,6 +67,7 @@ impl BitfunAcpRuntime { acp_session.mode_id.clone(), Some(acp_session.cwd.clone()), DialogSubmissionPolicy::for_source(DialogTriggerSource::Cli), + None, ) .await .map_err(Self::internal_error)?; diff --git a/src/crates/core/src/agentic/agents/prompts/deep_review_agent.md b/src/crates/core/src/agentic/agents/prompts/deep_review_agent.md index ea2338336..1b2058591 100644 --- a/src/crates/core/src/agentic/agents/prompts/deep_review_agent.md +++ b/src/crates/core/src/agentic/agents/prompts/deep_review_agent.md @@ -27,6 +27,8 @@ The user request may also include a **configured team manifest** with additional The configured manifest may also include an **execution policy** with reviewer timeout, judge timeout, a team review strategy, per-reviewer strategy overrides, preferred reviewer `model_id` values, prompt directives, and file-split parameters. Treat that policy and roster as authoritative. +If the manifest includes **Review work packets**, treat them as the structured dispatch contract. Each packet defines the reviewer, assigned scope, allowed tools, timeout, required output fields, model, and prompt directive for one reviewer or judge task. Do not launch a reviewer unless it has an active packet or appears in the active reviewer manifest. + ### File splitting for large review targets When the review target contains many files, running a single reviewer instance per role may cause timeouts or shallow coverage. The execution policy provides two fields to control this: @@ -38,7 +40,7 @@ When the file count exceeds `reviewer_file_split_threshold` and `max_same_role_i 1. Divide the file list into roughly equal groups (one group per same-role instance, up to `max_same_role_instances`). 2. Launch multiple Task calls with the **same `subagent_type`** in the **same parallel message**, each assigned a distinct file group. -3. In each Task `description`, include a group identifier so the user can track them in the UI (e.g. "Security review [group 1/3]", "Security review [group 2/3]"). +3. In each Task `description`, include a group identifier and packet id so the user and judge can track them in the UI (e.g. "Security review [group 1/3] [packet reviewer:ReviewSecurity:group-1-of-3]", "Security review [group 2/3] [packet reviewer:ReviewSecurity:group-2-of-3]"). 4. In each reviewer Task `prompt`, clearly state which files this instance is responsible for and that it should **not** inspect files outside its assigned group unless a cross-file dependency is strongly suspected. All same-role instances from a single split must be launched in the **same assistant message** to maximize parallelism. @@ -83,6 +85,7 @@ You MUST NOT: Track one reviewer record for every reviewer that was scheduled. Use these status labels conservatively: - `completed` +- `partial_timeout` - `timed_out` - `cancelled_by_user` - `failed` @@ -92,6 +95,11 @@ If a reviewer or the judge fails, times out, or is cancelled: - keep going with the remaining evidence - record the status in `reviewers` +- if the Task result reports `partial_timeout`, copy the useful partial text into `reviewers[].partial_output` and summarize the confidence impact in `report_sections.coverage_notes` +- if the reviewer reports its packet id, copy it into `reviewers[].packet_id` and set `reviewers[].packet_status_source = "reported"` +- if the reviewer omits `packet_id` but the Task was launched from a work packet, infer `reviewers[].packet_id` from the Task description or the matching work packet and set `reviewers[].packet_status_source = "inferred"` +- if no packet id can be reported or inferred, set `reviewers[].packet_status_source = "missing"` and summarize the confidence impact in `report_sections.coverage_notes` +- retry a failed or timed-out reviewer only when useful evidence is missing, and only within the configured retry budget; retry the same `subagent_type` with `retry = true`, a reduced scope, a downgraded strategy when possible, and a shorter timeout - lower confidence as needed - never drop the final report just because one subagent stopped @@ -124,8 +132,11 @@ If a configured reviewer entry provides `model_id`, pass `model_id` with that va If the configured team manifest provides a preferred display label or nickname for a reviewer, reuse that nickname in the Task `description` so the user can easily track each reviewer in the session UI. +Every reviewer Task `description` should also include the work packet id in square brackets, for example `Security review [packet reviewer:ReviewSecurity]` or `Security review [group 1/3] [packet reviewer:ReviewSecurity:group-1-of-3]`. This gives the judge a deterministic fallback when the reviewer forgets to echo `packet_id`. + Each reviewer Task prompt must include: +- the matching work packet verbatim, including `packet_id`, `assigned_scope`, `allowed_tools`, `timeout_seconds`, and `required_output_fields` - the exact review target (for split instances: the assigned file group only) - any user-provided focus text - the reviewer-specific strategy from the configured manifest (`quick`, `normal`, or `deep`) and its exact `prompt_directive` @@ -133,7 +144,9 @@ Each reviewer Task prompt must include: - a request for concrete findings only - a strict output format that is easy to verify later - for split instances: an explicit list of the files this instance is responsible for, and an instruction not to review files outside the assigned group unless a cross-file dependency is critical -- if `reviewer_timeout_seconds > 0`, a time-awareness reminder: "You have a strict timeout. Prioritize: (1) Inspect the diff first, then read only files the diff directly references. (2) Confirm or dismiss each hypothesis before opening a new investigation path. (3) Write your findings early — a partial report with confirmed findings is more valuable than no report at all." +- an instruction to echo the work packet `packet_id` and set `status` in the response +- an instruction that missing `packet_id` will be inferred by the parent only as a lower-confidence fallback, not treated as a successful reported packet +- if `reviewer_timeout_seconds > 0`, a time-awareness reminder: "You have a strict timeout. Prioritize: (1) Inspect the diff first, then read only files the diff directly references. (2) Confirm or dismiss each hypothesis before opening a new investigation path. (3) Write your findings early; a partial report with confirmed findings is more valuable than no report at all." Strategy guidance (fallback only; the configured `prompt_directive` is the source of truth): @@ -161,6 +174,7 @@ Role-specific strategy amplification (append to the reviewer Task prompt when th After the reviewer batch finishes, launch `ReviewJudge` with: +- the matching judge work packet verbatim - the same review target - the full reviewer outputs from every reviewer that ran, including timeout/cancel/failure notes - if file splitting was used, include outputs from **all** same-role instances and label each by group (e.g. "Security Reviewer [group 1/3]") @@ -179,6 +193,8 @@ The judge must explicitly call out: - likely false positives - optimization advice that is too risky or directionally wrong - findings where the reviewer's evidence does not support their conclusion +- reviewer outputs that are missing `packet_id` or `status`; treat those as lower confidence rather than discarding the whole review +- reviewer outputs whose packet id was inferred from scheduling metadata rather than reported by the reviewer - which findings should survive into the final report ### Phase 4: Report and wait for user approval @@ -187,7 +203,14 @@ After the quality gate finishes: 1. Submit the final structured report via `submit_code_review`. 2. Include all validated findings, unresolved items, and concrete next steps in `remediation_plan`. -3. When enough information exists, also populate `report_sections` so the UI can present a compact, multi-dimensional report: +3. For each `reviewers[]` entry, include `packet_id` when reported or inferable and set `packet_status_source` to `reported`, `inferred`, or `missing`. +4. Populate `reliability_signals` with structured status signals when relevant: + - `context_pressure`: large target, constrained token budget, or reduced fan-out affected coverage. + - `compression_preserved`: compression or compaction preserved key facts used in the final decision. + - `partial_reviewer`: one or more reviewers timed out or were cancelled after producing useful partial evidence. + - `user_decision`: an item needs user/product judgment before remediation. + Use `severity = "info" | "warning" | "action"`, include `count` when useful, and set `source = "runtime" | "manifest" | "report" | "inferred"`. +5. When enough information exists, also populate `report_sections` so the UI can present a compact, multi-dimensional report: - `executive_summary`: 1-3 concise bullets with the final decision and most important risk. - `remediation_groups.must_fix`: required correctness/security/regression fixes. - `remediation_groups.should_improve`: non-blocking cleanup or quality improvements. @@ -200,8 +223,8 @@ After the quality gate finishes: - `remediation_groups.verification`: focused verification or follow-up review steps. - `strength_groups`: positive observations grouped under `architecture`, `maintainability`, `tests`, `security`, `performance`, `user_experience`, or `other`. - `coverage_notes`: confidence, timeout/cancel/failure, scope, or manual follow-up notes. -4. Do **not** modify any files during the review phase. -5. Wait for explicit user approval before starting any remediation work. +6. Do **not** modify any files during the review phase. +7. Wait for explicit user approval before starting any remediation work. ### Phase 5: Remediation (only when explicitly instructed) @@ -224,7 +247,11 @@ Your structured result MUST include: - `review_mode = "deep"` - `review_scope` - `reviewers` with one entry for every reviewer that was scheduled, including optional extra reviewers and the judge when relevant +- `reviewers[].packet_id` when reported by the reviewer or inferable from the scheduled packet +- `reviewers[].packet_status_source` as `reported`, `inferred`, or `missing` +- for a timed-out reviewer with captured output, set `status = "partial_timeout"` and include the captured evidence in `partial_output` - `remediation_plan` with concrete next steps, including unresolved items or manual follow-up when needed +- `reliability_signals` with structured context pressure, compression preservation, partial reviewer, and user decision signals when any of those apply - `report_sections` when the final report has enough content to split remediation, strengths, and coverage into the dimensions above Issue writing rules: diff --git a/src/crates/core/src/agentic/agents/prompts/review_architecture_agent.md b/src/crates/core/src/agentic/agents/prompts/review_architecture_agent.md index adb873d7d..7ee303213 100644 --- a/src/crates/core/src/agentic/agents/prompts/review_architecture_agent.md +++ b/src/crates/core/src/agentic/agents/prompts/review_architecture_agent.md @@ -59,6 +59,10 @@ Never modify files or git state. Return markdown only, using this exact structure: +## Packet +packet_id: +status: completed + ## Reviewer Architecture Reviewer diff --git a/src/crates/core/src/agentic/agents/prompts/review_business_logic_agent.md b/src/crates/core/src/agentic/agents/prompts/review_business_logic_agent.md index c05c7e1b7..0669ed784 100644 --- a/src/crates/core/src/agentic/agents/prompts/review_business_logic_agent.md +++ b/src/crates/core/src/agentic/agents/prompts/review_business_logic_agent.md @@ -58,6 +58,10 @@ Never modify files or git state. Return markdown only, using this exact structure: +## Packet +packet_id: +status: completed + ## Reviewer Business Logic Reviewer diff --git a/src/crates/core/src/agentic/agents/prompts/review_frontend_agent.md b/src/crates/core/src/agentic/agents/prompts/review_frontend_agent.md index 4e868efa3..e3a180a1c 100644 --- a/src/crates/core/src/agentic/agents/prompts/review_frontend_agent.md +++ b/src/crates/core/src/agentic/agents/prompts/review_frontend_agent.md @@ -64,6 +64,10 @@ Never modify files or git state. Return markdown only, using this exact structure: +## Packet +packet_id: +status: completed + ## Reviewer Frontend Reviewer diff --git a/src/crates/core/src/agentic/agents/prompts/review_performance_agent.md b/src/crates/core/src/agentic/agents/prompts/review_performance_agent.md index 0cfb81f63..719e29fd7 100644 --- a/src/crates/core/src/agentic/agents/prompts/review_performance_agent.md +++ b/src/crates/core/src/agentic/agents/prompts/review_performance_agent.md @@ -59,6 +59,10 @@ Never modify files or git state. Return markdown only, using this exact structure: +## Packet +packet_id: +status: completed + ## Reviewer Performance Reviewer diff --git a/src/crates/core/src/agentic/agents/prompts/review_quality_gate_agent.md b/src/crates/core/src/agentic/agents/prompts/review_quality_gate_agent.md index 2668d243e..d95cd6893 100644 --- a/src/crates/core/src/agentic/agents/prompts/review_quality_gate_agent.md +++ b/src/crates/core/src/agentic/agents/prompts/review_quality_gate_agent.md @@ -70,6 +70,10 @@ Never modify files or git state. Return markdown only, using this exact structure: +## Packet +packet_id: +status: completed + ## Reviewer Review Quality Inspector diff --git a/src/crates/core/src/agentic/agents/prompts/review_security_agent.md b/src/crates/core/src/agentic/agents/prompts/review_security_agent.md index 02b111374..3cf7b2e5d 100644 --- a/src/crates/core/src/agentic/agents/prompts/review_security_agent.md +++ b/src/crates/core/src/agentic/agents/prompts/review_security_agent.md @@ -59,6 +59,10 @@ Never modify files or git state. Return markdown only, using this exact structure: +## Packet +packet_id: +status: completed + ## Reviewer Security Reviewer diff --git a/src/crates/core/src/agentic/agents/registry.rs b/src/crates/core/src/agentic/agents/registry.rs index 7e44df435..c80fce4ab 100644 --- a/src/crates/core/src/agentic/agents/registry.rs +++ b/src/crates/core/src/agentic/agents/registry.rs @@ -1362,6 +1362,19 @@ mod tests { } } + #[tokio::test] + async fn frontend_reviewer_is_registered_as_review_subagent() { + let registry = AgentRegistry::new(); + let subagents = registry.get_subagents_info(None).await; + let frontend = subagents + .iter() + .find(|agent| agent.id == "ReviewFrontend") + .expect("ReviewFrontend should be registered as a subagent"); + + assert!(frontend.is_review); + assert!(frontend.is_readonly); + } + #[test] fn built_in_deep_review_reviewers_are_marked_as_review_agents() { let registry = AgentRegistry::new(); diff --git a/src/crates/core/src/agentic/context_profile.rs b/src/crates/core/src/agentic/context_profile.rs new file mode 100644 index 000000000..4a98e2d09 --- /dev/null +++ b/src/crates/core/src/agentic/context_profile.rs @@ -0,0 +1,344 @@ +//! Adaptive context profile policy. +//! +//! Profiles keep context behavior aligned with the shape of the agent workload +//! without exposing more knobs to the UI. + +use crate::agentic::session::compression::microcompact::MicrocompactConfig; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ContextProfile { + LongTask, + Conversation, +} + +impl ContextProfile { + pub fn for_agent_type(agent_type: &str) -> Self { + Self::for_agent_context(agent_type, false) + } + + pub fn for_agent_context(agent_type: &str, is_review_subagent: bool) -> Self { + if is_review_subagent || is_long_task_agent(agent_type) { + Self::LongTask + } else { + Self::Conversation + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ModelCapabilityProfile { + Standard, + Weak, +} + +impl ModelCapabilityProfile { + pub fn from_model_id(model_id: Option<&str>) -> Self { + let Some(model_id) = model_id.map(str::trim).filter(|id| !id.is_empty()) else { + return Self::Standard; + }; + let normalized = model_id.to_ascii_lowercase(); + if matches!(normalized.as_str(), "auto" | "fast" | "primary") { + return Self::Standard; + } + + // Weak model detection: match suffix-based markers (e.g., "gpt-4o-mini", + // "gemini-1.5-flash") and exact markers (e.g., "haiku", "mini"). + // Avoid false positives from substring matches (e.g., "gemini-pro" should + // NOT match "mini" inside "gemini"). + let weak_suffixes = ["-haiku", "-mini", "-small", "-lite", "-flash", "-nano"]; + let weak_exact = ["haiku", "mini", "small", "lite", "flash", "nano"]; + // Also match known weak model name patterns where the marker appears + // mid-string but is a genuine weak model (e.g., "claude-3-haiku-20240307"). + let weak_mid_patterns = [ + "-haiku-", "-mini-", "-small-", "-lite-", "-flash-", "-nano-", + ]; + if weak_suffixes.iter().any(|s| normalized.ends_with(s)) + || weak_exact.iter().any(|e| normalized == *e) + || weak_mid_patterns.iter().any(|p| normalized.contains(p)) + { + Self::Weak + } else { + Self::Standard + } + } + + pub fn from_resolved_model(resolved_model_id: &str, provider_model_name: &str) -> Self { + let resolved = Self::from_model_id(Some(resolved_model_id)); + if resolved == Self::Weak { + resolved + } else { + Self::from_model_id(Some(provider_model_name)) + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct ContextProfilePolicy { + pub profile: ContextProfile, + pub microcompact_keep_recent: usize, + pub microcompact_trigger_ratio: f32, + pub compression_contract_limit: usize, + pub subagent_concurrency_cap: usize, + pub repeated_tool_signature_threshold: usize, + pub consecutive_failed_command_threshold: usize, +} + +impl ContextProfilePolicy { + pub fn for_agent_context( + agent_type: &str, + is_review_subagent: bool, + model_capability: ModelCapabilityProfile, + ) -> Self { + let profile = ContextProfile::for_agent_context(agent_type, is_review_subagent); + let mut policy = match profile { + ContextProfile::LongTask => Self::long_task(), + ContextProfile::Conversation => Self::conversation(), + }; + + if model_capability == ModelCapabilityProfile::Weak { + policy.apply_weak_model_override(); + } + + policy + } + + pub fn for_agent_context_and_model( + agent_type: &str, + is_review_subagent: bool, + resolved_model_id: &str, + provider_model_name: &str, + ) -> Self { + Self::for_agent_context( + agent_type, + is_review_subagent, + ModelCapabilityProfile::from_resolved_model(resolved_model_id, provider_model_name), + ) + } + + pub fn for_subagent_context_and_models( + agent_type: &str, + is_review_subagent: bool, + subagent_model_id: Option<&str>, + parent_agent_type: Option<&str>, + parent_is_review_subagent: bool, + parent_model_id: Option<&str>, + ) -> Self { + let child_profile = ContextProfile::for_agent_context(agent_type, is_review_subagent); + let parent_profile = parent_agent_type + .map(|agent_type| { + ContextProfile::for_agent_context(agent_type, parent_is_review_subagent) + }) + .unwrap_or(ContextProfile::Conversation); + let profile = if child_profile == ContextProfile::LongTask + || parent_profile == ContextProfile::LongTask + { + ContextProfile::LongTask + } else { + ContextProfile::Conversation + }; + let model_capability = subagent_model_id + .map(str::trim) + .filter(|model_id| !model_id.is_empty()) + .map(|model_id| ModelCapabilityProfile::from_model_id(Some(model_id))) + .or_else(|| { + parent_model_id + .map(str::trim) + .filter(|model_id| !model_id.is_empty()) + .map(|model_id| ModelCapabilityProfile::from_model_id(Some(model_id))) + }) + .unwrap_or(ModelCapabilityProfile::Standard); + + let mut policy = match profile { + ContextProfile::LongTask => Self::long_task(), + ContextProfile::Conversation => Self::conversation(), + }; + if model_capability == ModelCapabilityProfile::Weak { + policy.apply_weak_model_override(); + } + policy + } + + pub fn microcompact_config(&self) -> MicrocompactConfig { + MicrocompactConfig { + keep_recent: self.microcompact_keep_recent, + trigger_ratio: self.microcompact_trigger_ratio, + } + } + + pub fn effective_subagent_max_concurrency(&self, configured: usize) -> usize { + configured.clamp(1, self.subagent_concurrency_cap) + } + + pub fn effective_loop_threshold(&self, configured: usize) -> usize { + configured + .max(1) + .min(self.repeated_tool_signature_threshold.max(1)) + } + + pub fn has_repeated_tool_loop(&self, repeated_tool_signature_count: usize) -> bool { + repeated_tool_signature_count >= self.repeated_tool_signature_threshold.max(1) + } + + pub fn has_consecutive_command_failure_loop(&self, consecutive_failed_commands: usize) -> bool { + consecutive_failed_commands >= self.consecutive_failed_command_threshold.max(1) + } + + fn long_task() -> Self { + let default_microcompact = MicrocompactConfig::default(); + Self { + profile: ContextProfile::LongTask, + microcompact_keep_recent: default_microcompact.keep_recent, + microcompact_trigger_ratio: default_microcompact.trigger_ratio, + compression_contract_limit: 8, + subagent_concurrency_cap: 5, + repeated_tool_signature_threshold: 3, + consecutive_failed_command_threshold: 2, + } + } + + fn conversation() -> Self { + Self { + profile: ContextProfile::Conversation, + microcompact_keep_recent: 12, + microcompact_trigger_ratio: 0.65, + compression_contract_limit: 4, + subagent_concurrency_cap: 2, + repeated_tool_signature_threshold: 4, + consecutive_failed_command_threshold: 3, + } + } + + fn apply_weak_model_override(&mut self) { + self.microcompact_keep_recent = self.microcompact_keep_recent.min(8); + self.compression_contract_limit = self.compression_contract_limit.min(4); + self.subagent_concurrency_cap = self.subagent_concurrency_cap.min(2); + self.repeated_tool_signature_threshold = self.repeated_tool_signature_threshold.min(2); + self.consecutive_failed_command_threshold = + self.consecutive_failed_command_threshold.min(2); + } +} + +fn is_long_task_agent(agent_type: &str) -> bool { + matches!( + agent_type, + "agentic" | "DeepReview" | "DeepResearch" | "ComputerUse" | "Team" + ) || agent_type.starts_with("Review") +} + +#[cfg(test)] +mod tests { + use super::ModelCapabilityProfile; + + #[test] + fn model_capability_standard_for_empty_or_none() { + assert_eq!( + ModelCapabilityProfile::from_model_id(None), + ModelCapabilityProfile::Standard + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("")), + ModelCapabilityProfile::Standard + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(Some(" ")), + ModelCapabilityProfile::Standard + ); + } + + #[test] + fn model_capability_standard_for_strong_models() { + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("gpt-4o")), + ModelCapabilityProfile::Standard + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("claude-sonnet-4")), + ModelCapabilityProfile::Standard + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("gemini-pro")), + ModelCapabilityProfile::Standard + ); + } + + #[test] + fn model_capability_weak_for_haiku() { + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("claude-3-haiku-20240307")), + ModelCapabilityProfile::Weak + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("anthropic/claude-3-haiku")), + ModelCapabilityProfile::Weak + ); + } + + #[test] + fn model_capability_weak_for_mini() { + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("gpt-4o-mini")), + ModelCapabilityProfile::Weak + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("openai/gpt-4o-mini")), + ModelCapabilityProfile::Weak + ); + } + + #[test] + fn model_capability_weak_for_flash() { + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("gemini-1.5-flash")), + ModelCapabilityProfile::Weak + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("google/gemini-flash")), + ModelCapabilityProfile::Weak + ); + } + + #[test] + fn model_capability_weak_for_lite() { + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("qwen-lite")), + ModelCapabilityProfile::Weak + ); + } + + #[test] + fn model_capability_weak_for_small() { + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("llama-small")), + ModelCapabilityProfile::Weak + ); + } + + #[test] + fn model_capability_weak_for_nano() { + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("gemini-nano")), + ModelCapabilityProfile::Weak + ); + } + + #[test] + fn model_capability_from_resolved_model_prefers_resolved() { + // resolved is weak → returns weak regardless of provider name + assert_eq!( + ModelCapabilityProfile::from_resolved_model("gpt-4o-mini", "gpt-4o"), + ModelCapabilityProfile::Weak + ); + // resolved is standard, provider is weak → returns weak + assert_eq!( + ModelCapabilityProfile::from_resolved_model("gpt-4o", "gpt-4o-mini"), + ModelCapabilityProfile::Weak + ); + // both standard → returns standard + assert_eq!( + ModelCapabilityProfile::from_resolved_model("gpt-4o", "claude-sonnet"), + ModelCapabilityProfile::Standard + ); + } +} diff --git a/src/crates/core/src/agentic/coordination/coordinator.rs b/src/crates/core/src/agentic/coordination/coordinator.rs index eba79b464..924ba0104 100644 --- a/src/crates/core/src/agentic/coordination/coordinator.rs +++ b/src/crates/core/src/agentic/coordination/coordinator.rs @@ -4,12 +4,13 @@ use super::{scheduler::DialogSubmissionPolicy, turn_outcome::TurnOutcome}; use crate::agentic::agents::get_agent_registry; +use crate::agentic::context_profile::ContextProfilePolicy; use crate::agentic::core::{ has_prompt_markup, Message, MessageContent, ProcessingPhase, PromptEnvelope, Session, SessionConfig, SessionKind, SessionState, SessionSummary, TurnStats, }; use crate::agentic::events::{ - AgenticEvent, EventPriority, EventQueue, EventRouter, EventSubscriber, + AgenticEvent, DeepReviewQueueState, EventPriority, EventQueue, EventRouter, EventSubscriber, }; use crate::agentic::execution::{ContextCompactionOutcome, ExecutionContext, ExecutionEngine}; use crate::agentic::fork_agent::{ @@ -51,6 +52,48 @@ const SUBAGENT_TIMEOUT_GRACE_PERIOD: Duration = Duration::from_secs(10); pub struct SubagentResult { /// AI text response pub text: String, + pub status: SubagentResultStatus, + pub reason: Option, + pub ledger_event_id: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SubagentResultStatus { + Completed, + PartialTimeout, +} + +impl SubagentResult { + fn completed(text: String) -> Self { + Self { + text, + status: SubagentResultStatus::Completed, + reason: None, + ledger_event_id: None, + } + } + + fn partial_timeout(text: String, reason: String) -> Self { + Self { + text, + status: SubagentResultStatus::PartialTimeout, + reason: Some(reason), + ledger_event_id: None, + } + } + + fn with_ledger_event_id(mut self, event_id: String) -> Self { + self.ledger_event_id = Some(event_id); + self + } + + pub fn is_partial_timeout(&self) -> bool { + self.status == SubagentResultStatus::PartialTimeout + } + + pub fn ledger_event_id(&self) -> Option<&str> { + self.ledger_event_id.as_deref() + } } struct HiddenSubagentExecutionRequest { @@ -132,20 +175,17 @@ struct SubagentConcurrencyLimiter { } struct SubagentConcurrencyPermitGuard { - permit: Option, - limiter: SubagentConcurrencyLimiter, + permits: Vec<(OwnedSemaphorePermit, SubagentConcurrencyLimiter)>, agent_type: String, } impl SubagentConcurrencyPermitGuard { fn new( - permit: OwnedSemaphorePermit, - limiter: SubagentConcurrencyLimiter, + permits: Vec<(OwnedSemaphorePermit, SubagentConcurrencyLimiter)>, agent_type: String, ) -> Self { Self { - permit: Some(permit), - limiter, + permits, agent_type, } } @@ -153,20 +193,17 @@ impl SubagentConcurrencyPermitGuard { impl Drop for SubagentConcurrencyPermitGuard { fn drop(&mut self) { - let Some(permit) = self.permit.take() else { - return; - }; - - drop(permit); + for (permit, limiter) in std::mem::take(&mut self.permits) { + drop(permit); - let active_subagents = self - .limiter - .max_concurrency - .saturating_sub(self.limiter.semaphore.available_permits()); - debug!( - "Released subagent concurrency permit: agent_type={}, active_subagents={}, max_concurrency={}", - self.agent_type, active_subagents, self.limiter.max_concurrency - ); + let active_subagents = limiter + .max_concurrency + .saturating_sub(limiter.semaphore.available_permits()); + debug!( + "Released subagent concurrency permit: agent_type={}, active_subagents={}, max_concurrency={}", + self.agent_type, active_subagents, limiter.max_concurrency + ); + } } } @@ -256,6 +293,7 @@ pub struct ConversationCoordinator { event_queue: Arc, event_router: Arc, subagent_concurrency_limiter: Arc>>, + subagent_profile_concurrency_limiters: Arc>>, /// Registry for dynamically adjusting subagent timeouts. subagent_timeout_registry: Arc>>>, /// Notifies DialogScheduler of turn outcomes; injected after construction @@ -599,6 +637,7 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet event_queue, event_router, subagent_concurrency_limiter: Arc::new(RwLock::new(None)), + subagent_profile_concurrency_limiters: Arc::new(RwLock::new(HashMap::new())), subagent_timeout_registry: Arc::new(RwLock::new(HashMap::new())), scheduler_notify_tx: OnceLock::new(), round_preempt_source: OnceLock::new(), @@ -855,6 +894,8 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet tags: Vec::new(), custom_metadata: None, todos: None, + deep_review_run_manifest: None, + deep_review_cache: None, workspace_path: Some(workspace_path.to_string()), workspace_hostname: None, unread_completion: None, @@ -1119,6 +1160,7 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet agent_type: String, workspace_path: Option, submission_policy: DialogSubmissionPolicy, + user_message_metadata: Option, ) -> BitFunResult<()> { self.start_dialog_turn_internal( session_id, @@ -1129,7 +1171,7 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet agent_type, workspace_path, submission_policy, - None, + user_message_metadata, false, ) .await @@ -1146,6 +1188,7 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet agent_type: String, workspace_path: Option, submission_policy: DialogSubmissionPolicy, + user_message_metadata: Option, ) -> BitFunResult<()> { self.start_dialog_turn_internal( session_id, @@ -1156,7 +1199,7 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet agent_type, workspace_path, submission_policy, - None, + user_message_metadata, false, ) .await @@ -1692,6 +1735,16 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet // Pass turn_index (for operation history/rollback) context_vars.insert("turn_index".to_string(), turn_index.to_string()); + if let Some(run_manifest) = user_message_metadata.as_ref().and_then(|metadata| { + metadata + .get("deepReviewRunManifest") + .or_else(|| metadata.get("deep_review_run_manifest")) + }) { + context_vars.insert( + "deep_review_run_manifest".to_string(), + run_manifest.to_string(), + ); + } let session_workspace_path = session_workspace .as_ref() .map(|workspace| workspace.root_path_string()); @@ -1715,6 +1768,7 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet workspace_services, round_preempt: self.round_preempt_source.get().cloned(), round_steering: self.round_steering_source.get().cloned(), + recover_partial_on_cancel: false, }; // Auto-generate session title on first message @@ -2261,16 +2315,41 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet limiter } - async fn acquire_subagent_concurrency_permit( + async fn get_subagent_profile_concurrency_limiter( &self, + max_concurrency: usize, + ) -> SubagentConcurrencyLimiter { + let max_concurrency = normalize_subagent_max_concurrency(max_concurrency); + + { + let limiter_guard = self.subagent_profile_concurrency_limiters.read().await; + if let Some(limiter) = limiter_guard.get(&max_concurrency) { + return limiter.clone(); + } + } + + let mut limiter_guard = self.subagent_profile_concurrency_limiters.write().await; + if let Some(limiter) = limiter_guard.get(&max_concurrency) { + return limiter.clone(); + } + + let limiter = SubagentConcurrencyLimiter { + semaphore: Arc::new(Semaphore::new(max_concurrency)), + max_concurrency, + }; + limiter_guard.insert(max_concurrency, limiter.clone()); + limiter + } + + async fn acquire_permit_from_limiter( + &self, + limiter: &SubagentConcurrencyLimiter, agent_type: &str, cancel_token: Option<&CancellationToken>, deadline: Option, - ) -> BitFunResult<(OwnedSemaphorePermit, SubagentConcurrencyLimiter, u128)> { - let limiter = self.get_subagent_concurrency_limiter().await; - let started_waiting = Instant::now(); + label: &str, + ) -> BitFunResult { let semaphore = limiter.semaphore.clone(); - let permit = match (cancel_token, deadline) { (Some(token), Some(deadline)) => { tokio::select! { @@ -2282,8 +2361,8 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet } _ = tokio::time::sleep_until(deadline) => { return Err(BitFunError::Timeout(format!( - "Timed out while waiting for a concurrency slot for subagent '{}'", - agent_type + "Timed out while waiting for a {} concurrency slot for subagent '{}'", + label, agent_type ))); } } @@ -2303,8 +2382,8 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet result = semaphore.acquire_owned() => result?, _ = tokio::time::sleep_until(deadline) => { return Err(BitFunError::Timeout(format!( - "Timed out while waiting for a concurrency slot for subagent '{}'", - agent_type + "Timed out while waiting for a {} concurrency slot for subagent '{}'", + label, agent_type ))); } } @@ -2312,16 +2391,104 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet (None, None) => semaphore.acquire_owned().await?, }; - let wait_ms = started_waiting.elapsed().as_millis(); let active_subagents = limiter .max_concurrency .saturating_sub(limiter.semaphore.available_permits()); debug!( - "Acquired subagent concurrency permit: agent_type={}, wait_ms={}, active_subagents={}, max_concurrency={}", - agent_type, wait_ms, active_subagents, limiter.max_concurrency + "Acquired subagent {} concurrency permit: agent_type={}, active_subagents={}, max_concurrency={}", + label, agent_type, active_subagents, limiter.max_concurrency + ); + + Ok(permit) + } + + async fn acquire_subagent_concurrency_permit( + &self, + agent_type: &str, + profile_concurrency_cap: usize, + cancel_token: Option<&CancellationToken>, + deadline: Option, + ) -> BitFunResult<( + Vec<(OwnedSemaphorePermit, SubagentConcurrencyLimiter)>, + u128, + )> { + let started_waiting = Instant::now(); + + let profile_limiter = self + .get_subagent_profile_concurrency_limiter(profile_concurrency_cap) + .await; + let profile_permit = self + .acquire_permit_from_limiter( + &profile_limiter, + agent_type, + cancel_token, + deadline, + "profile", + ) + .await?; + + let global_limiter = self.get_subagent_concurrency_limiter().await; + let global_permit = self + .acquire_permit_from_limiter( + &global_limiter, + agent_type, + cancel_token, + deadline, + "global", + ) + .await?; + + let wait_ms = started_waiting.elapsed().as_millis(); + debug!( + "Acquired subagent concurrency permits: agent_type={}, wait_ms={}, profile_max_concurrency={}, global_max_concurrency={}", + agent_type, wait_ms, profile_limiter.max_concurrency, global_limiter.max_concurrency ); - Ok((permit, limiter, wait_ms)) + Ok(( + vec![ + (profile_permit, profile_limiter), + (global_permit, global_limiter), + ], + wait_ms, + )) + } + + fn context_profile_policy_for_subagent( + &self, + agent_type: &str, + session_config: &SessionConfig, + subagent_parent_info: Option<&SubagentParentInfo>, + ) -> ContextProfilePolicy { + if let Some(parent_info) = subagent_parent_info { + if let Some(parent_session) = self.session_manager.get_session(&parent_info.session_id) + { + let parent_is_review_subagent = get_agent_registry() + .get_subagent_is_review(&parent_session.agent_type) + .unwrap_or(false); + let is_review_subagent = get_agent_registry() + .get_subagent_is_review(agent_type) + .unwrap_or(false); + return ContextProfilePolicy::for_subagent_context_and_models( + agent_type, + is_review_subagent, + session_config.model_id.as_deref(), + Some(&parent_session.agent_type), + parent_is_review_subagent, + parent_session.config.model_id.as_deref(), + ); + } + } + + let is_review_subagent = get_agent_registry() + .get_subagent_is_review(agent_type) + .unwrap_or(false); + let model_id = session_config.model_id.as_deref().unwrap_or_default(); + ContextProfilePolicy::for_agent_context_and_model( + agent_type, + is_review_subagent, + model_id, + model_id, + ) } async fn execute_hidden_subagent_internal( @@ -2355,6 +2522,18 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet timeout_seconds.map(|seconds| Instant::now() + Duration::from_secs(seconds)); let (deadline_tx, mut deadline_rx) = watch::channel(initial_deadline); + let context_profile_policy = self.context_profile_policy_for_subagent( + &agent_type, + &session_config, + subagent_parent_info.as_ref(), + ); + debug!( + "Subagent context profile policy selected: agent_type={}, profile={:?}, profile_concurrency_cap={}", + agent_type, + context_profile_policy.profile, + context_profile_policy.subagent_concurrency_cap + ); + // Check cancel token (before creating session) if let Some(token) = cancel_token { if token.is_cancelled() { @@ -2369,11 +2548,15 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet // Use create_subagent_session (not create_session) so that no SessionCreated // event is emitted to the transport layer — subagent sessions are internal // implementation details and must not appear in the UI session list. - let (permit, limiter, wait_ms) = self - .acquire_subagent_concurrency_permit(&agent_type, cancel_token, initial_deadline) + let (permits, wait_ms) = self + .acquire_subagent_concurrency_permit( + &agent_type, + context_profile_policy.subagent_concurrency_cap, + cancel_token, + initial_deadline, + ) .await?; - let _permit_guard = - SubagentConcurrencyPermitGuard::new(permit, limiter, agent_type.clone()); + let _permit_guard = SubagentConcurrencyPermitGuard::new(permits, agent_type.clone()); if let Some(token) = cancel_token { if token.is_cancelled() { @@ -2520,6 +2703,7 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet // dialog turns only. Leave None so we don't intercept buffer entries // that belong to a different (parent) session/turn. round_steering: None, + recover_partial_on_cancel: true, }; let execution_engine = self.execution_engine.clone(); @@ -2702,15 +2886,41 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet ); } - match tokio::time::timeout(SUBAGENT_TIMEOUT_GRACE_PERIOD, &mut execution_task).await + let partial_timeout_result = match tokio::time::timeout( + SUBAGENT_TIMEOUT_GRACE_PERIOD, + &mut execution_task, + ) + .await { - Ok(Ok(Ok(_))) | Ok(Ok(Err(_))) => {} + Ok(Ok(Ok(exec_result))) => { + let response_text = match exec_result.final_message.content { + MessageContent::Mixed { text, .. } => text, + MessageContent::Text(text) => text, + _ => String::new(), + }; + if response_text.trim().is_empty() { + None + } else { + Some(SubagentResult::partial_timeout( + response_text, + timeout_error_message.clone(), + )) + } + } + Ok(Ok(Err(error))) => { + debug!( + "Subagent returned error during timeout grace period: agent_type={}, session={}, error={}", + agent_type, session_id, error + ); + None + } Ok(Err(error)) => { warn!( "Subagent join failed during timeout grace period: agent_type={}, session={}, error={}", agent_type, session_id, error ); execution_task.abort(); + None } Err(_) => { warn!( @@ -2718,7 +2928,37 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet agent_type, session_id ); execution_task.abort(); + None } + }; + + if let Some(mut partial_result) = partial_timeout_result { + warn!( + "Subagent timed out with partial output: agent_type={}, session={}, text_len={}", + agent_type, + session_id, + partial_result.text.len() + ); + if let Some(parent_info) = subagent_parent_info.as_ref() { + let event = self.session_manager.record_subagent_partial_timeout( + &parent_info.session_id, + &parent_info.dialog_turn_id, + &agent_type, + &partial_result.text, + Some("timeout"), + ); + partial_result = partial_result.with_ledger_event_id(event.event_id); + } + if let Err(cleanup_err) = self.cleanup_subagent_resources(&session_id).await { + warn!( + "Failed to cleanup subagent resources after partial timeout: session={}, error={}", + session_id, cleanup_err + ); + } + let mut registry = self.subagent_timeout_registry.write().await; + registry.remove(&session_id); + + return Ok(partial_result); } if let Err(cleanup_err) = self.cleanup_subagent_resources(&session_id).await { @@ -2778,9 +3018,7 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet let mut registry = self.subagent_timeout_registry.write().await; registry.remove(&session_id); - Ok(SubagentResult { - text: response_text, - }) + Ok(SubagentResult::completed(response_text)) } pub async fn capture_fork_agent_context_snapshot( @@ -3167,6 +3405,24 @@ Update the persona files and delete BOOTSTRAP.md as soon as bootstrap is complet .await; } + pub async fn emit_deep_review_queue_state_changed( + &self, + session_id: &str, + turn_id: &str, + queue_state: DeepReviewQueueState, + ) { + let event = AgenticEvent::DeepReviewQueueStateChanged { + session_id: session_id.to_string(), + turn_id: turn_id.to_string(), + queue_state, + subagent_parent_info: None, + }; + let _ = self + .event_queue + .enqueue(event, Some(EventPriority::High)) + .await; + } + /// Get SessionManager reference (for advanced features like mode management) pub fn get_session_manager(&self) -> &Arc { &self.session_manager diff --git a/src/crates/core/src/agentic/coordination/scheduler.rs b/src/crates/core/src/agentic/coordination/scheduler.rs index 8f4258825..9f3cde55b 100644 --- a/src/crates/core/src/agentic/coordination/scheduler.rs +++ b/src/crates/core/src/agentic/coordination/scheduler.rs @@ -138,6 +138,7 @@ pub struct QueuedTurn { pub workspace_path: Option, pub policy: DialogSubmissionPolicy, pub reply_route: Option, + pub user_message_metadata: Option, pub image_contexts: Option>, #[allow(dead_code)] pub enqueued_at: SystemTime, @@ -320,6 +321,7 @@ impl DialogScheduler { workspace_path: Option, policy: DialogSubmissionPolicy, reply_route: Option, + user_message_metadata: Option, image_contexts: Option>, ) -> Result { let resolved_turn_id = turn_id.unwrap_or_else(|| Uuid::new_v4().to_string()); @@ -331,6 +333,7 @@ impl DialogScheduler { workspace_path, policy, reply_route, + user_message_metadata, image_contexts, enqueued_at: SystemTime::now(), }; @@ -575,6 +578,7 @@ impl DialogScheduler { queued_turn.agent_type.clone(), queued_turn.workspace_path.clone(), queued_turn.policy, + queued_turn.user_message_metadata.clone(), ) .await } @@ -588,6 +592,7 @@ impl DialogScheduler { queued_turn.agent_type.clone(), queued_turn.workspace_path.clone(), queued_turn.policy, + queued_turn.user_message_metadata.clone(), ) .await } @@ -652,6 +657,7 @@ impl DialogScheduler { DialogSubmissionPolicy::for_source(DialogTriggerSource::AgentSession), None, None, + None, ) .await { diff --git a/src/crates/core/src/agentic/core/message.rs b/src/crates/core/src/agentic/core/message.rs index bab90a12c..d45c64c22 100644 --- a/src/crates/core/src/agentic/core/message.rs +++ b/src/crates/core/src/agentic/core/message.rs @@ -88,6 +88,9 @@ pub struct CompressionPayload { #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind", rename_all = "snake_case")] pub enum CompressionEntry { + Contract { + contract: CompressionContract, + }, ModelSummary { text: String, }, @@ -101,6 +104,75 @@ pub enum CompressionEntry { }, } +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct CompressionContract { + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub touched_files: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub verification_commands: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub blocking_failures: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub subagent_statuses: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct CompressionContractItem { + pub target: String, + pub status: String, + pub summary: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub error_kind: Option, +} + +impl CompressionContract { + pub fn is_empty(&self) -> bool { + self.touched_files.is_empty() + && self.verification_commands.is_empty() + && self.blocking_failures.is_empty() + && self.subagent_statuses.is_empty() + } + + pub fn render_for_model(&self) -> String { + let mut lines = vec![ + "Compaction contract: preserve these factual fields when continuing the task." + .to_string(), + ]; + + if !self.touched_files.is_empty() { + lines.push("Touched files:".to_string()); + for file in &self.touched_files { + lines.push(format!("- {}", file)); + } + } + + render_contract_items( + &mut lines, + "Verification commands:", + &self.verification_commands, + ); + render_contract_items(&mut lines, "Blocking failures:", &self.blocking_failures); + render_contract_items(&mut lines, "Subagent statuses:", &self.subagent_statuses); + + lines.join("\n") + } +} + +fn render_contract_items(lines: &mut Vec, title: &str, items: &[CompressionContractItem]) { + if items.is_empty() { + return; + } + + lines.push(title.to_string()); + for item in items { + let mut rendered = format!("- {} [{}]: {}", item.target, item.status, item.summary); + if let Some(error_kind) = item.error_kind.as_ref() { + rendered.push_str(&format!(" ({})", error_kind)); + } + lines.push(rendered); + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CompressedMessage { pub role: CompressedMessageRole, diff --git a/src/crates/core/src/agentic/core/mod.rs b/src/crates/core/src/agentic/core/mod.rs index e515fdd9e..a12a5a9c3 100644 --- a/src/crates/core/src/agentic/core/mod.rs +++ b/src/crates/core/src/agentic/core/mod.rs @@ -11,8 +11,9 @@ pub mod state; pub use dialog_turn::{new_turn_id, TurnStats}; pub use message::{ CompressedMessage, CompressedMessageRole, CompressedTodoItem, CompressedTodoSnapshot, - CompressedToolCall, CompressionEntry, CompressionPayload, Message, MessageContent, MessageRole, - MessageSemanticKind, ToolCall, ToolResult, + CompressedToolCall, CompressionContract, CompressionContractItem, CompressionEntry, + CompressionPayload, Message, MessageContent, MessageRole, MessageSemanticKind, ToolCall, + ToolResult, }; pub use messages_helper::{MessageHelper, RequestReasoningTokenPolicy}; pub use prompt_markup::{ diff --git a/src/crates/core/src/agentic/deep_review_policy.rs b/src/crates/core/src/agentic/deep_review_policy.rs index eff7468b0..cb78f9cce 100644 --- a/src/crates/core/src/agentic/deep_review_policy.rs +++ b/src/crates/core/src/agentic/deep_review_policy.rs @@ -1,8 +1,12 @@ use crate::service::config::global::GlobalConfigManager; use crate::util::errors::{BitFunError, BitFunResult}; +use dashmap::DashMap; use log::warn; +use serde::Serialize; use serde_json::{json, Value}; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::sync::LazyLock; +use std::time::{Duration, Instant}; pub const DEEP_REVIEW_AGENT_TYPE: &str = "DeepReview"; pub const REVIEW_JUDGE_AGENT_TYPE: &str = "ReviewJudge"; @@ -12,19 +16,385 @@ pub const REVIEWER_PERFORMANCE_AGENT_TYPE: &str = "ReviewPerformance"; pub const REVIEWER_SECURITY_AGENT_TYPE: &str = "ReviewSecurity"; pub const REVIEWER_ARCHITECTURE_AGENT_TYPE: &str = "ReviewArchitecture"; pub const REVIEWER_FRONTEND_AGENT_TYPE: &str = "ReviewFrontend"; -pub const CORE_REVIEWER_AGENT_TYPES: [&str; 5] = [ +pub const CORE_REVIEWER_AGENT_TYPES: [&str; 4] = [ REVIEWER_BUSINESS_LOGIC_AGENT_TYPE, REVIEWER_PERFORMANCE_AGENT_TYPE, REVIEWER_SECURITY_AGENT_TYPE, REVIEWER_ARCHITECTURE_AGENT_TYPE, - REVIEWER_FRONTEND_AGENT_TYPE, ]; +pub const CONDITIONAL_REVIEWER_AGENT_TYPES: [&str; 1] = [REVIEWER_FRONTEND_AGENT_TYPE]; const DEFAULT_REVIEW_TEAM_CONFIG_PATH: &str = "ai.review_teams.default"; -const DEFAULT_REVIEWER_TIMEOUT_SECONDS: u64 = 0; -const DEFAULT_JUDGE_TIMEOUT_SECONDS: u64 = 0; +const DEFAULT_REVIEWER_TIMEOUT_SECONDS: u64 = 600; +const DEFAULT_JUDGE_TIMEOUT_SECONDS: u64 = 600; +const MAX_TIMEOUT_SECONDS: u64 = 3600; +const BASE_TIMEOUT_QUICK_SECONDS: u64 = 180; +const BASE_TIMEOUT_NORMAL_SECONDS: u64 = 300; +const BASE_TIMEOUT_DEEP_SECONDS: u64 = 600; +const TIMEOUT_PER_FILE_SECONDS: u64 = 15; +const TIMEOUT_PER_100_LINES_SECONDS: u64 = 30; const DEFAULT_REVIEWER_FILE_SPLIT_THRESHOLD: usize = 20; const DEFAULT_MAX_SAME_ROLE_INSTANCES: usize = 3; +const MAX_SAME_ROLE_INSTANCES: usize = 8; +const DEFAULT_MAX_RETRIES_PER_ROLE: usize = 1; +const MAX_RETRIES_PER_ROLE: usize = 3; +const DEFAULT_MAX_PARALLEL_INSTANCES: usize = 4; +const DEFAULT_MAX_QUEUE_WAIT_SECONDS: u64 = 60; +const MAX_QUEUE_WAIT_SECONDS: u64 = 600; +const EFFECTIVE_CONCURRENCY_RECOVERY_SUCCESS_WINDOW: usize = 3; +const BUDGET_TTL: Duration = Duration::from_secs(60 * 60); +const PRUNE_INTERVAL: Duration = Duration::from_secs(300); + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ReviewTeamRoleDefinition { + pub key: String, + pub subagent_id: String, + pub fun_name: String, + pub role_name: String, + pub description: String, + pub responsibilities: Vec, + pub accent_color: String, + pub conditional: bool, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ReviewStrategyManifestProfile { + pub level: String, + pub label: String, + pub summary: String, + pub token_impact: String, + pub runtime_impact: String, + pub default_model_slot: String, + pub prompt_directive: String, + pub role_directives: BTreeMap, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ReviewTeamExecutionPolicyDefinition { + pub reviewer_timeout_seconds: u64, + pub judge_timeout_seconds: u64, + pub reviewer_file_split_threshold: usize, + pub max_same_role_instances: usize, + pub max_retries_per_role: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ReviewTeamDefinition { + pub id: String, + pub name: String, + pub description: String, + pub warning: String, + pub default_model: String, + pub default_strategy_level: String, + pub default_execution_policy: ReviewTeamExecutionPolicyDefinition, + pub core_roles: Vec, + pub strategy_profiles: BTreeMap, + pub disallowed_extra_subagent_ids: Vec, + pub hidden_agent_ids: Vec, +} + +fn review_role( + key: &str, + subagent_id: &str, + fun_name: &str, + role_name: &str, + description: &str, + responsibilities: &[&str], + accent_color: &str, + conditional: bool, +) -> ReviewTeamRoleDefinition { + ReviewTeamRoleDefinition { + key: key.to_string(), + subagent_id: subagent_id.to_string(), + fun_name: fun_name.to_string(), + role_name: role_name.to_string(), + description: description.to_string(), + responsibilities: responsibilities + .iter() + .map(|item| item.to_string()) + .collect(), + accent_color: accent_color.to_string(), + conditional, + } +} + +fn role_directives(entries: &[(&str, &str)]) -> BTreeMap { + entries + .iter() + .map(|(role, directive)| (role.to_string(), directive.to_string())) + .collect() +} + +fn strategy_profile( + level: &str, + label: &str, + summary: &str, + token_impact: &str, + runtime_impact: &str, + default_model_slot: &str, + prompt_directive: &str, + directives: &[(&str, &str)], +) -> ReviewStrategyManifestProfile { + ReviewStrategyManifestProfile { + level: level.to_string(), + label: label.to_string(), + summary: summary.to_string(), + token_impact: token_impact.to_string(), + runtime_impact: runtime_impact.to_string(), + default_model_slot: default_model_slot.to_string(), + prompt_directive: prompt_directive.to_string(), + role_directives: role_directives(directives), + } +} + +pub fn default_review_team_definition() -> ReviewTeamDefinition { + let core_roles = vec![ + review_role( + "businessLogic", + REVIEWER_BUSINESS_LOGIC_AGENT_TYPE, + "Logic Reviewer", + "Business Logic Reviewer", + "A workflow sleuth that inspects business rules, state transitions, recovery paths, and real-user correctness.", + &[ + "Verify workflows, state transitions, and domain rules still behave correctly.", + "Check boundary cases, rollback paths, and data integrity assumptions.", + "Focus on issues that can break user outcomes or product intent.", + ], + "#2563eb", + false, + ), + review_role( + "performance", + REVIEWER_PERFORMANCE_AGENT_TYPE, + "Performance Reviewer", + "Performance Reviewer", + "A speed-focused profiler that hunts hot paths, unnecessary work, blocking calls, and scale-sensitive regressions.", + &[ + "Inspect hot paths, large loops, and unnecessary allocations or recomputation.", + "Flag blocking work, N+1 patterns, and wasteful data movement.", + "Keep performance advice practical and aligned with the existing architecture.", + ], + "#d97706", + false, + ), + review_role( + "security", + REVIEWER_SECURITY_AGENT_TYPE, + "Security Reviewer", + "Security Reviewer", + "A boundary guardian that scans for injection risks, trust leaks, privilege mistakes, and unsafe file or command handling.", + &[ + "Review trust boundaries, auth assumptions, and sensitive data handling.", + "Look for injection, unsafe command execution, and exposure risks.", + "Highlight concrete fixes that reduce risk without broad rewrites.", + ], + "#dc2626", + false, + ), + review_role( + "architecture", + REVIEWER_ARCHITECTURE_AGENT_TYPE, + "Architecture Reviewer", + "Architecture Reviewer", + "A structural watchdog that checks module boundaries, dependency direction, API contract design, and abstraction integrity.", + &[ + "Detect layer boundary violations and wrong-direction imports.", + "Verify API contracts, tool schemas, and transport messages stay consistent.", + "Ensure platform-agnostic code does not leak platform-specific details.", + ], + "#0891b2", + false, + ), + review_role( + "frontend", + REVIEWER_FRONTEND_AGENT_TYPE, + "Frontend Reviewer", + "Frontend Reviewer", + "A UI specialist that checks i18n synchronization, React performance patterns, accessibility, and frontend-backend contract alignment.", + &[ + "Verify i18n key completeness across all locales.", + "Check React performance patterns (memoization, virtualization, effect dependencies).", + "Flag accessibility violations and frontend-backend API contract drift.", + ], + "#059669", + true, + ), + review_role( + "judge", + REVIEW_JUDGE_AGENT_TYPE, + "Review Arbiter", + "Review Quality Inspector", + "An independent third-party arbiter that validates reviewer reports for logical consistency and evidence quality. It spot-checks specific code locations only when a claim needs verification, rather than re-reviewing the codebase from scratch.", + &[ + "Validate, merge, downgrade, or reject reviewer findings based on logical consistency and evidence quality.", + "Filter out false positives and directionally-wrong optimization advice by examining reviewer reasoning.", + "Spot-check specific code locations only when a reviewer claim needs verification.", + "Ensure every surviving issue has an actionable fix or follow-up plan.", + ], + "#7c3aed", + false, + ), + ]; + + let strategy_profiles = BTreeMap::from([ + ( + "quick".to_string(), + strategy_profile( + "quick", + "Quick", + "Fast screening for high-confidence issues in the requested diff or scope.", + "0.4-0.6x", + "0.5-0.7x", + "fast", + "Prefer a concise diff-focused pass. Report only high-confidence correctness, security, or regression risks and avoid speculative design rewrites.", + &[ + ( + REVIEWER_BUSINESS_LOGIC_AGENT_TYPE, + "Only trace logic paths directly changed by the diff. Do not follow call chains beyond one hop. Report only issues where the diff introduces a provably wrong behavior.", + ), + ( + REVIEWER_PERFORMANCE_AGENT_TYPE, + "Scan the diff for known anti-patterns only: nested loops, repeated fetches, blocking calls on hot paths, unnecessary re-renders. Do not trace call chains or estimate impact beyond what the diff shows.", + ), + ( + REVIEWER_SECURITY_AGENT_TYPE, + "Scan the diff for direct security risks only: injection, secret exposure, unsafe commands, missing auth. Do not trace data flows beyond one hop.", + ), + ( + REVIEWER_ARCHITECTURE_AGENT_TYPE, + "Only check imports directly changed by the diff. Flag violations of documented layer boundaries.", + ), + ( + REVIEWER_FRONTEND_AGENT_TYPE, + "Only check i18n key completeness and direct platform boundary violations in changed frontend files.", + ), + ( + REVIEW_JUDGE_AGENT_TYPE, + "This was a quick review. Focus on confirming or rejecting each finding efficiently. If a finding's evidence is thin, reject it rather than spending time verifying.", + ), + ], + ), + ), + ( + "normal".to_string(), + strategy_profile( + "normal", + "Normal", + "Balanced review depth for day-to-day code review with practical evidence.", + "1x", + "1x", + "fast", + "Perform the standard role-specific review. Balance coverage with precision and include concrete evidence for each issue.", + &[ + ( + REVIEWER_BUSINESS_LOGIC_AGENT_TYPE, + "Trace each changed function's direct callers and callees to verify business rules and state transitions. Stop investigating a path once you have enough evidence to confirm or dismiss it.", + ), + ( + REVIEWER_PERFORMANCE_AGENT_TYPE, + "Inspect the diff for anti-patterns, then read surrounding code to confirm impact on hot paths. Report only issues likely to matter at realistic scale.", + ), + ( + REVIEWER_SECURITY_AGENT_TYPE, + "Trace each changed input path from entry point to usage. Check trust boundaries, auth assumptions, and data sanitization. Report only issues with a realistic threat narrative.", + ), + ( + REVIEWER_ARCHITECTURE_AGENT_TYPE, + "Check the diff's imports plus one level of dependency direction. Verify API contract consistency.", + ), + ( + REVIEWER_FRONTEND_AGENT_TYPE, + "Check i18n, React performance patterns, and accessibility in changed components. Verify frontend-backend API contract alignment.", + ), + ( + REVIEW_JUDGE_AGENT_TYPE, + "Validate each finding's logical consistency and evidence quality. Spot-check code only when a claim needs verification.", + ), + ], + ), + ), + ( + "deep".to_string(), + strategy_profile( + "deep", + "Deep", + "Thorough multi-pass review for risky, broad, or release-sensitive changes.", + "1.8-2.5x", + "1.5-2.5x", + "primary", + "Run a thorough role-specific pass. Inspect edge cases, cross-file interactions, failure modes, and remediation tradeoffs before finalizing findings.", + &[ + ( + REVIEWER_BUSINESS_LOGIC_AGENT_TYPE, + "Map full call chains for changed functions. Verify state transitions end-to-end, check rollback and error-recovery paths, and test edge cases in data shape and lifecycle assumptions. Prioritize findings by user-facing impact.", + ), + ( + REVIEWER_PERFORMANCE_AGENT_TYPE, + "In addition to the normal pass, check for latent scaling risks - data structures that degrade at volume, or algorithms that are correct but unnecessarily expensive. Only report if you can estimate the impact. Do not speculate about edge cases or failure modes unrelated to performance.", + ), + ( + REVIEWER_SECURITY_AGENT_TYPE, + "In addition to the normal pass, trace data flows across trust boundaries end-to-end. Check for privilege escalation chains, indirect injection vectors, and failure modes that expose sensitive data. Report only issues with a complete threat narrative.", + ), + ( + REVIEWER_ARCHITECTURE_AGENT_TYPE, + "Map the full dependency graph for changed modules. Check for structural anti-patterns, circular dependencies, and cross-cutting concerns.", + ), + ( + REVIEWER_FRONTEND_AGENT_TYPE, + "Thorough React analysis: effect dependencies, memoization, virtualization. Full accessibility audit. State management pattern review. Cross-layer contract verification.", + ), + ( + REVIEW_JUDGE_AGENT_TYPE, + "This was a deep review with potentially complex findings. Cross-validate findings across reviewers for consistency. For each finding, verify the evidence supports the conclusion and the suggested fix is safe. Pay extra attention to overlapping findings across reviewers or same-role instances.", + ), + ], + ), + ), + ]); + + let mut hidden_agent_ids = vec![ + DEEP_REVIEW_AGENT_TYPE.to_string(), + REVIEW_JUDGE_AGENT_TYPE.to_string(), + ]; + hidden_agent_ids.extend(CORE_REVIEWER_AGENT_TYPES.iter().map(|id| id.to_string())); + hidden_agent_ids.extend( + CONDITIONAL_REVIEWER_AGENT_TYPES + .iter() + .map(|id| id.to_string()), + ); + hidden_agent_ids.sort(); + hidden_agent_ids.dedup(); + + let mut disallowed_extra_subagent_ids = hidden_agent_ids.clone(); + disallowed_extra_subagent_ids.push(REVIEW_FIXER_AGENT_TYPE.to_string()); + disallowed_extra_subagent_ids.sort(); + disallowed_extra_subagent_ids.dedup(); + + ReviewTeamDefinition { + id: "default-review-team".to_string(), + name: "Code Review Team".to_string(), + description: "A multi-reviewer team for deep code review with mandatory logic, performance, security, architecture, conditional frontend, and quality-gate roles.".to_string(), + warning: "Deep review may take longer and usually consumes more tokens than a standard review.".to_string(), + default_model: "fast".to_string(), + default_strategy_level: "normal".to_string(), + default_execution_policy: ReviewTeamExecutionPolicyDefinition { + reviewer_timeout_seconds: 300, + judge_timeout_seconds: 240, + reviewer_file_split_threshold: DEFAULT_REVIEWER_FILE_SPLIT_THRESHOLD, + max_same_role_instances: DEFAULT_MAX_SAME_ROLE_INSTANCES, + max_retries_per_role: DEFAULT_MAX_RETRIES_PER_ROLE, + }, + core_roles, + strategy_profiles, + disallowed_extra_subagent_ids, + hidden_agent_ids, + } +} #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum DeepReviewSubagentRole { @@ -56,6 +426,28 @@ impl DeepReviewStrategyLevel { } } +/// Risk factors used for automatic strategy selection. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ChangeRiskFactors { + pub file_count: usize, + pub total_lines_changed: usize, + pub files_in_security_paths: usize, + pub max_cyclomatic_complexity_delta: usize, + pub cross_crate_changes: usize, +} + +impl Default for ChangeRiskFactors { + fn default() -> Self { + Self { + file_count: 0, + total_lines_changed: 0, + files_in_security_paths: 0, + max_cyclomatic_complexity_delta: 0, + cross_crate_changes: 0, + } + } +} + #[derive(Debug, Clone, PartialEq, Eq)] pub struct DeepReviewExecutionPolicy { pub extra_subagent_ids: Vec, @@ -71,6 +463,9 @@ pub struct DeepReviewExecutionPolicy { /// Maximum number of same-role reviewer instances allowed per review turn. /// Clamped to [1, MAX_SAME_ROLE_INSTANCES]. pub max_same_role_instances: usize, + /// Maximum retry launches allowed per reviewer role in one DeepReview turn. + /// Set to 0 to disable automatic reviewer retries. + pub max_retries_per_role: usize, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -80,7 +475,7 @@ pub struct DeepReviewPolicyViolation { } impl DeepReviewPolicyViolation { - fn new(code: &'static str, message: impl Into) -> Self { + pub(crate) fn new(code: &'static str, message: impl Into) -> Self { Self { code, message: message.into(), @@ -96,6 +491,86 @@ impl DeepReviewPolicyViolation { } } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DeepReviewRunManifestGate { + active_subagent_ids: HashSet, + skipped_subagent_reasons: HashMap, +} + +impl DeepReviewRunManifestGate { + pub fn from_value(raw: &Value) -> Option { + let manifest = raw.as_object()?; + if manifest.get("reviewMode").and_then(Value::as_str) != Some("deep") { + return None; + } + + let mut active_subagent_ids = HashSet::new(); + collect_manifest_members(manifest.get("workPackets"), &mut active_subagent_ids); + collect_manifest_members(manifest.get("coreReviewers"), &mut active_subagent_ids); + collect_manifest_members( + manifest.get("enabledExtraReviewers"), + &mut active_subagent_ids, + ); + if let Some(id) = manifest + .get("qualityGateReviewer") + .and_then(manifest_member_subagent_id) + { + active_subagent_ids.insert(id); + } + + if active_subagent_ids.is_empty() { + return None; + } + + let mut skipped_subagent_reasons = HashMap::new(); + if let Some(skipped) = manifest.get("skippedReviewers").and_then(Value::as_array) { + for member in skipped { + let Some(id) = manifest_member_subagent_id(member) else { + continue; + }; + let reason = member + .get("reason") + .and_then(Value::as_str) + .unwrap_or("skipped") + .trim(); + skipped_subagent_reasons.insert( + id, + if reason.is_empty() { + "skipped".to_string() + } else { + reason.to_string() + }, + ); + } + } + + Some(Self { + active_subagent_ids, + skipped_subagent_reasons, + }) + } + + pub fn ensure_active(&self, subagent_type: &str) -> Result<(), DeepReviewPolicyViolation> { + if self.active_subagent_ids.contains(subagent_type) { + return Ok(()); + } + + let reason = self + .skipped_subagent_reasons + .get(subagent_type) + .map(String::as_str) + .unwrap_or("missing_from_manifest"); + + Err(DeepReviewPolicyViolation::new( + "deep_review_subagent_not_active_for_target", + format!( + "DeepReview subagent '{}' is not active for this review target (reason: {})", + subagent_type, reason + ), + )) + } +} + impl Default for DeepReviewExecutionPolicy { fn default() -> Self { Self { @@ -106,6 +581,7 @@ impl Default for DeepReviewExecutionPolicy { judge_timeout_seconds: DEFAULT_JUDGE_TIMEOUT_SECONDS, reviewer_file_split_threshold: DEFAULT_REVIEWER_FILE_SPLIT_THRESHOLD, max_same_role_instances: DEFAULT_MAX_SAME_ROLE_INSTANCES, + max_retries_per_role: DEFAULT_MAX_RETRIES_PER_ROLE, } } } @@ -126,13 +602,13 @@ impl DeepReviewExecutionPolicy { reviewer_timeout_seconds: clamp_u64( config.get("reviewer_timeout_seconds"), 0, - u64::MAX, + MAX_TIMEOUT_SECONDS, DEFAULT_REVIEWER_TIMEOUT_SECONDS, ), judge_timeout_seconds: clamp_u64( config.get("judge_timeout_seconds"), 0, - u64::MAX, + MAX_TIMEOUT_SECONDS, DEFAULT_JUDGE_TIMEOUT_SECONDS, ), reviewer_file_split_threshold: clamp_usize( @@ -147,6 +623,12 @@ impl DeepReviewExecutionPolicy { usize::MAX, DEFAULT_MAX_SAME_ROLE_INSTANCES, ), + max_retries_per_role: clamp_usize( + config.get("max_retries_per_role"), + 0, + MAX_RETRIES_PER_ROLE, + DEFAULT_MAX_RETRIES_PER_ROLE, + ), } } @@ -155,6 +637,7 @@ impl DeepReviewExecutionPolicy { subagent_type: &str, ) -> Result { if CORE_REVIEWER_AGENT_TYPES.contains(&subagent_type) + || CONDITIONAL_REVIEWER_AGENT_TYPES.contains(&subagent_type) || self .extra_subagent_ids .iter() @@ -204,6 +687,104 @@ impl DeepReviewExecutionPolicy { ) } + pub fn predictive_timeout( + &self, + role: DeepReviewSubagentRole, + strategy: DeepReviewStrategyLevel, + file_count: usize, + line_count: usize, + reviewer_count: usize, + ) -> u64 { + let base = match strategy { + DeepReviewStrategyLevel::Quick => BASE_TIMEOUT_QUICK_SECONDS, + DeepReviewStrategyLevel::Normal => BASE_TIMEOUT_NORMAL_SECONDS, + DeepReviewStrategyLevel::Deep => BASE_TIMEOUT_DEEP_SECONDS, + }; + let file_overhead = u64::try_from(file_count) + .unwrap_or(u64::MAX) + .saturating_mul(TIMEOUT_PER_FILE_SECONDS); + let line_overhead = u64::try_from(line_count / 100) + .unwrap_or(u64::MAX) + .saturating_mul(TIMEOUT_PER_100_LINES_SECONDS); + let raw = base + .saturating_add(file_overhead) + .saturating_add(line_overhead); + let multiplier = match role { + DeepReviewSubagentRole::Reviewer => 1, + DeepReviewSubagentRole::Judge => { + let reviewer_count = u64::try_from(reviewer_count.max(1)).unwrap_or(u64::MAX); + 1 + reviewer_count.saturating_sub(1) / 3 + } + }; + + raw.saturating_mul(multiplier).min(MAX_TIMEOUT_SECONDS) + } + + pub fn with_run_manifest_execution_policy(&self, raw_manifest: &Value) -> Self { + let Some(manifest) = raw_manifest.as_object() else { + return self.clone(); + }; + if manifest.get("reviewMode").and_then(Value::as_str) != Some("deep") { + return self.clone(); + } + + let mut policy = self.clone(); + if let Some(strategy_level) = + DeepReviewStrategyLevel::from_value(manifest.get("strategyLevel")) + { + policy.strategy_level = strategy_level; + } + + let Some(execution_policy) = manifest.get("executionPolicy").and_then(Value::as_object) + else { + return policy; + }; + + policy.reviewer_timeout_seconds = clamp_u64( + execution_policy.get("reviewerTimeoutSeconds"), + 0, + MAX_TIMEOUT_SECONDS, + policy.reviewer_timeout_seconds, + ); + policy.judge_timeout_seconds = clamp_u64( + execution_policy.get("judgeTimeoutSeconds"), + 0, + MAX_TIMEOUT_SECONDS, + policy.judge_timeout_seconds, + ); + policy.reviewer_file_split_threshold = clamp_usize( + execution_policy.get("reviewerFileSplitThreshold"), + 0, + usize::MAX, + policy.reviewer_file_split_threshold, + ); + policy.max_same_role_instances = clamp_usize( + execution_policy.get("maxSameRoleInstances"), + 1, + MAX_SAME_ROLE_INSTANCES, + policy.max_same_role_instances, + ); + policy.max_retries_per_role = clamp_usize( + execution_policy.get("maxRetriesPerRole"), + 0, + MAX_RETRIES_PER_ROLE, + policy.max_retries_per_role, + ); + + policy + } + + /// Extract the concurrency policy from a run manifest, if present. + pub fn concurrency_policy_from_manifest( + &self, + raw_manifest: &Value, + ) -> DeepReviewConcurrencyPolicy { + raw_manifest + .get("concurrencyPolicy") + .map(DeepReviewConcurrencyPolicy::from_manifest) + .unwrap_or_default() + } + /// Returns true when the file count exceeds the split threshold and /// `max_same_role_instances > 1`, meaning the orchestrator should /// partition the file list across multiple same-role reviewer instances. @@ -225,143 +806,1582 @@ impl DeepReviewExecutionPolicy { / self.reviewer_file_split_threshold; needed.clamp(1, self.max_same_role_instances) } -} -pub async fn load_default_deep_review_policy() -> BitFunResult { - let config_service = GlobalConfigManager::get_service().await.map_err(|error| { - BitFunError::config(format!( - "Failed to load DeepReview execution policy because config service is unavailable: {}", - error - )) - })?; + /// Auto-select strategy level based on change risk factors. + /// Returns the recommended level and a human-readable rationale. + pub fn auto_select_strategy( + &self, + risk: &ChangeRiskFactors, + ) -> (DeepReviewStrategyLevel, String) { + let score = risk.file_count + + risk.total_lines_changed / 100 + + risk.files_in_security_paths * 3 + + risk.cross_crate_changes * 2; - let raw_config = match config_service - .get_config::(Some(DEFAULT_REVIEW_TEAM_CONFIG_PATH)) - .await - { - Ok(config) => Some(config), - Err(error) if is_missing_default_review_team_config_error(&error) => { - warn!( - "DeepReview policy config missing at {}, using defaults", - DEFAULT_REVIEW_TEAM_CONFIG_PATH - ); - None - } - Err(error) => { - return Err(BitFunError::config(format!( - "Failed to load DeepReview execution policy from {}: {}", - DEFAULT_REVIEW_TEAM_CONFIG_PATH, error - ))); + match score { + 0..=5 => ( + DeepReviewStrategyLevel::Quick, + format!( + "Small change ({} files, {} lines). Quick scan sufficient.", + risk.file_count, risk.total_lines_changed + ), + ), + 6..=20 => ( + DeepReviewStrategyLevel::Normal, + format!( + "Medium change ({} files, {} lines). Standard review recommended.", + risk.file_count, risk.total_lines_changed + ), + ), + _ => ( + DeepReviewStrategyLevel::Deep, + format!( + "Large/high-risk change ({} files, {} lines, {} security files). Deep review recommended.", + risk.file_count, risk.total_lines_changed, risk.files_in_security_paths + ), + ), } - }; + } +} - Ok(DeepReviewExecutionPolicy::from_config_value( - raw_config.as_ref(), - )) +/// Dynamic concurrency control for deep review reviewer launches. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DeepReviewConcurrencyPolicy { + /// Maximum parallel reviewer instances at once. + pub max_parallel_instances: usize, + /// Whether to stagger launches (wait N seconds between batches). + pub stagger_seconds: u64, + /// Maximum time an over-cap reviewer launch can wait before being skipped. + pub max_queue_wait_seconds: u64, + /// Whether to batch extras separately from core reviewers. + pub batch_extras_separately: bool, } -pub fn is_missing_default_review_team_config_error(error: &BitFunError) -> bool { - matches!(error, BitFunError::NotFound(message) - if message == &format!("Config path '{}' not found", DEFAULT_REVIEW_TEAM_CONFIG_PATH)) +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DeepReviewEffectiveConcurrencySnapshot { + pub configured_max_parallel_instances: usize, + pub learned_parallel_instances: usize, + pub effective_parallel_instances: usize, + pub user_override_parallel_instances: Option, + pub retry_after_remaining_ms: Option, } -fn normalize_extra_subagent_ids(raw: Option<&Value>) -> Vec { - let Some(values) = raw.and_then(Value::as_array) else { - return Vec::new(); - }; +#[derive(Debug, Clone)] +struct DeepReviewEffectiveConcurrencyState { + configured_max_parallel_instances: usize, + learned_parallel_instances: usize, + user_override_parallel_instances: Option, + successful_observation_count: usize, + retry_after_until: Option, +} - let disallowed = disallowed_extra_subagent_ids(); - let mut seen = HashSet::new(); - let mut normalized = Vec::new(); +impl DeepReviewEffectiveConcurrencyState { + fn new(configured_max_parallel_instances: usize) -> Self { + let configured_max_parallel_instances = + Self::normalize_configured_max(configured_max_parallel_instances); + Self { + configured_max_parallel_instances, + learned_parallel_instances: configured_max_parallel_instances, + user_override_parallel_instances: None, + successful_observation_count: 0, + retry_after_until: None, + } + } - for value in values { - let Some(id) = value_to_id(value) else { - continue; - }; - if id.is_empty() || disallowed.contains(id.as_str()) || !seen.insert(id.clone()) { - continue; + fn normalize_configured_max(configured_max_parallel_instances: usize) -> usize { + configured_max_parallel_instances.max(1) + } + + fn rebase_configured_max(&mut self, configured_max_parallel_instances: usize) { + let configured_max_parallel_instances = + Self::normalize_configured_max(configured_max_parallel_instances); + if self.configured_max_parallel_instances == configured_max_parallel_instances { + return; } - normalized.push(id); + + self.configured_max_parallel_instances = configured_max_parallel_instances; + self.learned_parallel_instances = self + .learned_parallel_instances + .clamp(1, configured_max_parallel_instances); + self.user_override_parallel_instances = self + .user_override_parallel_instances + .map(|value| value.clamp(1, configured_max_parallel_instances)); } - normalized -} + fn effective_parallel_instances(&self, now: Instant) -> usize { + if let Some(user_override) = self.user_override_parallel_instances { + return user_override.clamp(1, self.configured_max_parallel_instances); + } -fn normalize_member_strategy_overrides( - raw: Option<&Value>, -) -> HashMap { - let Some(values) = raw.and_then(Value::as_object) else { - return HashMap::new(); - }; + if self + .retry_after_until + .is_some_and(|retry_after_until| retry_after_until > now) + { + return 1; + } - let mut normalized = HashMap::new(); - for (subagent_id, value) in values { - let id = subagent_id.trim(); - let Some(strategy_level) = DeepReviewStrategyLevel::from_value(Some(value)) else { - continue; - }; - if !id.is_empty() { - normalized.insert(id.to_string(), strategy_level); + self.learned_parallel_instances + .clamp(1, self.configured_max_parallel_instances) + } + + fn record_capacity_error( + &mut self, + reason: DeepReviewCapacityQueueReason, + retry_after: Option, + now: Instant, + ) { + self.successful_observation_count = 0; + self.learned_parallel_instances = self.learned_parallel_instances.saturating_sub(1).max(1); + + if matches!(reason, DeepReviewCapacityQueueReason::RetryAfter) || retry_after.is_some() { + self.retry_after_until = retry_after.map(|duration| now + duration); } } - normalized -} + fn record_success(&mut self, now: Instant) { + if self + .retry_after_until + .is_some_and(|retry_after_until| retry_after_until > now) + { + return; + } + if self + .retry_after_until + .is_some_and(|retry_after_until| retry_after_until <= now) + { + self.retry_after_until = None; + } -fn disallowed_extra_subagent_ids() -> HashSet<&'static str> { - CORE_REVIEWER_AGENT_TYPES - .into_iter() - .chain([ - REVIEW_JUDGE_AGENT_TYPE, - DEEP_REVIEW_AGENT_TYPE, - REVIEW_FIXER_AGENT_TYPE, - ]) - .collect() + if self.learned_parallel_instances >= self.configured_max_parallel_instances { + self.successful_observation_count = 0; + return; + } + + self.successful_observation_count = self.successful_observation_count.saturating_add(1); + if self.successful_observation_count >= EFFECTIVE_CONCURRENCY_RECOVERY_SUCCESS_WINDOW { + self.learned_parallel_instances = + (self.learned_parallel_instances + 1).min(self.configured_max_parallel_instances); + self.successful_observation_count = 0; + } + } + + fn set_user_override(&mut self, user_override_parallel_instances: Option) { + self.user_override_parallel_instances = user_override_parallel_instances + .map(|value| value.clamp(1, self.configured_max_parallel_instances)); + } + + fn snapshot(&self, now: Instant) -> DeepReviewEffectiveConcurrencySnapshot { + let retry_after_remaining_ms = + self.retry_after_until + .and_then(|retry_after_until| match retry_after_until > now { + true => Some( + u64::try_from(retry_after_until.duration_since(now).as_millis()) + .unwrap_or(u64::MAX), + ), + false => None, + }); + + DeepReviewEffectiveConcurrencySnapshot { + configured_max_parallel_instances: self.configured_max_parallel_instances, + learned_parallel_instances: self + .learned_parallel_instances + .clamp(1, self.configured_max_parallel_instances), + effective_parallel_instances: self.effective_parallel_instances(now), + user_override_parallel_instances: self.user_override_parallel_instances, + retry_after_remaining_ms, + } + } } -fn value_to_id(value: &Value) -> Option { - match value { - Value::String(s) => Some(s.trim().to_string()), - _ => None, +impl Default for DeepReviewConcurrencyPolicy { + fn default() -> Self { + Self { + max_parallel_instances: DEFAULT_MAX_PARALLEL_INSTANCES, + stagger_seconds: 0, + max_queue_wait_seconds: DEFAULT_MAX_QUEUE_WAIT_SECONDS, + batch_extras_separately: true, + } } } -fn clamp_u64(raw: Option<&Value>, min: u64, max: u64, fallback: u64) -> u64 { - let Some(value) = raw.and_then(number_as_i64) else { - return fallback; - }; +impl DeepReviewConcurrencyPolicy { + pub fn from_manifest(raw: &Value) -> Self { + let Some(obj) = raw.as_object() else { + return Self::default(); + }; - let min_i64 = i64::try_from(min).unwrap_or(i64::MAX); - let max_i64 = i64::try_from(max).unwrap_or(i64::MAX); - value.clamp(min_i64, max_i64) as u64 -} + Self { + max_parallel_instances: clamp_usize( + obj.get("maxParallelInstances"), + 1, + 16, + DEFAULT_MAX_PARALLEL_INSTANCES, + ), + stagger_seconds: clamp_u64(obj.get("staggerSeconds"), 0, 60, 0), + max_queue_wait_seconds: clamp_u64( + obj.get("maxQueueWaitSeconds"), + 0, + MAX_QUEUE_WAIT_SECONDS, + DEFAULT_MAX_QUEUE_WAIT_SECONDS, + ), + batch_extras_separately: obj + .get("batchExtrasSeparately") + .and_then(Value::as_bool) + .unwrap_or(true), + } + } -fn clamp_usize(raw: Option<&Value>, min: usize, max: usize, fallback: usize) -> usize { - let Some(value) = raw.and_then(number_as_i64) else { - return fallback; - }; + /// Compute the effective max same-role instances, capped by both + /// the execution policy's `max_same_role_instances` and the + /// concurrency policy's `max_parallel_instances / role_count`. + pub fn effective_max_same_role_instances(&self, policy: &DeepReviewExecutionPolicy) -> usize { + let role_count = reviewer_agent_type_count() + policy.extra_subagent_ids.len(); + let max_per_role = self.max_parallel_instances / role_count.max(1); + max_per_role.max(1).min(policy.max_same_role_instances) + } - let min_i64 = i64::try_from(min).unwrap_or(i64::MAX); - let max_i64 = i64::try_from(max).unwrap_or(i64::MAX); - value.clamp(min_i64, max_i64) as usize + /// Check whether the current number of active launches exceeds the cap. + /// Returns `Ok(())` if the launch is allowed, or an error describing why not. + pub fn check_launch_allowed( + &self, + active_count: usize, + role: DeepReviewSubagentRole, + is_judge_pending: bool, + ) -> Result<(), DeepReviewPolicyViolation> { + match role { + DeepReviewSubagentRole::Reviewer => { + if active_count >= self.max_parallel_instances { + return Err(DeepReviewPolicyViolation::new( + "deep_review_concurrency_cap_reached", + format!( + "Maximum parallel reviewer instances reached ({}/{}). Wait for running reviewers to complete before launching more.", + active_count, self.max_parallel_instances + ), + )); + } + } + DeepReviewSubagentRole::Judge => { + if active_count > 0 { + return Err(DeepReviewPolicyViolation::new( + "deep_review_judge_launch_blocked_by_reviewers", + format!( + "ReviewJudge cannot launch while {} reviewer(s) are still active. Wait for reviewers to complete first.", + active_count + ), + )); + } + if is_judge_pending { + return Err(DeepReviewPolicyViolation::new( + "deep_review_judge_already_pending", + "ReviewJudge is already pending or running in this turn.", + )); + } + } + } + Ok(()) + } } -fn number_as_i64(value: &Value) -> Option { - value.as_i64().or_else(|| { - value - .as_u64() - .map(|value| i64::try_from(value).unwrap_or(i64::MAX)) - }) +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum DeepReviewCapacityQueueReason { + ProviderRateLimit, + ProviderConcurrencyLimit, + RetryAfter, + LocalConcurrencyCap, + TemporaryOverload, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct DeepReviewCapacityQueueDecision { + pub queueable: bool, + pub reason: Option, + pub retry_after_seconds: Option, +} + +impl DeepReviewCapacityQueueDecision { + fn queueable(reason: DeepReviewCapacityQueueReason, retry_after_seconds: Option) -> Self { + Self { + queueable: true, + reason: Some(reason), + retry_after_seconds, + } + } + + fn fail_fast() -> Self { + Self { + queueable: false, + reason: None, + retry_after_seconds: None, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum DeepReviewReviewerQueueStatus { + QueuedForCapacity, + PausedByUser, + Running, + CapacitySkipped, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct DeepReviewReviewerQueueState { + pub status: DeepReviewReviewerQueueStatus, + pub reason: Option, + pub queue_elapsed_ms: u64, + pub run_elapsed_ms: u64, +} + +impl DeepReviewReviewerQueueState { + pub fn queued_for_capacity( + reason: DeepReviewCapacityQueueReason, + queue_elapsed_ms: u64, + ) -> Self { + Self { + status: DeepReviewReviewerQueueStatus::QueuedForCapacity, + reason: Some(reason), + queue_elapsed_ms, + run_elapsed_ms: 0, + } + } + + pub fn paused_by_user(queue_elapsed_ms: u64) -> Self { + Self { + status: DeepReviewReviewerQueueStatus::PausedByUser, + reason: None, + queue_elapsed_ms, + run_elapsed_ms: 0, + } + } + + pub fn running(queue_elapsed_ms: u64, run_elapsed_ms: u64) -> Self { + Self { + status: DeepReviewReviewerQueueStatus::Running, + reason: None, + queue_elapsed_ms, + run_elapsed_ms, + } + } + + pub fn capacity_skipped(reason: DeepReviewCapacityQueueReason, queue_elapsed_ms: u64) -> Self { + Self { + status: DeepReviewReviewerQueueStatus::CapacitySkipped, + reason: Some(reason), + queue_elapsed_ms, + run_elapsed_ms: 0, + } + } + + pub fn timeout_elapsed_ms(&self) -> u64 { + self.run_elapsed_ms + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum DeepReviewQueueControlAction { + Pause, + Continue, + Cancel, + SkipOptional, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct DeepReviewQueueControlSnapshot { + pub paused: bool, + pub cancelled: bool, + pub skip_optional: bool, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct DeepReviewQueueControlKey { + parent_dialog_turn_id: String, + tool_id: String, +} + +impl DeepReviewQueueControlKey { + fn new(parent_dialog_turn_id: &str, tool_id: &str) -> Option { + let parent_dialog_turn_id = parent_dialog_turn_id.trim(); + let tool_id = tool_id.trim(); + if parent_dialog_turn_id.is_empty() || tool_id.is_empty() { + return None; + } + + Some(Self { + parent_dialog_turn_id: parent_dialog_turn_id.to_string(), + tool_id: tool_id.to_string(), + }) + } +} + +#[derive(Default)] +struct DeepReviewQueueControlTracker { + paused_tools: DashMap, + cancelled_tools: DashMap, + skip_optional_turns: DashMap, +} + +impl DeepReviewQueueControlTracker { + fn apply( + &self, + parent_dialog_turn_id: &str, + tool_id: &str, + action: DeepReviewQueueControlAction, + ) -> DeepReviewQueueControlSnapshot { + let now = Instant::now(); + let Some(key) = DeepReviewQueueControlKey::new(parent_dialog_turn_id, tool_id) else { + return DeepReviewQueueControlSnapshot { + paused: false, + cancelled: false, + skip_optional: false, + }; + }; + + match action { + DeepReviewQueueControlAction::Pause => { + self.paused_tools.insert(key.clone(), now); + } + DeepReviewQueueControlAction::Continue => { + self.paused_tools.remove(&key); + } + DeepReviewQueueControlAction::Cancel => { + self.cancelled_tools.insert(key.clone(), now); + self.paused_tools.remove(&key); + } + DeepReviewQueueControlAction::SkipOptional => { + self.skip_optional_turns + .insert(key.parent_dialog_turn_id.clone(), now); + } + } + + self.snapshot(parent_dialog_turn_id, tool_id) + } + + fn snapshot( + &self, + parent_dialog_turn_id: &str, + tool_id: &str, + ) -> DeepReviewQueueControlSnapshot { + let Some(key) = DeepReviewQueueControlKey::new(parent_dialog_turn_id, tool_id) else { + return DeepReviewQueueControlSnapshot { + paused: false, + cancelled: false, + skip_optional: false, + }; + }; + let skip_optional = self + .skip_optional_turns + .contains_key(&key.parent_dialog_turn_id); + + DeepReviewQueueControlSnapshot { + paused: self.paused_tools.contains_key(&key), + cancelled: self.cancelled_tools.contains_key(&key), + skip_optional, + } + } + + fn clear_tool(&self, parent_dialog_turn_id: &str, tool_id: &str) { + if let Some(key) = DeepReviewQueueControlKey::new(parent_dialog_turn_id, tool_id) { + self.paused_tools.remove(&key); + self.cancelled_tools.remove(&key); + } + } +} + +pub fn classify_deep_review_capacity_error( + code: &str, + message: &str, + retry_after_seconds: Option, +) -> DeepReviewCapacityQueueDecision { + let code = code.trim().to_ascii_lowercase(); + let message = message.trim().to_ascii_lowercase(); + let combined = format!("{code} {message}"); + + if contains_any( + &combined, + &[ + "auth", + "api key", + "unauthorized", + "permission", + "quota", + "billing", + "exhausted", + "invalid_model", + "invalid model", + "model does not exist", + "user_cancel", + "cancelled", + "canceled", + "invalid_tooling", + "subagent_not_allowed", + "not allowed", + "policy", + "validation", + ], + ) { + return DeepReviewCapacityQueueDecision::fail_fast(); + } + + if code == "deep_review_concurrency_cap_reached" { + return DeepReviewCapacityQueueDecision::queueable( + DeepReviewCapacityQueueReason::LocalConcurrencyCap, + retry_after_seconds, + ); + } + + if retry_after_seconds.is_some() { + return DeepReviewCapacityQueueDecision::queueable( + DeepReviewCapacityQueueReason::RetryAfter, + retry_after_seconds, + ); + } + + if contains_any(&combined, &["rate limit", "rate_limit", "429"]) { + return DeepReviewCapacityQueueDecision::queueable( + DeepReviewCapacityQueueReason::ProviderRateLimit, + retry_after_seconds, + ); + } + + if contains_any( + &combined, + &[ + "too many concurrent", + "concurrency limit", + "parallel request", + "concurrent requests", + "max concurrent", + ], + ) { + return DeepReviewCapacityQueueDecision::queueable( + DeepReviewCapacityQueueReason::ProviderConcurrencyLimit, + retry_after_seconds, + ); + } + + if contains_any( + &combined, + &[ + "temporarily overloaded", + "temporary overload", + "overloaded", + "capacity", + "try again later", + "retry later", + ], + ) { + return DeepReviewCapacityQueueDecision::queueable( + DeepReviewCapacityQueueReason::TemporaryOverload, + retry_after_seconds, + ); + } + + DeepReviewCapacityQueueDecision::fail_fast() +} + +fn contains_any(value: &str, needles: &[&str]) -> bool { + needles.iter().any(|needle| value.contains(needle)) +} + +#[derive(Debug)] +struct DeepReviewTurnBudget { + judge_calls: usize, + /// Tracks total reviewer calls (across all roles) per turn. + /// Capped by `max_same_role_instances * reviewer_agent_type_count() + + /// extra_subagent_ids.len()` so the orchestrator cannot spawn an unbounded + /// number of same-role instances. + reviewer_calls: usize, + reviewer_calls_by_subagent: HashMap, + retries_used_by_subagent: HashMap, + active_reviewers: usize, + concurrency_cap_rejections: usize, + capacity_skips: usize, + shared_context_uses: HashMap, + effective_concurrency: Option, + updated_at: Instant, +} + +impl DeepReviewTurnBudget { + fn new(now: Instant) -> Self { + Self { + judge_calls: 0, + reviewer_calls: 0, + reviewer_calls_by_subagent: HashMap::new(), + retries_used_by_subagent: HashMap::new(), + active_reviewers: 0, + concurrency_cap_rejections: 0, + capacity_skips: 0, + shared_context_uses: HashMap::new(), + effective_concurrency: None, + updated_at: now, + } + } + + fn effective_concurrency_mut( + &mut self, + configured_max_parallel_instances: usize, + ) -> &mut DeepReviewEffectiveConcurrencyState { + let state = self.effective_concurrency.get_or_insert_with(|| { + DeepReviewEffectiveConcurrencyState::new(configured_max_parallel_instances) + }); + state.rebase_configured_max(configured_max_parallel_instances); + state + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct DeepReviewSharedContextDuplicate { + pub tool_name: String, + pub file_path: String, + pub call_count: usize, + pub reviewer_count: usize, +} + +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct DeepReviewSharedContextMeasurementSnapshot { + pub total_calls: usize, + pub duplicate_calls: usize, + pub duplicate_context_count: usize, + pub repeated_contexts: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct DeepReviewSharedContextKey { + tool_name: String, + file_path: String, +} + +#[derive(Debug, Clone, Default)] +struct DeepReviewSharedContextUseRecord { + call_count: usize, + reviewer_types: HashSet, +} + +pub struct DeepReviewActiveReviewerGuard<'a> { + tracker: &'a DeepReviewBudgetTracker, + parent_dialog_turn_id: String, + released: bool, +} + +impl Drop for DeepReviewActiveReviewerGuard<'_> { + fn drop(&mut self) { + if !self.released { + self.tracker + .finish_active_reviewer(&self.parent_dialog_turn_id); + self.released = true; + } + } +} + +pub struct DeepReviewBudgetTracker { + turns: DashMap, + last_pruned_at: std::sync::Mutex, +} + +impl Default for DeepReviewBudgetTracker { + fn default() -> Self { + Self { + turns: DashMap::new(), + last_pruned_at: std::sync::Mutex::new(Instant::now()), + } + } +} + +impl DeepReviewBudgetTracker { + pub fn record_shared_context_tool_use( + &self, + parent_dialog_turn_id: &str, + subagent_type: &str, + tool_name: &str, + file_path: &str, + ) -> DeepReviewSharedContextMeasurementSnapshot { + if parent_dialog_turn_id.trim().is_empty() { + return DeepReviewSharedContextMeasurementSnapshot::default(); + } + let Some(tool_name) = normalize_shared_context_tool_name(tool_name) else { + return self.shared_context_measurement_snapshot(parent_dialog_turn_id); + }; + let Some(file_path) = normalize_shared_context_file_path(file_path) else { + return self.shared_context_measurement_snapshot(parent_dialog_turn_id); + }; + + let now = Instant::now(); + if let Ok(last_pruned) = self.last_pruned_at.lock() { + if now.saturating_duration_since(*last_pruned) >= PRUNE_INTERVAL { + drop(last_pruned); + self.prune_stale(now); + } + } + + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + let record = budget + .shared_context_uses + .entry(DeepReviewSharedContextKey { + tool_name: tool_name.to_string(), + file_path, + }) + .or_default(); + record.call_count = record.call_count.saturating_add(1); + if !subagent_type.trim().is_empty() { + record + .reviewer_types + .insert(subagent_type.trim().to_string()); + } + budget.updated_at = now; + + shared_context_measurement_snapshot_from_uses(&budget.shared_context_uses) + } + + pub fn shared_context_measurement_snapshot( + &self, + parent_dialog_turn_id: &str, + ) -> DeepReviewSharedContextMeasurementSnapshot { + self.turns + .get(parent_dialog_turn_id) + .map(|budget| { + shared_context_measurement_snapshot_from_uses(&budget.shared_context_uses) + }) + .unwrap_or_default() + } + + pub fn record_task( + &self, + parent_dialog_turn_id: &str, + policy: &DeepReviewExecutionPolicy, + role: DeepReviewSubagentRole, + subagent_type: &str, + is_retry: bool, + ) -> Result<(), DeepReviewPolicyViolation> { + let now = Instant::now(); + if let Ok(last_pruned) = self.last_pruned_at.lock() { + if now.saturating_duration_since(*last_pruned) >= PRUNE_INTERVAL { + drop(last_pruned); + self.prune_stale(now); + } + } + + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + + match role { + DeepReviewSubagentRole::Reviewer => { + let subagent_type = normalize_budget_subagent_type(subagent_type)?; + if is_retry { + if policy.max_retries_per_role == 0 { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_budget_exhausted", + format!( + "Retry budget is disabled for DeepReview reviewer '{}'", + subagent_type + ), + )); + } + if !budget + .reviewer_calls_by_subagent + .contains_key(subagent_type.as_str()) + { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_without_initial_attempt", + format!( + "Cannot retry DeepReview reviewer '{}' before an initial attempt in this turn", + subagent_type + ), + )); + } + let retry_count = budget + .retries_used_by_subagent + .entry(subagent_type.clone()) + .or_insert(0); + if *retry_count >= policy.max_retries_per_role { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_budget_exhausted", + format!( + "Retry budget exhausted for DeepReview reviewer '{}' (max retries: {})", + subagent_type, policy.max_retries_per_role + ), + )); + } + *retry_count += 1; + budget.updated_at = now; + return Ok(()); + } + + let max_reviewer_calls = policy.max_same_role_instances + * (reviewer_agent_type_count() + policy.extra_subagent_ids.len()); + if budget.reviewer_calls >= max_reviewer_calls { + return Err(DeepReviewPolicyViolation::new( + "deep_review_reviewer_budget_exhausted", + format!( + "Reviewer launch budget exhausted for this DeepReview turn (max calls: {})", + max_reviewer_calls + ), + )); + } + budget.reviewer_calls += 1; + *budget + .reviewer_calls_by_subagent + .entry(subagent_type) + .or_insert(0) += 1; + } + DeepReviewSubagentRole::Judge => { + if is_retry { + return Err(DeepReviewPolicyViolation::new( + "deep_review_judge_retry_disallowed", + "ReviewJudge retry is not covered by the reviewer retry budget", + )); + } + let max_judge_calls = 1; + if budget.judge_calls >= max_judge_calls { + return Err(DeepReviewPolicyViolation::new( + "deep_review_judge_budget_exhausted", + format!( + "ReviewJudge launch budget exhausted for this DeepReview turn (max calls: {})", + max_judge_calls + ), + )); + } + + budget.judge_calls += 1; + } + } + + budget.updated_at = now; + Ok(()) + } + + pub fn record_concurrency_cap_rejection(&self, parent_dialog_turn_id: &str) { + if parent_dialog_turn_id.trim().is_empty() { + return; + } + + let now = Instant::now(); + if let Ok(last_pruned) = self.last_pruned_at.lock() { + if now.saturating_duration_since(*last_pruned) >= PRUNE_INTERVAL { + drop(last_pruned); + self.prune_stale(now); + } + } + + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + budget.concurrency_cap_rejections += 1; + budget.updated_at = now; + } + + pub fn record_capacity_skip(&self, parent_dialog_turn_id: &str) { + if parent_dialog_turn_id.trim().is_empty() { + return; + } + + let now = Instant::now(); + if let Ok(last_pruned) = self.last_pruned_at.lock() { + if now.saturating_duration_since(*last_pruned) >= PRUNE_INTERVAL { + drop(last_pruned); + self.prune_stale(now); + } + } + + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + budget.capacity_skips += 1; + budget.updated_at = now; + } + + pub fn begin_active_reviewer<'a>( + &'a self, + parent_dialog_turn_id: &str, + ) -> DeepReviewActiveReviewerGuard<'a> { + let now = Instant::now(); + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + budget.active_reviewers = budget.active_reviewers.saturating_add(1); + budget.updated_at = now; + + DeepReviewActiveReviewerGuard { + tracker: self, + parent_dialog_turn_id: parent_dialog_turn_id.to_string(), + released: false, + } + } + + pub fn try_begin_active_reviewer<'a>( + &'a self, + parent_dialog_turn_id: &str, + max_active_reviewers: usize, + ) -> Option> { + let now = Instant::now(); + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + if budget.active_reviewers >= max_active_reviewers { + return None; + } + + budget.active_reviewers = budget.active_reviewers.saturating_add(1); + budget.updated_at = now; + Some(DeepReviewActiveReviewerGuard { + tracker: self, + parent_dialog_turn_id: parent_dialog_turn_id.to_string(), + released: false, + }) + } + + fn finish_active_reviewer(&self, parent_dialog_turn_id: &str) { + if let Some(mut budget) = self.turns.get_mut(parent_dialog_turn_id) { + budget.active_reviewers = budget.active_reviewers.saturating_sub(1); + budget.updated_at = Instant::now(); + } + } + + fn prune_stale(&self, now: Instant) { + self.turns + .retain(|_, budget| now.saturating_duration_since(budget.updated_at) <= BUDGET_TTL); + if let Ok(mut last_pruned) = self.last_pruned_at.lock() { + *last_pruned = now; + } + } + + /// Explicitly clean up all budget tracking data. + /// Call this when the application is shutting down or when the review session ends. + pub fn cleanup(&self) { + self.turns.clear(); + if let Ok(mut last_pruned) = self.last_pruned_at.lock() { + *last_pruned = Instant::now(); + } + } + + /// Returns the number of reviewer calls recorded for a given turn. + /// Used by the concurrency enforcement to check if a new launch is allowed. + pub fn active_reviewer_count(&self, parent_dialog_turn_id: &str) -> usize { + self.turns + .get(parent_dialog_turn_id) + .map(|budget| budget.active_reviewers) + .unwrap_or(0) + } + + /// Returns true if a judge call has been recorded for a given turn. + pub fn has_judge_been_launched(&self, parent_dialog_turn_id: &str) -> bool { + self.turns + .get(parent_dialog_turn_id) + .map(|budget| budget.judge_calls > 0) + .unwrap_or(false) + } + + pub fn concurrency_cap_rejection_count(&self, parent_dialog_turn_id: &str) -> usize { + self.turns + .get(parent_dialog_turn_id) + .map(|budget| budget.concurrency_cap_rejections) + .unwrap_or(0) + } + + pub fn capacity_skip_count(&self, parent_dialog_turn_id: &str) -> usize { + self.turns + .get(parent_dialog_turn_id) + .map(|budget| budget.capacity_skips) + .unwrap_or(0) + } + + pub fn effective_concurrency_snapshot( + &self, + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, + ) -> DeepReviewEffectiveConcurrencySnapshot { + if parent_dialog_turn_id.trim().is_empty() { + return DeepReviewEffectiveConcurrencyState::new(configured_max_parallel_instances) + .snapshot(Instant::now()); + } + + let now = Instant::now(); + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + budget.updated_at = now; + budget + .effective_concurrency_mut(configured_max_parallel_instances) + .snapshot(now) + } + + pub fn effective_parallel_instances( + &self, + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, + ) -> usize { + self.effective_concurrency_snapshot( + parent_dialog_turn_id, + configured_max_parallel_instances, + ) + .effective_parallel_instances + } + + pub fn record_effective_concurrency_capacity_error( + &self, + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, + reason: DeepReviewCapacityQueueReason, + retry_after: Option, + ) -> DeepReviewEffectiveConcurrencySnapshot { + if parent_dialog_turn_id.trim().is_empty() { + return DeepReviewEffectiveConcurrencyState::new(configured_max_parallel_instances) + .snapshot(Instant::now()); + } + + let now = Instant::now(); + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + budget.updated_at = now; + let state = budget.effective_concurrency_mut(configured_max_parallel_instances); + state.record_capacity_error(reason, retry_after, now); + state.snapshot(now) + } + + pub fn record_effective_concurrency_success( + &self, + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, + ) -> DeepReviewEffectiveConcurrencySnapshot { + if parent_dialog_turn_id.trim().is_empty() { + return DeepReviewEffectiveConcurrencyState::new(configured_max_parallel_instances) + .snapshot(Instant::now()); + } + + let now = Instant::now(); + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + budget.updated_at = now; + let state = budget.effective_concurrency_mut(configured_max_parallel_instances); + state.record_success(now); + state.snapshot(now) + } + + pub fn set_effective_concurrency_user_override( + &self, + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, + user_override_parallel_instances: Option, + ) -> DeepReviewEffectiveConcurrencySnapshot { + if parent_dialog_turn_id.trim().is_empty() { + return DeepReviewEffectiveConcurrencyState::new(configured_max_parallel_instances) + .snapshot(Instant::now()); + } + + let now = Instant::now(); + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + budget.updated_at = now; + let state = budget.effective_concurrency_mut(configured_max_parallel_instances); + state.set_user_override(user_override_parallel_instances); + state.snapshot(now) + } +} + +fn normalize_shared_context_tool_name(tool_name: &str) -> Option<&'static str> { + let tool_name = tool_name.trim(); + if tool_name.eq_ignore_ascii_case("Read") { + Some("Read") + } else if tool_name.eq_ignore_ascii_case("GetFileDiff") { + Some("GetFileDiff") + } else { + None + } +} + +fn normalize_shared_context_file_path(file_path: &str) -> Option { + let mut file_path = file_path.trim().replace('\\', "/"); + while file_path.starts_with("./") { + file_path = file_path[2..].to_string(); + } + (!file_path.is_empty()).then_some(file_path) +} + +fn shared_context_measurement_snapshot_from_uses( + uses: &HashMap, +) -> DeepReviewSharedContextMeasurementSnapshot { + let total_calls = uses.values().map(|record| record.call_count).sum(); + let duplicate_calls = uses + .values() + .map(|record| record.call_count.saturating_sub(1)) + .sum(); + let mut repeated_contexts: Vec = uses + .iter() + .filter_map(|(key, record)| { + (record.call_count > 1).then(|| DeepReviewSharedContextDuplicate { + tool_name: key.tool_name.clone(), + file_path: key.file_path.clone(), + call_count: record.call_count, + reviewer_count: record.reviewer_types.len(), + }) + }) + .collect(); + repeated_contexts.sort_by(|left, right| { + right + .call_count + .cmp(&left.call_count) + .then_with(|| right.reviewer_count.cmp(&left.reviewer_count)) + .then_with(|| left.tool_name.cmp(&right.tool_name)) + .then_with(|| left.file_path.cmp(&right.file_path)) + }); + let duplicate_context_count = repeated_contexts.len(); + + DeepReviewSharedContextMeasurementSnapshot { + total_calls, + duplicate_calls, + duplicate_context_count, + repeated_contexts, + } +} + +static GLOBAL_DEEP_REVIEW_BUDGET_TRACKER: LazyLock = + LazyLock::new(DeepReviewBudgetTracker::default); +static GLOBAL_DEEP_REVIEW_QUEUE_CONTROL_TRACKER: LazyLock = + LazyLock::new(DeepReviewQueueControlTracker::default); + +pub async fn load_default_deep_review_policy() -> BitFunResult { + let config_service = GlobalConfigManager::get_service().await.map_err(|error| { + BitFunError::config(format!( + "Failed to load DeepReview execution policy because config service is unavailable: {}", + error + )) + })?; + + let raw_config = match config_service + .get_config::(Some(DEFAULT_REVIEW_TEAM_CONFIG_PATH)) + .await + { + Ok(config) => Some(config), + Err(error) if is_missing_default_review_team_config_error(&error) => { + warn!( + "DeepReview policy config missing at {}, using defaults", + DEFAULT_REVIEW_TEAM_CONFIG_PATH + ); + None + } + Err(error) => { + return Err(BitFunError::config(format!( + "Failed to load DeepReview execution policy from {}: {}", + DEFAULT_REVIEW_TEAM_CONFIG_PATH, error + ))); + } + }; + + Ok(DeepReviewExecutionPolicy::from_config_value( + raw_config.as_ref(), + )) +} + +pub fn is_missing_default_review_team_config_error(error: &BitFunError) -> bool { + matches!(error, BitFunError::NotFound(message) + if message == &format!("Config path '{}' not found", DEFAULT_REVIEW_TEAM_CONFIG_PATH)) +} + +pub fn record_deep_review_task_budget( + parent_dialog_turn_id: &str, + policy: &DeepReviewExecutionPolicy, + role: DeepReviewSubagentRole, + subagent_type: &str, + is_retry: bool, +) -> Result<(), DeepReviewPolicyViolation> { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_task( + parent_dialog_turn_id, + policy, + role, + subagent_type, + is_retry, + ) +} + +pub fn record_deep_review_concurrency_cap_rejection(parent_dialog_turn_id: &str) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_concurrency_cap_rejection(parent_dialog_turn_id) +} + +pub fn record_deep_review_capacity_skip(parent_dialog_turn_id: &str) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_capacity_skip(parent_dialog_turn_id) +} + +pub fn record_deep_review_shared_context_tool_use( + parent_dialog_turn_id: &str, + subagent_type: &str, + tool_name: &str, + file_path: &str, +) -> DeepReviewSharedContextMeasurementSnapshot { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_shared_context_tool_use( + parent_dialog_turn_id, + subagent_type, + tool_name, + file_path, + ) +} + +pub fn deep_review_shared_context_measurement_snapshot( + parent_dialog_turn_id: &str, +) -> DeepReviewSharedContextMeasurementSnapshot { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.shared_context_measurement_snapshot(parent_dialog_turn_id) +} + +pub fn try_begin_deep_review_active_reviewer( + parent_dialog_turn_id: &str, + max_active_reviewers: usize, +) -> Option> { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER + .try_begin_active_reviewer(parent_dialog_turn_id, max_active_reviewers) +} + +pub fn deep_review_effective_concurrency_snapshot( + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, +) -> DeepReviewEffectiveConcurrencySnapshot { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER + .effective_concurrency_snapshot(parent_dialog_turn_id, configured_max_parallel_instances) +} + +pub fn deep_review_effective_parallel_instances( + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, +) -> usize { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER + .effective_parallel_instances(parent_dialog_turn_id, configured_max_parallel_instances) +} + +pub fn record_deep_review_effective_concurrency_capacity_error( + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, + reason: DeepReviewCapacityQueueReason, + retry_after: Option, +) -> DeepReviewEffectiveConcurrencySnapshot { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_effective_concurrency_capacity_error( + parent_dialog_turn_id, + configured_max_parallel_instances, + reason, + retry_after, + ) +} + +pub fn record_deep_review_effective_concurrency_success( + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, +) -> DeepReviewEffectiveConcurrencySnapshot { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_effective_concurrency_success( + parent_dialog_turn_id, + configured_max_parallel_instances, + ) +} + +pub fn set_deep_review_effective_concurrency_user_override( + parent_dialog_turn_id: &str, + configured_max_parallel_instances: usize, + user_override_parallel_instances: Option, +) -> DeepReviewEffectiveConcurrencySnapshot { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.set_effective_concurrency_user_override( + parent_dialog_turn_id, + configured_max_parallel_instances, + user_override_parallel_instances, + ) +} + +/// Returns the number of active reviewer calls for a given turn. +pub fn deep_review_active_reviewer_count(parent_dialog_turn_id: &str) -> usize { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.active_reviewer_count(parent_dialog_turn_id) +} + +/// Returns true if a judge has been launched for a given turn. +pub fn deep_review_has_judge_been_launched(parent_dialog_turn_id: &str) -> bool { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.has_judge_been_launched(parent_dialog_turn_id) +} + +pub fn deep_review_concurrency_cap_rejection_count(parent_dialog_turn_id: &str) -> usize { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.concurrency_cap_rejection_count(parent_dialog_turn_id) +} + +pub fn deep_review_capacity_skip_count(parent_dialog_turn_id: &str) -> usize { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.capacity_skip_count(parent_dialog_turn_id) +} + +pub fn apply_deep_review_queue_control( + parent_dialog_turn_id: &str, + tool_id: &str, + action: DeepReviewQueueControlAction, +) -> DeepReviewQueueControlSnapshot { + GLOBAL_DEEP_REVIEW_QUEUE_CONTROL_TRACKER.apply(parent_dialog_turn_id, tool_id, action) +} + +pub fn deep_review_queue_control_snapshot( + parent_dialog_turn_id: &str, + tool_id: &str, +) -> DeepReviewQueueControlSnapshot { + GLOBAL_DEEP_REVIEW_QUEUE_CONTROL_TRACKER.snapshot(parent_dialog_turn_id, tool_id) +} + +pub fn clear_deep_review_queue_control_for_tool(parent_dialog_turn_id: &str, tool_id: &str) { + GLOBAL_DEEP_REVIEW_QUEUE_CONTROL_TRACKER.clear_tool(parent_dialog_turn_id, tool_id) +} + +/// Returns the number of retries used for a specific subagent type in a given turn. +pub fn deep_review_retries_used(parent_dialog_turn_id: &str, subagent_type: &str) -> usize { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER + .turns + .get(parent_dialog_turn_id) + .map(|budget| { + budget + .retries_used_by_subagent + .get(subagent_type) + .copied() + .unwrap_or(0) + }) + .unwrap_or(0) +} + +/// Returns the fallback max retries per role when an effective run policy is unavailable. +pub fn deep_review_max_retries_per_role(_parent_dialog_turn_id: &str) -> usize { + DEFAULT_MAX_RETRIES_PER_ROLE +} + +fn collect_manifest_members(raw: Option<&Value>, output: &mut HashSet) { + let Some(values) = raw.and_then(Value::as_array) else { + return; + }; + + for member in values { + if let Some(id) = manifest_member_subagent_id(member) { + output.insert(id); + } + } +} + +fn manifest_member_subagent_id(value: &Value) -> Option { + let id = value + .get("subagentId") + .or_else(|| value.get("subagent_id")) + .and_then(Value::as_str)? + .trim(); + (!id.is_empty()).then(|| id.to_string()) +} + +fn normalize_extra_subagent_ids(raw: Option<&Value>) -> Vec { + let Some(values) = raw.and_then(Value::as_array) else { + return Vec::new(); + }; + + let disallowed = disallowed_extra_subagent_ids(); + let mut seen = HashSet::new(); + let mut normalized = Vec::new(); + + for value in values { + let Some(id) = value_to_id(value) else { + continue; + }; + if id.is_empty() || disallowed.contains(id.as_str()) || !seen.insert(id.clone()) { + continue; + } + normalized.push(id); + } + + normalized +} + +fn normalize_member_strategy_overrides( + raw: Option<&Value>, +) -> HashMap { + let Some(values) = raw.and_then(Value::as_object) else { + return HashMap::new(); + }; + + let mut normalized = HashMap::new(); + for (subagent_id, value) in values { + let id = subagent_id.trim(); + let Some(strategy_level) = DeepReviewStrategyLevel::from_value(Some(value)) else { + continue; + }; + if !id.is_empty() { + normalized.insert(id.to_string(), strategy_level); + } + } + + normalized +} + +fn disallowed_extra_subagent_ids() -> HashSet<&'static str> { + CORE_REVIEWER_AGENT_TYPES + .into_iter() + .chain(CONDITIONAL_REVIEWER_AGENT_TYPES) + .chain([ + REVIEW_JUDGE_AGENT_TYPE, + DEEP_REVIEW_AGENT_TYPE, + REVIEW_FIXER_AGENT_TYPE, + ]) + .collect() +} + +fn reviewer_agent_type_count() -> usize { + CORE_REVIEWER_AGENT_TYPES.len() + CONDITIONAL_REVIEWER_AGENT_TYPES.len() +} + +fn normalize_budget_subagent_type( + subagent_type: &str, +) -> Result { + let normalized = subagent_type.trim(); + if normalized.is_empty() { + return Err(DeepReviewPolicyViolation::new( + "deep_review_subagent_type_missing", + "DeepReview task budget requires a non-empty subagent type", + )); + } + + Ok(normalized.to_string()) +} + +fn value_to_id(value: &Value) -> Option { + match value { + Value::String(s) => Some(s.trim().to_string()), + _ => None, + } +} + +fn clamp_u64(raw: Option<&Value>, min: u64, max: u64, fallback: u64) -> u64 { + let Some(value) = raw.and_then(number_as_i64) else { + return fallback; + }; + + let min_i64 = i64::try_from(min).unwrap_or(i64::MAX); + let max_i64 = i64::try_from(max).unwrap_or(i64::MAX); + value.clamp(min_i64, max_i64) as u64 +} + +fn clamp_usize(raw: Option<&Value>, min: usize, max: usize, fallback: usize) -> usize { + let Some(value) = raw.and_then(number_as_i64) else { + return fallback; + }; + + let min_i64 = i64::try_from(min).unwrap_or(i64::MAX); + let max_i64 = i64::try_from(max).unwrap_or(i64::MAX); + value.clamp(min_i64, max_i64) as usize +} + +fn number_as_i64(value: &Value) -> Option { + value.as_i64().or_else(|| { + value + .as_u64() + .map(|value| i64::try_from(value).unwrap_or(i64::MAX)) + }) +} + +/// Incremental review cache stores completed reviewer outputs keyed by packet_id. +/// When a deep review is re-run with the same target fingerprint, cached outputs +/// are reused instead of re-dispatching reviewers. +#[derive(Clone)] +pub struct DeepReviewIncrementalCache { + fingerprint: String, + packets: HashMap, +} + +impl DeepReviewIncrementalCache { + pub fn new(fingerprint: &str) -> Self { + Self { + fingerprint: fingerprint.to_string(), + packets: HashMap::new(), + } + } + + pub fn from_value(value: &Value) -> Self { + let obj = value.as_object(); + let fingerprint = obj + .and_then(|o| o.get("fingerprint")) + .and_then(Value::as_str) + .unwrap_or("") + .to_string(); + let packets = obj + .and_then(|o| o.get("packets")) + .and_then(Value::as_object) + .map(|map| { + map.iter() + .filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string()))) + .collect() + }) + .unwrap_or_default(); + Self { + fingerprint, + packets, + } + } + + pub fn to_value(&self) -> Value { + json!({ + "fingerprint": self.fingerprint, + "packets": self.packets, + }) + } + + pub fn fingerprint(&self) -> &str { + &self.fingerprint + } + + pub fn store_packet(&mut self, packet_id: &str, output: &str) { + self.packets + .insert(packet_id.to_string(), output.to_string()); + } + + pub fn get_packet(&self, packet_id: &str) -> Option<&str> { + self.packets.get(packet_id).map(|s| s.as_str()) + } + + pub fn is_empty(&self) -> bool { + self.packets.is_empty() + } + + pub fn len(&self) -> usize { + self.packets.len() + } + + /// Check if the cached fingerprint matches the fingerprint in the run manifest. + /// Returns false if the manifest has no incrementalReviewCache section. + pub fn matches_manifest(&self, manifest: &Value) -> bool { + manifest + .get("incrementalReviewCache") + .and_then(|ic| ic.get("fingerprint")) + .and_then(Value::as_str) + .map(|fp| fp == self.fingerprint) + .unwrap_or(false) + } } #[cfg(test)] mod tests { use super::{ - is_missing_default_review_team_config_error, DeepReviewExecutionPolicy, - DeepReviewStrategyLevel, DeepReviewSubagentRole, REVIEW_FIXER_AGENT_TYPE, + is_missing_default_review_team_config_error, DeepReviewBudgetTracker, + DeepReviewExecutionPolicy, DeepReviewIncrementalCache, DeepReviewRunManifestGate, + DeepReviewStrategyLevel, DeepReviewSubagentRole, REVIEWER_ARCHITECTURE_AGENT_TYPE, + REVIEWER_PERFORMANCE_AGENT_TYPE, REVIEWER_SECURITY_AGENT_TYPE, REVIEW_FIXER_AGENT_TYPE, + REVIEW_JUDGE_AGENT_TYPE, }; use crate::util::errors::BitFunError; use serde_json::json; + use serde_json::Value; + use std::time::Duration; #[test] fn only_missing_default_review_team_path_can_fallback_to_defaults() { @@ -391,6 +2411,53 @@ mod tests { ); } + #[test] + fn frontend_reviewer_is_conditional_not_core() { + let policy = DeepReviewExecutionPolicy::default(); + + assert!(!super::CORE_REVIEWER_AGENT_TYPES.contains(&super::REVIEWER_FRONTEND_AGENT_TYPE)); + assert!( + super::CONDITIONAL_REVIEWER_AGENT_TYPES.contains(&super::REVIEWER_FRONTEND_AGENT_TYPE) + ); + assert_eq!( + policy + .classify_subagent(super::REVIEWER_FRONTEND_AGENT_TYPE) + .unwrap(), + DeepReviewSubagentRole::Reviewer + ); + } + + #[test] + fn default_review_team_definition_exposes_role_manifest() { + let definition = super::default_review_team_definition(); + let role_ids: Vec<&str> = definition + .core_roles + .iter() + .map(|role| role.subagent_id.as_str()) + .collect(); + + assert_eq!(definition.default_strategy_level, "normal"); + assert!(role_ids.contains(&super::REVIEWER_BUSINESS_LOGIC_AGENT_TYPE)); + assert!(role_ids.contains(&super::REVIEWER_ARCHITECTURE_AGENT_TYPE)); + assert!(role_ids.contains(&super::REVIEWER_FRONTEND_AGENT_TYPE)); + assert!(role_ids.contains(&super::REVIEW_JUDGE_AGENT_TYPE)); + assert!(definition.core_roles.iter().any(|role| { + role.subagent_id == super::REVIEWER_FRONTEND_AGENT_TYPE && role.conditional + })); + assert!(definition + .hidden_agent_ids + .contains(&super::REVIEWER_FRONTEND_AGENT_TYPE.to_string())); + assert!(definition + .disallowed_extra_subagent_ids + .contains(&super::REVIEWER_FRONTEND_AGENT_TYPE.to_string())); + assert!(definition + .strategy_profiles + .get("quick") + .expect("quick strategy") + .role_directives + .contains_key(super::REVIEWER_FRONTEND_AGENT_TYPE)); + } + #[test] fn parses_review_strategy_and_member_overrides_from_config() { let raw = json!({ @@ -436,11 +2503,82 @@ mod tests { } #[test] - fn classify_rejects_unknown_subagent() { - let policy = DeepReviewExecutionPolicy::default(); - let result = policy.classify_subagent("UnknownAgent"); - assert!(result.is_err()); - assert_eq!(result.unwrap_err().code, "deep_review_subagent_not_allowed"); + fn classify_rejects_unknown_subagent() { + let policy = DeepReviewExecutionPolicy::default(); + let result = policy.classify_subagent("UnknownAgent"); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().code, "deep_review_subagent_not_allowed"); + } + + #[test] + fn run_manifest_gate_allows_only_active_reviewers() { + let manifest = json!({ + "reviewMode": "deep", + "coreReviewers": [ + { "subagentId": "ReviewBusinessLogic" } + ], + "enabledExtraReviewers": [ + { "subagentId": "ExtraReviewer" } + ], + "qualityGateReviewer": { "subagentId": "ReviewJudge" }, + "skippedReviewers": [ + { "subagentId": "ReviewFrontend", "reason": "not_applicable" } + ] + }); + + let gate = DeepReviewRunManifestGate::from_value(&manifest) + .expect("valid run manifest should produce a gate"); + + gate.ensure_active("ReviewBusinessLogic").unwrap(); + gate.ensure_active("ExtraReviewer").unwrap(); + gate.ensure_active("ReviewJudge").unwrap(); + + let violation = gate.ensure_active("ReviewFrontend").unwrap_err(); + assert_eq!(violation.code, "deep_review_subagent_not_active_for_target"); + assert!(violation.message.contains("ReviewFrontend")); + assert!(violation.message.contains("not_applicable")); + } + + #[test] + fn run_manifest_gate_is_absent_without_review_team_shape() { + let manifest = json!({ + "reviewMode": "deep", + "skippedReviewers": [ + { "subagentId": "ReviewFrontend", "reason": "not_applicable" } + ] + }); + + assert!(DeepReviewRunManifestGate::from_value(&manifest).is_none()); + } + + #[test] + fn run_manifest_gate_accepts_work_packet_roster() { + let manifest = json!({ + "reviewMode": "deep", + "workPackets": [ + { + "packetId": "reviewer:ReviewBusinessLogic", + "subagentId": "ReviewBusinessLogic" + }, + { + "packet_id": "judge:ReviewJudge", + "subagent_id": "ReviewJudge" + } + ], + "skippedReviewers": [ + { "subagentId": "ReviewFrontend", "reason": "not_applicable" } + ] + }); + + let gate = DeepReviewRunManifestGate::from_value(&manifest) + .expect("work packet manifest should produce a gate"); + + gate.ensure_active("ReviewBusinessLogic").unwrap(); + gate.ensure_active("ReviewJudge").unwrap(); + + let violation = gate.ensure_active("ReviewFrontend").unwrap_err(); + assert_eq!(violation.code, "deep_review_subagent_not_active_for_target"); + assert!(violation.message.contains("not_applicable")); } #[test] @@ -485,6 +2623,43 @@ mod tests { .contains(&"DeepReview".to_string())); } + #[test] + fn budget_tracker_caps_judge_calls_per_turn() { + let policy = DeepReviewExecutionPolicy::default(); + let tracker = DeepReviewBudgetTracker::default(); + + // turn-1: one judge call allowed + tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Judge, + REVIEW_JUDGE_AGENT_TYPE, + false, + ) + .unwrap(); + assert!(tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Judge, + REVIEW_JUDGE_AGENT_TYPE, + false, + ) + .is_err()); + + // turn-2: fresh budget, should succeed + tracker + .record_task( + "turn-2", + &policy, + DeepReviewSubagentRole::Judge, + REVIEW_JUDGE_AGENT_TYPE, + false, + ) + .unwrap(); + } + #[test] fn effective_timeout_zero_cap_allows_any_requested() { let policy = DeepReviewExecutionPolicy::from_config_value(Some(&json!({ @@ -503,6 +2678,63 @@ mod tests { ); } + #[test] + fn predictive_timeout_scales_with_target_size_and_reviewer_count() { + let policy = DeepReviewExecutionPolicy::default(); + + assert_eq!( + policy.predictive_timeout( + DeepReviewSubagentRole::Reviewer, + DeepReviewStrategyLevel::Normal, + 25, + 0, + 5, + ), + 675 + ); + assert_eq!( + policy.predictive_timeout( + DeepReviewSubagentRole::Judge, + DeepReviewStrategyLevel::Normal, + 25, + 0, + 5, + ), + 1350 + ); + } + + #[test] + fn run_manifest_execution_policy_overrides_static_timeouts() { + let policy = DeepReviewExecutionPolicy::from_config_value(Some(&json!({ + "reviewer_timeout_seconds": 300, + "judge_timeout_seconds": 240, + "reviewer_file_split_threshold": 20, + "max_same_role_instances": 3 + }))); + let manifest = json!({ + "reviewMode": "deep", + "strategyLevel": "normal", + "executionPolicy": { + "reviewerTimeoutSeconds": 675, + "judgeTimeoutSeconds": 1350, + "reviewerFileSplitThreshold": 10, + "maxSameRoleInstances": 4 + }, + "coreReviewers": [ + { "subagentId": "ReviewBusinessLogic" } + ], + "qualityGateReviewer": { "subagentId": "ReviewJudge" } + }); + + let effective = policy.with_run_manifest_execution_policy(&manifest); + + assert_eq!(effective.reviewer_timeout_seconds, 675); + assert_eq!(effective.judge_timeout_seconds, 1350); + assert_eq!(effective.reviewer_file_split_threshold, 10); + assert_eq!(effective.max_same_role_instances, 4); + } + #[test] fn default_file_split_threshold_and_max_instances() { let policy = DeepReviewExecutionPolicy::default(); @@ -563,7 +2795,113 @@ mod tests { } #[test] - fn max_same_role_instances_only_enforces_positive_minimum() { + fn budget_tracker_caps_reviewer_calls_by_max_same_role_instances() { + let policy = DeepReviewExecutionPolicy::from_config_value(Some(&json!({ + "max_same_role_instances": 2 + }))); + let tracker = DeepReviewBudgetTracker::default(); + + // Default policy: 5 core reviewers * 2 max instances = 10 reviewer calls allowed + for _ in 0..10 { + tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Reviewer, + "ReviewBusinessLogic", + false, + ) + .unwrap(); + } + // 11th reviewer call should be rejected + assert!(tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Reviewer, + "ReviewSecurity", + false, + ) + .is_err()); + } + + #[test] + fn budget_tracker_allows_one_retry_after_initial_reviewer_budget() { + let policy = DeepReviewExecutionPolicy::from_config_value(Some(&json!({ + "max_same_role_instances": 1, + "max_retries_per_role": 1 + }))); + let tracker = DeepReviewBudgetTracker::default(); + + for reviewer in [ + "ReviewBusinessLogic", + "ReviewPerformance", + "ReviewSecurity", + "ReviewArchitecture", + "ReviewFrontend", + ] { + tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Reviewer, + reviewer, + false, + ) + .unwrap(); + } + + assert!(tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Reviewer, + "ReviewSecurity", + false, + ) + .is_err()); + tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Reviewer, + "ReviewSecurity", + true, + ) + .unwrap(); + + let violation = tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Reviewer, + "ReviewSecurity", + true, + ) + .unwrap_err(); + assert_eq!(violation.code, "deep_review_retry_budget_exhausted"); + } + + #[test] + fn budget_tracker_rejects_retry_without_initial_reviewer_call() { + let policy = DeepReviewExecutionPolicy::default(); + let tracker = DeepReviewBudgetTracker::default(); + + let violation = tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Reviewer, + "ReviewSecurity", + true, + ) + .unwrap_err(); + + assert_eq!(violation.code, "deep_review_retry_without_initial_attempt"); + } + + #[test] + fn max_same_role_instances_clamped_to_range() { // Value 0 should be clamped to 1 let policy = DeepReviewExecutionPolicy::from_config_value(Some(&json!({ "max_same_role_instances": 0 @@ -576,4 +2914,596 @@ mod tests { }))); assert_eq!(policy.max_same_role_instances, 100); } + + #[test] + fn auto_select_strategy_quick_for_small_changes() { + let policy = DeepReviewExecutionPolicy::default(); + let risk = super::ChangeRiskFactors { + file_count: 2, + total_lines_changed: 80, + files_in_security_paths: 0, + max_cyclomatic_complexity_delta: 0, + cross_crate_changes: 0, + }; + let (level, rationale) = policy.auto_select_strategy(&risk); + assert_eq!(level, DeepReviewStrategyLevel::Quick); + assert!(rationale.contains("2 files")); + assert!(rationale.contains("80 lines")); + } + + #[test] + fn auto_select_strategy_normal_for_medium_changes() { + let policy = DeepReviewExecutionPolicy::default(); + let risk = super::ChangeRiskFactors { + file_count: 8, + total_lines_changed: 400, + files_in_security_paths: 0, + max_cyclomatic_complexity_delta: 0, + cross_crate_changes: 0, + }; + let (level, rationale) = policy.auto_select_strategy(&risk); + assert_eq!(level, DeepReviewStrategyLevel::Normal); + assert!(rationale.contains("8 files")); + } + + #[test] + fn auto_select_strategy_deep_for_large_or_risky_changes() { + let policy = DeepReviewExecutionPolicy::default(); + let risk = super::ChangeRiskFactors { + file_count: 30, + total_lines_changed: 2000, + files_in_security_paths: 3, + max_cyclomatic_complexity_delta: 0, + cross_crate_changes: 2, + }; + let (level, rationale) = policy.auto_select_strategy(&risk); + assert_eq!(level, DeepReviewStrategyLevel::Deep); + assert!(rationale.contains("30 files")); + assert!(rationale.contains("3 security files")); + } + + #[test] + fn auto_select_strategy_security_paths_boost_score() { + let policy = super::DeepReviewExecutionPolicy::default(); + // 4 files + 0 lines/100 + 2 security * 3 = 10 → Normal + let risk = super::ChangeRiskFactors { + file_count: 4, + total_lines_changed: 0, + files_in_security_paths: 2, + max_cyclomatic_complexity_delta: 0, + cross_crate_changes: 0, + }; + let (level, _) = policy.auto_select_strategy(&risk); + assert_eq!(level, DeepReviewStrategyLevel::Normal); + } + + #[test] + fn concurrency_policy_default_values() { + let policy = super::DeepReviewConcurrencyPolicy::default(); + assert_eq!(policy.max_parallel_instances, 4); + assert_eq!(policy.stagger_seconds, 0); + assert!(policy.batch_extras_separately); + } + + #[test] + fn concurrency_policy_from_manifest() { + let raw = json!({ + "maxParallelInstances": 6, + "staggerSeconds": 5, + "batchExtrasSeparately": false + }); + let policy = super::DeepReviewConcurrencyPolicy::from_manifest(&raw); + assert_eq!(policy.max_parallel_instances, 6); + assert_eq!(policy.stagger_seconds, 5); + assert!(!policy.batch_extras_separately); + } + + #[test] + fn concurrency_effective_max_same_role_instances() { + let exec_policy = DeepReviewExecutionPolicy::default(); + let conc_policy = super::DeepReviewConcurrencyPolicy { + max_parallel_instances: 4, + stagger_seconds: 0, + max_queue_wait_seconds: 60, + batch_extras_separately: true, + }; + // 5 reviewer types (4 core + 1 conditional), 4 / 5 = 0 → clamped to 1 + assert_eq!( + conc_policy.effective_max_same_role_instances(&exec_policy), + 1 + ); + + let conc_policy_12 = super::DeepReviewConcurrencyPolicy { + max_parallel_instances: 12, + stagger_seconds: 0, + max_queue_wait_seconds: 60, + batch_extras_separately: true, + }; + // 12 / 5 = 2, capped by default max_same_role_instances (3) → 2 + assert_eq!( + conc_policy_12.effective_max_same_role_instances(&exec_policy), + 2 + ); + } + + #[test] + fn concurrency_check_launch_allowed() { + let policy = super::DeepReviewConcurrencyPolicy::default(); + // 0 active reviewers → reviewer allowed + assert!(policy + .check_launch_allowed(0, DeepReviewSubagentRole::Reviewer, false) + .is_ok()); + // 4 active reviewers (at cap) → reviewer blocked + let err = policy + .check_launch_allowed(4, DeepReviewSubagentRole::Reviewer, false) + .unwrap_err(); + assert_eq!(err.code, "deep_review_concurrency_cap_reached"); + // 1 active reviewer → judge blocked + let err = policy + .check_launch_allowed(1, DeepReviewSubagentRole::Judge, false) + .unwrap_err(); + assert_eq!(err.code, "deep_review_judge_launch_blocked_by_reviewers"); + // 0 active reviewers, judge not pending → judge allowed + assert!(policy + .check_launch_allowed(0, DeepReviewSubagentRole::Judge, false) + .is_ok()); + // 0 active reviewers, judge pending → blocked + let err = policy + .check_launch_allowed(0, DeepReviewSubagentRole::Judge, true) + .unwrap_err(); + assert_eq!(err.code, "deep_review_judge_already_pending"); + } + + #[test] + fn concurrency_policy_from_run_manifest() { + let policy = DeepReviewExecutionPolicy::default(); + let manifest = json!({ + "reviewMode": "deep", + "concurrencyPolicy": { + "maxParallelInstances": 3, + "staggerSeconds": 10, + "maxQueueWaitSeconds": 45 + } + }); + let conc = policy.concurrency_policy_from_manifest(&manifest); + assert_eq!(conc.max_parallel_instances, 3); + assert_eq!(conc.stagger_seconds, 10); + assert_eq!(conc.max_queue_wait_seconds, 45); + assert!(conc.batch_extras_separately); + } + + #[test] + fn active_reviewer_guard_tracks_running_reviewers_only() { + let tracker = DeepReviewBudgetTracker::default(); + let policy = DeepReviewExecutionPolicy::default(); + + tracker + .record_task( + "turn-active", + &policy, + DeepReviewSubagentRole::Reviewer, + REVIEWER_SECURITY_AGENT_TYPE, + false, + ) + .unwrap(); + assert_eq!(tracker.active_reviewer_count("turn-active"), 0); + + { + let _guard = tracker.begin_active_reviewer("turn-active"); + assert_eq!(tracker.active_reviewer_count("turn-active"), 1); + } + + assert_eq!(tracker.active_reviewer_count("turn-active"), 0); + } + + #[test] + fn active_reviewer_try_begin_respects_capacity_atomically() { + let tracker = DeepReviewBudgetTracker::default(); + let first = tracker + .try_begin_active_reviewer("turn-atomic", 1) + .expect("first reviewer should acquire capacity"); + + assert!(tracker + .try_begin_active_reviewer("turn-atomic", 1) + .is_none()); + assert_eq!(tracker.active_reviewer_count("turn-atomic"), 1); + + drop(first); + + assert!(tracker + .try_begin_active_reviewer("turn-atomic", 1) + .is_some()); + } + + #[test] + fn capacity_skip_count_is_tracked_separately_from_hard_rejections() { + let tracker = DeepReviewBudgetTracker::default(); + + tracker.record_capacity_skip("turn-skip"); + tracker.record_capacity_skip("turn-skip"); + tracker.record_concurrency_cap_rejection("turn-skip"); + + assert_eq!(tracker.capacity_skip_count("turn-skip"), 2); + assert_eq!(tracker.concurrency_cap_rejection_count("turn-skip"), 1); + } + + #[test] + fn shared_context_measurement_tracks_duplicate_readonly_file_context_without_content() { + let tracker = DeepReviewBudgetTracker::default(); + + tracker.record_shared_context_tool_use( + "turn-shared-context", + REVIEWER_SECURITY_AGENT_TYPE, + "Read", + ".\\src\\lib.rs", + ); + tracker.record_shared_context_tool_use( + "turn-shared-context", + REVIEWER_PERFORMANCE_AGENT_TYPE, + "Read", + "src/lib.rs", + ); + tracker.record_shared_context_tool_use( + "turn-shared-context", + REVIEWER_SECURITY_AGENT_TYPE, + "GetFileDiff", + "src/lib.rs", + ); + tracker.record_shared_context_tool_use( + "turn-shared-context", + REVIEWER_ARCHITECTURE_AGENT_TYPE, + "Read", + "src/other.rs", + ); + + let snapshot = tracker.shared_context_measurement_snapshot("turn-shared-context"); + + assert_eq!(snapshot.total_calls, 4); + assert_eq!(snapshot.duplicate_calls, 1); + assert_eq!(snapshot.duplicate_context_count, 1); + assert_eq!(snapshot.repeated_contexts.len(), 1); + assert_eq!(snapshot.repeated_contexts[0].tool_name, "Read"); + assert_eq!(snapshot.repeated_contexts[0].file_path, "src/lib.rs"); + assert_eq!(snapshot.repeated_contexts[0].call_count, 2); + assert_eq!(snapshot.repeated_contexts[0].reviewer_count, 2); + } + + #[test] + fn effective_concurrency_lowers_after_capacity_errors_without_exceeding_hard_cap() { + let tracker = DeepReviewBudgetTracker::default(); + + assert_eq!(tracker.effective_parallel_instances("turn-effective", 4), 4); + + tracker.record_effective_concurrency_capacity_error( + "turn-effective", + 4, + super::DeepReviewCapacityQueueReason::LocalConcurrencyCap, + None, + ); + assert_eq!(tracker.effective_parallel_instances("turn-effective", 4), 3); + + for _ in 0..8 { + tracker.record_effective_concurrency_capacity_error( + "turn-effective", + 4, + super::DeepReviewCapacityQueueReason::LocalConcurrencyCap, + None, + ); + } + assert_eq!(tracker.effective_parallel_instances("turn-effective", 4), 1); + } + + #[test] + fn effective_concurrency_recovers_after_success_observation_window() { + let tracker = DeepReviewBudgetTracker::default(); + + tracker.record_effective_concurrency_capacity_error( + "turn-recover", + 4, + super::DeepReviewCapacityQueueReason::LocalConcurrencyCap, + None, + ); + assert_eq!(tracker.effective_parallel_instances("turn-recover", 4), 3); + + tracker.record_effective_concurrency_success("turn-recover", 4); + tracker.record_effective_concurrency_success("turn-recover", 4); + assert_eq!(tracker.effective_parallel_instances("turn-recover", 4), 3); + + tracker.record_effective_concurrency_success("turn-recover", 4); + assert_eq!(tracker.effective_parallel_instances("turn-recover", 4), 4); + } + + #[test] + fn effective_concurrency_respects_retry_after_before_recovery() { + let tracker = DeepReviewBudgetTracker::default(); + + let snapshot = tracker.record_effective_concurrency_capacity_error( + "turn-retry-after", + 4, + super::DeepReviewCapacityQueueReason::RetryAfter, + Some(Duration::from_secs(60)), + ); + assert_eq!(snapshot.learned_parallel_instances, 3); + assert_eq!(snapshot.effective_parallel_instances, 1); + assert!(snapshot.retry_after_remaining_ms.unwrap_or_default() > 0); + + for _ in 0..3 { + tracker.record_effective_concurrency_success("turn-retry-after", 4); + } + assert_eq!( + tracker.effective_parallel_instances("turn-retry-after", 4), + 1 + ); + } + + #[test] + fn effective_concurrency_user_override_is_bounded_and_visible() { + let tracker = DeepReviewBudgetTracker::default(); + + tracker.record_effective_concurrency_capacity_error( + "turn-override", + 4, + super::DeepReviewCapacityQueueReason::ProviderConcurrencyLimit, + None, + ); + tracker.set_effective_concurrency_user_override("turn-override", 4, Some(9)); + + let snapshot = tracker.effective_concurrency_snapshot("turn-override", 4); + assert_eq!(snapshot.configured_max_parallel_instances, 4); + assert_eq!(snapshot.learned_parallel_instances, 3); + assert_eq!(snapshot.user_override_parallel_instances, Some(4)); + assert_eq!(snapshot.effective_parallel_instances, 4); + + tracker.set_effective_concurrency_user_override("turn-override", 4, Some(0)); + let snapshot = tracker.effective_concurrency_snapshot("turn-override", 4); + assert_eq!(snapshot.user_override_parallel_instances, Some(1)); + assert_eq!(snapshot.effective_parallel_instances, 1); + } + + #[test] + fn capacity_error_classifier_queues_only_transient_capacity_failures() { + let queueable_cases = [ + ( + "provider_rate_limit", + "Provider rate limit exceeded", + None, + super::DeepReviewCapacityQueueReason::ProviderRateLimit, + ), + ( + "provider_error", + "Too many concurrent requests for this account", + None, + super::DeepReviewCapacityQueueReason::ProviderConcurrencyLimit, + ), + ( + "provider_unavailable", + "Model is temporarily overloaded", + None, + super::DeepReviewCapacityQueueReason::TemporaryOverload, + ), + ( + "provider_error", + "Retry later", + Some(30), + super::DeepReviewCapacityQueueReason::RetryAfter, + ), + ( + "deep_review_concurrency_cap_reached", + "Maximum parallel reviewer instances reached", + None, + super::DeepReviewCapacityQueueReason::LocalConcurrencyCap, + ), + ]; + + for (code, message, retry_after_seconds, expected_reason) in queueable_cases { + let decision = + super::classify_deep_review_capacity_error(code, message, retry_after_seconds); + assert!(decision.queueable, "{code} should be queueable"); + assert_eq!(decision.reason, Some(expected_reason)); + } + } + + #[test] + fn capacity_error_classifier_fails_fast_for_non_capacity_failures() { + let non_queueable_cases = [ + ("authentication_failed", "API key is invalid"), + ( + "provider_quota_exhausted", + "Quota exhausted for this billing period", + ), + ("billing_required", "Billing is not configured"), + ("invalid_model", "The requested model does not exist"), + ("user_cancelled", "User cancelled the operation"), + ( + "deep_review_subagent_not_allowed", + "Subagent is not allowed", + ), + ("invalid_tooling", "Review agent is missing GetFileDiff"), + ]; + + for (code, message) in non_queueable_cases { + let decision = super::classify_deep_review_capacity_error(code, message, None); + assert!(!decision.queueable, "{code} should fail fast"); + assert_eq!(decision.reason, None); + } + } + + #[test] + fn queue_state_keeps_queue_wait_out_of_reviewer_timeout() { + let queued = super::DeepReviewReviewerQueueState::queued_for_capacity( + super::DeepReviewCapacityQueueReason::ProviderConcurrencyLimit, + 45_000, + ); + assert_eq!( + queued.status, + super::DeepReviewReviewerQueueStatus::QueuedForCapacity + ); + assert_eq!(queued.queue_elapsed_ms, 45_000); + assert_eq!(queued.run_elapsed_ms, 0); + assert_eq!(queued.timeout_elapsed_ms(), 0); + + let running = super::DeepReviewReviewerQueueState::running(45_000, 8_000); + assert_eq!( + running.status, + super::DeepReviewReviewerQueueStatus::Running + ); + assert_eq!(running.queue_elapsed_ms, 45_000); + assert_eq!(running.run_elapsed_ms, 8_000); + assert_eq!(running.timeout_elapsed_ms(), 8_000); + } + + #[test] + fn paused_queue_state_does_not_consume_reviewer_timeout() { + let paused = super::DeepReviewReviewerQueueState::paused_by_user(120_000); + + assert_eq!( + paused.status, + super::DeepReviewReviewerQueueStatus::PausedByUser + ); + assert_eq!(paused.queue_elapsed_ms, 120_000); + assert_eq!(paused.run_elapsed_ms, 0); + assert_eq!(paused.timeout_elapsed_ms(), 0); + assert_eq!(paused.reason, None); + } + + #[test] + fn queue_control_pause_continue_cancel_are_tool_scoped() { + let turn_id = "turn-queue-control-tool"; + let primary_tool_id = "tool-queue-control-a"; + let other_tool_id = "tool-queue-control-b"; + + let paused = super::apply_deep_review_queue_control( + turn_id, + primary_tool_id, + super::DeepReviewQueueControlAction::Pause, + ); + assert!(paused.paused); + assert!(!paused.cancelled); + + let other = super::deep_review_queue_control_snapshot(turn_id, other_tool_id); + assert!(!other.paused); + assert!(!other.cancelled); + + let continued = super::apply_deep_review_queue_control( + turn_id, + primary_tool_id, + super::DeepReviewQueueControlAction::Continue, + ); + assert!(!continued.paused); + assert!(!continued.cancelled); + + let cancelled = super::apply_deep_review_queue_control( + turn_id, + primary_tool_id, + super::DeepReviewQueueControlAction::Cancel, + ); + assert!(!cancelled.paused); + assert!(cancelled.cancelled); + + super::clear_deep_review_queue_control_for_tool(turn_id, primary_tool_id); + let cleared = super::deep_review_queue_control_snapshot(turn_id, primary_tool_id); + assert!(!cleared.paused); + assert!(!cleared.cancelled); + } + + #[test] + fn queue_control_skip_optional_is_turn_scoped() { + let turn_id = "turn-queue-control-optional"; + let primary_tool_id = "tool-queue-control-primary"; + let other_tool_id = "tool-queue-control-other"; + + let snapshot = super::apply_deep_review_queue_control( + turn_id, + primary_tool_id, + super::DeepReviewQueueControlAction::SkipOptional, + ); + assert!(snapshot.skip_optional); + + let other = super::deep_review_queue_control_snapshot(turn_id, other_tool_id); + assert!(other.skip_optional); + + super::clear_deep_review_queue_control_for_tool(turn_id, primary_tool_id); + let after_tool_clear = super::deep_review_queue_control_snapshot(turn_id, other_tool_id); + assert!(after_tool_clear.skip_optional); + } + + // --- Incremental review cache tests --- + + #[test] + fn incremental_cache_builds_and_reads() { + let mut cache = DeepReviewIncrementalCache::new("fp-abc123"); + assert_eq!(cache.fingerprint(), "fp-abc123"); + assert!(cache.is_empty()); + + cache.store_packet("reviewer:ReviewSecurity", "Found 2 security issues"); + cache.store_packet("reviewer:ReviewBusinessLogic", "All good"); + assert_eq!(cache.len(), 2); + assert!(!cache.is_empty()); + + assert_eq!( + cache.get_packet("reviewer:ReviewSecurity"), + Some("Found 2 security issues") + ); + assert_eq!(cache.get_packet("reviewer:ReviewArchitecture"), None); + } + + #[test] + fn incremental_cache_matches_fingerprint() { + let cache = DeepReviewIncrementalCache::new("fp-abc123"); + let manifest = json!({ + "incrementalReviewCache": { + "fingerprint": "fp-abc123" + } + }); + assert!(cache.matches_manifest(&manifest)); + + let wrong_manifest = json!({ + "incrementalReviewCache": { + "fingerprint": "fp-other" + } + }); + assert!(!cache.matches_manifest(&wrong_manifest)); + } + + #[test] + fn incremental_cache_to_and_from_value() { + let mut cache = DeepReviewIncrementalCache::new("fp-test"); + cache.store_packet("reviewer:ReviewSecurity", "sec result"); + cache.store_packet("reviewer:ReviewBusinessLogic", "logic result"); + + let value = cache.to_value(); + let restored = DeepReviewIncrementalCache::from_value(&value); + assert_eq!(restored.fingerprint(), "fp-test"); + assert_eq!(restored.len(), 2); + assert_eq!( + restored.get_packet("reviewer:ReviewSecurity"), + Some("sec result") + ); + } + + #[test] + fn incremental_cache_preserves_split_packet_keys() { + let mut cache = DeepReviewIncrementalCache::new("fp-split"); + cache.store_packet("reviewer:ReviewSecurity:group-1-of-2", "sec group 1"); + cache.store_packet("reviewer:ReviewSecurity:group-2-of-2", "sec group 2"); + + let restored = DeepReviewIncrementalCache::from_value(&cache.to_value()); + + assert_eq!( + restored.get_packet("reviewer:ReviewSecurity:group-1-of-2"), + Some("sec group 1") + ); + assert_eq!( + restored.get_packet("reviewer:ReviewSecurity:group-2-of-2"), + Some("sec group 2") + ); + assert_eq!(restored.get_packet("ReviewSecurity"), None); + } + + #[test] + fn incremental_cache_from_null_value() { + let cache = DeepReviewIncrementalCache::from_value(&Value::Null); + assert!(cache.is_empty()); + assert_eq!(cache.fingerprint(), ""); + } } diff --git a/src/crates/core/src/agentic/events/types.rs b/src/crates/core/src/agentic/events/types.rs index f326e1762..c40b4ce34 100644 --- a/src/crates/core/src/agentic/events/types.rs +++ b/src/crates/core/src/agentic/events/types.rs @@ -8,7 +8,8 @@ use crate::agentic::core::SessionState; pub use bitfun_events::agentic::ErrorCategory; pub use bitfun_events::{ AgenticEvent as BaseAgenticEvent, AgenticEventEnvelope as EventEnvelope, - AgenticEventPriority as EventPriority, SubagentParentInfo, ToolEventData, + AgenticEventPriority as EventPriority, DeepReviewQueueReason, DeepReviewQueueState, + DeepReviewQueueStatus, SubagentParentInfo, ToolEventData, }; // ============ Core layer AgenticEvent extension ============ diff --git a/src/crates/core/src/agentic/execution/execution_engine.rs b/src/crates/core/src/agentic/execution/execution_engine.rs index 14427257f..4a2dbd2c1 100644 --- a/src/crates/core/src/agentic/execution/execution_engine.rs +++ b/src/crates/core/src/agentic/execution/execution_engine.rs @@ -7,6 +7,7 @@ use super::types::{ExecutionContext, ExecutionResult, RoundContext, RoundResult} use crate::agentic::agents::{ get_agent_registry, PromptBuilder, PromptBuilderContext, RemoteExecutionHints, }; +use crate::agentic::context_profile::{ContextProfilePolicy, ModelCapabilityProfile}; use crate::agentic::core::{ render_system_reminder, Message, MessageContent, MessageHelper, MessageRole, MessageSemanticKind, RequestReasoningTokenPolicy, Session, @@ -27,20 +28,34 @@ use crate::infrastructure::ai::get_global_ai_client_factory; use crate::service::config::get_global_config_service; use crate::service::config::types::{ModelCapability, ModelCategory}; use crate::service::remote_ssh::workspace_state::get_remote_workspace_manager; -use crate::util::elapsed_ms_u64; use crate::util::errors::{BitFunError, BitFunResult}; use crate::util::token_counter::TokenCounter; use crate::util::types::Message as AIMessage; use crate::util::types::ToolDefinition; +use crate::util::{elapsed_ms_u64, truncate_at_char_boundary}; use log::{debug, error, info, trace, warn}; +use sha2::{Digest, Sha256}; use std::collections::{HashMap, HashSet}; use std::path::Path; use std::sync::Arc; use tokio_util::sync::CancellationToken; /// Execution engine configuration -#[derive(Debug, Clone, Default)] -pub struct ExecutionEngineConfig; +#[derive(Debug, Clone)] +pub struct ExecutionEngineConfig { + pub max_rounds: usize, + /// Max consecutive rounds with identical tool-call signatures before loop detection triggers. + pub max_consecutive_same_tool: usize, +} + +impl Default for ExecutionEngineConfig { + fn default() -> Self { + Self { + max_rounds: crate::service::config::types::DEFAULT_MAX_ROUNDS, + max_consecutive_same_tool: 3, + } + } +} #[derive(Debug, Clone)] pub struct ContextCompactionOutcome { @@ -55,12 +70,168 @@ pub struct ContextCompactionOutcome { pub applied: bool, } +#[derive(Debug, Clone)] +struct ContextHealthSnapshot { + token_usage_ratio: f32, + microcompact_count: usize, + full_compression_count: usize, + compression_failure_count: u32, + repeated_tool_signature_count: usize, + consecutive_failed_commands: usize, +} + +impl ContextHealthSnapshot { + fn from_runtime_observations( + token_usage_ratio: f32, + microcompact_count: usize, + full_compression_count: usize, + compression_failure_count: u32, + recent_tool_signatures: &[String], + messages: &[Message], + ) -> Self { + Self { + token_usage_ratio, + microcompact_count, + full_compression_count, + compression_failure_count, + repeated_tool_signature_count: Self::repeated_tool_signature_count( + recent_tool_signatures, + ), + consecutive_failed_commands: Self::consecutive_failed_commands(messages), + } + } + + fn token_usage_ratio(current_tokens: usize, context_window: usize) -> f32 { + if context_window == 0 { + return 0.0; + } + current_tokens as f32 / context_window as f32 + } + + fn log(&self, session_id: &str, turn_id: &str, round_index: usize, stage: &str) { + debug!( + "Context health snapshot: session_id={}, turn_id={}, round_index={}, stage={}, token_usage={:.3}, microcompact_count={}, full_compression_count={}, compression_failure_count={}, repeated_tool_signature_count={}, consecutive_failed_commands={}", + session_id, + turn_id, + round_index, + stage, + self.token_usage_ratio, + self.microcompact_count, + self.full_compression_count, + self.compression_failure_count, + self.repeated_tool_signature_count, + self.consecutive_failed_commands + ); + } + + fn log_policy_thresholds( + &self, + session_id: &str, + turn_id: &str, + round_index: usize, + policy: &ContextProfilePolicy, + ) { + if policy.has_repeated_tool_loop(self.repeated_tool_signature_count) { + debug!( + "Context profile repeated-tool threshold reached: session_id={}, turn_id={}, round_index={}, profile={:?}, repeated_tool_signature_count={}, threshold={}", + session_id, + turn_id, + round_index, + policy.profile, + self.repeated_tool_signature_count, + policy.repeated_tool_signature_threshold + ); + } + + if policy.has_consecutive_command_failure_loop(self.consecutive_failed_commands) { + warn!( + "Context profile command-failure threshold reached: session_id={}, turn_id={}, round_index={}, profile={:?}, consecutive_failed_commands={}, threshold={}", + session_id, + turn_id, + round_index, + policy.profile, + self.consecutive_failed_commands, + policy.consecutive_failed_command_threshold + ); + } + } + + fn repeated_tool_signature_count(recent_tool_signatures: &[String]) -> usize { + let Some(last_signature) = recent_tool_signatures.last() else { + return 0; + }; + + let repeated_count = recent_tool_signatures + .iter() + .rev() + .take_while(|signature| *signature == last_signature) + .count(); + + if repeated_count >= 2 { + repeated_count + } else { + 0 + } + } + + fn consecutive_failed_commands(messages: &[Message]) -> usize { + let mut failures = 0; + for message in messages.iter().rev() { + let Some(failed) = Self::command_result_failed(message) else { + continue; + }; + + if failed { + failures += 1; + } else { + break; + } + } + failures + } + + fn command_result_failed(message: &Message) -> Option { + let MessageContent::ToolResult { + tool_name, + result, + is_error, + .. + } = &message.content + else { + return None; + }; + + if !matches!(tool_name.as_str(), "Bash" | "Git") { + return None; + } + + Some(Self::tool_result_failed(result, *is_error)) + } + + fn tool_result_failed(result: &serde_json::Value, is_error: bool) -> bool { + is_error + || Self::bool_field(result, "timed_out") == Some(true) + || Self::bool_field(result, "interrupted") == Some(true) + || Self::bool_field(result, "success") == Some(false) + || Self::numeric_field(result, "exit_code").is_some_and(|code| code != 0) + } + + fn bool_field(value: &serde_json::Value, key: &str) -> Option { + value.get(key).and_then(|field| field.as_bool()) + } + + fn numeric_field(value: &serde_json::Value, key: &str) -> Option { + value.get(key).and_then(|field| field.as_i64()) + } +} + /// Execution engine pub struct ExecutionEngine { round_executor: Arc, event_queue: Arc, session_manager: Arc, context_compressor: Arc, + config: ExecutionEngineConfig, } impl ExecutionEngine { @@ -72,13 +243,14 @@ impl ExecutionEngine { event_queue: Arc, session_manager: Arc, context_compressor: Arc, - _config: ExecutionEngineConfig, + config: ExecutionEngineConfig, ) -> Self { Self { round_executor, event_queue, session_manager, context_compressor, + config, } } @@ -93,6 +265,20 @@ impl ExecutionEngine { ) } + fn tool_signature_args_summary(args_str: &str) -> String { + if args_str.len() <= 128 { + return args_str.to_string(); + } + + let args_hash = hex::encode(Sha256::digest(args_str.as_bytes())); + format!( + "{}..#{}:sha256={}", + truncate_at_char_boundary(args_str, 64), + args_str.len(), + args_hash + ) + } + fn assistant_has_tool_calls(message: &Message) -> bool { matches!( &message.content, @@ -512,6 +698,7 @@ impl ExecutionEngine { steering_interrupt: None, cancellation_token: CancellationToken::new(), workspace_services: context.workspace_services.clone(), + recover_partial_on_cancel: context.recover_partial_on_cancel, }; // Tools are disabled here (None) — model must respond in plain text. @@ -742,6 +929,7 @@ impl ExecutionEngine { context_window: usize, tool_definitions: &Option>, system_prompt_message: Message, + compression_contract_limit: usize, tail_policy: CompressionTailPolicy, ) -> BitFunResult)>> { let event_subagent_parent_info = subagent_parent_info.map(|info| info.clone().into()); @@ -783,14 +971,18 @@ impl ExecutionEngine { .await; // Execute compression + let compression_contract = self + .session_manager + .compression_contract_for_session(session_id, compression_contract_limit); match self .context_compressor - .compress_turns( + .compress_turns_with_contract( session_id, context_window, turn_index_to_keep, turns, tail_policy, + compression_contract, ) .await { @@ -955,9 +1147,30 @@ impl ExecutionEngine { }); } + let is_review_subagent = get_agent_registry() + .get_subagent_is_review(&session.agent_type) + .unwrap_or(false); + let model_id = session.config.model_id.as_deref().unwrap_or_default(); + let context_profile_policy = ContextProfilePolicy::for_agent_context_and_model( + &session.agent_type, + is_review_subagent, + model_id, + model_id, + ); + let compression_contract = self.session_manager.compression_contract_for_session( + session_id, + context_profile_policy.compression_contract_limit, + ); match self .context_compressor - .compress_turns(session_id, context_window, turns.len(), turns, tail_policy) + .compress_turns_with_contract( + session_id, + context_window, + turns.len(), + turns, + tail_policy, + compression_contract, + ) .await { Ok(compression_result) => { @@ -1215,6 +1428,32 @@ impl ExecutionEngine { ); } + let model_capability_profile = ModelCapabilityProfile::from_resolved_model( + &resolved_primary_model_id, + &ai_client.config.model, + ); + let is_review_subagent = agent_registry + .get_subagent_is_review(&agent_type) + .unwrap_or(false); + let context_profile_policy = ContextProfilePolicy::for_agent_context( + &agent_type, + is_review_subagent, + model_capability_profile, + ); + debug!( + "Context profile policy selected: session_id={}, agent_type={}, profile={:?}, model_capability={:?}, microcompact_keep_recent={}, microcompact_trigger_ratio={:.2}, compression_contract_limit={}, subagent_concurrency_cap={}, repeated_tool_signature_threshold={}, consecutive_failed_command_threshold={}", + context.session_id, + agent_type, + context_profile_policy.profile, + model_capability_profile, + context_profile_policy.microcompact_keep_recent, + context_profile_policy.microcompact_trigger_ratio, + context_profile_policy.compression_contract_limit, + context_profile_policy.subagent_concurrency_cap, + context_profile_policy.repeated_tool_signature_threshold, + context_profile_policy.consecutive_failed_command_threshold + ); + // 3. Get System Prompt from current Agent debug!( "Building system prompt from agent: {}, model={}", @@ -1259,6 +1498,13 @@ impl ExecutionEngine { let mut consecutive_compression_failures: u32 = 0; const MAX_CONSECUTIVE_COMPRESSION_FAILURES: u32 = 3; + // P0: Loop detection: track recent tool call signatures + let mut recent_tool_signatures: Vec = Vec::new(); + let mut loop_detected = false; + let mut microcompact_count = 0usize; + let mut full_compression_count = 0usize; + let mut compression_failure_count = 0u32; + // Save the last token usage statistics let mut last_usage: Option = None; @@ -1320,8 +1566,7 @@ impl ExecutionEngine { let enable_context_compression = session.config.enable_context_compression; let compression_threshold = session.config.compression_threshold; - let microcompact_config = - crate::agentic::session::compression::microcompact::MicrocompactConfig::default(); + let microcompact_config = context_profile_policy.microcompact_config(); let mut execution_context_vars = context.context.clone(); execution_context_vars.insert( @@ -1363,6 +1608,15 @@ impl ExecutionEngine { // Loop to execute model rounds loop { + if completed_rounds >= self.config.max_rounds { + warn!( + "Reached max rounds limit: {}, stopping execution", + self.config.max_rounds + ); + finalization_reason = Some("max_rounds"); + break; + } + // Check and compress before sending AI request let mut current_tokens = Self::estimate_request_tokens_internal(&messages, tool_definitions.as_deref()); @@ -1381,20 +1635,29 @@ impl ExecutionEngine { if enable_context_compression && token_usage_ratio >= microcompact_config.trigger_ratio { if let Some(mc_result) = - crate::agentic::session::compression::microcompact::microcompact_messages( + crate::agentic::session::compression::microcompact::microcompact_messages_with_evidence( &mut messages, µcompact_config, + crate::agentic::session::compression::microcompact::MicrocompactEvidenceScope { + session_id: &context.session_id, + turn_id: &context.dialog_turn_id, + }, ) { + microcompact_count += 1; + for event in mc_result.evidence_events.iter().cloned() { + self.session_manager.append_evidence_event(event); + } current_tokens = Self::estimate_request_tokens_internal( &mut messages, tool_definitions.as_deref(), ); debug!( - "Round {} after microcompact: cleared={}, kept={}, tokens now {} ({:.1}%)", + "Round {} after microcompact: cleared={}, kept={}, evidence_events={}, tokens now {} ({:.1}%)", round_index, mc_result.tools_cleared, mc_result.tools_kept, + mc_result.evidence_events_preserved, current_tokens, (current_tokens as f32 / context_window as f32) * 100.0 ); @@ -1440,6 +1703,7 @@ impl ExecutionEngine { context_window, &tool_definitions, system_prompt_message.clone(), + context_profile_policy.compression_contract_limit, CompressionTailPolicy::PreserveLiveFrontier, ) .await @@ -1455,6 +1719,7 @@ impl ExecutionEngine { ); messages = compressed_messages; + full_compression_count += 1; consecutive_compression_failures = 0; } Ok(None) => { @@ -1463,6 +1728,7 @@ impl ExecutionEngine { } Err(e) => { consecutive_compression_failures += 1; + compression_failure_count += 1; error!( "Round {} compression failed ({}/{}): {}, continuing with uncompressed context", round_index, @@ -1496,6 +1762,23 @@ impl ExecutionEngine { ); } + let before_send_tokens = + Self::estimate_request_tokens_internal(&messages, tool_definitions.as_deref()); + ContextHealthSnapshot::from_runtime_observations( + ContextHealthSnapshot::token_usage_ratio(before_send_tokens, context_window), + microcompact_count, + full_compression_count, + compression_failure_count, + &recent_tool_signatures, + &messages, + ) + .log( + &context.session_id, + &context.dialog_turn_id, + round_index, + "before_send", + ); + // Create round context let mut round_context_vars = execution_context_vars.clone(); if context.skip_tool_confirmation { @@ -1523,6 +1806,7 @@ impl ExecutionEngine { }), cancellation_token: CancellationToken::new(), workspace_services: context.workspace_services.clone(), + recover_partial_on_cancel: context.recover_partial_on_cancel, }; // Execute single model round @@ -1608,6 +1892,62 @@ impl ExecutionEngine { last_partial_recovery_reason = round_result.partial_recovery_reason.clone(); } + // P0: Consecutive same-tool-call loop detection + if !round_result.tool_calls.is_empty() { + let mut sigs: Vec = round_result + .tool_calls + .iter() + .map(|tc| { + let args_str = tc.arguments.to_string(); + let args_summary = Self::tool_signature_args_summary(&args_str); + format!("{}:{}", tc.tool_name, args_summary) + }) + .collect(); + sigs.sort(); + let round_sig = sigs.join("|"); + recent_tool_signatures.push(round_sig); + } else { + recent_tool_signatures.clear(); + } + + let after_round_tokens = + Self::estimate_request_tokens_internal(&messages, tool_definitions.as_deref()); + let after_round_health = ContextHealthSnapshot::from_runtime_observations( + ContextHealthSnapshot::token_usage_ratio(after_round_tokens, context_window), + microcompact_count, + full_compression_count, + compression_failure_count, + &recent_tool_signatures, + &messages, + ); + after_round_health.log( + &context.session_id, + &context.dialog_turn_id, + round_index, + "after_round", + ); + after_round_health.log_policy_thresholds( + &context.session_id, + &context.dialog_turn_id, + round_index, + &context_profile_policy, + ); + + let max_consec = context_profile_policy + .effective_loop_threshold(self.config.max_consecutive_same_tool); + if recent_tool_signatures.len() >= max_consec { + let tail = &recent_tool_signatures[recent_tool_signatures.len() - max_consec..]; + if tail.windows(2).all(|w| w[0] == w[1]) { + warn!( + "Loop detected: {} consecutive rounds with identical tool signatures, stopping", + max_consec + ); + loop_detected = true; + finalization_reason = Some("loop_detected"); + break; + } + } + // User-steering messages submitted while this turn is running: drain and inject // them as user messages into the working history before starting the next round // (Codex-style mid-turn injection). This does NOT end the current turn, in @@ -1881,7 +2221,11 @@ impl ExecutionEngine { let finish_reason = FinishReason::Complete; // success reflects whether we ended with a usable final answer. - let success = !matches!(effective_finish_reason, "finalize_failed" | "empty_round"); + let success = !loop_detected + && !matches!( + effective_finish_reason, + "finalize_failed" | "empty_round" | "max_rounds" + ); // Emit dialog turn completed event debug!("Preparing to send DialogTurnCompleted event"); @@ -2082,11 +2426,12 @@ impl ExecutionEngine { #[cfg(test)] mod tests { - use super::ExecutionEngine; + use super::{ContextHealthSnapshot, ExecutionEngine}; use crate::agentic::core::{Message, ToolCall, ToolResult}; use crate::service::config::types::AIConfig; use crate::service::config::types::AIModelConfig; use serde_json::json; + use sha2::{Digest, Sha256}; fn build_model(id: &str, name: &str, model_name: &str) -> AIModelConfig { AIModelConfig { @@ -2131,6 +2476,76 @@ mod tests { ); } + #[test] + fn tool_signature_args_summary_truncates_on_utf8_boundary() { + let args = format!("{}{}", "a".repeat(62), "案".repeat(30)); + let args_hash = hex::encode(Sha256::digest(args.as_bytes())); + + let summary = ExecutionEngine::tool_signature_args_summary(&args); + + assert_eq!( + summary, + format!("{}..#{}:sha256={}", "a".repeat(62), args.len(), args_hash) + ); + } + + #[test] + fn tool_signature_args_summary_keeps_short_arguments() { + let args = r#"{"content":"short"}"#; + + let summary = ExecutionEngine::tool_signature_args_summary(args); + + assert_eq!(summary, args); + } + + #[test] + fn tool_signature_args_summary_distinguishes_same_prefix_and_length() { + let first = format!("{}{}", "x".repeat(64), "a".repeat(80)); + let second = format!("{}{}", "x".repeat(64), "b".repeat(80)); + + let first_summary = ExecutionEngine::tool_signature_args_summary(&first); + let second_summary = ExecutionEngine::tool_signature_args_summary(&second); + + assert_eq!(first.len(), second.len()); + assert_ne!(first, second); + assert_ne!(first_summary, second_summary); + } + + #[test] + fn context_health_snapshot_scores_repeated_tool_signatures() { + let signatures = vec![ + r#"Bash:{"command":"cargo test"}"#.to_string(), + r#"Bash:{"command":"cargo test"}"#.to_string(), + r#"Bash:{"command":"cargo test"}"#.to_string(), + ]; + + let snapshot = + ContextHealthSnapshot::from_runtime_observations(0.82, 2, 1, 0, &signatures, &[]); + + assert!((snapshot.token_usage_ratio - 0.82).abs() < f32::EPSILON); + assert_eq!(snapshot.microcompact_count, 2); + assert_eq!(snapshot.full_compression_count, 1); + assert_eq!(snapshot.compression_failure_count, 0); + assert_eq!(snapshot.repeated_tool_signature_count, 3); + assert_eq!(snapshot.consecutive_failed_commands, 0); + } + + #[test] + fn context_health_snapshot_counts_consecutive_failed_commands() { + let messages = vec![ + command_result("Bash", true, Some(0)), + command_result("Bash", false, Some(1)), + command_result("Git", false, Some(128)), + ]; + + let snapshot = + ContextHealthSnapshot::from_runtime_observations(0.44, 0, 0, 2, &[], &messages); + + assert_eq!(snapshot.repeated_tool_signature_count, 0); + assert_eq!(snapshot.consecutive_failed_commands, 2); + assert_eq!(snapshot.compression_failure_count, 2); + } + #[test] fn assistant_has_tool_calls_detects_mixed_tool_message() { let message = Message::assistant_with_tools( @@ -2182,4 +2597,20 @@ mod tests { assistant, ])); } + + fn command_result(tool_name: &str, success: bool, exit_code: Option) -> Message { + Message::tool_result(ToolResult { + tool_id: format!("{}-tool", tool_name), + tool_name: tool_name.to_string(), + result: json!({ + "success": success, + "exit_code": exit_code, + "command": format!("{} command", tool_name), + }), + result_for_assistant: None, + is_error: !success, + duration_ms: Some(1), + image_attachments: None, + }) + } } diff --git a/src/crates/core/src/agentic/execution/round_executor.rs b/src/crates/core/src/agentic/execution/round_executor.rs index 7c5ffc76f..85603cf27 100644 --- a/src/crates/core/src/agentic/execution/round_executor.rs +++ b/src/crates/core/src/agentic/execution/round_executor.rs @@ -2,7 +2,7 @@ //! //! Executes a single model round: calls AI, processes streaming responses, executes tools -use super::stream_processor::{StreamProcessor, StreamResult}; +use super::stream_processor::{StreamProcessOptions, StreamProcessor, StreamResult}; use super::types::{FinishReason, RoundContext, RoundResult}; use crate::agentic::core::{Message, ToolCall}; use crate::agentic::events::{AgenticEvent, EventPriority, EventQueue, ToolEventData}; @@ -180,7 +180,7 @@ impl RoundExecutor { let stream_started_at = Instant::now(); match self .stream_processor - .process_stream( + .process_stream_with_options( ai_stream, StreamProcessor::derive_watchdog_timeout(ai_client.stream_idle_timeout()), raw_sse_rx, // Pass raw SSE data receiver (for error diagnosis) @@ -189,6 +189,9 @@ impl RoundExecutor { round_id.clone(), subagent_parent_info.clone(), &cancel_token, + StreamProcessOptions { + recover_partial_on_cancel: context.recover_partial_on_cancel, + }, ) .await { diff --git a/src/crates/core/src/agentic/execution/stream_processor.rs b/src/crates/core/src/agentic/execution/stream_processor.rs index ff35e2fb2..45738b233 100644 --- a/src/crates/core/src/agentic/execution/stream_processor.rs +++ b/src/crates/core/src/agentic/execution/stream_processor.rs @@ -141,6 +141,11 @@ pub struct StreamProcessError { pub has_effective_output: bool, } +#[derive(Debug, Clone, Copy, Default)] +pub struct StreamProcessOptions { + pub recover_partial_on_cancel: bool, +} + impl StreamProcessError { fn new(error: BitFunError, has_effective_output: bool) -> Self { Self { @@ -704,6 +709,32 @@ impl StreamProcessor { /// * `cancellation_token` - Cancellation token #[allow(clippy::too_many_arguments)] pub async fn process_stream( + &self, + stream: futures::stream::BoxStream<'static, Result>, + watchdog_timeout: Option, + raw_sse_rx: Option>, + session_id: String, + dialog_turn_id: String, + round_id: String, + subagent_parent_info: Option, + cancellation_token: &tokio_util::sync::CancellationToken, + ) -> Result { + self.process_stream_with_options( + stream, + watchdog_timeout, + raw_sse_rx, + session_id, + dialog_turn_id, + round_id, + subagent_parent_info, + cancellation_token, + StreamProcessOptions::default(), + ) + .await + } + + #[allow(clippy::too_many_arguments)] + pub async fn process_stream_with_options( &self, mut stream: futures::stream::BoxStream<'static, Result>, watchdog_timeout: Option, @@ -713,6 +744,7 @@ impl StreamProcessor { round_id: String, subagent_parent_info: Option, cancellation_token: &tokio_util::sync::CancellationToken, + options: StreamProcessOptions, ) -> Result { let mut ctx = StreamContext::new(session_id, dialog_turn_id, round_id, subagent_parent_info); @@ -754,6 +786,14 @@ impl StreamProcessor { // Check cancellation token _ = cancellation_token.cancelled() => { debug!("Cancel token detected, stopping stream processing: session_id={}", ctx.session_id); + if options.recover_partial_on_cancel && ctx.can_recover_as_partial_result() { + self.send_thinking_end_if_needed(&mut ctx).await; + ctx.force_finish_pending_tool_calls(); + ctx.partial_recovery_reason = + Some("Stream processing cancelled after partial output".to_string()); + self.log_stream_result(&ctx); + break; + } self.graceful_shutdown_from_ctx(&mut ctx, "User cancelled stream processing".to_string()).await; return Err(StreamProcessError::new( BitFunError::Cancelled("Stream processing cancelled".to_string()), @@ -906,7 +946,7 @@ impl StreamProcessor { #[cfg(test)] mod tests { - use super::StreamProcessor; + use super::{StreamProcessOptions, StreamProcessor}; use crate::agentic::events::{EventQueue, EventQueueConfig}; use crate::infrastructure::ai::ai_stream_handlers::{ UnifiedResponse, UnifiedTokenUsage, UnifiedToolCall, @@ -941,6 +981,47 @@ mod tests { } } + #[tokio::test] + async fn recovers_partial_text_when_cancellation_allows_partial_recovery() { + let processor = build_processor(); + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + tx.send(Ok(UnifiedResponse { + text: Some("Partial reviewer evidence.".to_string()), + ..Default::default() + })) + .expect("send partial chunk"); + let _keep_stream_open = tx; + let cancellation_token = CancellationToken::new(); + let cancel_clone = cancellation_token.clone(); + tokio::spawn(async move { + tokio::time::sleep(Duration::from_millis(10)).await; + cancel_clone.cancel(); + }); + + let result = processor + .process_stream_with_options( + tokio_stream::wrappers::UnboundedReceiverStream::new(rx).boxed(), + None, + None, + "session_1".to_string(), + "turn_1".to_string(), + "round_1".to_string(), + None, + &cancellation_token, + StreamProcessOptions { + recover_partial_on_cancel: true, + }, + ) + .await + .expect("partial stream result"); + + assert_eq!(result.full_text, "Partial reviewer evidence."); + assert!(result + .partial_recovery_reason + .as_deref() + .is_some_and(|reason| reason.contains("cancelled"))); + } + #[tokio::test] async fn keeps_collecting_tool_args_across_usage_chunks() { let processor = build_processor(); diff --git a/src/crates/core/src/agentic/execution/types.rs b/src/crates/core/src/agentic/execution/types.rs index 5afd01c82..51ce884d6 100644 --- a/src/crates/core/src/agentic/execution/types.rs +++ b/src/crates/core/src/agentic/execution/types.rs @@ -32,6 +32,9 @@ pub struct ExecutionContext { /// When set, engine drains pending user steering messages at each round boundary /// and injects them into the dialog history without ending the turn. pub round_steering: Option>, + /// When true, stream cancellation may be converted into a partial assistant + /// result if text/tool output has already been produced. + pub recover_partial_on_cancel: bool, } /// Round context @@ -54,6 +57,7 @@ pub struct RoundContext { pub steering_interrupt: Option, pub cancellation_token: CancellationToken, pub workspace_services: Option, + pub recover_partial_on_cancel: bool, } /// Round result diff --git a/src/crates/core/src/agentic/mod.rs b/src/crates/core/src/agentic/mod.rs index 6303e305d..12ca1c86d 100644 --- a/src/crates/core/src/agentic/mod.rs +++ b/src/crates/core/src/agentic/mod.rs @@ -17,6 +17,7 @@ pub mod execution; pub mod tools; // Coordination module +pub mod context_profile; pub mod coordination; pub mod deep_review_policy; @@ -43,6 +44,7 @@ mod util; pub mod insights; pub use agents::*; +pub use context_profile::*; pub use coordination::*; pub use core::*; pub use events::{queue, router, types as event_types}; diff --git a/src/crates/core/src/agentic/persistence/manager.rs b/src/crates/core/src/agentic/persistence/manager.rs index 3447e1191..5adb0de6f 100644 --- a/src/crates/core/src/agentic/persistence/manager.rs +++ b/src/crates/core/src/agentic/persistence/manager.rs @@ -659,6 +659,9 @@ impl PersistenceManager { tags: existing.map(|value| value.tags.clone()).unwrap_or_default(), custom_metadata: existing.and_then(|value| value.custom_metadata.clone()), todos: existing.and_then(|value| value.todos.clone()), + deep_review_run_manifest: existing + .and_then(|value| value.deep_review_run_manifest.clone()), + deep_review_cache: existing.and_then(|value| value.deep_review_cache.clone()), workspace_path: Some(workspace_root), workspace_hostname, unread_completion: existing.and_then(|value| value.unread_completion.clone()), diff --git a/src/crates/core/src/agentic/session/compression/compressor.rs b/src/crates/core/src/agentic/session/compression/compressor.rs index c3bc8b690..48819d346 100644 --- a/src/crates/core/src/agentic/session/compression/compressor.rs +++ b/src/crates/core/src/agentic/session/compression/compressor.rs @@ -3,11 +3,12 @@ //! Responsible only for transforming a session context into a compressed one. use super::fallback::{ - build_structured_compression_summary, CompressionFallbackOptions, CompressionSummaryArtifact, + build_structured_compression_summary_with_contract, CompressionFallbackOptions, + CompressionSummaryArtifact, }; use crate::agentic::core::{ - render_system_reminder, CompressedTodoSnapshot, CompressionEntry, CompressionPayload, Message, - MessageHelper, MessageRole, MessageSemanticKind, + render_system_reminder, CompressedTodoSnapshot, CompressionContract, CompressionEntry, + CompressionPayload, Message, MessageHelper, MessageRole, MessageSemanticKind, }; use crate::infrastructure::ai::{get_global_ai_client_factory, AIClient}; use crate::util::errors::{BitFunError, BitFunResult}; @@ -192,12 +193,32 @@ impl ContextCompressor { } pub async fn compress_turns( + &self, + session_id: &str, + context_window: usize, + turn_index_to_keep: usize, + turns: Vec, + tail_policy: CompressionTailPolicy, + ) -> BitFunResult { + self.compress_turns_with_contract( + session_id, + context_window, + turn_index_to_keep, + turns, + tail_policy, + None, + ) + .await + } + + pub async fn compress_turns_with_contract( &self, session_id: &str, context_window: usize, turn_index_to_keep: usize, mut turns: Vec, tail_policy: CompressionTailPolicy, + contract: Option, ) -> BitFunResult { if turns.is_empty() { debug!("No turns need compression: session_id={}", session_id); @@ -230,7 +251,7 @@ impl ContextCompressor { let mut has_model_summary = false; if !turns.is_empty() { let mut summary_artifact = self - .execute_compression_with_fallback(turns, context_window) + .execute_compression_with_fallback(turns, context_window, contract) .await?; if turns_to_keep.is_empty() { self.append_todo_snapshot(&mut summary_artifact, last_todo.clone()); @@ -340,6 +361,7 @@ impl ContextCompressor { &self, turns_to_compress: Vec, context_window: usize, + contract: Option, ) -> BitFunResult { let summary_result = match get_global_ai_client_factory().await { Ok(ai_client_factory) => match ai_client_factory @@ -347,8 +369,13 @@ impl ContextCompressor { .await { Ok(ai_client) => { - self.execute_compression(ai_client, turns_to_compress.clone(), context_window) - .await + self.execute_compression( + ai_client, + turns_to_compress.clone(), + context_window, + contract.as_ref(), + ) + .await } Err(err) => Err(BitFunError::AIClient(format!( "Failed to get AI client: {}", @@ -364,12 +391,26 @@ impl ContextCompressor { match summary_result { Ok(summary) => { trace!("Compression summary: {}", summary); + let mut payload = CompressionPayload::from_summary(summary.clone()); + let summary_text = + if let Some(contract) = contract.filter(|contract| !contract.is_empty()) { + payload.entries.insert( + 0, + CompressionEntry::Contract { + contract: contract.clone(), + }, + ); + format!( + "{}\n\nPrevious conversation is summarized below:\n{}", + contract.render_for_model(), + summary + ) + } else { + format!("Previous conversation is summarized below:\n{}", summary) + }; Ok(CompressionSummaryArtifact { - summary_text: format!( - "Previous conversation is summarized below:\n{}", - summary - ), - payload: CompressionPayload::from_summary(summary), + summary_text, + payload, used_model_summary: true, }) } @@ -378,12 +419,13 @@ impl ContextCompressor { "Model-based compression failed, falling back to structured local compression: {}", err ); - let summary_artifact = build_structured_compression_summary( + let summary_artifact = build_structured_compression_summary_with_contract( turns_to_compress .into_iter() .map(|turn| turn.messages) .collect(), &self.build_fallback_options(context_window), + contract, ); Ok(summary_artifact) } @@ -426,6 +468,7 @@ impl ContextCompressor { ai_client: Arc, turns_to_compress: Vec, context_window: usize, + contract: Option<&CompressionContract>, ) -> BitFunResult { debug!("Compressing {} turn(s)", turns_to_compress.len()); @@ -483,6 +526,7 @@ Be thorough and precise. Do not lose important technical details from either the ai_client.clone(), gen_system_message_for_summary(&summary), cur_messages, + contract, ) .await?; cur_messages = Vec::new(); @@ -506,6 +550,7 @@ Be thorough and precise. Do not lose important technical details from either the ai_client.clone(), gen_system_message_for_summary(&summary), messages_part1, + contract, ) .await?; request_cnt += 1; @@ -518,6 +563,7 @@ Be thorough and precise. Do not lose important technical details from either the ai_client.clone(), gen_system_message_for_summary(&summary), messages_part2, + contract, ) .await?; request_cnt += 1; @@ -540,6 +586,7 @@ Be thorough and precise. Do not lose important technical details from either the ai_client.clone(), gen_system_message_for_summary(&summary), cur_messages, + contract, ) .await?; request_cnt += 1; @@ -553,9 +600,16 @@ Be thorough and precise. Do not lose important technical details from either the ai_client: Arc, system_message_for_summary: Message, messages: Vec, + contract: Option<&CompressionContract>, ) -> BitFunResult { let raw_summary = self - .generate_summary_with_retry(ai_client, system_message_for_summary, messages, 2) + .generate_summary_with_retry( + ai_client, + system_message_for_summary, + messages, + contract, + 2, + ) .await?; Self::normalize_model_summary_output(&raw_summary).ok_or_else(|| { BitFunError::AIClient( @@ -570,6 +624,7 @@ Be thorough and precise. Do not lose important technical details from either the ai_client: Arc, system_message_for_summary: Message, messages: Vec, + contract: Option<&CompressionContract>, max_tries: usize, ) -> BitFunResult { let mut summary_messages = vec![AIMessage::from(system_message_for_summary)]; @@ -578,7 +633,7 @@ Be thorough and precise. Do not lose important technical details from either the ai_msg.reasoning_content = None; ai_msg })); - summary_messages.push(AIMessage::user(self.get_compact_prompt())); + summary_messages.push(AIMessage::user(self.get_compact_prompt(contract))); let mut last_error = None; let base_wait_time_ms = 500; @@ -624,9 +679,21 @@ Be thorough and precise. Do not lose important technical details from either the Err(BitFunError::AIClient(error_msg)) } - fn get_compact_prompt(&self) -> String { - r#"Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions. + fn get_compact_prompt(&self, contract: Option<&CompressionContract>) -> String { + let contract_instruction = contract + .filter(|contract| !contract.is_empty()) + .map(|contract| { + format!( + "\n\nThe following compaction contract is authoritative factual context from tool observations. Preserve every field from it in the final :\n{}\n", + contract.render_for_model() + ) + }) + .unwrap_or_default(); + + format!( + r#"Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions. This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing development work without losing context. +{contract_instruction} Before providing your final summary, wrap your analysis in tags to organize your thoughts and ensure you've covered all necessary points. Then output the final retained summary in tags. Important: only the content inside will be kept as compressed history. The section is transient and will be discarded, so do not put any required final information only in . @@ -712,7 +779,7 @@ Here's an example of how your output should be structured: Please provide your summary based on the conversation so far, following this structure and ensuring precision and thoroughness in your response. "# - .to_string() + ) } } @@ -729,7 +796,8 @@ fn extract_tag_content<'a>(text: &'a str, tag: &str) -> Option<&'a str> { mod tests { use super::{CompressionTailPolicy, ContextCompressor, TurnWithTokens}; use crate::agentic::core::{ - render_system_reminder, CompressionEntry, CompressionPayload, Message, MessageSemanticKind, + render_system_reminder, CompressionContract, CompressionContractItem, CompressionEntry, + CompressionPayload, Message, MessageSemanticKind, }; fn make_turn(messages: Vec) -> TurnWithTokens { @@ -857,6 +925,28 @@ mod tests { assert!(marker.contains("historical context")); } + #[test] + fn model_summary_prompt_includes_compaction_contract() { + let compressor = ContextCompressor::new(Default::default()); + let contract = CompressionContract { + touched_files: vec!["src/lib.rs".to_string()], + verification_commands: vec![CompressionContractItem { + target: "cargo test".to_string(), + status: "succeeded".to_string(), + summary: "Tests passed.".to_string(), + error_kind: None, + }], + blocking_failures: Vec::new(), + subagent_statuses: Vec::new(), + }; + + let prompt = compressor.get_compact_prompt(Some(&contract)); + + assert!(prompt.contains("authoritative factual context")); + assert!(prompt.contains("src/lib.rs")); + assert!(prompt.contains("cargo test")); + } + #[test] fn model_summary_output_uses_summary_tag_body_only() { let normalized = ContextCompressor::normalize_model_summary_output( diff --git a/src/crates/core/src/agentic/session/compression/fallback/mod.rs b/src/crates/core/src/agentic/session/compression/fallback/mod.rs index 2e46b9596..dc0aa0b62 100644 --- a/src/crates/core/src/agentic/session/compression/fallback/mod.rs +++ b/src/crates/core/src/agentic/session/compression/fallback/mod.rs @@ -4,6 +4,7 @@ mod render; mod sanitize; mod types; +use crate::agentic::core::{CompressionContract, CompressionEntry}; use builder::build_entries_from_turns; use payload::trim_payload_to_budget; use render::render_payload_for_model; @@ -14,7 +15,18 @@ pub fn build_structured_compression_summary( turns: Vec>, options: &CompressionFallbackOptions, ) -> CompressionSummaryArtifact { - let entries = build_entries_from_turns(turns, options); + build_structured_compression_summary_with_contract(turns, options, None) +} + +pub fn build_structured_compression_summary_with_contract( + turns: Vec>, + options: &CompressionFallbackOptions, + contract: Option, +) -> CompressionSummaryArtifact { + let mut entries = build_entries_from_turns(turns, options); + if let Some(contract) = contract.filter(|contract| !contract.is_empty()) { + entries.insert(0, CompressionEntry::Contract { contract }); + } let trimmed_payload = trim_payload_to_budget(entries, options); let summary_text = render_payload_for_model(&trimmed_payload); diff --git a/src/crates/core/src/agentic/session/compression/fallback/payload.rs b/src/crates/core/src/agentic/session/compression/fallback/payload.rs index 29497e80a..4721cae6e 100644 --- a/src/crates/core/src/agentic/session/compression/fallback/payload.rs +++ b/src/crates/core/src/agentic/session/compression/fallback/payload.rs @@ -14,15 +14,29 @@ pub(super) fn trim_payload_to_budget( } let units = flatten_entries_to_units(entries); - let mut selected_units = Vec::new(); - - for unit in units.into_iter().rev() { + let mut selected_units: Vec = units + .iter() + .filter_map(|unit| match unit { + CompressionUnit::Contract { .. } => Some(unit.clone()), + _ => None, + }) + .collect(); + let history_units: Vec = units + .into_iter() + .filter(|unit| !matches!(unit, CompressionUnit::Contract { .. })) + .collect(); + + for unit in history_units.into_iter().rev() { let mut candidate_units = vec![unit.clone()]; candidate_units.extend(selected_units.clone()); let candidate_payload = rebuild_payload_from_units(candidate_units); if estimate_payload_tokens(&candidate_payload) <= options.max_tokens { - selected_units.insert(0, unit); + let history_insert_index = selected_units + .iter() + .take_while(|selected| matches!(selected, CompressionUnit::Contract { .. })) + .count(); + selected_units.insert(history_insert_index, unit); } } @@ -34,6 +48,9 @@ fn flatten_entries_to_units(entries: Vec) -> Vec { + units.push(CompressionUnit::Contract { contract }); + } CompressionEntry::ModelSummary { text } => { units.push(CompressionUnit::ModelSummary { text }); } @@ -72,6 +89,16 @@ fn rebuild_payload_from_units(units: Vec) -> CompressionPayload for unit in units { match unit { + CompressionUnit::Contract { contract } => { + flush_rebuilt_turn( + &mut entries, + &mut current_turn_entry_id, + &mut current_turn_id, + &mut current_messages, + &mut current_todo, + ); + entries.push(CompressionEntry::Contract { contract }); + } CompressionUnit::ModelSummary { text } => { flush_rebuilt_turn( &mut entries, diff --git a/src/crates/core/src/agentic/session/compression/fallback/render.rs b/src/crates/core/src/agentic/session/compression/fallback/render.rs index 4e635d128..fd0335226 100644 --- a/src/crates/core/src/agentic/session/compression/fallback/render.rs +++ b/src/crates/core/src/agentic/session/compression/fallback/render.rs @@ -1,5 +1,6 @@ use crate::agentic::core::{ - CompressedMessage, CompressedMessageRole, CompressionEntry, CompressionPayload, + CompressedMessage, CompressedMessageRole, CompressionContract, CompressionEntry, + CompressionPayload, }; use serde_json::{json, Value}; @@ -9,12 +10,16 @@ pub(super) fn render_payload_for_model(payload: &CompressionPayload) -> String { .to_string(); } - let mut sections = Vec::new(); + let mut contract_sections = Vec::new(); + let mut history_sections = Vec::new(); for (index, entry) in payload.entries.iter().enumerate() { match entry { + CompressionEntry::Contract { contract } => { + contract_sections.push(render_contract(contract)); + } CompressionEntry::ModelSummary { text } => { - sections.push(format!( + history_sections.push(format!( "Earlier summarized history {}:\n{}", index + 1, text @@ -41,14 +46,20 @@ pub(super) fn render_payload_for_model(payload: &CompressionPayload) -> String { } } } - sections.push(lines.join("\n")); + history_sections.push(lines.join("\n")); } } } + let mut sections = contract_sections; + sections.extend(history_sections); sections.join("\n\n") } +fn render_contract(contract: &CompressionContract) -> String { + contract.render_for_model() +} + fn render_compressed_message( lines: &mut Vec, message: &CompressedMessage, diff --git a/src/crates/core/src/agentic/session/compression/fallback/tests.rs b/src/crates/core/src/agentic/session/compression/fallback/tests.rs index e90e006ef..1a8be5f32 100644 --- a/src/crates/core/src/agentic/session/compression/fallback/tests.rs +++ b/src/crates/core/src/agentic/session/compression/fallback/tests.rs @@ -1,7 +1,11 @@ -use super::{build_structured_compression_summary, CompressionFallbackOptions}; +use super::{ + build_structured_compression_summary, build_structured_compression_summary_with_contract, + CompressionFallbackOptions, +}; use crate::agentic::core::{ - render_system_reminder, render_user_query, CompressedMessageRole, CompressionEntry, - CompressionPayload, Message, MessageSemanticKind, ToolCall, ToolResult, + render_system_reminder, render_user_query, CompressedMessageRole, CompressionContract, + CompressionContractItem, CompressionEntry, CompressionPayload, Message, MessageSemanticKind, + ToolCall, ToolResult, }; use serde_json::json; @@ -172,3 +176,58 @@ fn groups_consecutive_assistant_messages_under_single_role_header() { .summary_text .contains("Updated the styling changes.")); } + +#[test] +fn renders_contract_facts_even_when_tool_results_are_cleared() { + let contract = CompressionContract { + touched_files: vec!["src/main.rs".to_string()], + verification_commands: vec![CompressionContractItem { + target: "cargo test".to_string(), + status: "succeeded".to_string(), + summary: "Verification command completed.".to_string(), + error_kind: None, + }], + blocking_failures: vec![CompressionContractItem { + target: "pnpm run type-check:web".to_string(), + status: "failed".to_string(), + summary: "Type check failed before compression.".to_string(), + error_kind: Some("exit_code:2".to_string()), + }], + subagent_statuses: vec![CompressionContractItem { + target: "ReviewSecurity".to_string(), + status: "partial_timeout".to_string(), + summary: "Security reviewer timed out after partial output.".to_string(), + error_kind: Some("timeout".to_string()), + }], + }; + + let summary_artifact = build_structured_compression_summary_with_contract( + vec![vec![Message::tool_result(ToolResult { + tool_id: "tool_1".to_string(), + tool_name: "Read".to_string(), + result: json!({"content": "large output omitted"}), + result_for_assistant: Some("large output omitted".to_string()), + is_error: false, + duration_ms: None, + image_attachments: None, + })]], + &default_options(), + Some(contract), + ); + + assert!(summary_artifact + .summary_text + .contains("Compaction contract:")); + assert!(summary_artifact.summary_text.contains("src/main.rs")); + assert!(summary_artifact.summary_text.contains("cargo test")); + assert!(summary_artifact + .summary_text + .contains("pnpm run type-check:web")); + assert!(summary_artifact.summary_text.contains("exit_code:2")); + assert!(summary_artifact.summary_text.contains("ReviewSecurity")); + assert!(summary_artifact.summary_text.contains("partial_timeout")); + assert!(matches!( + &summary_artifact.payload.entries[0], + CompressionEntry::Contract { .. } + )); +} diff --git a/src/crates/core/src/agentic/session/compression/fallback/types.rs b/src/crates/core/src/agentic/session/compression/fallback/types.rs index dd27488d5..e90edaf42 100644 --- a/src/crates/core/src/agentic/session/compression/fallback/types.rs +++ b/src/crates/core/src/agentic/session/compression/fallback/types.rs @@ -1,4 +1,6 @@ -use crate::agentic::core::{CompressedMessage, CompressedTodoSnapshot, CompressionPayload}; +use crate::agentic::core::{ + CompressedMessage, CompressedTodoSnapshot, CompressionContract, CompressionPayload, +}; #[derive(Debug, Clone)] pub struct CompressionFallbackOptions { @@ -18,6 +20,9 @@ pub struct CompressionSummaryArtifact { #[derive(Debug, Clone)] pub(super) enum CompressionUnit { + Contract { + contract: CompressionContract, + }, ModelSummary { text: String, }, diff --git a/src/crates/core/src/agentic/session/compression/microcompact.rs b/src/crates/core/src/agentic/session/compression/microcompact.rs index c751b51e3..92b7ce9c7 100644 --- a/src/crates/core/src/agentic/session/compression/microcompact.rs +++ b/src/crates/core/src/agentic/session/compression/microcompact.rs @@ -9,6 +9,9 @@ //! Design reference: Claude Code `microCompact.ts` (time-based clearing path). use crate::agentic::core::{Message, MessageContent}; +use crate::agentic::session::{ + EvidenceLedgerEvent, EvidenceLedgerEventStatus, EvidenceLedgerTargetKind, +}; use log::{debug, info}; use std::collections::HashSet; @@ -57,6 +60,15 @@ impl Default for MicrocompactConfig { pub struct MicrocompactResult { pub tools_cleared: usize, pub tools_kept: usize, + pub evidence_events: Vec, + pub evidence_events_preserved: usize, +} + +/// Session/turn scope used when preserving facts for cleared tool results. +#[derive(Debug, Clone, Copy)] +pub struct MicrocompactEvidenceScope<'a> { + pub session_id: &'a str, + pub turn_id: &'a str, } /// Run microcompact on the message list **in place**. @@ -66,6 +78,23 @@ pub struct MicrocompactResult { pub fn microcompact_messages( messages: &mut [Message], config: &MicrocompactConfig, +) -> Option { + microcompact_messages_internal(messages, config, None) +} + +/// Run microcompact and preserve a ledger event for each cleared tool result. +pub fn microcompact_messages_with_evidence( + messages: &mut [Message], + config: &MicrocompactConfig, + evidence_scope: MicrocompactEvidenceScope<'_>, +) -> Option { + microcompact_messages_internal(messages, config, Some(evidence_scope)) +} + +fn microcompact_messages_internal( + messages: &mut [Message], + config: &MicrocompactConfig, + evidence_scope: Option>, ) -> Option { let compactable = default_compactable_tools(); @@ -96,7 +125,25 @@ pub fn microcompact_messages( } let mut cleared = 0usize; + let mut evidence_events = Vec::new(); for &idx in to_clear { + let already_cleared = matches!( + &messages[idx].content, + MessageContent::ToolResult { + result_for_assistant, + .. + } if result_for_assistant.as_deref() == Some(CLEARED_PLACEHOLDER) + ); + if already_cleared { + continue; + } + + if let Some(scope) = evidence_scope { + if let Some(event) = build_evidence_event_for_tool_result(&messages[idx], scope) { + evidence_events.push(event); + } + } + let msg = &mut messages[idx]; if let MessageContent::ToolResult { ref mut result, @@ -105,10 +152,6 @@ pub fn microcompact_messages( .. } = msg.content { - // Skip if already cleared - if result_for_assistant.as_deref() == Some(CLEARED_PLACEHOLDER) { - continue; - } *result = serde_json::json!(CLEARED_PLACEHOLDER); *result_for_assistant = Some(CLEARED_PLACEHOLDER.to_string()); *image_attachments = None; @@ -123,27 +166,176 @@ pub fn microcompact_messages( } let kept = compactable_indices.len() - cleared; + let evidence_events_preserved = evidence_events.len(); info!( - "Microcompact: cleared {} tool result(s), kept {} recent", - cleared, kept + "Microcompact: cleared {} tool result(s), kept {} recent, preserved {} evidence event(s)", + cleared, kept, evidence_events_preserved ); debug!( - "Microcompact details: total_compactable={}, keep_recent={}, cleared={}", + "Microcompact details: total_compactable={}, keep_recent={}, cleared={}, evidence_events={}", compactable_indices.len(), config.keep_recent, - cleared + cleared, + evidence_events_preserved ); Some(MicrocompactResult { tools_cleared: cleared, tools_kept: kept, + evidence_events, + evidence_events_preserved, }) } +fn build_evidence_event_for_tool_result( + message: &Message, + scope: MicrocompactEvidenceScope<'_>, +) -> Option { + let MessageContent::ToolResult { + tool_name, + result, + is_error, + .. + } = &message.content + else { + return None; + }; + + let turn_id = message.metadata.turn_id.as_deref().unwrap_or(scope.turn_id); + let target_kind = infer_target_kind(tool_name); + let target = infer_target(tool_name, result); + let status = infer_event_status(result, *is_error); + let mut event = EvidenceLedgerEvent::new( + scope.session_id, + turn_id, + tool_name, + target_kind, + target, + status, + format!( + "Preserved {} tool result before microcompact clearing.", + tool_name + ), + ); + + if let Some(error_kind) = infer_error_kind(result, *is_error) { + event = event.with_error_kind(error_kind); + } + + let touched_files = infer_touched_files(tool_name, result); + if !touched_files.is_empty() { + event = event.with_touched_files(touched_files); + } + + if let Some(artifact_path) = infer_artifact_path(result) { + event = event.with_artifact_path(artifact_path); + } + + Some(event) +} + +fn infer_target_kind(tool_name: &str) -> EvidenceLedgerTargetKind { + match tool_name { + "Bash" | "Git" => EvidenceLedgerTargetKind::Command, + "Read" | "Grep" | "Glob" | "LS" | "Edit" | "Write" | "Delete" | "GetFileDiff" => { + EvidenceLedgerTargetKind::File + } + _ => EvidenceLedgerTargetKind::Unknown, + } +} + +fn infer_target(tool_name: &str, result: &serde_json::Value) -> String { + match tool_name { + "Bash" | "Git" => string_field(result, "command") + .or_else(|| { + let operation = string_field(result, "operation")?; + Some(format!("git {}", operation)) + }) + .unwrap_or_else(|| tool_name.to_string()), + "Read" | "Edit" | "Write" | "Delete" | "GetFileDiff" => string_field(result, "file_path") + .or_else(|| string_field(result, "path")) + .unwrap_or_else(|| tool_name.to_string()), + "Grep" => string_field(result, "pattern") + .or_else(|| string_field(result, "path")) + .unwrap_or_else(|| tool_name.to_string()), + "Glob" => string_field(result, "pattern") + .or_else(|| string_field(result, "path")) + .unwrap_or_else(|| tool_name.to_string()), + "LS" => string_field(result, "path") + .or_else(|| string_field(result, "directory")) + .unwrap_or_else(|| tool_name.to_string()), + _ => string_field(result, "target").unwrap_or_else(|| tool_name.to_string()), + } +} + +fn infer_event_status(result: &serde_json::Value, is_error: bool) -> EvidenceLedgerEventStatus { + if is_error + || bool_field(result, "timed_out") == Some(true) + || bool_field(result, "interrupted") == Some(true) + || bool_field(result, "success") == Some(false) + || numeric_field(result, "exit_code").is_some_and(|code| code != 0) + { + EvidenceLedgerEventStatus::Failed + } else { + EvidenceLedgerEventStatus::Succeeded + } +} + +fn infer_error_kind(result: &serde_json::Value, is_error: bool) -> Option { + if bool_field(result, "timed_out") == Some(true) { + return Some("timeout".to_string()); + } + if bool_field(result, "interrupted") == Some(true) { + return Some("interrupted".to_string()); + } + if let Some(exit_code) = numeric_field(result, "exit_code") { + if exit_code != 0 { + return Some(format!("exit_code:{}", exit_code)); + } + } + if is_error || result.get("error").is_some() || bool_field(result, "success") == Some(false) { + return Some("tool_error".to_string()); + } + None +} + +fn infer_touched_files(tool_name: &str, result: &serde_json::Value) -> Vec { + match tool_name { + "Edit" | "Write" | "Delete" => string_field(result, "file_path") + .or_else(|| string_field(result, "path")) + .into_iter() + .collect(), + _ => Vec::new(), + } +} + +fn infer_artifact_path(result: &serde_json::Value) -> Option { + string_field(result, "artifact_path") + .or_else(|| string_field(result, "output_file")) + .or_else(|| string_field(result, "transcript_path")) +} + +fn string_field(result: &serde_json::Value, key: &str) -> Option { + result + .get(key) + .and_then(|value| value.as_str()) + .filter(|value| !value.trim().is_empty()) + .map(ToString::to_string) +} + +fn bool_field(result: &serde_json::Value, key: &str) -> Option { + result.get(key).and_then(|value| value.as_bool()) +} + +fn numeric_field(result: &serde_json::Value, key: &str) -> Option { + result.get(key).and_then(|value| value.as_i64()) +} + #[cfg(test)] mod tests { use super::*; use crate::agentic::core::{Message, ToolResult}; + use serde_json::json; fn make_tool_result(tool_name: &str, content: &str) -> Message { Message::tool_result(ToolResult { @@ -157,6 +349,22 @@ mod tests { }) } + fn make_tool_result_with_data( + tool_name: &str, + data: serde_json::Value, + assistant_text: &str, + ) -> Message { + Message::tool_result(ToolResult { + tool_id: format!("id_{}", tool_name), + tool_name: tool_name.to_string(), + result: data, + result_for_assistant: Some(assistant_text.to_string()), + is_error: false, + duration_ms: None, + image_attachments: None, + }) + } + #[test] fn clears_old_compactable_results() { let mut messages = vec![ @@ -251,4 +459,121 @@ mod tests { let r2 = microcompact_messages(&mut messages, &config); assert!(r2.is_none()); } + + #[test] + fn preserves_read_target_before_clearing_tool_result() { + let mut messages = vec![ + make_tool_result_with_data( + "Read", + json!({ + "file_path": "src/main.rs", + "content": "fn main() {}", + "success": true + }), + "Read lines 1-1 from src/main.rs", + ) + .with_turn_id("turn-old".to_string()), + make_tool_result("Read", "recent"), + ]; + + let config = MicrocompactConfig { + keep_recent: 1, + trigger_ratio: 0.0, + }; + let result = microcompact_messages_with_evidence( + &mut messages, + &config, + MicrocompactEvidenceScope { + session_id: "session-a", + turn_id: "turn-current", + }, + ) + .expect("microcompact result"); + + assert_eq!(result.tools_cleared, 1); + assert_eq!(result.evidence_events_preserved, 1); + assert_eq!(result.evidence_events[0].session_id, "session-a"); + assert_eq!(result.evidence_events[0].turn_id, "turn-old"); + assert_eq!(result.evidence_events[0].tool_name, "Read"); + assert_eq!( + result.evidence_events[0].target_kind, + EvidenceLedgerTargetKind::File + ); + assert_eq!(result.evidence_events[0].target, "src/main.rs"); + assert_eq!( + result.evidence_events[0].status, + EvidenceLedgerEventStatus::Succeeded + ); + } + + #[test] + fn preserves_failed_command_error_kind_before_clearing() { + let mut messages = vec![ + make_tool_result_with_data( + "Bash", + json!({ + "command": "cargo test", + "success": false, + "exit_code": 1, + "output": "test failed" + }), + "Command failed", + ), + make_tool_result("Read", "recent"), + ]; + + let config = MicrocompactConfig { + keep_recent: 1, + trigger_ratio: 0.0, + }; + let result = microcompact_messages_with_evidence( + &mut messages, + &config, + MicrocompactEvidenceScope { + session_id: "session-a", + turn_id: "turn-a", + }, + ) + .expect("microcompact result"); + + let event = &result.evidence_events[0]; + assert_eq!(event.target_kind, EvidenceLedgerTargetKind::Command); + assert_eq!(event.target, "cargo test"); + assert_eq!(event.status, EvidenceLedgerEventStatus::Failed); + assert_eq!( + event.exit_code_or_error_kind.as_deref(), + Some("exit_code:1") + ); + } + + #[test] + fn preserves_mutated_file_in_touched_files_before_clearing() { + let mut messages = vec![ + make_tool_result_with_data( + "Edit", + json!({ + "file_path": "src/lib.rs", + "success": true + }), + "Successfully edited src/lib.rs", + ), + make_tool_result("Read", "recent"), + ]; + + let config = MicrocompactConfig { + keep_recent: 1, + trigger_ratio: 0.0, + }; + let result = microcompact_messages_with_evidence( + &mut messages, + &config, + MicrocompactEvidenceScope { + session_id: "session-a", + turn_id: "turn-a", + }, + ) + .expect("microcompact result"); + + assert_eq!(result.evidence_events[0].touched_files, vec!["src/lib.rs"]); + } } diff --git a/src/crates/core/src/agentic/session/evidence_ledger.rs b/src/crates/core/src/agentic/session/evidence_ledger.rs new file mode 100644 index 000000000..c3ec77f6f --- /dev/null +++ b/src/crates/core/src/agentic/session/evidence_ledger.rs @@ -0,0 +1,540 @@ +use crate::agentic::core::{CompressionContract, CompressionContractItem}; +use dashmap::DashMap; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +const MAX_PARTIAL_OUTPUT_BYTES: usize = 8_000; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum EvidenceLedgerTargetKind { + #[serde(rename = "file")] + File, + #[serde(rename = "command")] + Command, + #[serde(rename = "subagent")] + Subagent, + #[serde(rename = "artifact")] + Artifact, + #[serde(rename = "checkpoint")] + Checkpoint, + #[serde(rename = "unknown")] + Unknown, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum EvidenceLedgerEventStatus { + #[serde(rename = "created")] + Created, + #[serde(rename = "succeeded")] + Succeeded, + #[serde(rename = "failed")] + Failed, + #[serde(rename = "partial_timeout")] + PartialTimeout, + #[serde(rename = "cancelled")] + Cancelled, + #[serde(rename = "unknown")] + Unknown, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvidenceLedgerCheckpoint { + #[serde(skip_serializing_if = "Option::is_none")] + pub current_branch: Option, + pub dirty_state_summary: String, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub touched_files: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub diff_hash: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvidenceLedgerEvent { + pub event_id: String, + pub session_id: String, + pub turn_id: String, + pub tool_name: String, + pub target_kind: EvidenceLedgerTargetKind, + pub target: String, + pub status: EvidenceLedgerEventStatus, + pub exit_code_or_error_kind: Option, + pub touched_files: Vec, + pub artifact_path: Option, + pub summary: String, + pub partial_output: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub checkpoint: Option, + pub created_at_ms: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvidenceLedgerSummaryItem { + pub event_id: String, + pub turn_id: String, + pub tool_name: String, + pub target_kind: EvidenceLedgerTargetKind, + pub target: String, + pub status: EvidenceLedgerEventStatus, + pub summary: String, + pub error_kind: Option, + pub partial_output: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub checkpoint: Option, +} + +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvidenceLedgerSummary { + pub touched_files: Vec, + pub latest_failed_commands: Vec, + pub latest_verification_commands: Vec, + pub partial_subagent_results: Vec, + pub latest_checkpoints: Vec, +} + +#[derive(Debug, Default)] +pub struct SessionEvidenceLedger { + events_by_session: Arc>>, +} + +impl EvidenceLedgerEvent { + pub fn new( + session_id: impl Into, + turn_id: impl Into, + tool_name: impl Into, + target_kind: EvidenceLedgerTargetKind, + target: impl Into, + status: EvidenceLedgerEventStatus, + summary: impl Into, + ) -> Self { + Self { + event_id: uuid::Uuid::new_v4().to_string(), + session_id: session_id.into(), + turn_id: turn_id.into(), + tool_name: tool_name.into(), + target_kind, + target: target.into(), + status, + exit_code_or_error_kind: None, + touched_files: Vec::new(), + artifact_path: None, + summary: summary.into(), + partial_output: None, + checkpoint: None, + created_at_ms: current_time_millis(), + } + } + + pub fn checkpoint_created( + session_id: impl Into, + turn_id: impl Into, + tool_name: impl Into, + target: impl Into, + checkpoint: EvidenceLedgerCheckpoint, + ) -> Self { + let target = target.into(); + Self::new( + session_id, + turn_id, + tool_name, + EvidenceLedgerTargetKind::Checkpoint, + target.clone(), + EvidenceLedgerEventStatus::Created, + format!("Checkpoint created before modifying {}.", target), + ) + .with_touched_files(checkpoint.touched_files.clone()) + .with_checkpoint(checkpoint) + } + + pub fn with_error_kind(mut self, error_kind: impl Into) -> Self { + self.exit_code_or_error_kind = Some(error_kind.into()); + self + } + + pub fn with_partial_output(mut self, partial_output: impl Into) -> Self { + let partial_output = partial_output.into(); + self.partial_output = Some(truncate_string_at_char_boundary( + &partial_output, + MAX_PARTIAL_OUTPUT_BYTES, + )); + self + } + + pub fn with_touched_files(mut self, touched_files: Vec) -> Self { + self.touched_files = touched_files; + self + } + + pub fn with_artifact_path(mut self, artifact_path: impl Into) -> Self { + self.artifact_path = Some(artifact_path.into()); + self + } + + pub fn with_checkpoint(mut self, checkpoint: EvidenceLedgerCheckpoint) -> Self { + self.checkpoint = Some(checkpoint); + self + } +} + +impl SessionEvidenceLedger { + pub fn new() -> Self { + Self::default() + } + + pub fn append(&self, event: EvidenceLedgerEvent) -> EvidenceLedgerEvent { + self.events_by_session + .entry(event.session_id.clone()) + .or_default() + .push(event.clone()); + event + } + + pub fn events_for_turn(&self, session_id: &str, turn_id: &str) -> Vec { + self.events_by_session + .get(session_id) + .map(|events| { + events + .iter() + .filter(|event| event.turn_id == turn_id) + .cloned() + .collect() + }) + .unwrap_or_default() + } + + pub fn summary_for_session(&self, session_id: &str, limit: usize) -> EvidenceLedgerSummary { + let Some(events) = self.events_by_session.get(session_id) else { + return EvidenceLedgerSummary::default(); + }; + + let mut touched_files = Vec::new(); + let mut latest_failed_commands = Vec::new(); + let mut latest_verification_commands = Vec::new(); + let mut partial_subagent_results = Vec::new(); + let mut latest_checkpoints = Vec::new(); + + for event in events.iter().rev() { + for file in &event.touched_files { + if !touched_files.contains(file) { + touched_files.push(file.clone()); + } + } + + if event.target_kind == EvidenceLedgerTargetKind::Command + && event.status == EvidenceLedgerEventStatus::Failed + && latest_failed_commands.len() < limit + { + latest_failed_commands.push(event.into()); + } + + if event.target_kind == EvidenceLedgerTargetKind::Command + && is_verification_command(&event.target) + && latest_verification_commands.len() < limit + { + latest_verification_commands.push(event.into()); + } + + if event.target_kind == EvidenceLedgerTargetKind::Subagent + && event.status == EvidenceLedgerEventStatus::PartialTimeout + && partial_subagent_results.len() < limit + { + partial_subagent_results.push(event.into()); + } + + if event.target_kind == EvidenceLedgerTargetKind::Checkpoint + && event.status == EvidenceLedgerEventStatus::Created + && latest_checkpoints.len() < limit + { + latest_checkpoints.push(event.into()); + } + } + + touched_files.truncate(limit); + + EvidenceLedgerSummary { + touched_files, + latest_failed_commands, + latest_verification_commands, + partial_subagent_results, + latest_checkpoints, + } + } +} + +impl From<&EvidenceLedgerEvent> for EvidenceLedgerSummaryItem { + fn from(event: &EvidenceLedgerEvent) -> Self { + Self { + event_id: event.event_id.clone(), + turn_id: event.turn_id.clone(), + tool_name: event.tool_name.clone(), + target_kind: event.target_kind.clone(), + target: event.target.clone(), + status: event.status.clone(), + summary: event.summary.clone(), + error_kind: event.exit_code_or_error_kind.clone(), + partial_output: event.partial_output.clone(), + checkpoint: event.checkpoint.clone(), + } + } +} + +impl From for CompressionContract { + fn from(summary: EvidenceLedgerSummary) -> Self { + Self { + touched_files: summary.touched_files, + verification_commands: summary + .latest_verification_commands + .into_iter() + .map(compression_contract_item_from_summary_item) + .collect(), + blocking_failures: summary + .latest_failed_commands + .into_iter() + .map(compression_contract_item_from_summary_item) + .collect(), + subagent_statuses: summary + .partial_subagent_results + .into_iter() + .map(compression_contract_item_from_summary_item) + .collect(), + } + } +} + +fn compression_contract_item_from_summary_item( + item: EvidenceLedgerSummaryItem, +) -> CompressionContractItem { + CompressionContractItem { + target: item.target, + status: event_status_label(&item.status).to_string(), + summary: item.summary, + error_kind: item.error_kind, + } +} + +fn event_status_label(status: &EvidenceLedgerEventStatus) -> &'static str { + match status { + EvidenceLedgerEventStatus::Created => "created", + EvidenceLedgerEventStatus::Succeeded => "succeeded", + EvidenceLedgerEventStatus::Failed => "failed", + EvidenceLedgerEventStatus::PartialTimeout => "partial_timeout", + EvidenceLedgerEventStatus::Cancelled => "cancelled", + EvidenceLedgerEventStatus::Unknown => "unknown", + } +} + +fn current_time_millis() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|duration| duration.as_millis().min(u128::from(u64::MAX)) as u64) + .unwrap_or(0) +} + +fn is_verification_command(command: &str) -> bool { + let command = command.to_ascii_lowercase(); + command.contains(" test") + || command.starts_with("test") + || command.contains("cargo test") + || command.contains("pnpm") + || command.contains("npm test") + || command.contains("yarn test") + || command.contains("vitest") + || command.contains("type-check") + || command.contains("lint") +} + +fn truncate_string_at_char_boundary(value: &str, max_bytes: usize) -> String { + crate::util::truncate_at_char_boundary(value, max_bytes).to_string() +} + +#[cfg(test)] +mod tests { + use super::{ + EvidenceLedgerCheckpoint, EvidenceLedgerEvent, EvidenceLedgerEventStatus, + EvidenceLedgerTargetKind, SessionEvidenceLedger, + }; + + #[test] + fn ledger_reads_events_scoped_by_session_and_turn() { + let ledger = SessionEvidenceLedger::new(); + let event = EvidenceLedgerEvent::new( + "session-a", + "turn-a", + "Task", + EvidenceLedgerTargetKind::Subagent, + "ReviewSecurity", + EvidenceLedgerEventStatus::PartialTimeout, + "Security reviewer timed out after partial output.", + ) + .with_error_kind("timeout") + .with_partial_output("Found token logging before timeout."); + + let appended = ledger.append(event); + + assert!(!appended.event_id.is_empty()); + assert_eq!( + ledger.events_for_turn("session-a", "turn-a"), + vec![appended.clone()] + ); + assert!(ledger.events_for_turn("session-a", "other-turn").is_empty()); + assert!(ledger.events_for_turn("other-session", "turn-a").is_empty()); + } + + #[test] + fn checkpoint_created_event_preserves_recovery_boundary_metadata() { + let checkpoint = EvidenceLedgerCheckpoint { + current_branch: Some("feature/context".to_string()), + dirty_state_summary: "staged=1, unstaged=2, untracked=3".to_string(), + touched_files: vec!["src/lib.rs".to_string()], + diff_hash: Some("abc123".to_string()), + }; + + let event = EvidenceLedgerEvent::checkpoint_created( + "session-a", + "turn-a", + "Edit", + "src/lib.rs", + checkpoint.clone(), + ); + + assert_eq!(event.target_kind, EvidenceLedgerTargetKind::Checkpoint); + assert_eq!(event.status, EvidenceLedgerEventStatus::Created); + assert_eq!(event.touched_files, vec!["src/lib.rs"]); + assert_eq!(event.checkpoint.as_ref(), Some(&checkpoint)); + } + + #[test] + fn summary_projects_latest_checkpoints() { + let ledger = SessionEvidenceLedger::new(); + ledger.append(EvidenceLedgerEvent::checkpoint_created( + "session-a", + "turn-a", + "Delete", + "src/old.rs", + EvidenceLedgerCheckpoint { + current_branch: Some("feature/context".to_string()), + dirty_state_summary: "staged=0, unstaged=1, untracked=0".to_string(), + touched_files: vec!["src/old.rs".to_string()], + diff_hash: Some("def456".to_string()), + }, + )); + + let summary = ledger.summary_for_session("session-a", 10); + + assert_eq!(summary.latest_checkpoints.len(), 1); + assert_eq!(summary.latest_checkpoints[0].target, "src/old.rs"); + assert_eq!( + summary.latest_checkpoints[0] + .checkpoint + .as_ref() + .and_then(|checkpoint| checkpoint.current_branch.as_deref()), + Some("feature/context") + ); + } + + #[test] + fn summary_projects_partial_subagent_results() { + let ledger = SessionEvidenceLedger::new(); + ledger.append( + EvidenceLedgerEvent::new( + "session-a", + "turn-a", + "Task", + EvidenceLedgerTargetKind::Subagent, + "ReviewSecurity", + EvidenceLedgerEventStatus::PartialTimeout, + "Security reviewer timed out after partial output.", + ) + .with_error_kind("timeout") + .with_partial_output("Found token logging before timeout."), + ); + + let summary = ledger.summary_for_session("session-a", 10); + + assert_eq!(summary.partial_subagent_results.len(), 1); + assert_eq!(summary.partial_subagent_results[0].target, "ReviewSecurity"); + assert_eq!( + summary.partial_subagent_results[0] + .partial_output + .as_deref(), + Some("Found token logging before timeout.") + ); + } + + #[test] + fn partial_output_is_truncated_on_utf8_boundary() { + let ledger = SessionEvidenceLedger::new(); + let output = format!("{}{}", "a".repeat(7_999), "测"); + ledger.append( + EvidenceLedgerEvent::new( + "session-a", + "turn-a", + "Task", + EvidenceLedgerTargetKind::Subagent, + "ReviewSecurity", + EvidenceLedgerEventStatus::PartialTimeout, + "Security reviewer timed out after partial output.", + ) + .with_partial_output(output), + ); + + let summary = ledger.summary_for_session("session-a", 10); + let partial_output = summary.partial_subagent_results[0] + .partial_output + .as_deref() + .expect("partial output"); + + assert_eq!(partial_output.len(), 7_999); + assert!(partial_output.is_char_boundary(partial_output.len())); + } + + #[test] + fn summary_projects_into_compression_contract() { + let ledger = SessionEvidenceLedger::new(); + ledger.append( + EvidenceLedgerEvent::new( + "session-a", + "turn-a", + "Edit", + EvidenceLedgerTargetKind::File, + "src/main.rs", + EvidenceLedgerEventStatus::Succeeded, + "Edited main file.", + ) + .with_touched_files(vec!["src/main.rs".to_string()]), + ); + ledger.append( + EvidenceLedgerEvent::new( + "session-a", + "turn-a", + "Bash", + EvidenceLedgerTargetKind::Command, + "cargo test", + EvidenceLedgerEventStatus::Failed, + "Tests failed before compression.", + ) + .with_error_kind("exit_code:1"), + ); + ledger.append(EvidenceLedgerEvent::new( + "session-a", + "turn-a", + "Task", + EvidenceLedgerTargetKind::Subagent, + "ReviewSecurity", + EvidenceLedgerEventStatus::PartialTimeout, + "Security reviewer timed out after partial output.", + )); + + let contract: crate::agentic::core::CompressionContract = + ledger.summary_for_session("session-a", 10).into(); + + assert_eq!(contract.touched_files, vec!["src/main.rs"]); + assert_eq!(contract.verification_commands[0].target, "cargo test"); + assert_eq!( + contract.blocking_failures[0].error_kind.as_deref(), + Some("exit_code:1") + ); + assert_eq!(contract.subagent_statuses[0].target, "ReviewSecurity"); + assert_eq!(contract.subagent_statuses[0].status, "partial_timeout"); + } +} diff --git a/src/crates/core/src/agentic/session/mod.rs b/src/crates/core/src/agentic/session/mod.rs index 1b0b22a94..54578fb87 100644 --- a/src/crates/core/src/agentic/session/mod.rs +++ b/src/crates/core/src/agentic/session/mod.rs @@ -4,8 +4,10 @@ pub mod compression; pub mod context_store; +pub mod evidence_ledger; pub mod session_manager; pub use compression::*; pub use context_store::*; +pub use evidence_ledger::*; pub use session_manager::*; diff --git a/src/crates/core/src/agentic/session/session_manager.rs b/src/crates/core/src/agentic/session/session_manager.rs index 12b7287bb..7a8a49edb 100644 --- a/src/crates/core/src/agentic/session/session_manager.rs +++ b/src/crates/core/src/agentic/session/session_manager.rs @@ -3,19 +3,23 @@ //! Responsible for session CRUD, lifecycle management, and resource association use crate::agentic::core::{ - new_turn_id, CompressionState, Message, MessageSemanticKind, ProcessingPhase, Session, - SessionConfig, SessionKind, SessionState, SessionSummary, TurnStats, + new_turn_id, CompressionContract, CompressionState, Message, MessageSemanticKind, + ProcessingPhase, Session, SessionConfig, SessionKind, SessionState, SessionSummary, TurnStats, }; use crate::agentic::image_analysis::ImageContextData; use crate::agentic::persistence::PersistenceManager; -use crate::agentic::session::SessionContextStore; +use crate::agentic::session::{ + EvidenceLedgerCheckpoint, EvidenceLedgerEvent, EvidenceLedgerEventStatus, + EvidenceLedgerSummary, EvidenceLedgerTargetKind, SessionContextStore, SessionEvidenceLedger, +}; use crate::infrastructure::ai::get_global_ai_client_factory; use crate::service::config::{ get_app_language_code, get_global_config_service, short_model_user_language_instruction, subscribe_config_updates, ConfigUpdateEvent, }; use crate::service::session::{ - DialogTurnData, DialogTurnKind, ModelRoundData, TextItemData, TurnStatus, UserMessageData, + DialogTurnData, DialogTurnKind, ModelRoundData, SessionMetadata, TextItemData, TurnStatus, + UserMessageData, }; use crate::service::snapshot::ensure_snapshot_manager_for_workspace; use crate::util::errors::{BitFunError, BitFunResult}; @@ -226,6 +230,30 @@ mod tests { assert_eq!(title, "New Session"); } + + #[tokio::test] + async fn records_subagent_partial_timeout_in_evidence_ledger() { + let persistence_manager = Arc::new( + PersistenceManager::new(Arc::new(PathManager::new().expect("path manager"))) + .expect("persistence manager"), + ); + let manager = test_manager(persistence_manager); + + let event = manager.record_subagent_partial_timeout( + "session-a", + "turn-a", + "ReviewSecurity", + "Found token logging before timeout.", + Some("timeout"), + ); + + assert!(!event.event_id.is_empty()); + let events = manager.evidence_events_for_turn("session-a", "turn-a"); + assert_eq!(events, vec![event.clone()]); + let summary = manager.evidence_summary_for_session("session-a", 10); + assert_eq!(summary.partial_subagent_results.len(), 1); + assert_eq!(summary.partial_subagent_results[0].event_id, event.event_id); + } } /// Session manager @@ -241,6 +269,7 @@ pub struct SessionManager { /// Sub-components context_store: Arc, + evidence_ledger: Arc, persistence_manager: Arc, /// Configuration @@ -577,6 +606,7 @@ impl SessionManager { sessions: Arc::new(DashMap::new()), session_workspace_index: Arc::new(DashMap::new()), context_store, + evidence_ledger: Arc::new(SessionEvidenceLedger::new()), persistence_manager, config, }; @@ -591,6 +621,76 @@ impl SessionManager { manager } + pub fn append_evidence_event(&self, event: EvidenceLedgerEvent) -> EvidenceLedgerEvent { + self.evidence_ledger.append(event) + } + + pub fn record_checkpoint_created( + &self, + session_id: &str, + turn_id: &str, + tool_name: &str, + target: &str, + checkpoint: EvidenceLedgerCheckpoint, + ) -> EvidenceLedgerEvent { + self.append_evidence_event(EvidenceLedgerEvent::checkpoint_created( + session_id, turn_id, tool_name, target, checkpoint, + )) + } + + pub fn evidence_events_for_turn( + &self, + session_id: &str, + turn_id: &str, + ) -> Vec { + self.evidence_ledger.events_for_turn(session_id, turn_id) + } + + pub fn evidence_summary_for_session( + &self, + session_id: &str, + limit: usize, + ) -> EvidenceLedgerSummary { + self.evidence_ledger.summary_for_session(session_id, limit) + } + + pub fn compression_contract_for_session( + &self, + session_id: &str, + limit: usize, + ) -> Option { + let contract: CompressionContract = + self.evidence_summary_for_session(session_id, limit).into(); + (!contract.is_empty()).then_some(contract) + } + + pub fn record_subagent_partial_timeout( + &self, + session_id: &str, + turn_id: &str, + subagent_type: &str, + partial_output: &str, + error_kind: Option<&str>, + ) -> EvidenceLedgerEvent { + let summary = format!( + "Subagent {} timed out after producing partial output.", + subagent_type + ); + let event = EvidenceLedgerEvent::new( + session_id, + turn_id, + "Task", + EvidenceLedgerTargetKind::Subagent, + subagent_type, + EvidenceLedgerEventStatus::PartialTimeout, + summary, + ) + .with_error_kind(error_kind.unwrap_or("timeout")) + .with_partial_output(partial_output); + + self.append_evidence_event(event) + } + /// Decide whether the given session model id is still usable. /// /// `model_id` is treated as "usable" when: @@ -691,6 +791,7 @@ impl SessionManager { let sessions = self.sessions.clone(); let session_workspace_index = self.session_workspace_index.clone(); let context_store = self.context_store.clone(); + let evidence_ledger = self.evidence_ledger.clone(); let persistence_manager = self.persistence_manager.clone(); let manager_config = self.config.clone(); @@ -709,6 +810,7 @@ impl SessionManager { sessions, session_workspace_index, context_store, + evidence_ledger, persistence_manager, config: manager_config, }; @@ -1435,6 +1537,26 @@ impl SessionManager { } } + pub async fn load_session_metadata( + &self, + workspace_path: &Path, + session_id: &str, + ) -> BitFunResult> { + self.persistence_manager + .load_session_metadata(workspace_path, session_id) + .await + } + + pub async fn save_session_metadata( + &self, + workspace_path: &Path, + metadata: &SessionMetadata, + ) -> BitFunResult<()> { + self.persistence_manager + .save_session_metadata(workspace_path, metadata) + .await + } + // ============ Dialog Turn Management ============ #[allow(clippy::too_many_arguments)] diff --git a/src/crates/core/src/agentic/tools/framework.rs b/src/crates/core/src/agentic/tools/framework.rs index f67b55ac1..4e54f9070 100644 --- a/src/crates/core/src/agentic/tools/framework.rs +++ b/src/crates/core/src/agentic/tools/framework.rs @@ -1,4 +1,7 @@ //! Tool framework - Tool interface definition and execution context +use crate::agentic::coordination::get_global_coordinator; +use crate::agentic::deep_review_policy::record_deep_review_shared_context_tool_use; +use crate::agentic::session::EvidenceLedgerCheckpoint; use crate::agentic::tools::restrictions::{ is_local_path_within_root, is_remote_posix_path_within_root, ToolPathOperation, ToolRuntimeRestrictions, @@ -10,13 +13,16 @@ use crate::agentic::tools::workspace_paths::{ use crate::agentic::workspace::WorkspaceServices; use crate::agentic::WorkspaceBinding; use crate::infrastructure::get_path_manager_arc; +use crate::service::git::{GitDiffParams, GitService}; use crate::service::remote_ssh::workspace_state::remote_workspace_runtime_root; use crate::service::{get_workspace_runtime_service_arc, WorkspaceRuntimeContext}; use crate::util::errors::BitFunResult; use crate::util::types::ToolImageAttachment; use async_trait::async_trait; +use log::warn; use serde::{Deserialize, Serialize}; use serde_json::Value; +use sha2::{Digest, Sha256}; use std::collections::HashMap; use std::path::{Path, PathBuf}; use tokio_util::sync::CancellationToken; @@ -95,6 +101,107 @@ impl ToolUseContext { self.workspace_services.as_ref().map(|s| s.shell.as_ref()) } + pub async fn record_light_checkpoint( + &self, + tool_name: &str, + target: &str, + touched_files: Vec, + ) { + let Some(session_id) = self.session_id.as_deref() else { + return; + }; + let Some(turn_id) = self.dialog_turn_id.as_deref() else { + return; + }; + let Some(coordinator) = get_global_coordinator() else { + return; + }; + + let checkpoint = self.build_light_checkpoint(touched_files).await; + coordinator + .get_session_manager() + .record_checkpoint_created(session_id, turn_id, tool_name, target, checkpoint); + } + + async fn build_light_checkpoint(&self, touched_files: Vec) -> EvidenceLedgerCheckpoint { + let mut checkpoint = EvidenceLedgerCheckpoint { + current_branch: None, + dirty_state_summary: "workspace_unavailable".to_string(), + touched_files, + diff_hash: None, + }; + + if self.is_remote() { + checkpoint.dirty_state_summary = + "remote_workspace_git_metadata_unavailable".to_string(); + return checkpoint; + } + + let Some(workspace_root) = self.workspace_root() else { + return checkpoint; + }; + + match GitService::get_status(workspace_root).await { + Ok(status) => { + checkpoint.current_branch = Some(status.current_branch); + checkpoint.dirty_state_summary = format!( + "staged={}, unstaged={}, untracked={}", + status.staged.len(), + status.unstaged.len(), + status.untracked.len() + ); + } + Err(error) => { + checkpoint.dirty_state_summary = format!("git_status_unavailable: {}", error); + } + } + + checkpoint.diff_hash = self + .checkpoint_diff_hash(workspace_root, &checkpoint.touched_files) + .await; + checkpoint + } + + async fn checkpoint_diff_hash( + &self, + workspace_root: &Path, + touched_files: &[String], + ) -> Option { + let files = touched_files + .iter() + .filter_map(|file| git_relative_path(workspace_root, file)) + .collect::>(); + + if files.is_empty() { + return None; + } + + let mut diff = String::new(); + for staged in [false, true] { + let params = GitDiffParams { + files: Some(files.clone()), + staged: Some(staged), + ..Default::default() + }; + match GitService::get_diff(workspace_root, ¶ms).await { + Ok(part) => diff.push_str(&part), + Err(error) => { + warn!( + "Failed to collect checkpoint diff hash: staged={}, error={}", + staged, error + ); + return None; + } + } + } + + if diff.is_empty() { + return None; + } + + Some(hex::encode(Sha256::digest(diff.as_bytes()))) + } + pub fn enforce_tool_runtime_restrictions(&self, tool_name: &str) -> BitFunResult<()> { self.runtime_tool_restrictions .ensure_tool_allowed(tool_name) @@ -358,7 +465,7 @@ impl ToolUseContext { } #[cfg(test)] -mod tests { +mod path_resolution_tests { use super::ToolUseContext; use crate::agentic::tools::ToolRuntimeRestrictions; use crate::agentic::WorkspaceBinding; @@ -511,6 +618,74 @@ impl ToolResult { } } +fn git_relative_path(workspace_root: &Path, path: &str) -> Option { + if is_bitfun_runtime_uri(path) { + return None; + } + + let path = Path::new(path); + let relative = if path.is_absolute() { + path.strip_prefix(workspace_root).ok()? + } else { + path + }; + + Some(relative.to_string_lossy().replace('\\', "/")) +} + +fn custom_data_str<'a>(context: &'a ToolUseContext, key: &str) -> Option<&'a str> { + context + .custom_data + .get(key) + .and_then(Value::as_str) + .map(str::trim) + .filter(|value| !value.is_empty()) +} + +fn maybe_record_deep_review_shared_context_tool_use( + tool_name: &str, + input: &Value, + context: &ToolUseContext, +) { + if !tool_name.eq_ignore_ascii_case("Read") && !tool_name.eq_ignore_ascii_case("GetFileDiff") { + return; + } + if !custom_data_str(context, "deep_review_subagent_role") + .is_some_and(|role| role.eq_ignore_ascii_case("reviewer")) + { + return; + } + let Some(parent_turn_id) = custom_data_str(context, "deep_review_parent_dialog_turn_id") else { + return; + }; + let Some(file_path) = input + .get("file_path") + .and_then(Value::as_str) + .map(str::trim) + .filter(|value| !value.is_empty()) + else { + return; + }; + let measured_path = if context.is_remote() { + None + } else { + context + .workspace_root() + .and_then(|workspace_root| git_relative_path(workspace_root, file_path)) + } + .unwrap_or_else(|| file_path.to_string()); + let subagent_type = custom_data_str(context, "deep_review_subagent_type") + .or(context.agent_type.as_deref()) + .unwrap_or("unknown"); + + record_deep_review_shared_context_tool_use( + parent_turn_id, + subagent_type, + tool_name, + &measured_path, + ); +} + /// Tool trait #[async_trait] pub trait Tool: Send + Sync { @@ -638,7 +813,7 @@ pub trait Tool: Send + Sync { /// execution to [`call_impl`], so most tools should override `call_impl` /// instead of overriding this method directly. async fn call(&self, input: &Value, context: &ToolUseContext) -> BitFunResult> { - if let Some(cancellation_token) = context.cancellation_token.as_ref() { + let result = if let Some(cancellation_token) = context.cancellation_token.as_ref() { tokio::select! { result = self.call_impl(input, context) => { result @@ -650,7 +825,11 @@ pub trait Tool: Send + Sync { } } else { self.call_impl(input, context).await + }; + if result.is_ok() { + maybe_record_deep_review_shared_context_tool_use(self.name(), input, context); } + result } } @@ -659,3 +838,90 @@ pub trait Tool: Send + Sync { pub struct ToolRenderOptions { pub verbose: bool, } + +#[cfg(test)] +mod shared_context_tests { + use super::{Tool, ToolResult, ToolUseContext}; + use crate::agentic::deep_review_policy::deep_review_shared_context_measurement_snapshot; + use crate::agentic::tools::ToolRuntimeRestrictions; + use crate::util::errors::BitFunResult; + use async_trait::async_trait; + use serde_json::{json, Value}; + use std::collections::HashMap; + + struct MeasurementReadTool; + + #[async_trait] + impl Tool for MeasurementReadTool { + fn name(&self) -> &str { + "Read" + } + + async fn description(&self) -> BitFunResult { + Ok("Read file".to_string()) + } + + fn input_schema(&self) -> Value { + json!({ + "type": "object", + "properties": { + "file_path": { "type": "string" } + } + }) + } + + async fn call_impl( + &self, + _input: &Value, + _context: &ToolUseContext, + ) -> BitFunResult> { + Ok(vec![ToolResult::ok( + json!({ "ok": true }), + Some("ok".to_string()), + )]) + } + } + + #[tokio::test] + async fn call_records_deep_review_read_file_measurement_without_touching_result() { + let parent_turn_id = format!("turn-framework-measure-{}", uuid::Uuid::new_v4()); + let mut custom_data = HashMap::new(); + custom_data.insert( + "deep_review_parent_dialog_turn_id".to_string(), + json!(parent_turn_id.clone()), + ); + custom_data.insert("deep_review_subagent_role".to_string(), json!("reviewer")); + custom_data.insert( + "deep_review_subagent_type".to_string(), + json!("ReviewSecurity"), + ); + let context = ToolUseContext { + tool_call_id: Some("tool-read".to_string()), + agent_type: Some("ReviewSecurity".to_string()), + session_id: Some("subagent-session".to_string()), + dialog_turn_id: Some("subagent-turn".to_string()), + workspace: None, + custom_data, + computer_use_host: None, + cancellation_token: None, + runtime_tool_restrictions: ToolRuntimeRestrictions::default(), + workspace_services: None, + }; + let tool = MeasurementReadTool; + + let result = tool + .call(&json!({ "file_path": ".\\src\\lib.rs" }), &context) + .await + .expect("read tool call should succeed"); + tool.call(&json!({ "file_path": "src/lib.rs" }), &context) + .await + .expect("read tool call should succeed"); + + assert_eq!(result.len(), 1); + let snapshot = deep_review_shared_context_measurement_snapshot(&parent_turn_id); + assert_eq!(snapshot.total_calls, 2); + assert_eq!(snapshot.duplicate_calls, 1); + assert_eq!(snapshot.repeated_contexts[0].tool_name, "Read"); + assert_eq!(snapshot.repeated_contexts[0].file_path, "src/lib.rs"); + } +} diff --git a/src/crates/core/src/agentic/tools/implementations/bash_tool.rs b/src/crates/core/src/agentic/tools/implementations/bash_tool.rs index e3e6ea6ac..0af76e54a 100644 --- a/src/crates/core/src/agentic/tools/implementations/bash_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/bash_tool.rs @@ -593,6 +593,12 @@ Usage notes: .and_then(|v| v.as_str()) .ok_or_else(|| BitFunError::tool("command is required".to_string()))?; + if command_needs_light_checkpoint(command_str) { + context + .record_light_checkpoint("Bash", command_str, Vec::new()) + .await; + } + // Remote workspace: execute via injected workspace shell if context.is_remote() { let Some(ws_shell) = context.ws_shell() else { @@ -959,6 +965,39 @@ Usage notes: } } +fn command_needs_light_checkpoint(command: &str) -> bool { + let command = command.trim().to_ascii_lowercase(); + let mutating_prefixes = [ + "rm ", + "rmdir ", + "del ", + "erase ", + "move ", + "mv ", + "cp ", + "git reset", + "git clean", + "git checkout", + "git switch", + "git merge", + "git rebase", + "git pull", + "git stash", + "git commit", + "cargo fmt", + "cargo fix", + "rustfmt", + "prettier --write", + ]; + + mutating_prefixes + .iter() + .any(|prefix| command.starts_with(prefix)) + || command.contains(" --fix") + || command.contains(" > ") + || command.contains(" >> ") +} + impl BashTool { fn background_output_file_path( context: &ToolUseContext, @@ -1163,6 +1202,15 @@ impl BashTool { mod tests { use super::*; + #[test] + fn checkpoint_detection_flags_mutating_bash_commands() { + assert!(command_needs_light_checkpoint("cargo fmt")); + assert!(command_needs_light_checkpoint("pnpm lint --fix")); + assert!(command_needs_light_checkpoint("rm -rf target/tmp")); + assert!(!command_needs_light_checkpoint("cargo test")); + assert!(!command_needs_light_checkpoint("git status")); + } + #[test] fn truncate_output_preserving_tail_keeps_end_of_output() { let input = "BEGIN-".to_string() + &"x".repeat(120) + "-IMPORTANT-END"; diff --git a/src/crates/core/src/agentic/tools/implementations/code_review_tool.rs b/src/crates/core/src/agentic/tools/implementations/code_review_tool.rs index 075950dc0..b298d1725 100644 --- a/src/crates/core/src/agentic/tools/implementations/code_review_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/code_review_tool.rs @@ -2,17 +2,40 @@ //! //! Used to get structured code review results. +use crate::agentic::agents::get_agent_registry; +use crate::agentic::context_profile::ContextProfilePolicy; +use crate::agentic::coordination::get_global_coordinator; +use crate::agentic::core::CompressionContract; +use crate::agentic::deep_review_policy::{ + deep_review_shared_context_measurement_snapshot, DeepReviewIncrementalCache, +}; use crate::agentic::tools::framework::{Tool, ToolResult, ToolUseContext}; use crate::service::config::get_app_language_code; use crate::service::i18n::code_review_copy_for_language; use crate::util::errors::BitFunResult; use async_trait::async_trait; -use log::warn; +use log::{debug, warn}; use serde_json::{json, Value}; +use std::collections::HashSet; /// Code review tool definition pub struct CodeReviewTool; +struct DeepReviewCacheUpdate { + value: Value, + hit_count: usize, + miss_count: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct DeepReviewSharedContextDiagnostics { + total_calls: usize, + duplicate_calls: usize, + duplicate_context_count: usize, + max_duplicate_call_count: usize, + max_duplicate_reviewer_count: usize, +} + impl CodeReviewTool { pub fn new() -> Self { Self @@ -204,6 +227,19 @@ impl CodeReviewTool { "type": "string", "description": reviewer_summary_desc }, + "partial_output": { + "type": "string", + "description": "Partial reviewer output captured before timeout or cancellation" + }, + "packet_id": { + "type": "string", + "description": "Deep Review work packet id associated with this reviewer output" + }, + "packet_status_source": { + "type": "string", + "enum": ["reported", "inferred", "missing"], + "description": "Whether packet_id/status was reported by the reviewer, inferred from scheduling metadata, or missing" + }, "issue_count": { "type": "integer", "description": "Validated issue count for this reviewer" @@ -333,6 +369,52 @@ impl CodeReviewTool { }, "additionalProperties": false }, + "reliability_signals": { + "type": "array", + "description": "Structured reliability/status signals for Deep Review report UI and export", + "items": { + "type": "object", + "properties": { + "kind": { + "type": "string", + "enum": [ + "context_pressure", + "compression_preserved", + "cache_hit", + "cache_miss", + "concurrency_limited", + "partial_reviewer", + "retry_guidance", + "skipped_reviewers", + "token_budget_limited", + "user_decision" + ], + "description": "Reliability signal category" + }, + "severity": { + "type": "string", + "enum": ["info", "warning", "action"], + "description": "User-facing severity of this signal" + }, + "count": { + "type": "integer", + "minimum": 0, + "description": "Optional affected item count" + }, + "source": { + "type": "string", + "enum": ["runtime", "manifest", "report", "inferred"], + "description": "Where this reliability signal came from" + }, + "detail": { + "type": "string", + "description": "Short user-facing detail for this signal" + } + }, + "required": ["kind", "severity"], + "additionalProperties": false + } + }, "schema_version": { "type": "integer", "description": "Schema version for forward compatibility", @@ -351,10 +433,596 @@ impl CodeReviewTool { .is_some_and(|agent_type| agent_type == "DeepReview") } + fn normalized_non_empty_string(value: Option<&Value>) -> Option { + value + .and_then(Value::as_str) + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(str::to_string) + } + + fn packet_string_field<'a>(packet: &'a Value, keys: &[&str]) -> Option<&'a str> { + keys.iter() + .find_map(|key| packet.get(*key).and_then(Value::as_str)) + .map(str::trim) + .filter(|value| !value.is_empty()) + } + + fn reviewer_match_tokens(reviewer: &Value) -> Vec { + ["name", "specialty"] + .iter() + .filter_map(|key| Self::normalized_non_empty_string(reviewer.get(*key))) + .map(|value| value.to_ascii_lowercase()) + .collect() + } + + fn packet_match_tokens(packet: &Value) -> Vec { + [ + &["subagentId", "subagent_id", "subagent_type"][..], + &["displayName", "display_name"][..], + &["roleName", "role"][..], + ] + .iter() + .filter_map(|keys| Self::packet_string_field(packet, keys)) + .map(|value| value.to_ascii_lowercase()) + .collect() + } + + fn infer_unique_packet_id_for_reviewer( + reviewer: &Value, + run_manifest: Option<&Value>, + ) -> Option { + let reviewer_tokens = Self::reviewer_match_tokens(reviewer); + if reviewer_tokens.is_empty() { + return None; + } + + let manifest = run_manifest?; + let packets = manifest + .get("workPackets") + .or_else(|| manifest.get("work_packets"))? + .as_array()?; + let mut matches = packets.iter().filter_map(|packet| { + let packet_id = Self::packet_string_field(packet, &["packetId", "packet_id"])?; + let packet_tokens = Self::packet_match_tokens(packet); + let matched = packet_tokens + .iter() + .any(|packet_token| reviewer_tokens.iter().any(|token| token == packet_token)); + matched.then(|| packet_id.to_string()) + }); + let first = matches.next()?; + if matches.next().is_some() { + None + } else { + Some(first) + } + } + + fn fill_deep_review_packet_metadata(input: &mut Value, run_manifest: Option<&Value>) { + let Some(reviewers) = input.get_mut("reviewers").and_then(Value::as_array_mut) else { + return; + }; + + for reviewer in reviewers { + let packet_id = Self::normalized_non_empty_string(reviewer.get("packet_id")); + let packet_status_source = + Self::normalized_non_empty_string(reviewer.get("packet_status_source")); + let inferred_packet_id = if packet_id.is_none() { + Self::infer_unique_packet_id_for_reviewer(reviewer, run_manifest) + } else { + None + }; + + let Some(object) = reviewer.as_object_mut() else { + continue; + }; + + if packet_id.is_some() { + if packet_status_source.is_none() { + object.insert("packet_status_source".to_string(), json!("reported")); + } + } else if let Some(inferred_packet_id) = inferred_packet_id { + object.insert("packet_id".to_string(), json!(inferred_packet_id)); + object.insert("packet_status_source".to_string(), json!("inferred")); + } else if packet_status_source.is_none() { + object.insert("packet_status_source".to_string(), json!("missing")); + } + } + } + + fn value_for_any_key<'a>(value: &'a Value, keys: &[&str]) -> Option<&'a Value> { + keys.iter().find_map(|key| value.get(*key)) + } + + fn bool_for_any_key(value: &Value, keys: &[&str]) -> bool { + Self::value_for_any_key(value, keys) + .and_then(Value::as_bool) + .unwrap_or(false) + } + + fn u64_for_any_key(value: &Value, keys: &[&str]) -> Option { + Self::value_for_any_key(value, keys).and_then(Value::as_u64) + } + + fn has_non_empty_array_for_any_key(value: &Value, keys: &[&str]) -> bool { + Self::value_for_any_key(value, keys) + .and_then(Value::as_array) + .is_some_and(|items| !items.is_empty()) + } + + fn count_partial_reviewers(input: &Value) -> usize { + input + .get("reviewers") + .and_then(Value::as_array) + .map(|reviewers| { + reviewers + .iter() + .filter(|reviewer| { + let status = reviewer + .get("status") + .and_then(Value::as_str) + .map(str::trim) + .unwrap_or_default(); + let has_partial_output = reviewer + .get("partial_output") + .and_then(Value::as_str) + .map(str::trim) + .is_some_and(|output| !output.is_empty()); + status == "partial_timeout" + || (matches!(status, "timed_out" | "cancelled_by_user") + && has_partial_output) + }) + .count() + }) + .unwrap_or(0) + } + + fn count_manifest_skipped_reviewers(run_manifest: Option<&Value>) -> usize { + run_manifest + .and_then(|manifest| { + Self::value_for_any_key(manifest, &["skippedReviewers", "skipped_reviewers"]) + }) + .and_then(Value::as_array) + .map(Vec::len) + .unwrap_or(0) + } + + fn count_token_budget_limited_reviewers(run_manifest: Option<&Value>) -> usize { + let Some(manifest) = run_manifest else { + return 0; + }; + let mut skipped_by_budget = HashSet::new(); + + if let Some(skipped_ids) = + Self::value_for_any_key(manifest, &["tokenBudget", "token_budget"]) + .and_then(|token_budget| { + Self::value_for_any_key( + token_budget, + &["skippedReviewerIds", "skipped_reviewer_ids"], + ) + }) + .and_then(Value::as_array) + { + for value in skipped_ids { + if let Some(id) = value.as_str().map(str::trim).filter(|id| !id.is_empty()) { + skipped_by_budget.insert(id.to_string()); + } + } + } + + if let Some(skipped_reviewers) = + Self::value_for_any_key(manifest, &["skippedReviewers", "skipped_reviewers"]) + .and_then(Value::as_array) + { + for reviewer in skipped_reviewers { + let reason = Self::packet_string_field(reviewer, &["reason"]); + if reason != Some("budget_limited") { + continue; + } + if let Some(id) = + Self::packet_string_field(reviewer, &["subagentId", "subagent_id"]) + { + skipped_by_budget.insert(id.to_string()); + } + } + } + + skipped_by_budget.len() + } + + fn count_decision_items(input: &Value) -> usize { + let needs_decision_count = input + .pointer("/report_sections/remediation_groups/needs_decision") + .and_then(Value::as_array) + .map(|items| { + items + .iter() + .filter_map(Value::as_str) + .map(str::trim) + .filter(|item| !item.is_empty()) + .count() + }) + .unwrap_or(0); + if needs_decision_count > 0 { + return needs_decision_count; + } + + let recommended_action = input + .pointer("/summary/recommended_action") + .and_then(Value::as_str) + .map(str::trim) + .unwrap_or_default(); + usize::from(recommended_action == "block") + } + + fn has_reliability_signal(input: &Value, kind: &str) -> bool { + input + .get("reliability_signals") + .and_then(Value::as_array) + .is_some_and(|signals| { + signals.iter().any(|signal| { + signal + .get("kind") + .and_then(Value::as_str) + .is_some_and(|value| value == kind) + }) + }) + } + + fn push_reliability_signal_if_missing(input: &mut Value, signal: Value) { + let Some(kind) = signal.get("kind").and_then(Value::as_str) else { + return; + }; + if Self::has_reliability_signal(input, kind) { + return; + } + if !input + .get("reliability_signals") + .is_some_and(Value::is_array) + { + input["reliability_signals"] = json!([]); + } + if let Some(signals) = input + .get_mut("reliability_signals") + .and_then(Value::as_array_mut) + { + signals.push(signal); + } + } + + fn compression_contract_for_context(context: &ToolUseContext) -> Option { + let session_id = context.session_id.as_deref()?; + let coordinator = get_global_coordinator()?; + let session = coordinator.get_session_manager().get_session(session_id)?; + let agent_type = Some(session.agent_type.as_str()); + let model_id = session.config.model_id.as_deref(); + let limit = Self::reliability_contract_limit(agent_type, model_id); + let contract = coordinator + .get_session_manager() + .compression_contract_for_session(session_id, limit)?; + Self::should_report_compression_preserved( + session.compression_state.compression_count, + Some(&contract), + ) + .then_some(contract) + } + + fn reliability_contract_limit(agent_type: Option<&str>, model_id: Option<&str>) -> usize { + let agent_type = agent_type + .map(str::trim) + .filter(|agent_type| !agent_type.is_empty()) + .unwrap_or("DeepReview"); + let model_id = model_id + .map(str::trim) + .filter(|model_id| !model_id.is_empty()) + .unwrap_or_default(); + let is_review_subagent = get_agent_registry() + .get_subagent_is_review(agent_type) + .unwrap_or(false); + + ContextProfilePolicy::for_agent_context_and_model( + agent_type, + is_review_subagent, + model_id, + model_id, + ) + .compression_contract_limit + } + + fn should_report_compression_preserved( + compression_count: usize, + compression_contract: Option<&CompressionContract>, + ) -> bool { + compression_count > 0 && compression_contract.is_some_and(|contract| !contract.is_empty()) + } + + fn compression_contract_signal_count(contract: &CompressionContract) -> usize { + contract.touched_files.len() + + contract.verification_commands.len() + + contract.blocking_failures.len() + + contract.subagent_statuses.len() + } + + fn fill_deep_review_reliability_signals( + input: &mut Value, + run_manifest: Option<&Value>, + compression_contract: Option<&CompressionContract>, + ) { + if let Some(token_budget) = run_manifest.and_then(|manifest| { + Self::value_for_any_key(manifest, &["tokenBudget", "token_budget"]) + }) { + let has_context_pressure = + Self::bool_for_any_key( + token_budget, + &["largeDiffSummaryFirst", "large_diff_summary_first"], + ) || Self::has_non_empty_array_for_any_key(token_budget, &["warnings"]); + if has_context_pressure { + let count = Self::u64_for_any_key( + token_budget, + &["estimatedReviewerCalls", "estimated_reviewer_calls"], + ) + .unwrap_or(0); + Self::push_reliability_signal_if_missing( + input, + json!({ + "kind": "context_pressure", + "severity": "info", + "count": count, + "source": "runtime" + }), + ); + } + } + + let skipped_reviewer_count = Self::count_manifest_skipped_reviewers(run_manifest); + if skipped_reviewer_count > 0 { + Self::push_reliability_signal_if_missing( + input, + json!({ + "kind": "skipped_reviewers", + "severity": "info", + "count": skipped_reviewer_count, + "source": "manifest" + }), + ); + } + + let token_budget_limited_reviewer_count = + Self::count_token_budget_limited_reviewers(run_manifest); + if token_budget_limited_reviewer_count > 0 { + Self::push_reliability_signal_if_missing( + input, + json!({ + "kind": "token_budget_limited", + "severity": "warning", + "count": token_budget_limited_reviewer_count, + "source": "manifest" + }), + ); + } + + if let Some(contract) = compression_contract.filter(|contract| !contract.is_empty()) { + let count = Self::compression_contract_signal_count(contract); + if count > 0 { + Self::push_reliability_signal_if_missing( + input, + json!({ + "kind": "compression_preserved", + "severity": "info", + "count": count, + "source": "runtime" + }), + ); + } + } + + let partial_reviewer_count = Self::count_partial_reviewers(input); + if partial_reviewer_count > 0 { + Self::push_reliability_signal_if_missing( + input, + json!({ + "kind": "partial_reviewer", + "severity": "warning", + "count": partial_reviewer_count, + "source": "runtime" + }), + ); + } + + if partial_reviewer_count > 0 { + Self::push_reliability_signal_if_missing( + input, + json!({ + "kind": "retry_guidance", + "severity": "warning", + "count": partial_reviewer_count, + "source": "runtime" + }), + ); + } + + let decision_item_count = Self::count_decision_items(input); + if decision_item_count > 0 { + Self::push_reliability_signal_if_missing( + input, + json!({ + "kind": "user_decision", + "severity": "action", + "count": decision_item_count, + "source": "report" + }), + ); + } + } + + fn fill_deep_review_runtime_tracker_signals(input: &mut Value, dialog_turn_id: Option<&str>) { + let Some(dialog_turn_id) = dialog_turn_id + .map(str::trim) + .filter(|value| !value.is_empty()) + else { + return; + }; + let count = + crate::agentic::deep_review_policy::deep_review_concurrency_cap_rejection_count( + dialog_turn_id, + ) + crate::agentic::deep_review_policy::deep_review_capacity_skip_count(dialog_turn_id); + if count > 0 { + Self::push_reliability_signal_if_missing( + input, + json!({ + "kind": "concurrency_limited", + "severity": "warning", + "count": count, + "source": "runtime" + }), + ); + } + } + + fn deep_review_shared_context_diagnostics( + dialog_turn_id: Option<&str>, + ) -> Option { + let dialog_turn_id = dialog_turn_id + .map(str::trim) + .filter(|value| !value.is_empty())?; + let snapshot = deep_review_shared_context_measurement_snapshot(dialog_turn_id); + if snapshot.total_calls == 0 { + return None; + } + + Some(DeepReviewSharedContextDiagnostics { + total_calls: snapshot.total_calls, + duplicate_calls: snapshot.duplicate_calls, + duplicate_context_count: snapshot.duplicate_context_count, + max_duplicate_call_count: snapshot + .repeated_contexts + .iter() + .map(|context| context.call_count) + .max() + .unwrap_or(0), + max_duplicate_reviewer_count: snapshot + .repeated_contexts + .iter() + .map(|context| context.reviewer_count) + .max() + .unwrap_or(0), + }) + } + + fn log_deep_review_shared_context_diagnostics(dialog_turn_id: Option<&str>) { + let Some(diagnostics) = Self::deep_review_shared_context_diagnostics(dialog_turn_id) else { + return; + }; + + debug!( + "DeepReview shared context measurement: total_calls={}, duplicate_calls={}, duplicate_context_count={}, max_duplicate_call_count={}, max_duplicate_reviewer_count={}", + diagnostics.total_calls, + diagnostics.duplicate_calls, + diagnostics.duplicate_context_count, + diagnostics.max_duplicate_call_count, + diagnostics.max_duplicate_reviewer_count + ); + } + + fn deep_review_cache_fingerprint(run_manifest: Option<&Value>) -> Option { + let manifest = run_manifest?; + let cache_config = Self::value_for_any_key( + manifest, + &["incrementalReviewCache", "incremental_review_cache"], + )?; + Self::packet_string_field(cache_config, &["fingerprint"]).map(str::to_string) + } + + fn deep_review_cache_from_completed_reviewers( + input: &Value, + run_manifest: Option<&Value>, + existing_cache: Option<&Value>, + ) -> Option { + let fingerprint = Self::deep_review_cache_fingerprint(run_manifest)?; + let matching_existing_cache = existing_cache + .map(DeepReviewIncrementalCache::from_value) + .filter(|cache| cache.fingerprint() == fingerprint); + let mut cache = matching_existing_cache + .clone() + .unwrap_or_else(|| DeepReviewIncrementalCache::new(&fingerprint)); + let mut stored_count = 0usize; + let mut hit_count = 0usize; + let mut miss_count = 0usize; + + if let Some(reviewers) = input.get("reviewers").and_then(Value::as_array) { + for reviewer in reviewers { + let is_completed = reviewer + .get("status") + .and_then(Value::as_str) + .map(str::trim) + .is_some_and(|status| status == "completed"); + if !is_completed { + continue; + } + let Some(packet_id) = Self::normalized_non_empty_string(reviewer.get("packet_id")) + else { + continue; + }; + if matching_existing_cache + .as_ref() + .and_then(|cache| cache.get_packet(&packet_id)) + .is_some() + { + hit_count += 1; + } else { + miss_count += 1; + } + let output = + serde_json::to_string(reviewer).unwrap_or_else(|_| reviewer.to_string()); + cache.store_packet(&packet_id, &output); + stored_count += 1; + } + } + + (stored_count > 0).then(|| DeepReviewCacheUpdate { + value: cache.to_value(), + hit_count, + miss_count, + }) + } + + async fn persist_deep_review_cache( + context: &ToolUseContext, + cache_value: Value, + ) -> BitFunResult<()> { + let Some(session_id) = context.session_id.as_deref() else { + return Ok(()); + }; + let Some(workspace) = context.workspace.as_ref() else { + return Ok(()); + }; + let Some(coordinator) = get_global_coordinator() else { + return Ok(()); + }; + let session_storage_path = workspace.session_storage_path(); + let session_manager = coordinator.get_session_manager(); + let Some(mut metadata) = session_manager + .load_session_metadata(&session_storage_path, session_id) + .await? + else { + return Ok(()); + }; + + metadata.deep_review_cache = Some(cache_value); + session_manager + .save_session_metadata(&session_storage_path, &metadata) + .await + } + /// Validate and fill missing fields with default values /// /// When AI-returned data is missing certain fields, fill with default values to avoid entire review failure - fn validate_and_fill_defaults(input: &mut Value, deep_review: bool) { + fn validate_and_fill_defaults( + input: &mut Value, + deep_review: bool, + run_manifest: Option<&Value>, + compression_contract: Option<&CompressionContract>, + ) { // Fill summary default values if input.get("summary").is_none() { warn!("CodeReview tool missing summary field, using default values"); @@ -410,6 +1078,10 @@ impl CodeReviewTool { if input.get("reviewers").is_none() { input["reviewers"] = json!([]); } + if deep_review { + Self::fill_deep_review_packet_metadata(input, run_manifest); + Self::fill_deep_review_reliability_signals(input, run_manifest, compression_contract); + } if input.get("remediation_plan").is_none() { input["remediation_plan"] = json!([]); @@ -493,10 +1165,94 @@ impl Tool for CodeReviewTool { context: &ToolUseContext, ) -> BitFunResult> { let mut filled_input = input.clone(); + let deep_review = Self::is_deep_review_context(Some(context)); + let compression_contract = deep_review + .then(|| Self::compression_contract_for_context(context)) + .flatten(); + let mut run_manifest = context.custom_data.get("deep_review_run_manifest").cloned(); + let mut existing_cache = run_manifest + .as_ref() + .and_then(|manifest| manifest.get("deepReviewCache")) + .cloned(); + if deep_review && (run_manifest.is_none() || existing_cache.is_none()) { + if let (Some(session_id), Some(workspace), Some(coordinator)) = ( + context.session_id.as_deref(), + context.workspace.as_ref(), + get_global_coordinator(), + ) { + let session_storage_path = workspace.session_storage_path(); + match coordinator + .get_session_manager() + .load_session_metadata(&session_storage_path, session_id) + .await + { + Ok(Some(metadata)) => { + if run_manifest.is_none() { + run_manifest = metadata.deep_review_run_manifest; + } + if existing_cache.is_none() { + existing_cache = metadata.deep_review_cache; + } + } + Ok(None) => {} + Err(error) => { + warn!( + "Failed to load DeepReview session metadata for review cache: session_id={}, error={}", + session_id, error + ); + } + } + } + } Self::validate_and_fill_defaults( &mut filled_input, - Self::is_deep_review_context(Some(context)), + deep_review, + run_manifest.as_ref(), + compression_contract.as_ref(), ); + if deep_review { + Self::fill_deep_review_runtime_tracker_signals( + &mut filled_input, + context.dialog_turn_id.as_deref(), + ); + Self::log_deep_review_shared_context_diagnostics(context.dialog_turn_id.as_deref()); + if let Some(cache_update) = Self::deep_review_cache_from_completed_reviewers( + &filled_input, + run_manifest.as_ref(), + existing_cache.as_ref(), + ) { + if cache_update.hit_count > 0 { + Self::push_reliability_signal_if_missing( + &mut filled_input, + json!({ + "kind": "cache_hit", + "severity": "info", + "count": cache_update.hit_count, + "source": "runtime" + }), + ); + } + if cache_update.miss_count > 0 { + Self::push_reliability_signal_if_missing( + &mut filled_input, + json!({ + "kind": "cache_miss", + "severity": "info", + "count": cache_update.miss_count, + "source": "runtime" + }), + ); + } + if let Err(error) = + Self::persist_deep_review_cache(context, cache_update.value).await + { + warn!( + "Failed to persist DeepReview incremental cache: error={}", + error + ); + } + } + } Ok(vec![ToolResult::Result { data: filled_input, @@ -509,6 +1265,7 @@ impl Tool for CodeReviewTool { #[cfg(test)] mod tests { use super::CodeReviewTool; + use crate::agentic::core::{CompressionContract, CompressionContractItem}; use crate::agentic::tools::framework::{Tool, ToolResult, ToolUseContext}; use serde_json::json; use std::collections::HashMap; @@ -550,6 +1307,65 @@ mod tests { } } + #[tokio::test] + async fn deep_review_schema_accepts_reviewer_partial_output() { + let tool = CodeReviewTool::new(); + let context = tool_context(Some("DeepReview")); + let schema = tool + .input_schema_for_model_with_context(Some(&context)) + .await; + let reviewer_properties = &schema["properties"]["reviewers"]["items"]["properties"]; + + assert_eq!(reviewer_properties["partial_output"]["type"], "string"); + } + + #[tokio::test] + async fn deep_review_schema_accepts_reviewer_packet_fallback_metadata() { + let tool = CodeReviewTool::new(); + let context = tool_context(Some("DeepReview")); + let schema = tool + .input_schema_for_model_with_context(Some(&context)) + .await; + let reviewer_properties = &schema["properties"]["reviewers"]["items"]["properties"]; + + assert_eq!(reviewer_properties["packet_id"]["type"], "string"); + assert_eq!( + reviewer_properties["packet_status_source"]["enum"], + json!(["reported", "inferred", "missing"]) + ); + } + + #[tokio::test] + async fn deep_review_schema_accepts_structured_reliability_signals() { + let tool = CodeReviewTool::new(); + let context = tool_context(Some("DeepReview")); + let schema = tool + .input_schema_for_model_with_context(Some(&context)) + .await; + let reliability_properties = + &schema["properties"]["reliability_signals"]["items"]["properties"]; + + assert_eq!( + reliability_properties["kind"]["enum"], + json!([ + "context_pressure", + "compression_preserved", + "cache_hit", + "cache_miss", + "concurrency_limited", + "partial_reviewer", + "retry_guidance", + "skipped_reviewers", + "token_budget_limited", + "user_decision" + ]) + ); + assert_eq!( + reliability_properties["source"]["enum"], + json!(["runtime", "manifest", "report", "inferred"]) + ); + } + #[tokio::test] async fn deep_review_submission_defaults_missing_mode_to_deep() { let tool = CodeReviewTool::new(); @@ -577,4 +1393,628 @@ mod tests { assert!(data["reviewers"].as_array().is_some()); assert!(data["remediation_plan"].as_array().is_some()); } + + #[tokio::test] + async fn deep_review_submission_infers_unique_reviewer_packet_from_manifest() { + let tool = CodeReviewTool::new(); + let mut context = tool_context(Some("DeepReview")); + context.custom_data.insert( + "deep_review_run_manifest".to_string(), + json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity", + "phase": "reviewer", + "subagentId": "ReviewSecurity", + "displayName": "Security Reviewer", + "roleName": "Security Reviewer" + } + ] + }), + ); + + let result = tool + .call_impl( + &json!({ + "summary": { + "overall_assessment": "No blocking issues", + "risk_level": "low", + "recommended_action": "approve" + }, + "issues": [], + "positive_points": [], + "reviewers": [ + { + "name": "Security Reviewer", + "specialty": "security", + "status": "completed", + "summary": "Checked the security packet." + } + ] + }), + &context, + ) + .await + .expect("submit review result"); + + let ToolResult::Result { data, .. } = &result[0] else { + panic!("expected tool result"); + }; + assert_eq!(data["reviewers"][0]["packet_id"], "reviewer:ReviewSecurity"); + assert_eq!(data["reviewers"][0]["packet_status_source"], "inferred"); + } + + #[tokio::test] + async fn deep_review_submission_marks_uninferable_packet_metadata_as_missing() { + let tool = CodeReviewTool::new(); + let context = tool_context(Some("DeepReview")); + let result = tool + .call_impl( + &json!({ + "summary": { + "overall_assessment": "No blocking issues", + "risk_level": "low", + "recommended_action": "approve" + }, + "issues": [], + "positive_points": [], + "reviewers": [ + { + "name": "Unknown Reviewer", + "specialty": "unknown", + "status": "completed", + "summary": "Packet was omitted." + } + ] + }), + &context, + ) + .await + .expect("submit review result"); + + let ToolResult::Result { data, .. } = &result[0] else { + panic!("expected tool result"); + }; + assert!(data["reviewers"][0].get("packet_id").is_none()); + assert_eq!(data["reviewers"][0]["packet_status_source"], "missing"); + } + + #[tokio::test] + async fn deep_review_submission_marks_existing_packet_metadata_as_reported() { + let tool = CodeReviewTool::new(); + let context = tool_context(Some("DeepReview")); + let result = tool + .call_impl( + &json!({ + "summary": { + "overall_assessment": "No blocking issues", + "risk_level": "low", + "recommended_action": "approve" + }, + "issues": [], + "positive_points": [], + "reviewers": [ + { + "name": "Security Reviewer", + "specialty": "security", + "status": "completed", + "summary": "Packet was reported.", + "packet_id": "reviewer:ReviewSecurity" + } + ] + }), + &context, + ) + .await + .expect("submit review result"); + + let ToolResult::Result { data, .. } = &result[0] else { + panic!("expected tool result"); + }; + assert_eq!(data["reviewers"][0]["packet_id"], "reviewer:ReviewSecurity"); + assert_eq!(data["reviewers"][0]["packet_status_source"], "reported"); + } + + #[tokio::test] + async fn deep_review_submission_fills_runtime_reliability_signals() { + let tool = CodeReviewTool::new(); + let mut context = tool_context(Some("DeepReview")); + context.custom_data.insert( + "deep_review_run_manifest".to_string(), + json!({ + "tokenBudget": { + "largeDiffSummaryFirst": true, + "warnings": [], + "estimatedReviewerCalls": 7, + "skippedReviewerIds": ["CustomPerf"] + }, + "skippedReviewers": [ + { + "subagentId": "ReviewFrontend", + "reason": "not_applicable" + }, + { + "subagentId": "CustomPerf", + "reason": "budget_limited" + } + ] + }), + ); + + let result = tool + .call_impl( + &json!({ + "summary": { + "overall_assessment": "Review completed with reduced confidence", + "risk_level": "medium", + "recommended_action": "request_changes" + }, + "issues": [], + "positive_points": [], + "reviewers": [ + { + "name": "Security Reviewer", + "specialty": "security", + "status": "partial_timeout", + "summary": "Timed out after partial evidence.", + "partial_output": "Found one likely issue before timeout." + } + ], + "report_sections": { + "remediation_groups": { + "needs_decision": [ + "Decide whether to block the release." + ] + } + } + }), + &context, + ) + .await + .expect("submit review result"); + + let ToolResult::Result { data, .. } = &result[0] else { + panic!("expected tool result"); + }; + assert_eq!( + data["reliability_signals"], + json!([ + { + "kind": "context_pressure", + "severity": "info", + "count": 7, + "source": "runtime" + }, + { + "kind": "skipped_reviewers", + "severity": "info", + "count": 2, + "source": "manifest" + }, + { + "kind": "token_budget_limited", + "severity": "warning", + "count": 1, + "source": "manifest" + }, + { + "kind": "partial_reviewer", + "severity": "warning", + "count": 1, + "source": "runtime" + }, + { + "kind": "retry_guidance", + "severity": "warning", + "count": 1, + "source": "runtime" + }, + { + "kind": "user_decision", + "severity": "action", + "count": 1, + "source": "report" + } + ]) + ); + } + + #[tokio::test] + async fn deep_review_submission_fills_concurrency_limited_from_runtime_tracker() { + use crate::agentic::deep_review_policy::record_deep_review_concurrency_cap_rejection; + + let tool = CodeReviewTool::new(); + let mut context = tool_context(Some("DeepReview")); + context.dialog_turn_id = Some("turn-code-review-cap-signal".to_string()); + record_deep_review_concurrency_cap_rejection("turn-code-review-cap-signal"); + + let result = tool + .call_impl( + &json!({ + "summary": { + "overall_assessment": "Review completed with launch backpressure", + "risk_level": "medium", + "recommended_action": "approve" + }, + "issues": [], + "positive_points": [] + }), + &context, + ) + .await + .expect("submit review result"); + + let ToolResult::Result { data, .. } = &result[0] else { + panic!("expected tool result"); + }; + assert_eq!( + data["reliability_signals"], + json!([ + { + "kind": "concurrency_limited", + "severity": "warning", + "count": 1, + "source": "runtime" + } + ]) + ); + } + + #[tokio::test] + async fn deep_review_shared_context_diagnostics_stays_out_of_report() { + use crate::agentic::deep_review_policy::record_deep_review_shared_context_tool_use; + + let turn_id = "turn-code-review-shared-context-diagnostics"; + record_deep_review_shared_context_tool_use(turn_id, "ReviewSecurity", "Read", "src/lib.rs"); + record_deep_review_shared_context_tool_use( + turn_id, + "ReviewPerformance", + "Read", + "src/lib.rs", + ); + record_deep_review_shared_context_tool_use( + turn_id, + "ReviewArchitecture", + "GetFileDiff", + "src/lib.rs", + ); + + let diagnostics = CodeReviewTool::deep_review_shared_context_diagnostics(Some(turn_id)) + .expect("diagnostics should be available for measured turn"); + assert_eq!(diagnostics.total_calls, 3); + assert_eq!(diagnostics.duplicate_calls, 1); + assert_eq!(diagnostics.duplicate_context_count, 1); + assert_eq!(diagnostics.max_duplicate_call_count, 2); + assert_eq!(diagnostics.max_duplicate_reviewer_count, 2); + + let tool = CodeReviewTool::new(); + let mut context = tool_context(Some("DeepReview")); + context.dialog_turn_id = Some(turn_id.to_string()); + + let result = tool + .call_impl( + &json!({ + "summary": { + "overall_assessment": "Review completed", + "risk_level": "low", + "recommended_action": "approve" + }, + "issues": [], + "positive_points": [] + }), + &context, + ) + .await + .expect("submit review result"); + + let ToolResult::Result { data, .. } = &result[0] else { + panic!("expected tool result"); + }; + assert!(data.get("shared_context_measurement").is_none()); + assert!(data.get("runtime_diagnostics").is_none()); + assert!(data.get("reliability_signals").is_none()); + } + + #[tokio::test] + async fn deep_review_submission_folds_capacity_skips_into_concurrency_limited_signal() { + use crate::agentic::deep_review_policy::record_deep_review_capacity_skip; + + record_deep_review_capacity_skip("turn-code-review-capacity-skip"); + + let tool = CodeReviewTool::new(); + let mut context = tool_context(Some("DeepReview")); + context.dialog_turn_id = Some("turn-code-review-capacity-skip".to_string()); + + let result = tool + .call_impl( + &json!({ + "summary": { + "overall_assessment": "Review completed after queue skip", + "risk_level": "medium", + "recommended_action": "approve" + }, + "issues": [], + "positive_points": [] + }), + &context, + ) + .await + .expect("submit review result"); + + let ToolResult::Result { data, .. } = &result[0] else { + panic!("expected tool result"); + }; + + assert_eq!( + data["reliability_signals"], + json!([ + { + "kind": "concurrency_limited", + "severity": "warning", + "count": 1, + "source": "runtime" + } + ]) + ); + } + + #[test] + fn deep_review_defaults_include_compression_contract_reliability_signal() { + let contract = CompressionContract { + touched_files: vec!["src/web-ui/src/flow_chat/utils/codeReviewReport.ts".to_string()], + verification_commands: vec![CompressionContractItem { + target: "pnpm --dir src/web-ui run test:run".to_string(), + status: "succeeded".to_string(), + summary: "Frontend report tests passed.".to_string(), + error_kind: None, + }], + blocking_failures: vec![], + subagent_statuses: vec![], + }; + let mut input = json!({ + "summary": { + "overall_assessment": "No blocking issues", + "risk_level": "low", + "recommended_action": "approve" + }, + "issues": [], + "positive_points": [] + }); + + CodeReviewTool::validate_and_fill_defaults(&mut input, true, None, Some(&contract)); + + assert_eq!( + input["reliability_signals"], + json!([ + { + "kind": "compression_preserved", + "severity": "info", + "count": 2, + "source": "runtime" + } + ]) + ); + } + + #[test] + fn deep_review_reliability_contract_limit_uses_context_profile_policy() { + assert_eq!( + CodeReviewTool::reliability_contract_limit(Some("DeepReview"), Some("gpt-5")), + 8 + ); + assert_eq!( + CodeReviewTool::reliability_contract_limit(Some("DeepReview"), Some("gpt-5-mini")), + 4 + ); + } + + #[test] + fn deep_review_compression_signal_requires_completed_compression() { + let contract = CompressionContract { + touched_files: vec!["src/main.rs".to_string()], + verification_commands: vec![], + blocking_failures: vec![], + subagent_statuses: vec![], + }; + + assert!(!CodeReviewTool::should_report_compression_preserved( + 0, + Some(&contract) + )); + assert!(CodeReviewTool::should_report_compression_preserved( + 1, + Some(&contract) + )); + assert!(!CodeReviewTool::should_report_compression_preserved( + 1, + Some(&CompressionContract::default()) + )); + } + + #[test] + fn deep_review_incremental_cache_stores_completed_reviewers_by_packet_id() { + use crate::agentic::deep_review_policy::DeepReviewIncrementalCache; + + let manifest = json!({ + "incrementalReviewCache": { + "fingerprint": "fp-review-v2" + }, + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity:group-1-of-1", + "phase": "reviewer", + "subagentId": "ReviewSecurity", + "displayName": "Security Reviewer" + }, + { + "packetId": "reviewer:ReviewPerformance:group-1-of-1", + "phase": "reviewer", + "subagentId": "ReviewPerformance", + "displayName": "Performance Reviewer" + } + ] + }); + let mut input = json!({ + "summary": { + "overall_assessment": "Review completed", + "risk_level": "medium", + "recommended_action": "request_changes" + }, + "issues": [], + "positive_points": [], + "reviewers": [ + { + "name": "Security Reviewer", + "specialty": "security", + "status": "completed", + "summary": "Found one high-risk issue." + }, + { + "name": "Performance Reviewer", + "specialty": "performance", + "status": "partial_timeout", + "summary": "Timed out before completion.", + "partial_output": "Large render path was still being checked." + } + ] + }); + + CodeReviewTool::validate_and_fill_defaults(&mut input, true, Some(&manifest), None); + let cache_update = CodeReviewTool::deep_review_cache_from_completed_reviewers( + &input, + Some(&manifest), + None, + ) + .expect("completed reviewer should produce cache value"); + let cache = DeepReviewIncrementalCache::from_value(&cache_update.value); + + assert_eq!(cache.fingerprint(), "fp-review-v2"); + assert_eq!(cache_update.hit_count, 0); + assert_eq!(cache_update.miss_count, 1); + assert!(cache + .get_packet("reviewer:ReviewSecurity:group-1-of-1") + .is_some_and(|output| output.contains("Found one high-risk issue."))); + assert_eq!( + cache.get_packet("reviewer:ReviewPerformance:group-1-of-1"), + None + ); + } + + #[test] + fn deep_review_incremental_cache_replaces_stale_existing_cache() { + use crate::agentic::deep_review_policy::DeepReviewIncrementalCache; + + let manifest = json!({ + "incrementalReviewCache": { + "fingerprint": "fp-new" + }, + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity", + "phase": "reviewer", + "subagentId": "ReviewSecurity", + "displayName": "Security Reviewer" + } + ] + }); + let mut stale_cache = DeepReviewIncrementalCache::new("fp-old"); + stale_cache.store_packet("reviewer:ReviewSecurity", "stale output"); + let mut input = json!({ + "summary": { + "overall_assessment": "Review completed", + "risk_level": "low", + "recommended_action": "approve" + }, + "issues": [], + "positive_points": [], + "reviewers": [ + { + "name": "Security Reviewer", + "specialty": "security", + "status": "completed", + "summary": "Fresh security output." + } + ] + }); + + CodeReviewTool::validate_and_fill_defaults(&mut input, true, Some(&manifest), None); + let cache_update = CodeReviewTool::deep_review_cache_from_completed_reviewers( + &input, + Some(&manifest), + Some(&stale_cache.to_value()), + ) + .expect("completed reviewer should replace stale cache"); + let cache = DeepReviewIncrementalCache::from_value(&cache_update.value); + + assert_eq!(cache.fingerprint(), "fp-new"); + assert_eq!(cache_update.hit_count, 0); + assert_eq!(cache_update.miss_count, 1); + assert!(cache + .get_packet("reviewer:ReviewSecurity") + .is_some_and(|output| output.contains("Fresh security output."))); + assert!(!cache + .get_packet("reviewer:ReviewSecurity") + .is_some_and(|output| output.contains("stale output"))); + } + + #[test] + fn deep_review_incremental_cache_counts_existing_packet_hits() { + use crate::agentic::deep_review_policy::DeepReviewIncrementalCache; + + let manifest = json!({ + "incrementalReviewCache": { + "fingerprint": "fp-existing" + }, + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity", + "phase": "reviewer", + "subagentId": "ReviewSecurity", + "displayName": "Security Reviewer" + }, + { + "packetId": "reviewer:ReviewPerformance", + "phase": "reviewer", + "subagentId": "ReviewPerformance", + "displayName": "Performance Reviewer" + } + ] + }); + let mut existing_cache = DeepReviewIncrementalCache::new("fp-existing"); + existing_cache.store_packet("reviewer:ReviewSecurity", "cached security output"); + let mut input = json!({ + "summary": { + "overall_assessment": "Review completed", + "risk_level": "medium", + "recommended_action": "request_changes" + }, + "issues": [], + "positive_points": [], + "reviewers": [ + { + "name": "Security Reviewer", + "specialty": "security", + "status": "completed", + "summary": "Reused security output." + }, + { + "name": "Performance Reviewer", + "specialty": "performance", + "status": "completed", + "summary": "Fresh performance output." + } + ] + }); + + CodeReviewTool::validate_and_fill_defaults(&mut input, true, Some(&manifest), None); + let cache_update = CodeReviewTool::deep_review_cache_from_completed_reviewers( + &input, + Some(&manifest), + Some(&existing_cache.to_value()), + ) + .expect("completed reviewers should update cache"); + + assert_eq!(cache_update.hit_count, 1); + assert_eq!(cache_update.miss_count, 1); + } } diff --git a/src/crates/core/src/agentic/tools/implementations/delete_file_tool.rs b/src/crates/core/src/agentic/tools/implementations/delete_file_tool.rs index d27dc38e7..3a5495537 100644 --- a/src/crates/core/src/agentic/tools/implementations/delete_file_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/delete_file_tool.rs @@ -12,7 +12,7 @@ use tokio::fs; /// File deletion tool - provides safe file/directory deletion functionality /// -/// This tool automatically integrates with the snapshot system, all deletion operations are recorded and support rollback +/// This tool records a lightweight checkpoint before deletion. Rollback is not automatic. pub struct DeleteFileTool; impl Default for DeleteFileTool { @@ -34,7 +34,7 @@ impl Tool for DeleteFileTool { } async fn description(&self) -> BitFunResult { - Ok(r#"Deletes a file or directory from the filesystem. This operation is tracked by the snapshot system and can be rolled back if needed. + Ok(r#"Deletes a file or directory from the filesystem. This operation records a lightweight checkpoint before deletion, but rollback is not automatic. Usage guidelines: 1. **File Deletion**: @@ -53,8 +53,8 @@ Usage guidelines: - The path must exist in the filesystem 4. **Safety Features**: - - All deletions are tracked by the snapshot system - - Users can review and roll back deletions if needed + - Deletions record a lightweight checkpoint when session context is available + - The checkpoint captures Git branch/dirty-state metadata when cheap - The tool requires user confirmation for execution 5. **Best Practices**: @@ -79,8 +79,8 @@ Example for directory: Important notes: - NEVER use bash `rm` commands when this tool is available - - This tool provides better safety through the snapshot system - - All deletions can be rolled back through the snapshot interface + - This tool provides better safety through checkpoint metadata + - Rollback is not automatic; use the recorded checkpoint metadata to guide recovery - The tool will fail gracefully if permissions are insufficient"#.to_string()) } @@ -294,6 +294,13 @@ Important notes: let resolved = context.resolve_tool_path(path_str)?; context.enforce_path_operation(ToolPathOperation::Delete, &resolved)?; + context + .record_light_checkpoint( + "Delete", + &resolved.logical_path, + vec![resolved.logical_path.clone()], + ) + .await; // Remote workspace path: delete via shell command if resolved.uses_remote_workspace_backend() { diff --git a/src/crates/core/src/agentic/tools/implementations/file_edit_tool.rs b/src/crates/core/src/agentic/tools/implementations/file_edit_tool.rs index e8950ee6c..a23e42d52 100644 --- a/src/crates/core/src/agentic/tools/implementations/file_edit_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/file_edit_tool.rs @@ -200,6 +200,13 @@ Usage: let resolved = context.resolve_tool_path(file_path)?; context.enforce_path_operation(ToolPathOperation::Edit, &resolved)?; + context + .record_light_checkpoint( + "Edit", + &resolved.logical_path, + vec![resolved.logical_path.clone()], + ) + .await; // For remote workspace paths, use the abstract FS to read → edit in memory → write back. if resolved.uses_remote_workspace_backend() { diff --git a/src/crates/core/src/agentic/tools/implementations/file_write_tool.rs b/src/crates/core/src/agentic/tools/implementations/file_write_tool.rs index 3a77c1f45..8e34c4a3b 100644 --- a/src/crates/core/src/agentic/tools/implementations/file_write_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/file_write_tool.rs @@ -190,6 +190,13 @@ Usage: let resolved = context.resolve_tool_path(file_path)?; context.enforce_path_operation(ToolPathOperation::Write, &resolved)?; + context + .record_light_checkpoint( + "Write", + &resolved.logical_path, + vec![resolved.logical_path.clone()], + ) + .await; let content = input .get("content") diff --git a/src/crates/core/src/agentic/tools/implementations/git_tool.rs b/src/crates/core/src/agentic/tools/implementations/git_tool.rs index 007c2f1e7..6d9e27d95 100644 --- a/src/crates/core/src/agentic/tools/implementations/git_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/git_tool.rs @@ -1071,6 +1071,16 @@ When creating commits, use this format for the commit message: args.unwrap_or("") ); + if git_operation_needs_light_checkpoint(operation, args) { + context + .record_light_checkpoint( + "Git", + &format!("git {} {}", operation, args.unwrap_or("").trim()), + Vec::new(), + ) + .await; + } + let start_time = std::time::Instant::now(); // Remote SSH workspace: run git on the server (not libgit2 on the PC). @@ -1126,6 +1136,15 @@ When creating commits, use this format for the commit message: } } +fn git_operation_needs_light_checkpoint(operation: &str, args: Option<&str>) -> bool { + match operation { + "add" | "commit" | "pull" | "checkout" | "switch" | "merge" | "rebase" | "stash" + | "reset" | "restore" | "clean" | "cherry-pick" => true, + "branch" => args.is_some_and(|value| !value.trim().is_empty()), + _ => false, + } +} + impl Default for GitTool { fn default() -> Self { Self::new() @@ -1136,7 +1155,7 @@ impl Default for GitTool { mod tests { use crate::agentic::tools::framework::Tool; - use super::{GitTool, ParsedDiffArgs}; + use super::{git_operation_needs_light_checkpoint, GitTool, ParsedDiffArgs}; use serde_json::json; #[tokio::test] @@ -1165,6 +1184,28 @@ mod tests { .contains("operation is required")); } + #[test] + fn checkpoint_detection_flags_mutating_git_operations() { + assert!(git_operation_needs_light_checkpoint( + "checkout", + Some("main") + )); + assert!(git_operation_needs_light_checkpoint( + "reset", + Some("--hard HEAD") + )); + assert!(git_operation_needs_light_checkpoint( + "branch", + Some("-D old") + )); + assert!(!git_operation_needs_light_checkpoint("status", None)); + assert!(!git_operation_needs_light_checkpoint( + "diff", + Some("-- src/lib.rs") + )); + assert!(!git_operation_needs_light_checkpoint("branch", None)); + } + #[test] fn parse_diff_args_empty() { let r = GitTool::parse_diff_args(""); diff --git a/src/crates/core/src/agentic/tools/implementations/session_message_tool.rs b/src/crates/core/src/agentic/tools/implementations/session_message_tool.rs index b4718c6be..226cc8723 100644 --- a/src/crates/core/src/agentic/tools/implementations/session_message_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/session_message_tool.rs @@ -408,6 +408,7 @@ When overriding an existing session's agent_type, only switching between "agenti source_workspace_path: source_workspace, }), None, + None, ) .await .map_err(BitFunError::tool)?; diff --git a/src/crates/core/src/agentic/tools/implementations/task_tool.rs b/src/crates/core/src/agentic/tools/implementations/task_tool.rs index 6d5e1137d..38e5f27cb 100644 --- a/src/crates/core/src/agentic/tools/implementations/task_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/task_tool.rs @@ -1,6 +1,21 @@ use crate::agentic::agents::{get_agent_registry, AgentInfo}; use crate::agentic::coordination::get_global_coordinator; -use crate::agentic::deep_review_policy::{load_default_deep_review_policy, DEEP_REVIEW_AGENT_TYPE}; +use crate::agentic::deep_review_policy::{ + classify_deep_review_capacity_error, clear_deep_review_queue_control_for_tool, + deep_review_active_reviewer_count, deep_review_effective_concurrency_snapshot, + deep_review_effective_parallel_instances, deep_review_has_judge_been_launched, + deep_review_max_retries_per_role, deep_review_queue_control_snapshot, + load_default_deep_review_policy, record_deep_review_capacity_skip, + record_deep_review_effective_concurrency_capacity_error, + record_deep_review_effective_concurrency_success, record_deep_review_task_budget, + try_begin_deep_review_active_reviewer, DeepReviewActiveReviewerGuard, + DeepReviewCapacityQueueReason, DeepReviewConcurrencyPolicy, DeepReviewExecutionPolicy, + DeepReviewIncrementalCache, DeepReviewPolicyViolation, DeepReviewRunManifestGate, + DeepReviewSubagentRole, DEEP_REVIEW_AGENT_TYPE, +}; +use crate::agentic::events::{ + DeepReviewQueueReason, DeepReviewQueueState, DeepReviewQueueStatus, ErrorCategory, +}; use crate::agentic::tools::framework::{ Tool, ToolRenderOptions, ToolResult, ToolUseContext, ValidationResult, }; @@ -8,13 +23,37 @@ use crate::agentic::tools::pipeline::SubagentParentInfo; use crate::agentic::tools::InputValidator; use crate::util::errors::{BitFunError, BitFunResult}; use async_trait::async_trait; +use log::warn; use serde_json::{json, Value}; +use std::collections::{HashMap, HashSet}; use std::path::Path; +use tokio::time::{sleep, Duration, Instant}; pub struct TaskTool; const LARGE_TASK_PROMPT_SOFT_LINE_LIMIT: usize = 180; const LARGE_TASK_PROMPT_SOFT_BYTE_LIMIT: usize = 16 * 1024; +#[cfg(test)] +const DEEP_REVIEW_QUEUE_POLL_INTERVAL: Duration = Duration::from_millis(10); +#[cfg(not(test))] +const DEEP_REVIEW_QUEUE_POLL_INTERVAL: Duration = Duration::from_secs(1); + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DeepReviewQueueWaitSkipReason { + QueueExpired, + UserCancelled, + OptionalSkipped, +} + +enum DeepReviewQueueWaitOutcome { + Ready { + guard: DeepReviewActiveReviewerGuard<'static>, + }, + Skipped { + queue_elapsed_ms: u64, + skip_reason: DeepReviewQueueWaitSkipReason, + }, +} impl Default for TaskTool { fn default() -> Self { @@ -27,6 +66,636 @@ impl TaskTool { Self } + fn string_for_any_key<'a>(value: &'a Value, keys: &[&str]) -> Option<&'a str> { + keys.iter().find_map(|key| { + value + .get(*key) + .and_then(Value::as_str) + .map(str::trim) + .filter(|value| !value.is_empty()) + }) + } + + fn value_for_any_key<'a>(value: &'a Value, keys: &[&str]) -> Option<&'a Value> { + keys.iter().find_map(|key| value.get(*key)) + } + + fn u64_for_any_key(value: &Value, keys: &[&str]) -> Option { + keys.iter() + .find_map(|key| value.get(*key).and_then(Value::as_u64)) + } + + fn string_array_for_any_key( + value: &Value, + keys: &[&str], + ) -> Result, DeepReviewPolicyViolation> { + let Some(array) = Self::value_for_any_key(value, keys).and_then(Value::as_array) else { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_missing_coverage", + format!("Retry coverage requires array field '{}'", keys[0]), + )); + }; + + let mut result = Vec::with_capacity(array.len()); + for item in array { + let Some(path) = item.as_str().map(str::trim).filter(|path| !path.is_empty()) else { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_invalid_coverage", + format!( + "Retry coverage field '{}' must contain non-empty strings", + keys[0] + ), + )); + }; + result.push(path.to_string()); + } + + Ok(result) + } + + fn work_packets_from_manifest(run_manifest: Option<&Value>) -> Option<&Vec> { + run_manifest? + .get("workPackets") + .or_else(|| run_manifest?.get("work_packets"))? + .as_array() + } + + fn packet_id_from_description(description: Option<&str>) -> Option { + let description = description?; + let start = description.find("[packet ")? + "[packet ".len(); + let packet_id = description[start..].split(']').next()?.trim(); + (!packet_id.is_empty()).then(|| packet_id.to_string()) + } + + fn packet_belongs_to_subagent(packet: &Value, subagent_type: &str) -> bool { + Self::string_for_any_key( + packet, + &["subagentId", "subagent_id", "subagentType", "subagent_type"], + ) + .is_some_and(|value| value == subagent_type) + } + + fn packet_id_for_manifest_packet(packet: &Value) -> Option<&str> { + Self::string_for_any_key(packet, &["packetId", "packet_id"]) + } + + fn deep_review_packet_id_for_cache( + subagent_type: &str, + description: Option<&str>, + run_manifest: Option<&Value>, + ) -> Option { + let packets = Self::work_packets_from_manifest(run_manifest)?; + + if let Some(description_packet_id) = Self::packet_id_from_description(description) { + return packets + .iter() + .any(|packet| { + Self::packet_id_for_manifest_packet(packet) + .is_some_and(|packet_id| packet_id == description_packet_id) + && Self::packet_belongs_to_subagent(packet, subagent_type) + }) + .then_some(description_packet_id); + } + + let mut matches = packets.iter().filter_map(|packet| { + if Self::packet_belongs_to_subagent(packet, subagent_type) { + Self::packet_id_for_manifest_packet(packet).map(str::to_string) + } else { + None + } + }); + let packet_id = matches.next()?; + if matches.next().is_some() { + None + } else { + Some(packet_id) + } + } + + fn attach_deep_review_cache(run_manifest: &mut Value, cache_value: Option) { + if run_manifest.get("deepReviewCache").is_some() { + return; + } + let Some(cache_value) = cache_value else { + return; + }; + if let Some(object) = run_manifest.as_object_mut() { + object.insert("deepReviewCache".to_string(), cache_value); + } + } + + fn deep_review_retry_guidance_max_retries( + effective_policy: Option<&DeepReviewExecutionPolicy>, + dialog_turn_id: &str, + ) -> usize { + effective_policy + .map(|policy| policy.max_retries_per_role) + .unwrap_or_else(|| deep_review_max_retries_per_role(dialog_turn_id)) + } + + fn manifest_packet_by_id<'a>( + run_manifest: Option<&'a Value>, + packet_id: &str, + subagent_type: &str, + ) -> Option<&'a Value> { + Self::work_packets_from_manifest(run_manifest)? + .iter() + .find(|packet| { + Self::packet_id_for_manifest_packet(packet).is_some_and(|id| id == packet_id) + && Self::packet_belongs_to_subagent(packet, subagent_type) + }) + } + + fn assigned_scope_files_for_packet( + packet: &Value, + ) -> Result, DeepReviewPolicyViolation> { + let Some(scope) = Self::value_for_any_key(packet, &["assignedScope", "assigned_scope"]) + else { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_missing_packet_scope", + "Retry source packet is missing assigned_scope", + )); + }; + Self::string_array_for_any_key(scope, &["files"]) + } + + fn is_retryable_capacity_reason(reason: &str) -> bool { + matches!( + reason, + "local_concurrency_cap" + | "provider_rate_limit" + | "provider_concurrency_limit" + | "retry_after" + | "temporary_overload" + ) + } + + fn ensure_deep_review_retry_coverage( + input: &Value, + subagent_type: &str, + run_manifest: Option<&Value>, + ) -> Result, DeepReviewPolicyViolation> { + let Some(coverage) = Self::value_for_any_key(input, &["retry_coverage", "retryCoverage"]) + else { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_missing_coverage", + "DeepReview retry requires structured retry_coverage metadata", + )); + }; + let packet_id = Self::string_for_any_key(coverage, &["source_packet_id", "sourcePacketId"]) + .ok_or_else(|| { + DeepReviewPolicyViolation::new( + "deep_review_retry_missing_packet_id", + "DeepReview retry coverage requires source_packet_id", + ) + })?; + let source_status = Self::string_for_any_key(coverage, &["source_status", "sourceStatus"]) + .ok_or_else(|| { + DeepReviewPolicyViolation::new( + "deep_review_retry_missing_status", + "DeepReview retry coverage requires source_status", + ) + })?; + match source_status { + "partial_timeout" => {} + "capacity_skipped" => { + let capacity_reason = + Self::string_for_any_key(coverage, &["capacity_reason", "capacityReason"]) + .unwrap_or_default(); + if !Self::is_retryable_capacity_reason(capacity_reason) { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_non_retryable_status", + format!( + "DeepReview retry cannot redispatch non-transient capacity reason '{}'", + capacity_reason + ), + )); + } + } + other => { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_non_retryable_status", + format!( + "DeepReview retry only supports partial_timeout or transient capacity failures, not '{}'", + other + ), + )); + } + } + + let packet = Self::manifest_packet_by_id(run_manifest, packet_id, subagent_type) + .ok_or_else(|| { + DeepReviewPolicyViolation::new( + "deep_review_retry_unknown_packet", + format!( + "DeepReview retry source packet '{}' does not match reviewer '{}'", + packet_id, subagent_type + ), + ) + })?; + let original_files = Self::assigned_scope_files_for_packet(packet)?; + Self::ensure_deep_review_retry_timeout(input, packet)?; + let retry_scope_files = + Self::string_array_for_any_key(coverage, &["retry_scope_files", "retryScopeFiles"])?; + let covered_files = + Self::string_array_for_any_key(coverage, &["covered_files", "coveredFiles"])?; + if retry_scope_files.is_empty() { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_empty_scope", + "DeepReview retry requires at least one retry_scope_files entry", + )); + } + + let original_file_set: HashSet<&str> = original_files.iter().map(String::as_str).collect(); + let mut retry_file_set = HashSet::new(); + for file in &retry_scope_files { + if !retry_file_set.insert(file.as_str()) { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_duplicate_scope_file", + format!("DeepReview retry scope repeats file '{}'", file), + )); + } + if !original_file_set.contains(file.as_str()) { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_scope_outside_packet", + format!( + "DeepReview retry file '{}' is outside source packet '{}'", + file, packet_id + ), + )); + } + } + if retry_scope_files.len() >= original_files.len() { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_scope_not_reduced", + "DeepReview retry_scope_files must be smaller than the source packet scope", + )); + } + + for file in &covered_files { + if !original_file_set.contains(file.as_str()) { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_coverage_outside_packet", + format!( + "DeepReview retry covered file '{}' is outside source packet '{}'", + file, packet_id + ), + )); + } + if retry_file_set.contains(file.as_str()) { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_coverage_overlaps_scope", + format!( + "DeepReview retry covered file '{}' cannot also be in retry_scope_files", + file + ), + )); + } + } + + Ok(retry_scope_files) + } + + fn ensure_deep_review_retry_timeout( + input: &Value, + packet: &Value, + ) -> Result<(), DeepReviewPolicyViolation> { + let retry_timeout_seconds = + Self::u64_for_any_key(input, &["timeout_seconds", "timeoutSeconds"]).unwrap_or(0); + if retry_timeout_seconds == 0 { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_timeout_required", + "DeepReview retry requires a positive timeout_seconds value", + )); + } + + let source_timeout_seconds = + Self::u64_for_any_key(packet, &["timeoutSeconds", "timeout_seconds"]).unwrap_or(0); + if source_timeout_seconds > 0 && retry_timeout_seconds >= source_timeout_seconds { + return Err(DeepReviewPolicyViolation::new( + "deep_review_retry_timeout_not_reduced", + format!( + "DeepReview retry timeout_seconds ({}) must be lower than source timeout ({})", + retry_timeout_seconds, source_timeout_seconds + ), + )); + } + + Ok(()) + } + + fn prompt_with_deep_review_retry_scope(prompt: &str, retry_scope_files: &[String]) -> String { + let mut scoped_prompt = String::new(); + scoped_prompt.push_str("\n"); + scoped_prompt.push_str( + "This is a bounded DeepReview retry. Review only the following retry_scope_files and treat any other files as background context only:\n", + ); + for file in retry_scope_files { + scoped_prompt.push_str("- "); + scoped_prompt.push_str(file); + scoped_prompt.push('\n'); + } + scoped_prompt.push_str("\n\n"); + scoped_prompt.push_str(prompt); + scoped_prompt + } + + fn queue_reason_to_event_reason( + reason: DeepReviewCapacityQueueReason, + ) -> DeepReviewQueueReason { + match reason { + DeepReviewCapacityQueueReason::ProviderRateLimit => { + DeepReviewQueueReason::ProviderRateLimit + } + DeepReviewCapacityQueueReason::ProviderConcurrencyLimit => { + DeepReviewQueueReason::ProviderConcurrencyLimit + } + DeepReviewCapacityQueueReason::RetryAfter => DeepReviewQueueReason::RetryAfter, + DeepReviewCapacityQueueReason::LocalConcurrencyCap => { + DeepReviewQueueReason::LocalConcurrencyCap + } + DeepReviewCapacityQueueReason::TemporaryOverload => { + DeepReviewQueueReason::TemporaryOverload + } + } + } + + fn queue_reason_to_snake_case(reason: DeepReviewCapacityQueueReason) -> &'static str { + match reason { + DeepReviewCapacityQueueReason::ProviderRateLimit => "provider_rate_limit", + DeepReviewCapacityQueueReason::ProviderConcurrencyLimit => "provider_concurrency_limit", + DeepReviewCapacityQueueReason::RetryAfter => "retry_after", + DeepReviewCapacityQueueReason::LocalConcurrencyCap => "local_concurrency_cap", + DeepReviewCapacityQueueReason::TemporaryOverload => "temporary_overload", + } + } + + fn deep_review_capacity_reason_for_provider_error( + error: &BitFunError, + ) -> Option { + let detail = error.error_detail(); + let error_message = error.to_string(); + let code = detail.provider_code.as_deref().unwrap_or_default(); + let message = detail + .provider_message + .as_deref() + .unwrap_or(error_message.as_str()); + let decision = classify_deep_review_capacity_error(code, message, None); + if decision.queueable { + return decision.reason; + } + + match detail.category { + ErrorCategory::RateLimit => Some(DeepReviewCapacityQueueReason::ProviderRateLimit), + ErrorCategory::ProviderUnavailable => { + Some(DeepReviewCapacityQueueReason::TemporaryOverload) + } + _ => None, + } + } + + fn deep_review_capacity_skip_result_for_provider_reason( + reason: DeepReviewCapacityQueueReason, + dialog_turn_id: &str, + subagent_type: &str, + conc_policy: &DeepReviewConcurrencyPolicy, + duration_ms: u128, + ) -> (Value, String) { + let snapshot = record_deep_review_effective_concurrency_capacity_error( + dialog_turn_id, + conc_policy.max_parallel_instances, + reason, + None, + ); + record_deep_review_capacity_skip(dialog_turn_id); + + let duration_ms = u64::try_from(duration_ms).unwrap_or(u64::MAX); + let reason_code = Self::queue_reason_to_snake_case(reason); + let assistant_message = format!( + "Subagent '{}' was skipped because the provider reported transient DeepReview capacity pressure.\n", + subagent_type, reason_code + ); + let data = json!({ + "duration": duration_ms, + "status": "capacity_skipped", + "queue_elapsed_ms": 0, + "max_queue_wait_seconds": conc_policy.max_queue_wait_seconds, + "queue_skip_reason": reason_code, + "effective_parallel_instances": snapshot.effective_parallel_instances + }); + + (data, assistant_message) + } + + async fn emit_deep_review_queue_state( + session_id: &str, + dialog_turn_id: &str, + tool_id: &str, + subagent_type: &str, + status: DeepReviewQueueStatus, + reason: Option, + queued_reviewer_count: usize, + active_reviewer_count: usize, + optional_reviewer_count: Option, + effective_parallel_instances: Option, + queue_elapsed_ms: u64, + max_queue_wait_seconds: u64, + ) { + let run_elapsed_ms = matches!(&status, DeepReviewQueueStatus::Running).then_some(0); + if let Some(coordinator) = get_global_coordinator() { + coordinator + .emit_deep_review_queue_state_changed( + session_id, + dialog_turn_id, + DeepReviewQueueState { + tool_id: tool_id.to_string(), + subagent_type: subagent_type.to_string(), + status, + reason: reason.map(Self::queue_reason_to_event_reason), + queued_reviewer_count, + active_reviewer_count: Some(active_reviewer_count), + effective_parallel_instances, + optional_reviewer_count, + queue_elapsed_ms: Some(queue_elapsed_ms), + run_elapsed_ms, + max_queue_wait_seconds: Some(max_queue_wait_seconds), + session_concurrency_high: false, + }, + ) + .await; + } + } + + async fn wait_for_deep_review_reviewer_capacity( + session_id: &str, + dialog_turn_id: &str, + tool_id: &str, + subagent_type: &str, + conc_policy: &DeepReviewConcurrencyPolicy, + is_optional_reviewer: bool, + ) -> BitFunResult { + let decision = classify_deep_review_capacity_error( + "deep_review_concurrency_cap_reached", + "Maximum parallel reviewer instances reached", + None, + ); + let reason = decision + .reason + .unwrap_or(DeepReviewCapacityQueueReason::LocalConcurrencyCap); + let started_at = Instant::now(); + let max_wait = Duration::from_secs(conc_policy.max_queue_wait_seconds); + let mut paused_since: Option = None; + let mut paused_total = Duration::ZERO; + let optional_reviewer_count = is_optional_reviewer.then_some(1); + + loop { + let now = Instant::now(); + let current_pause_elapsed = paused_since + .map(|paused_at| now.saturating_duration_since(paused_at)) + .unwrap_or_default(); + let queue_elapsed = now + .saturating_duration_since(started_at) + .saturating_sub(paused_total) + .saturating_sub(current_pause_elapsed); + let queue_elapsed_ms = u64::try_from(queue_elapsed.as_millis()).unwrap_or(u64::MAX); + let active_reviewers = deep_review_active_reviewer_count(dialog_turn_id); + let effective_parallel_instances = deep_review_effective_parallel_instances( + dialog_turn_id, + conc_policy.max_parallel_instances, + ); + + let control_snapshot = deep_review_queue_control_snapshot(dialog_turn_id, tool_id); + if control_snapshot.cancelled + || (is_optional_reviewer && control_snapshot.skip_optional) + { + record_deep_review_capacity_skip(dialog_turn_id); + clear_deep_review_queue_control_for_tool(dialog_turn_id, tool_id); + Self::emit_deep_review_queue_state( + session_id, + dialog_turn_id, + tool_id, + subagent_type, + DeepReviewQueueStatus::CapacitySkipped, + Some(reason), + 0, + active_reviewers, + optional_reviewer_count, + Some(effective_parallel_instances), + queue_elapsed_ms, + conc_policy.max_queue_wait_seconds, + ) + .await; + return Ok(DeepReviewQueueWaitOutcome::Skipped { + queue_elapsed_ms, + skip_reason: if control_snapshot.cancelled { + DeepReviewQueueWaitSkipReason::UserCancelled + } else { + DeepReviewQueueWaitSkipReason::OptionalSkipped + }, + }); + } + + if control_snapshot.paused { + if paused_since.is_none() { + paused_since = Some(now); + } + Self::emit_deep_review_queue_state( + session_id, + dialog_turn_id, + tool_id, + subagent_type, + DeepReviewQueueStatus::PausedByUser, + Some(reason), + 1, + active_reviewers, + optional_reviewer_count, + Some(effective_parallel_instances), + queue_elapsed_ms, + conc_policy.max_queue_wait_seconds, + ) + .await; + sleep(DEEP_REVIEW_QUEUE_POLL_INTERVAL).await; + continue; + } + + if let Some(paused_at) = paused_since.take() { + paused_total += now.saturating_duration_since(paused_at); + } + + if let Some(guard) = + try_begin_deep_review_active_reviewer(dialog_turn_id, effective_parallel_instances) + { + let active_reviewer_count = deep_review_active_reviewer_count(dialog_turn_id); + clear_deep_review_queue_control_for_tool(dialog_turn_id, tool_id); + Self::emit_deep_review_queue_state( + session_id, + dialog_turn_id, + tool_id, + subagent_type, + DeepReviewQueueStatus::Running, + None, + 0, + active_reviewer_count, + optional_reviewer_count, + Some(effective_parallel_instances), + queue_elapsed_ms, + conc_policy.max_queue_wait_seconds, + ) + .await; + return Ok(DeepReviewQueueWaitOutcome::Ready { guard }); + } + + if queue_elapsed >= max_wait { + let snapshot = record_deep_review_effective_concurrency_capacity_error( + dialog_turn_id, + conc_policy.max_parallel_instances, + reason, + decision.retry_after_seconds.map(Duration::from_secs), + ); + record_deep_review_capacity_skip(dialog_turn_id); + clear_deep_review_queue_control_for_tool(dialog_turn_id, tool_id); + Self::emit_deep_review_queue_state( + session_id, + dialog_turn_id, + tool_id, + subagent_type, + DeepReviewQueueStatus::CapacitySkipped, + Some(reason), + 0, + active_reviewers, + optional_reviewer_count, + Some(snapshot.effective_parallel_instances), + queue_elapsed_ms, + conc_policy.max_queue_wait_seconds, + ) + .await; + return Ok(DeepReviewQueueWaitOutcome::Skipped { + queue_elapsed_ms, + skip_reason: DeepReviewQueueWaitSkipReason::QueueExpired, + }); + } + + Self::emit_deep_review_queue_state( + session_id, + dialog_turn_id, + tool_id, + subagent_type, + DeepReviewQueueStatus::QueuedForCapacity, + Some(reason), + 1, + active_reviewers, + optional_reviewer_count, + Some(effective_parallel_instances), + queue_elapsed_ms, + conc_policy.max_queue_wait_seconds, + ) + .await; + + let remaining = max_wait.saturating_sub(queue_elapsed); + sleep(DEEP_REVIEW_QUEUE_POLL_INTERVAL.min(remaining)).await; + } + } + fn format_agent_descriptions(&self, agents: &[AgentInfo]) -> String { if agents.is_empty() { return String::new(); @@ -76,6 +745,7 @@ Usage notes: - The 'workspace_path' parameter must still be provided explicitly for the Explore and FileFinder agent. - Use 'model_id' when a caller needs a specific model or model slot for the subagent. Omit it to use the agent default. - Use 'timeout_seconds' when you need a hard deadline for the subagent. Omit it or set it to 0 to disable the timeout. +- For DeepReview only, set 'retry' to true when re-dispatching a reviewer after that same reviewer returned partial_timeout or an explicit transient capacity failure in the current turn. Retry calls must include retry_coverage with source_packet_id, source_status, covered_files, and a smaller retry_scope_files list. - Launch multiple agents concurrently whenever possible, to maximize performance; to do that, use a single message with multiple tool calls - When the agent is done, it will return a single message back to you. - The agent's outputs should generally be trusted @@ -197,6 +867,45 @@ impl Tool for TaskTool { "type": "integer", "minimum": 0, "description": "Optional timeout for this subagent task in seconds. Use 0 or omit it to disable the timeout." + }, + "retry": { + "type": "boolean", + "description": "DeepReview only: true when this Task call is a retry for the same reviewer role after partial_timeout or an explicit transient capacity failure in the current turn." + }, + "retry_coverage": { + "type": "object", + "description": "DeepReview retry only: structured coverage metadata proving the retry is bounded. Required when retry=true.", + "properties": { + "source_packet_id": { + "type": "string", + "description": "The original reviewer packet_id being retried." + }, + "source_status": { + "type": "string", + "enum": ["partial_timeout", "capacity_skipped"], + "description": "The retryable source status." + }, + "capacity_reason": { + "type": "string", + "description": "Required for capacity_skipped; must be a transient capacity reason such as local_concurrency_cap, provider_rate_limit, provider_concurrency_limit, retry_after, or temporary_overload." + }, + "covered_files": { + "type": "array", + "items": { "type": "string" }, + "description": "Files already covered by the source attempt." + }, + "retry_scope_files": { + "type": "array", + "items": { "type": "string" }, + "description": "Smaller file list to retry. Every entry must belong to the source packet and must not overlap covered_files." + } + }, + "required": [ + "source_packet_id", + "source_status", + "covered_files", + "retry_scope_files" + ] } }, "required": [ @@ -289,6 +998,10 @@ impl Tool for TaskTool { let start_time = std::time::Instant::now(); // description is only used for frontend display + let description = input + .get("description") + .and_then(Value::as_str) + .map(str::to_string); let mut prompt = input .get("prompt") @@ -339,6 +1052,7 @@ impl Tool for TaskTool { } None => None, }; + let is_retry = input.get("retry").and_then(Value::as_bool).unwrap_or(false); let current_workspace_path = context .workspace_root() .map(|path| path.to_string_lossy().into_owned()); @@ -415,6 +1129,17 @@ impl Tool for TaskTool { "dialog_turn_id is required in context".to_string(), )); }; + let mut deep_review_effective_policy: Option = None; + let mut deep_review_active_guard: Option> = None; + let mut deep_review_reviewer_configured_max_parallel_instances: Option = None; + let mut deep_review_concurrency_policy: Option = None; + let mut deep_review_is_optional_reviewer = false; + let mut deep_review_retry_scope_files: Option> = None; + let mut deep_review_subagent_role: Option = None; + + // Get global coordinator + let coordinator = get_global_coordinator() + .ok_or_else(|| BitFunError::tool("coordinator not initialized".to_string()))?; if context .agent_type @@ -422,12 +1147,46 @@ impl Tool for TaskTool { .map(str::trim) .is_some_and(|agent_type| agent_type == DEEP_REVIEW_AGENT_TYPE) { - let policy = load_default_deep_review_policy().await.map_err(|error| { + let base_policy = load_default_deep_review_policy().await.map_err(|error| { BitFunError::tool(format!( "Failed to load DeepReview execution policy: {}", error )) })?; + let mut run_manifest = context.custom_data.get("deep_review_run_manifest").cloned(); + if let Some(workspace) = context.workspace.as_ref() { + let session_storage_path = workspace.session_storage_path(); + match coordinator + .get_session_manager() + .load_session_metadata(&session_storage_path, &session_id) + .await + { + Ok(Some(metadata)) => { + if run_manifest.is_none() { + run_manifest = metadata.deep_review_run_manifest; + } + if let Some(run_manifest) = run_manifest.as_mut() { + Self::attach_deep_review_cache( + run_manifest, + metadata.deep_review_cache, + ); + } + } + Ok(None) => {} + Err(error) => { + warn!( + "Failed to load DeepReview session metadata for run-manifest policy: session_id={}, error={}", + session_id, error + ); + } + } + } + let policy = if let Some(manifest) = run_manifest.as_ref() { + base_policy.with_run_manifest_execution_policy(manifest) + } else { + base_policy + }; + deep_review_effective_policy = Some(policy.clone()); let role = policy .classify_subagent(&subagent_type) .map_err(|violation| { @@ -436,6 +1195,33 @@ impl Tool for TaskTool { violation.to_tool_error_message() )) })?; + deep_review_subagent_role = Some(role); + if let Some(gate) = run_manifest + .as_ref() + .and_then(DeepReviewRunManifestGate::from_value) + { + gate.ensure_active(&subagent_type).map_err(|violation| { + BitFunError::tool(format!( + "DeepReview Task policy violation: {}", + violation.to_tool_error_message() + )) + })?; + } + if is_retry && role == DeepReviewSubagentRole::Reviewer { + deep_review_retry_scope_files = Some( + Self::ensure_deep_review_retry_coverage( + input, + &subagent_type, + run_manifest.as_ref(), + ) + .map_err(|violation| { + BitFunError::tool(format!( + "DeepReview Task policy violation: {}", + violation.to_tool_error_message() + )) + })?, + ); + } let is_readonly = get_agent_registry() .get_subagent_is_readonly(&subagent_type) .unwrap_or(false); @@ -467,38 +1253,302 @@ impl Tool for TaskTool { ))); } timeout_seconds = policy.effective_timeout_seconds(role, timeout_seconds); + + // Check incremental review cache before queueing. A cache hit does + // not consume runtime reviewer capacity or reviewer timeout. + if role == DeepReviewSubagentRole::Reviewer && !is_retry { + if let Some(cache_value) = + run_manifest.as_ref().and_then(|m| m.get("deepReviewCache")) + { + let cache = DeepReviewIncrementalCache::from_value(cache_value); + if cache.matches_manifest(run_manifest.as_ref().unwrap_or(&Value::Null)) { + if let Some(packet_id) = Self::deep_review_packet_id_for_cache( + &subagent_type, + description.as_deref(), + run_manifest.as_ref(), + ) { + if let Some(cached_output) = cache.get_packet(&packet_id) { + let cached_result = format!( + "Subagent '{}' result (from incremental review cache):\n\n{}\n", + subagent_type, cached_output + ); + return Ok(vec![ToolResult::ok( + json!({ "cached": true, "packet_id": packet_id }), + Some(cached_result), + )]); + } + } + } + } + } + + // Enforce dynamic concurrency policy from the run manifest. + let conc_policy = policy + .concurrency_policy_from_manifest(run_manifest.as_ref().unwrap_or(&Value::Null)); + deep_review_concurrency_policy = Some(conc_policy.clone()); + match role { + DeepReviewSubagentRole::Reviewer => { + deep_review_reviewer_configured_max_parallel_instances = + Some(conc_policy.max_parallel_instances); + let effective_parallel_instances = deep_review_effective_parallel_instances( + &dialog_turn_id, + conc_policy.max_parallel_instances, + ); + let is_optional_reviewer = policy + .extra_subagent_ids + .iter() + .any(|id| id == &subagent_type); + deep_review_is_optional_reviewer = is_optional_reviewer; + if let Some(guard) = try_begin_deep_review_active_reviewer( + &dialog_turn_id, + effective_parallel_instances, + ) { + deep_review_active_guard = Some(guard); + } else { + match Self::wait_for_deep_review_reviewer_capacity( + &session_id, + &dialog_turn_id, + &tool_call_id, + &subagent_type, + &conc_policy, + is_optional_reviewer, + ) + .await? + { + DeepReviewQueueWaitOutcome::Ready { guard } => { + deep_review_active_guard = Some(guard); + } + DeepReviewQueueWaitOutcome::Skipped { + queue_elapsed_ms, + skip_reason, + } => { + let queue_skip_reason = match skip_reason { + DeepReviewQueueWaitSkipReason::QueueExpired => "queue_expired", + DeepReviewQueueWaitSkipReason::UserCancelled => { + "user_cancelled" + } + DeepReviewQueueWaitSkipReason::OptionalSkipped => { + "optional_skipped" + } + }; + let assistant_message = match skip_reason { + DeepReviewQueueWaitSkipReason::QueueExpired => format!( + "Subagent '{}' was skipped because the DeepReview capacity queue reached its maximum wait ({}s).\n", + subagent_type, + conc_policy.max_queue_wait_seconds, + queue_elapsed_ms + ), + DeepReviewQueueWaitSkipReason::UserCancelled => format!( + "Subagent '{}' was skipped because the DeepReview capacity queue was cancelled by the user.\n", + subagent_type, queue_elapsed_ms + ), + DeepReviewQueueWaitSkipReason::OptionalSkipped => format!( + "Subagent '{}' was skipped because optional DeepReview queued reviewers were skipped by the user.\n", + subagent_type, queue_elapsed_ms + ), + }; + return Ok(vec![ToolResult::Result { + data: json!({ + "duration": start_time.elapsed().as_millis(), + "status": "capacity_skipped", + "queue_elapsed_ms": queue_elapsed_ms, + "max_queue_wait_seconds": conc_policy.max_queue_wait_seconds, + "queue_skip_reason": queue_skip_reason, + "effective_parallel_instances": deep_review_effective_concurrency_snapshot( + &dialog_turn_id, + conc_policy.max_parallel_instances, + ).effective_parallel_instances + }), + result_for_assistant: Some(assistant_message), + image_attachments: None, + }]); + } + } + } + } + DeepReviewSubagentRole::Judge => { + let active_reviewers = deep_review_active_reviewer_count(&dialog_turn_id); + let judge_pending = deep_review_has_judge_been_launched(&dialog_turn_id); + conc_policy + .check_launch_allowed(active_reviewers, role, judge_pending) + .map_err(|violation| { + BitFunError::tool(format!( + "DeepReview concurrency policy violation: {}", + violation.to_tool_error_message() + )) + })?; + } + } + record_deep_review_task_budget( + &dialog_turn_id, + &policy, + role, + &subagent_type, + is_retry, + ) + .map_err(|violation| { + BitFunError::tool(format!( + "DeepReview Task policy violation: {}", + violation.to_tool_error_message() + )) + })?; } - // Get global coordinator - let coordinator = get_global_coordinator() - .ok_or_else(|| BitFunError::tool("coordinator not initialized".to_string()))?; + if let Some(retry_scope_files) = deep_review_retry_scope_files.as_ref() { + prompt = Self::prompt_with_deep_review_retry_scope(&prompt, retry_scope_files); + } let parent_info = SubagentParentInfo { - tool_call_id, - session_id, - dialog_turn_id, + tool_call_id: tool_call_id.clone(), + session_id: session_id.clone(), + dialog_turn_id: dialog_turn_id.clone(), }; - let result = coordinator + let subagent_context = deep_review_subagent_role.map(|role| { + let mut values = HashMap::new(); + values.insert( + "deep_review_subagent_role".to_string(), + match role { + DeepReviewSubagentRole::Reviewer => "reviewer", + DeepReviewSubagentRole::Judge => "judge", + } + .to_string(), + ); + values.insert( + "deep_review_subagent_type".to_string(), + subagent_type.clone(), + ); + values + }); + let result = match coordinator .execute_subagent( subagent_type.clone(), prompt, parent_info, Some(effective_workspace_path.clone()), - None, + subagent_context, context.cancellation_token.as_ref(), model_id, timeout_seconds, ) - .await?; + .await + { + Ok(result) => result, + Err(error) => { + if matches!( + deep_review_subagent_role, + Some(DeepReviewSubagentRole::Reviewer) + ) { + if let (Some(reason), Some(conc_policy)) = ( + Self::deep_review_capacity_reason_for_provider_error(&error), + deep_review_concurrency_policy.as_ref(), + ) { + drop(deep_review_active_guard.take()); + let (data, assistant_message) = + Self::deep_review_capacity_skip_result_for_provider_reason( + reason, + &dialog_turn_id, + &subagent_type, + conc_policy, + start_time.elapsed().as_millis(), + ); + let effective_parallel_instances = data + .get("effective_parallel_instances") + .and_then(Value::as_u64) + .and_then(|value| usize::try_from(value).ok()); + Self::emit_deep_review_queue_state( + &session_id, + &dialog_turn_id, + &tool_call_id, + &subagent_type, + DeepReviewQueueStatus::CapacitySkipped, + Some(reason), + 0, + deep_review_active_reviewer_count(&dialog_turn_id), + deep_review_is_optional_reviewer.then_some(1), + effective_parallel_instances, + 0, + conc_policy.max_queue_wait_seconds, + ) + .await; + return Ok(vec![ToolResult::Result { + data, + result_for_assistant: Some(assistant_message), + image_attachments: None, + }]); + } + } + return Err(error); + } + }; + if !result.is_partial_timeout() { + if let Some(configured_max_parallel_instances) = + deep_review_reviewer_configured_max_parallel_instances + { + record_deep_review_effective_concurrency_success( + &dialog_turn_id, + configured_max_parallel_instances, + ); + } + } + drop(deep_review_active_guard); let duration = start_time.elapsed().as_millis(); + let status = if result.is_partial_timeout() { + "partial_timeout" + } else { + "completed" + }; - Ok(vec![ToolResult::Result { - data: json!({"duration": duration}), - result_for_assistant: Some(format!( + // Build retry hint for deep review reviewer timeouts. + let retry_hint = if result.is_partial_timeout() && !is_retry { + let retries_used = crate::agentic::deep_review_policy::deep_review_retries_used( + &dialog_turn_id, + &subagent_type, + ); + let max_retries = Self::deep_review_retry_guidance_max_retries( + deep_review_effective_policy.as_ref(), + &dialog_turn_id, + ); + if max_retries > 0 && retries_used < max_retries { + format!( + "\n\nThis reviewer timed out. You may retry with 'retry: true' only if you can provide retry_coverage with source_packet_id, source_status='partial_timeout', covered_files, and a smaller retry_scope_files list. Retries used: {}/{}.", + retries_used, max_retries + ) + } else { + String::new() + } + } else { + String::new() + }; + + let result_for_assistant = if result.is_partial_timeout() { + format!( + "Subagent '{}' timed out with partial result:\n\n{}\n{}", + subagent_type, result.text, retry_hint + ) + } else { + format!( "Subagent '{}' completed successfully with result:\n\n{}\n", subagent_type, result.text - )), + ) + }; + let mut data = json!({ + "duration": duration, + "status": status + }); + if result.is_partial_timeout() { + data["partial_output"] = json!(result.text); + if let Some(reason) = result.reason.as_deref() { + data["reason"] = json!(reason); + } + if let Some(event_id) = result.ledger_event_id() { + data["ledger_event_id"] = json!(event_id); + } + } + + Ok(vec![ToolResult::Result { + data, + result_for_assistant: Some(result_for_assistant), image_attachments: None, }]) } @@ -507,7 +1557,9 @@ impl Tool for TaskTool { #[cfg(test)] mod tests { use super::TaskTool; - use crate::agentic::deep_review_policy::{DeepReviewExecutionPolicy, DeepReviewSubagentRole}; + use crate::agentic::deep_review_policy::{ + DeepReviewBudgetTracker, DeepReviewExecutionPolicy, DeepReviewSubagentRole, + }; use crate::agentic::tools::framework::Tool; use serde_json::json; @@ -592,4 +1644,731 @@ mod tests { Some(240) ); } + + #[test] + fn deep_review_policy_saturates_oversized_numeric_limits() { + let policy = DeepReviewExecutionPolicy::from_config_value(Some(&json!({ + "reviewer_timeout_seconds": u64::MAX, + "judge_timeout_seconds": u64::MAX + }))); + + assert_eq!(policy.reviewer_timeout_seconds, 3600); + assert_eq!(policy.judge_timeout_seconds, 3600); + } + + #[test] + fn deep_review_budget_tracker_caps_judge_per_turn() { + let policy = DeepReviewExecutionPolicy::default(); + let tracker = DeepReviewBudgetTracker::default(); + + tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Judge, + "ReviewJudge", + false, + ) + .unwrap(); + assert!(tracker + .record_task( + "turn-1", + &policy, + DeepReviewSubagentRole::Judge, + "ReviewJudge", + false, + ) + .is_err()); + + tracker + .record_task( + "turn-2", + &policy, + DeepReviewSubagentRole::Judge, + "ReviewJudge", + false, + ) + .unwrap(); + } + + #[test] + fn deep_review_concurrency_policy_blocks_reviewer_at_cap() { + use crate::agentic::deep_review_policy::DeepReviewConcurrencyPolicy; + + let policy = DeepReviewConcurrencyPolicy { + max_parallel_instances: 2, + stagger_seconds: 0, + max_queue_wait_seconds: 60, + batch_extras_separately: true, + }; + // 0 active → allowed + assert!(policy + .check_launch_allowed(0, DeepReviewSubagentRole::Reviewer, false) + .is_ok()); + // 1 active → allowed + assert!(policy + .check_launch_allowed(1, DeepReviewSubagentRole::Reviewer, false) + .is_ok()); + // 2 active (at cap) → blocked + assert!(policy + .check_launch_allowed(2, DeepReviewSubagentRole::Reviewer, false) + .is_err()); + } + + #[test] + fn deep_review_concurrency_policy_returns_structured_cap_rejection() { + use crate::agentic::deep_review_policy::DeepReviewConcurrencyPolicy; + + let policy = DeepReviewConcurrencyPolicy { + max_parallel_instances: 2, + stagger_seconds: 0, + max_queue_wait_seconds: 60, + batch_extras_separately: true, + }; + let violation = policy + .check_launch_allowed(2, DeepReviewSubagentRole::Reviewer, false) + .expect_err("reviewer launch at cap should be rejected"); + let message = format!( + "DeepReview concurrency policy violation: {}", + violation.to_tool_error_message() + ); + + assert!(message.contains("deep_review_concurrency_cap_reached")); + assert!(message.contains("Maximum parallel reviewer instances reached")); + } + + #[tokio::test] + async fn deep_review_capacity_queue_skips_after_max_wait() { + use crate::agentic::deep_review_policy::{ + deep_review_capacity_skip_count, deep_review_concurrency_cap_rejection_count, + deep_review_effective_parallel_instances, try_begin_deep_review_active_reviewer, + DeepReviewConcurrencyPolicy, + }; + + let _occupied_a = try_begin_deep_review_active_reviewer("turn-queue-skip", 2) + .expect("precondition should occupy first reviewer capacity"); + let _occupied_b = try_begin_deep_review_active_reviewer("turn-queue-skip", 2) + .expect("precondition should occupy second reviewer capacity"); + let policy = DeepReviewConcurrencyPolicy { + max_parallel_instances: 2, + stagger_seconds: 0, + max_queue_wait_seconds: 0, + batch_extras_separately: true, + }; + + let outcome = TaskTool::wait_for_deep_review_reviewer_capacity( + "session-queue-skip", + "turn-queue-skip", + "tool-queue-skip", + "ReviewSecurity", + &policy, + false, + ) + .await + .expect("queue wait should resolve"); + + match outcome { + super::DeepReviewQueueWaitOutcome::Skipped { + queue_elapsed_ms, .. + } => { + assert!(queue_elapsed_ms < 100); + } + super::DeepReviewQueueWaitOutcome::Ready { .. } => { + panic!("occupied capacity should skip with maxQueueWaitSeconds=0"); + } + } + assert_eq!(deep_review_capacity_skip_count("turn-queue-skip"), 1); + assert_eq!( + deep_review_concurrency_cap_rejection_count("turn-queue-skip"), + 0 + ); + assert_eq!( + deep_review_effective_parallel_instances("turn-queue-skip", 2), + 1 + ); + } + + #[tokio::test] + async fn deep_review_capacity_queue_cancel_control_skips_waiting_reviewer() { + use crate::agentic::deep_review_policy::{ + apply_deep_review_queue_control, deep_review_capacity_skip_count, + try_begin_deep_review_active_reviewer, DeepReviewConcurrencyPolicy, + DeepReviewQueueControlAction, + }; + + let turn_id = "turn-queue-cancel"; + let tool_id = "tool-queue-cancel"; + let _occupied = try_begin_deep_review_active_reviewer(turn_id, 1) + .expect("precondition should occupy reviewer capacity"); + apply_deep_review_queue_control(turn_id, tool_id, DeepReviewQueueControlAction::Cancel); + let policy = DeepReviewConcurrencyPolicy { + max_parallel_instances: 1, + stagger_seconds: 0, + max_queue_wait_seconds: 60, + batch_extras_separately: true, + }; + + let outcome = TaskTool::wait_for_deep_review_reviewer_capacity( + "session-queue-cancel", + turn_id, + tool_id, + "ReviewSecurity", + &policy, + false, + ) + .await + .expect("queue wait should resolve"); + + match outcome { + super::DeepReviewQueueWaitOutcome::Skipped { + queue_elapsed_ms, .. + } => { + assert!(queue_elapsed_ms < 100); + } + super::DeepReviewQueueWaitOutcome::Ready { .. } => { + panic!("cancelled queue control should skip the waiting reviewer"); + } + } + assert_eq!(deep_review_capacity_skip_count(turn_id), 1); + } + + #[tokio::test] + async fn deep_review_capacity_queue_pause_does_not_expire_until_continued() { + use crate::agentic::deep_review_policy::{ + apply_deep_review_queue_control, try_begin_deep_review_active_reviewer, + DeepReviewConcurrencyPolicy, DeepReviewQueueControlAction, + }; + + let turn_id = "turn-queue-pause"; + let tool_id = "tool-queue-pause"; + let _occupied = try_begin_deep_review_active_reviewer(turn_id, 1) + .expect("precondition should occupy reviewer capacity"); + apply_deep_review_queue_control(turn_id, tool_id, DeepReviewQueueControlAction::Pause); + let policy = DeepReviewConcurrencyPolicy { + max_parallel_instances: 1, + stagger_seconds: 0, + max_queue_wait_seconds: 0, + batch_extras_separately: true, + }; + let turn_id_owned = turn_id.to_string(); + let tool_id_owned = tool_id.to_string(); + + let handle = tokio::spawn(async move { + TaskTool::wait_for_deep_review_reviewer_capacity( + "session-queue-pause", + &turn_id_owned, + &tool_id_owned, + "ReviewSecurity", + &policy, + false, + ) + .await + }); + + tokio::time::sleep(tokio::time::Duration::from_millis(30)).await; + assert!( + !handle.is_finished(), + "paused queue wait should not expire while user pause is active" + ); + + apply_deep_review_queue_control(turn_id, tool_id, DeepReviewQueueControlAction::Continue); + let outcome = tokio::time::timeout(tokio::time::Duration::from_millis(500), handle) + .await + .expect("continued queue wait should finish") + .expect("spawned wait should not panic") + .expect("queue wait should resolve"); + match outcome { + super::DeepReviewQueueWaitOutcome::Skipped { + queue_elapsed_ms, .. + } => { + assert!(queue_elapsed_ms < 100); + } + super::DeepReviewQueueWaitOutcome::Ready { .. } => { + panic!("occupied capacity should skip after pause is continued"); + } + } + } + + #[tokio::test] + async fn deep_review_capacity_queue_skip_optional_skips_optional_waiter() { + use crate::agentic::deep_review_policy::{ + apply_deep_review_queue_control, try_begin_deep_review_active_reviewer, + DeepReviewConcurrencyPolicy, DeepReviewQueueControlAction, + }; + + let turn_id = "turn-queue-skip-optional"; + let tool_id = "tool-queue-skip-optional"; + let _occupied = try_begin_deep_review_active_reviewer(turn_id, 1) + .expect("precondition should occupy reviewer capacity"); + apply_deep_review_queue_control( + turn_id, + tool_id, + DeepReviewQueueControlAction::SkipOptional, + ); + let policy = DeepReviewConcurrencyPolicy { + max_parallel_instances: 1, + stagger_seconds: 0, + max_queue_wait_seconds: 60, + batch_extras_separately: true, + }; + + let outcome = TaskTool::wait_for_deep_review_reviewer_capacity( + "session-queue-skip-optional", + turn_id, + tool_id, + "ReviewCustom", + &policy, + true, + ) + .await + .expect("queue wait should resolve"); + + match outcome { + super::DeepReviewQueueWaitOutcome::Skipped { + queue_elapsed_ms, .. + } => { + assert!(queue_elapsed_ms < 100); + } + super::DeepReviewQueueWaitOutcome::Ready { .. } => { + panic!("optional queue control should skip optional reviewer"); + } + } + } + + #[test] + fn deep_review_concurrency_policy_blocks_judge_with_active_reviewers() { + use crate::agentic::deep_review_policy::DeepReviewConcurrencyPolicy; + + let policy = DeepReviewConcurrencyPolicy::default(); + // 1 active reviewer → judge blocked + assert!(policy + .check_launch_allowed(1, DeepReviewSubagentRole::Judge, false) + .is_err()); + // 0 active reviewers, no judge pending → judge allowed + assert!(policy + .check_launch_allowed(0, DeepReviewSubagentRole::Judge, false) + .is_ok()); + // 0 active reviewers, judge already pending → blocked + assert!(policy + .check_launch_allowed(0, DeepReviewSubagentRole::Judge, true) + .is_err()); + } + + #[test] + fn deep_review_incremental_cache_hit_returns_cached_result() { + use crate::agentic::deep_review_policy::DeepReviewIncrementalCache; + + let mut cache = DeepReviewIncrementalCache::new("fp-test-123"); + cache.store_packet("ReviewSecurity", "Found 2 security issues"); + + // Cache hit + let result = cache.get_packet("ReviewSecurity"); + assert_eq!(result, Some("Found 2 security issues")); + + // Cache miss + assert_eq!(cache.get_packet("ReviewPerformance"), None); + } + + #[test] + fn deep_review_incremental_cache_fingerprint_mismatch_skips() { + use crate::agentic::deep_review_policy::DeepReviewIncrementalCache; + + let cache = DeepReviewIncrementalCache::new("fp-old"); + let manifest = serde_json::json!({ + "incrementalReviewCache": { + "fingerprint": "fp-new" + } + }); + // Fingerprint mismatch → cache should not match + assert!(!cache.matches_manifest(&manifest)); + } + + #[test] + fn deep_review_cache_packet_id_prefers_task_description_packet() { + let manifest = serde_json::json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity:group-1-of-2", + "phase": "reviewer", + "subagentId": "ReviewSecurity" + }, + { + "packetId": "reviewer:ReviewSecurity:group-2-of-2", + "phase": "reviewer", + "subagentId": "ReviewSecurity" + } + ] + }); + + assert_eq!( + TaskTool::deep_review_packet_id_for_cache( + "ReviewSecurity", + Some("Security review [packet reviewer:ReviewSecurity:group-2-of-2]"), + Some(&manifest), + ), + Some("reviewer:ReviewSecurity:group-2-of-2".to_string()) + ); + } + + #[test] + fn deep_review_cache_packet_id_uses_unique_manifest_packet() { + let manifest = serde_json::json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewBusinessLogic", + "phase": "reviewer", + "subagentId": "ReviewBusinessLogic" + } + ] + }); + + assert_eq!( + TaskTool::deep_review_packet_id_for_cache( + "ReviewBusinessLogic", + Some("Logic review"), + Some(&manifest), + ), + Some("reviewer:ReviewBusinessLogic".to_string()) + ); + } + + #[test] + fn deep_review_cache_packet_id_does_not_guess_split_packets() { + let manifest = serde_json::json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewPerformance:group-1-of-2", + "phase": "reviewer", + "subagentId": "ReviewPerformance" + }, + { + "packetId": "reviewer:ReviewPerformance:group-2-of-2", + "phase": "reviewer", + "subagentId": "ReviewPerformance" + } + ] + }); + + assert_eq!( + TaskTool::deep_review_packet_id_for_cache( + "ReviewPerformance", + Some("Performance review"), + Some(&manifest), + ), + None + ); + } + + #[test] + fn deep_review_cache_packet_id_ignores_description_for_other_subagent() { + let manifest = serde_json::json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity:group-1-of-1", + "phase": "reviewer", + "subagentId": "ReviewSecurity" + } + ] + }); + + assert_eq!( + TaskTool::deep_review_packet_id_for_cache( + "ReviewPerformance", + Some("Performance review [packet reviewer:ReviewSecurity:group-1-of-1]"), + Some(&manifest), + ), + None + ); + } + + #[test] + fn deep_review_retry_guidance_includes_budget_info() { + // Verify that the retry budget tracking functions work correctly + // for the retry guidance injected in task_tool. + use crate::agentic::deep_review_policy::{ + deep_review_max_retries_per_role, deep_review_retries_used, + }; + + // Default max retries should be 1 + assert_eq!(deep_review_max_retries_per_role("nonexistent-turn"), 1); + + // Retries used for a nonexistent turn should be 0 + assert_eq!( + deep_review_retries_used("nonexistent-turn", "ReviewSecurity"), + 0 + ); + } + + #[test] + fn deep_review_retry_guidance_uses_manifest_policy_limit() { + use crate::agentic::deep_review_policy::DeepReviewExecutionPolicy; + + let manifest = serde_json::json!({ + "reviewMode": "deep", + "executionPolicy": { + "maxRetriesPerRole": 2 + } + }); + let policy = + DeepReviewExecutionPolicy::default().with_run_manifest_execution_policy(&manifest); + + assert_eq!( + TaskTool::deep_review_retry_guidance_max_retries(Some(&policy), "nonexistent-turn"), + 2 + ); + } + + #[test] + fn deep_review_retry_rejects_missing_structured_coverage() { + let manifest = json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity:group-1-of-1", + "phase": "reviewer", + "subagentId": "ReviewSecurity", + "timeoutSeconds": 600, + "assignedScope": { + "files": [ + "src/crates/core/src/auth.rs", + "src/crates/core/src/token.rs" + ] + } + } + ] + }); + let input = json!({ + "retry": true + }); + + let violation = + TaskTool::ensure_deep_review_retry_coverage(&input, "ReviewSecurity", Some(&manifest)) + .expect_err("missing retry coverage should be rejected"); + + assert_eq!(violation.code, "deep_review_retry_missing_coverage"); + } + + #[test] + fn deep_review_retry_rejects_broad_scope() { + let manifest = json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity:group-1-of-1", + "phase": "reviewer", + "subagentId": "ReviewSecurity", + "timeoutSeconds": 600, + "assignedScope": { + "files": [ + "src/crates/core/src/auth.rs", + "src/crates/core/src/token.rs" + ] + } + } + ] + }); + let input = json!({ + "retry": true, + "timeout_seconds": 300, + "retry_coverage": { + "source_packet_id": "reviewer:ReviewSecurity:group-1-of-1", + "source_status": "partial_timeout", + "covered_files": [ + "src/crates/core/src/auth.rs" + ], + "retry_scope_files": [ + "src/crates/core/src/auth.rs", + "src/crates/core/src/token.rs" + ] + } + }); + + let violation = + TaskTool::ensure_deep_review_retry_coverage(&input, "ReviewSecurity", Some(&manifest)) + .expect_err("retrying the full packet should be rejected"); + + assert_eq!(violation.code, "deep_review_retry_scope_not_reduced"); + } + + #[test] + fn deep_review_retry_rejects_timeout_that_is_not_lowered() { + let manifest = json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity:group-1-of-1", + "phase": "reviewer", + "subagentId": "ReviewSecurity", + "timeoutSeconds": 600, + "assignedScope": { + "files": [ + "src/crates/core/src/auth.rs", + "src/crates/core/src/token.rs" + ] + } + } + ] + }); + let input = json!({ + "retry": true, + "timeout_seconds": 600, + "retry_coverage": { + "source_packet_id": "reviewer:ReviewSecurity:group-1-of-1", + "source_status": "partial_timeout", + "covered_files": [ + "src/crates/core/src/auth.rs" + ], + "retry_scope_files": [ + "src/crates/core/src/token.rs" + ] + } + }); + + let violation = + TaskTool::ensure_deep_review_retry_coverage(&input, "ReviewSecurity", Some(&manifest)) + .expect_err("retry timeout must be lower than source timeout"); + + assert_eq!(violation.code, "deep_review_retry_timeout_not_reduced"); + } + + #[test] + fn deep_review_retry_rejects_non_queueable_capacity_reason() { + let manifest = json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity:group-1-of-1", + "phase": "reviewer", + "subagentId": "ReviewSecurity", + "timeoutSeconds": 600, + "assignedScope": { + "files": [ + "src/crates/core/src/auth.rs", + "src/crates/core/src/token.rs" + ] + } + } + ] + }); + let input = json!({ + "retry": true, + "retry_coverage": { + "source_packet_id": "reviewer:ReviewSecurity:group-1-of-1", + "source_status": "capacity_skipped", + "capacity_reason": "auth_error", + "covered_files": [], + "retry_scope_files": [ + "src/crates/core/src/token.rs" + ] + } + }); + + let violation = + TaskTool::ensure_deep_review_retry_coverage(&input, "ReviewSecurity", Some(&manifest)) + .expect_err("non-queueable capacity failures must fail fast"); + + assert_eq!(violation.code, "deep_review_retry_non_retryable_status"); + } + + #[test] + fn deep_review_provider_capacity_error_builds_capacity_skipped_payload_and_lowers_effective_cap( + ) { + use crate::agentic::deep_review_policy::{ + deep_review_effective_concurrency_snapshot, DeepReviewConcurrencyPolicy, + }; + use crate::util::BitFunError; + + let policy = DeepReviewConcurrencyPolicy { + max_parallel_instances: 3, + stagger_seconds: 0, + max_queue_wait_seconds: 30, + batch_extras_separately: true, + }; + let turn_id = "turn-provider-capacity-skip"; + let reason = TaskTool::deep_review_capacity_reason_for_provider_error(&BitFunError::ai( + "Provider error: provider=openai, code=429, message=rate limit exceeded", + )) + .expect("provider rate limit should surface as capacity_skipped"); + let (data, assistant_message) = + TaskTool::deep_review_capacity_skip_result_for_provider_reason( + reason, + turn_id, + "ReviewSecurity", + &policy, + 42, + ); + + assert_eq!(data["status"], "capacity_skipped"); + assert_eq!(data["queue_skip_reason"], "provider_rate_limit"); + assert_eq!(data["effective_parallel_instances"], 2); + assert!(assistant_message.contains("status=\"capacity_skipped\"")); + assert!(assistant_message.contains("reason=\"provider_rate_limit\"")); + assert_eq!( + deep_review_effective_concurrency_snapshot(turn_id, 3).effective_parallel_instances, + 2 + ); + } + + #[test] + fn deep_review_provider_quota_error_is_not_capacity_skipped() { + use crate::util::BitFunError; + + let reason = TaskTool::deep_review_capacity_reason_for_provider_error(&BitFunError::ai( + "Provider error: provider=glm, code=1113, message=insufficient quota", + )); + + assert!( + reason.is_none(), + "quota errors should remain fail-fast instead of entering capacity queue flow" + ); + } + + #[test] + fn deep_review_retry_accepts_reduced_partial_timeout_scope() { + let manifest = json!({ + "workPackets": [ + { + "packetId": "reviewer:ReviewSecurity:group-1-of-1", + "phase": "reviewer", + "subagentId": "ReviewSecurity", + "timeoutSeconds": 600, + "assignedScope": { + "files": [ + "src/crates/core/src/auth.rs", + "src/crates/core/src/token.rs" + ] + } + } + ] + }); + let input = json!({ + "retry": true, + "timeout_seconds": 300, + "retry_coverage": { + "source_packet_id": "reviewer:ReviewSecurity:group-1-of-1", + "source_status": "partial_timeout", + "covered_files": [ + "src/crates/core/src/auth.rs" + ], + "retry_scope_files": [ + "src/crates/core/src/token.rs" + ] + } + }); + + let retry_scope = + TaskTool::ensure_deep_review_retry_coverage(&input, "ReviewSecurity", Some(&manifest)) + .expect("reduced retry scope should be accepted"); + + assert_eq!(retry_scope, vec!["src/crates/core/src/token.rs"]); + } + + #[test] + fn deep_review_retry_scope_prompt_prepend_bounds_review_files() { + let prompt = TaskTool::prompt_with_deep_review_retry_scope( + "Continue the security review.", + &["src/crates/core/src/token.rs".to_string()], + ); + + assert!(prompt.starts_with("")); + assert!(prompt.contains("Review only the following retry_scope_files")); + assert!(prompt.contains("- src/crates/core/src/token.rs")); + assert!(prompt.ends_with("Continue the security review.")); + } } diff --git a/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs b/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs index 5fc183d00..68bbd094c 100644 --- a/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs +++ b/src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs @@ -1163,6 +1163,51 @@ impl ToolPipeline { ); } } + if let Some(raw_manifest) = + task.context.context_vars.get("deep_review_run_manifest") + { + if let Ok(manifest) = serde_json::from_str::(raw_manifest) { + map.insert("deep_review_run_manifest".to_string(), manifest); + } + } + if let Some(role) = task.context.context_vars.get("deep_review_subagent_role") { + if !role.trim().is_empty() { + map.insert( + "deep_review_subagent_role".to_string(), + serde_json::json!(role.trim()), + ); + } + } + if let Some(subagent_type) = + task.context.context_vars.get("deep_review_subagent_type") + { + if !subagent_type.trim().is_empty() { + map.insert( + "deep_review_subagent_type".to_string(), + serde_json::json!(subagent_type.trim()), + ); + } + } + if map + .get("deep_review_subagent_role") + .and_then(serde_json::Value::as_str) + .is_some_and(|role| role == "reviewer") + { + if let Some(parent_info) = task.context.subagent_parent_info.as_ref() { + map.insert( + "deep_review_parent_tool_call_id".to_string(), + serde_json::json!(parent_info.tool_call_id.clone()), + ); + map.insert( + "deep_review_parent_session_id".to_string(), + serde_json::json!(parent_info.session_id.clone()), + ); + map.insert( + "deep_review_parent_dialog_turn_id".to_string(), + serde_json::json!(parent_info.dialog_turn_id.clone()), + ); + } + } map }, diff --git a/src/crates/core/src/service/config/types.rs b/src/crates/core/src/service/config/types.rs index 36f1727ba..4debc9c5d 100644 --- a/src/crates/core/src/service/config/types.rs +++ b/src/crates/core/src/service/config/types.rs @@ -536,6 +536,10 @@ pub struct AIConfig { /// Allow Computer use (desktop automation) when the desktop host is available (all session modes). #[serde(default)] pub computer_use_enabled: bool, + + /// Maximum number of rounds per dialog turn before soft-pausing. + #[serde(default = "default_max_rounds")] + pub max_rounds: usize, } impl AIConfig { @@ -679,6 +683,12 @@ fn default_subagent_max_concurrency() -> usize { 5 } +pub const DEFAULT_MAX_ROUNDS: usize = 200; + +fn default_max_rounds() -> usize { + DEFAULT_MAX_ROUNDS +} + impl Default for ModeConfig { fn default() -> Self { Self { @@ -1485,6 +1495,7 @@ impl Default for AIConfig { skip_tool_confirmation: true, debug_mode_config: DebugModeConfig::default(), computer_use_enabled: false, + max_rounds: default_max_rounds(), } } } diff --git a/src/crates/core/src/service/cron/service.rs b/src/crates/core/src/service/cron/service.rs index c7e74a09d..a8ac1060c 100644 --- a/src/crates/core/src/service/cron/service.rs +++ b/src/crates/core/src/service/cron/service.rs @@ -519,6 +519,7 @@ impl CronService { scheduled_job_policy(), None, None, + None, ) .await; diff --git a/src/crates/core/src/service/git/git_service.rs b/src/crates/core/src/service/git/git_service.rs index 177f016d6..e1523e1b7 100644 --- a/src/crates/core/src/service/git/git_service.rs +++ b/src/crates/core/src/service/git/git_service.rs @@ -14,6 +14,48 @@ pub struct GitService; type CommitStats = (Option, Option, Option); +fn parse_name_status_output(output: &str) -> Vec { + output + .lines() + .filter_map(|line| { + let mut parts = line.split('\t'); + let raw_status = parts.next()?.trim(); + if raw_status.is_empty() { + return None; + } + + let status = match raw_status.chars().next().unwrap_or_default() { + 'A' => GitChangedFileStatus::Added, + 'M' => GitChangedFileStatus::Modified, + 'D' => GitChangedFileStatus::Deleted, + 'R' => GitChangedFileStatus::Renamed, + 'C' => GitChangedFileStatus::Copied, + _ => GitChangedFileStatus::Unknown, + }; + + match status { + GitChangedFileStatus::Renamed | GitChangedFileStatus::Copied => { + let old_path = parts.next()?.to_string(); + let path = parts.next()?.to_string(); + Some(GitChangedFile { + path, + old_path: Some(old_path), + status, + }) + } + _ => { + let path = parts.next()?.to_string(); + Some(GitChangedFile { + path, + old_path: None, + status, + }) + } + } + }) + .collect() +} + impl GitService { /// Checks whether the path is a Git repository. pub async fn is_repository>(path: P) -> Result { @@ -769,6 +811,38 @@ impl GitService { execute_git_command(&repo_path, &args).await } + /// Gets changed files using `git diff --name-status`. + pub async fn get_changed_files>( + path: P, + params: &GitChangedFilesParams, + ) -> Result, GitError> { + let repo_path = path.as_ref().to_string_lossy(); + + let mut args = vec!["diff", "--name-status"]; + let range; + + if params.staged.unwrap_or(false) { + args.push("--cached"); + } + + match (¶ms.source, ¶ms.target) { + (Some(src), Some(tgt)) => { + range = format!("{}..{}", src, tgt); + args.push(&range); + } + (Some(src), None) => { + args.push(src); + } + (None, Some(tgt)) => { + args.push(tgt); + } + (None, None) => {} + } + + let output = execute_git_command(&repo_path, &args).await?; + Ok(parse_name_status_output(&output)) + } + /// Gets file content. /// /// # Parameters @@ -1125,3 +1199,46 @@ impl GitService { }) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_name_status_output_for_common_statuses() { + let files = parse_name_status_output( + "M\tsrc/main.rs\nA\tsrc/new.rs\nD\tsrc/old.rs\nR100\tsrc/old_name.rs\tsrc/new_name.rs\nC087\tsrc/source.rs\tsrc/copy.rs\n", + ); + + assert_eq!( + files, + vec![ + GitChangedFile { + path: "src/main.rs".to_string(), + old_path: None, + status: GitChangedFileStatus::Modified, + }, + GitChangedFile { + path: "src/new.rs".to_string(), + old_path: None, + status: GitChangedFileStatus::Added, + }, + GitChangedFile { + path: "src/old.rs".to_string(), + old_path: None, + status: GitChangedFileStatus::Deleted, + }, + GitChangedFile { + path: "src/new_name.rs".to_string(), + old_path: Some("src/old_name.rs".to_string()), + status: GitChangedFileStatus::Renamed, + }, + GitChangedFile { + path: "src/copy.rs".to_string(), + old_path: Some("src/source.rs".to_string()), + status: GitChangedFileStatus::Copied, + }, + ], + ); + } +} diff --git a/src/crates/core/src/service/git/git_types.rs b/src/crates/core/src/service/git/git_types.rs index 4bd937317..7d2786ce3 100644 --- a/src/crates/core/src/service/git/git_types.rs +++ b/src/crates/core/src/service/git/git_types.rs @@ -163,6 +163,31 @@ pub struct GitDiffParams { pub stat: Option, } +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct GitChangedFilesParams { + pub source: Option, + pub target: Option, + pub staged: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum GitChangedFileStatus { + Added, + Modified, + Deleted, + Renamed, + Copied, + Unknown, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct GitChangedFile { + pub path: String, + pub old_path: Option, + pub status: GitChangedFileStatus, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct GitOperationResult { pub success: bool, diff --git a/src/crates/core/src/service/remote_connect/remote_server.rs b/src/crates/core/src/service/remote_connect/remote_server.rs index 51b003ef7..9276304d0 100644 --- a/src/crates/core/src/service/remote_connect/remote_server.rs +++ b/src/crates/core/src/service/remote_connect/remote_server.rs @@ -1780,6 +1780,7 @@ impl RemoteExecutionDispatcher { binding_workspace, submission_policy, None, + None, image_payload, ) .await diff --git a/src/crates/core/src/service/session/types.rs b/src/crates/core/src/service/session/types.rs index 0387f4ced..a6600025d 100644 --- a/src/crates/core/src/service/session/types.rs +++ b/src/crates/core/src/service/session/types.rs @@ -78,6 +78,27 @@ pub struct SessionMetadata { #[serde(skip_serializing_if = "Option::is_none")] pub todos: Option, + /// Deep Review run manifest for this session, when the session was launched + /// from Code Review Team. + #[serde( + default, + skip_serializing_if = "Option::is_none", + alias = "deep_review_run_manifest", + alias = "deepReviewRunManifest" + )] + pub deep_review_run_manifest: Option, + + /// Cached reviewer outputs from previous deep review runs in this session. + /// Keyed by packet_id, value is the reviewer's output text. + /// Used for incremental review: when the fingerprint matches, skip re-dispatching. + #[serde( + default, + skip_serializing_if = "Option::is_none", + alias = "deep_review_cache", + alias = "deepReviewCache" + )] + pub deep_review_cache: Option, + /// Workspace path this session belongs to (normalized source workspace root, not mirror dir) #[serde(skip_serializing_if = "Option::is_none", alias = "workspace_path")] pub workspace_path: Option, @@ -526,6 +547,8 @@ impl SessionMetadata { tags: Vec::new(), custom_metadata: None, todos: None, + deep_review_run_manifest: None, + deep_review_cache: None, workspace_path: None, workspace_hostname: None, unread_completion: None, @@ -746,4 +769,44 @@ mod tests { assert!(!metadata.is_subagent()); assert!(metadata.is_standard()); } + + #[test] + fn session_metadata_preserves_deep_review_run_manifest() { + let payload = serde_json::json!({ + "sessionId": "session-1", + "sessionName": "Deep Review", + "agentType": "DeepReview", + "sessionKind": "standard", + "modelName": "fast", + "createdAt": 1, + "lastActiveAt": 1, + "turnCount": 0, + "messageCount": 0, + "toolCallCount": 0, + "status": "active", + "deep_review_run_manifest": { + "reviewMode": "deep", + "coreReviewers": [ + { "subagentId": "ReviewBusinessLogic" } + ], + "skippedReviewers": [ + { "subagentId": "ReviewFrontend", "reason": "not_applicable" } + ] + } + }); + + let metadata: SessionMetadata = + serde_json::from_value(payload).expect("metadata should deserialize"); + + assert_eq!( + metadata.deep_review_run_manifest.as_ref().unwrap()["reviewMode"], + "deep" + ); + + let serialized = serde_json::to_value(&metadata).expect("metadata should serialize"); + assert_eq!( + serialized["deepReviewRunManifest"]["coreReviewers"][0]["subagentId"], + "ReviewBusinessLogic" + ); + } } diff --git a/src/crates/core/tests/context_profile.rs b/src/crates/core/tests/context_profile.rs new file mode 100644 index 000000000..096a941d1 --- /dev/null +++ b/src/crates/core/tests/context_profile.rs @@ -0,0 +1,172 @@ +use bitfun_core::agentic::context_profile::{ + ContextProfile, ContextProfilePolicy, ModelCapabilityProfile, +}; +use bitfun_core::agentic::session::MicrocompactConfig; + +#[test] +fn context_profile_maps_long_running_agents_to_long_task_profile() { + for agent_type in [ + "agentic", + "DeepReview", + "DeepResearch", + "ComputerUse", + "Team", + "ReviewFrontend", + "ReviewSecurity", + ] { + assert_eq!( + ContextProfile::for_agent_type(agent_type), + ContextProfile::LongTask, + "{agent_type} should use the long-task profile" + ); + } +} + +#[test] +fn context_profile_maps_conversation_agents_to_conversation_profile() { + for agent_type in ["Cowork", "Plan", "Claw", "unknown-custom-agent"] { + assert_eq!( + ContextProfile::for_agent_type(agent_type), + ContextProfile::Conversation, + "{agent_type} should use the conversation profile" + ); + } +} + +#[test] +fn context_profile_review_custom_subagents_can_be_promoted_to_long_task_profile() { + assert_eq!( + ContextProfile::for_agent_context("legal-domain-reviewer", true), + ContextProfile::LongTask + ); + assert_eq!( + ContextProfile::for_agent_context("legal-domain-reviewer", false), + ContextProfile::Conversation + ); +} + +#[test] +fn context_profile_long_task_policy_preserves_current_context_defaults() { + let policy = ContextProfilePolicy::for_agent_context( + "DeepReview", + false, + ModelCapabilityProfile::Standard, + ); + let default_microcompact = MicrocompactConfig::default(); + + assert_eq!(policy.profile, ContextProfile::LongTask); + assert_eq!( + policy.microcompact_config().keep_recent, + default_microcompact.keep_recent + ); + assert!( + (policy.microcompact_config().trigger_ratio - default_microcompact.trigger_ratio).abs() + < f32::EPSILON + ); + assert_eq!(policy.compression_contract_limit, 8); + assert_eq!(policy.subagent_concurrency_cap, 5); + assert_eq!(policy.repeated_tool_signature_threshold, 3); + assert_eq!(policy.consecutive_failed_command_threshold, 2); +} + +#[test] +fn context_profile_conversation_policy_keeps_more_recent_chat_context() { + let policy = + ContextProfilePolicy::for_agent_context("Cowork", false, ModelCapabilityProfile::Standard); + + assert_eq!(policy.profile, ContextProfile::Conversation); + assert_eq!(policy.microcompact_config().keep_recent, 12); + assert!((policy.microcompact_config().trigger_ratio - 0.65).abs() < f32::EPSILON); + assert_eq!(policy.compression_contract_limit, 4); + assert_eq!(policy.subagent_concurrency_cap, 2); + assert_eq!(policy.repeated_tool_signature_threshold, 4); + assert_eq!(policy.consecutive_failed_command_threshold, 3); +} + +#[test] +fn context_profile_weak_model_override_shortens_contract_and_caps_fanout() { + let standard = ContextProfilePolicy::for_agent_context( + "DeepReview", + false, + ModelCapabilityProfile::Standard, + ); + let weak = + ContextProfilePolicy::for_agent_context("DeepReview", false, ModelCapabilityProfile::Weak); + + assert_eq!(weak.profile, ContextProfile::LongTask); + assert!(weak.compression_contract_limit < standard.compression_contract_limit); + assert!(weak.subagent_concurrency_cap < standard.subagent_concurrency_cap); + assert!(weak.repeated_tool_signature_threshold < standard.repeated_tool_signature_threshold); + assert_eq!(weak.compression_contract_limit, 4); + assert_eq!(weak.subagent_concurrency_cap, 2); + assert_eq!(weak.repeated_tool_signature_threshold, 2); +} + +#[test] +fn context_profile_model_capability_profile_only_marks_explicit_weak_models() { + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("claude-3-haiku")), + ModelCapabilityProfile::Weak + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("gpt-5.4-mini")), + ModelCapabilityProfile::Weak + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(Some("fast")), + ModelCapabilityProfile::Standard, + "configured model slots should not be treated as weak before resolving" + ); + assert_eq!( + ModelCapabilityProfile::from_model_id(None), + ModelCapabilityProfile::Standard + ); +} + +#[test] +fn context_profile_configured_subagent_concurrency_is_capped_by_policy() { + let long_task = ContextProfilePolicy::for_agent_context( + "DeepReview", + false, + ModelCapabilityProfile::Standard, + ); + let conversation = + ContextProfilePolicy::for_agent_context("Cowork", false, ModelCapabilityProfile::Standard); + + assert_eq!(long_task.effective_subagent_max_concurrency(64), 5); + assert_eq!(long_task.effective_subagent_max_concurrency(3), 3); + assert_eq!(conversation.effective_subagent_max_concurrency(64), 2); + assert_eq!(conversation.effective_subagent_max_concurrency(1), 1); +} + +#[test] +fn context_profile_subagent_policy_combines_parent_workload_and_child_model() { + let policy = ContextProfilePolicy::for_subagent_context_and_models( + "custom-security-reviewer", + true, + Some("claude-3-haiku"), + Some("DeepReview"), + false, + Some("gpt-5"), + ); + + assert_eq!(policy.profile, ContextProfile::LongTask); + assert_eq!(policy.compression_contract_limit, 4); + assert_eq!(policy.subagent_concurrency_cap, 2); + assert_eq!(policy.repeated_tool_signature_threshold, 2); +} + +#[test] +fn context_profile_subagent_policy_inherits_parent_long_task_when_child_is_plain() { + let policy = ContextProfilePolicy::for_subagent_context_and_models( + "Explore", + false, + None, + Some("DeepReview"), + false, + Some("gpt-5"), + ); + + assert_eq!(policy.profile, ContextProfile::LongTask); + assert_eq!(policy.subagent_concurrency_cap, 5); +} diff --git a/src/crates/events/src/agentic.rs b/src/crates/events/src/agentic.rs index 3906af0dc..5025270d1 100644 --- a/src/crates/events/src/agentic.rs +++ b/src/crates/events/src/agentic.rs @@ -1,4 +1,4 @@ -//! Agentic Events Definition +//! Agentic Events Definition use serde::{Deserialize, Serialize}; use std::time::SystemTime; @@ -77,6 +77,49 @@ pub struct SubagentParentInfo { pub dialog_turn_id: String, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum DeepReviewQueueStatus { + QueuedForCapacity, + PausedByUser, + Running, + CapacitySkipped, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum DeepReviewQueueReason { + ProviderRateLimit, + ProviderConcurrencyLimit, + RetryAfter, + LocalConcurrencyCap, + TemporaryOverload, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DeepReviewQueueState { + pub tool_id: String, + pub subagent_type: String, + pub status: DeepReviewQueueStatus, + #[serde(skip_serializing_if = "Option::is_none")] + pub reason: Option, + pub queued_reviewer_count: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub active_reviewer_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub effective_parallel_instances: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub optional_reviewer_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub queue_elapsed_ms: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub run_elapsed_ms: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub max_queue_wait_seconds: Option, + #[serde(default)] + pub session_concurrency_high: bool, +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type")] pub enum AgenticEvent { @@ -250,6 +293,13 @@ pub enum AgenticEvent { subagent_parent_info: Option, }, + DeepReviewQueueStateChanged { + session_id: String, + turn_id: String, + queue_state: DeepReviewQueueState, + subagent_parent_info: Option, + }, + SystemError { session_id: Option, error: String, @@ -431,6 +481,7 @@ impl AgenticEvent { | Self::ModelRoundCompleted { session_id, .. } | Self::ToolEvent { session_id, .. } | Self::UserSteeringInjected { session_id, .. } + | Self::DeepReviewQueueStateChanged { session_id, .. } | Self::SessionModelAutoMigrated { session_id, .. } => Some(session_id), Self::SystemError { session_id, .. } => session_id.as_deref(), } @@ -446,6 +497,7 @@ impl AgenticEvent { Self::SessionStateChanged { .. } | Self::SessionTitleGenerated { .. } | Self::SessionModelAutoMigrated { .. } + | Self::DeepReviewQueueStateChanged { .. } | Self::ContextCompressionFailed { .. } => AgenticEventPriority::High, Self::ImageAnalysisStarted { .. } @@ -490,3 +542,52 @@ impl ToolEventData { } } } + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn deep_review_queue_state_event_serializes_stable_contract() { + let event = AgenticEvent::DeepReviewQueueStateChanged { + session_id: "review-session".to_string(), + turn_id: "turn-1".to_string(), + queue_state: DeepReviewQueueState { + tool_id: "task-1".to_string(), + subagent_type: "ReviewSecurity".to_string(), + status: DeepReviewQueueStatus::QueuedForCapacity, + reason: Some(DeepReviewQueueReason::ProviderConcurrencyLimit), + queued_reviewer_count: 2, + active_reviewer_count: Some(1), + effective_parallel_instances: Some(2), + optional_reviewer_count: Some(1), + queue_elapsed_ms: Some(1200), + run_elapsed_ms: None, + max_queue_wait_seconds: Some(60), + session_concurrency_high: true, + }, + subagent_parent_info: None, + }; + + assert_eq!(event.session_id(), Some("review-session")); + assert_eq!(event.default_priority(), AgenticEventPriority::High); + + let serialized = serde_json::to_value(event).expect("serialize event"); + assert_eq!(serialized["type"], "DeepReviewQueueStateChanged"); + assert_eq!(serialized["queue_state"]["status"], "queued_for_capacity"); + assert_eq!( + serialized["queue_state"]["reason"], + json!("provider_concurrency_limit") + ); + assert_eq!(serialized["queue_state"]["queue_elapsed_ms"], json!(1200)); + assert_eq!( + serialized["queue_state"]["effective_parallel_instances"], + json!(2) + ); + assert_eq!( + serialized["queue_state"]["run_elapsed_ms"], + serde_json::Value::Null + ); + } +} diff --git a/src/crates/events/src/lib.rs b/src/crates/events/src/lib.rs index cb983ace6..fe3884ac0 100644 --- a/src/crates/events/src/lib.rs +++ b/src/crates/events/src/lib.rs @@ -9,7 +9,8 @@ pub mod emitter; pub mod types; pub use agentic::{ - AgenticEvent, AgenticEventEnvelope, AgenticEventPriority, SubagentParentInfo, ToolEventData, + AgenticEvent, AgenticEventEnvelope, AgenticEventPriority, DeepReviewQueueReason, + DeepReviewQueueState, DeepReviewQueueStatus, SubagentParentInfo, ToolEventData, }; pub use emitter::EventEmitter; pub use types::*; diff --git a/src/crates/transport/src/adapters/tauri.rs b/src/crates/transport/src/adapters/tauri.rs index 197ebb1d2..c12e4f39d 100644 --- a/src/crates/transport/src/adapters/tauri.rs +++ b/src/crates/transport/src/adapters/tauri.rs @@ -384,6 +384,35 @@ impl TransportAdapter for TauriTransportAdapter { }), )?; } + AgenticEvent::DeepReviewQueueStateChanged { + session_id, + turn_id, + queue_state, + subagent_parent_info, + } => { + self.app_handle.emit( + "agentic://deep-review-queue-state-changed", + json!({ + "sessionId": session_id, + "turnId": turn_id, + "queueState": { + "toolId": queue_state.tool_id, + "subagentType": queue_state.subagent_type, + "status": queue_state.status, + "reason": queue_state.reason, + "queuedReviewerCount": queue_state.queued_reviewer_count, + "activeReviewerCount": queue_state.active_reviewer_count, + "effectiveParallelInstances": queue_state.effective_parallel_instances, + "optionalReviewerCount": queue_state.optional_reviewer_count, + "queueElapsedMs": queue_state.queue_elapsed_ms, + "runElapsedMs": queue_state.run_elapsed_ms, + "maxQueueWaitSeconds": queue_state.max_queue_wait_seconds, + "sessionConcurrencyHigh": queue_state.session_concurrency_high, + }, + "subagentParentInfo": subagent_parent_info, + }), + )?; + } AgenticEvent::ModelRoundCompleted { session_id, turn_id, diff --git a/src/crates/transport/src/adapters/websocket.rs b/src/crates/transport/src/adapters/websocket.rs index a18dbcc5c..f0d847424 100644 --- a/src/crates/transport/src/adapters/websocket.rs +++ b/src/crates/transport/src/adapters/websocket.rs @@ -179,6 +179,33 @@ impl TransportAdapter for WebSocketTransportAdapter { "finishReason": finish_reason, }) } + AgenticEvent::DeepReviewQueueStateChanged { + session_id, + turn_id, + queue_state, + subagent_parent_info, + } => { + json!({ + "type": "deep-review-queue-state-changed", + "sessionId": session_id, + "turnId": turn_id, + "queueState": { + "toolId": queue_state.tool_id, + "subagentType": queue_state.subagent_type, + "status": queue_state.status, + "reason": queue_state.reason, + "queuedReviewerCount": queue_state.queued_reviewer_count, + "activeReviewerCount": queue_state.active_reviewer_count, + "effectiveParallelInstances": queue_state.effective_parallel_instances, + "optionalReviewerCount": queue_state.optional_reviewer_count, + "queueElapsedMs": queue_state.queue_elapsed_ms, + "runElapsedMs": queue_state.run_elapsed_ms, + "maxQueueWaitSeconds": queue_state.max_queue_wait_seconds, + "sessionConcurrencyHigh": queue_state.session_concurrency_high, + }, + "subagentParentInfo": subagent_parent_info, + }) + } _ => return Ok(()), }; From 9981e22ec4b99577cd548567cb47b3b39a03f854 Mon Sep 17 00:00:00 2001 From: limit_yan Date: Sat, 9 May 2026 16:31:46 +0800 Subject: [PATCH 2/6] feat(deep-review): add adaptive review orchestration UI --- .../src/app/scenes/agents/AgentsScene.tsx | 5 +- .../app/scenes/agents/agentVisibility.test.ts | 13 + .../src/app/scenes/agents/agentVisibility.ts | 17 +- .../agents/components/AgentTeamCard.scss | 2 + .../agents/components/AgentTeamCard.test.tsx | 61 + .../agents/components/ReviewTeamPage.tsx | 2 + .../components/subagentEditorUtils.test.ts | 36 + .../agents/components/subagentEditorUtils.ts | 9 + .../app/scenes/agents/hooks/useAgentsList.ts | 18 +- .../src/flow_chat/components/ChatInput.tsx | 46 +- .../components/DeepReviewConsentDialog.scss | 215 +- .../DeepReviewConsentDialog.test.tsx | 367 +++ .../components/DeepReviewConsentDialog.tsx | 400 ++- .../components/btw/BtwSessionPanel.tsx | 8 + .../btw/DeepReviewActionBar.i18n.test.ts | 25 + .../components/btw/DeepReviewActionBar.scss | 50 + .../btw/DeepReviewActionBar.test.tsx | 118 + .../components/btw/DeepReviewActionBar.tsx | 162 +- .../components/modern/SessionFilesBadge.tsx | 52 +- .../services/AgenticEventListener.ts | 10 + .../flow_chat/services/BtwThreadService.ts | 3 + .../services/DeepReviewService.test.ts | 449 +++- .../flow_chat/services/DeepReviewService.ts | 402 ++- .../src/flow_chat/services/FlowChatManager.ts | 1 + .../flow-chat-manager/EventHandlerModule.ts | 34 + .../flow-chat-manager/MessageModule.ts | 5 + .../flow-chat-manager/TextChunkModule.test.ts | 2 + .../src/flow_chat/store/FlowChatStore.ts | 3 + .../store/deepReviewActionBarStore.test.ts | 91 + .../store/deepReviewActionBarStore.ts | 127 + .../CodeReviewReportExportActions.tsx | 52 +- .../tool-cards/CodeReviewToolCard.scss | 200 ++ .../tool-cards/CodeReviewToolCard.test.tsx | 381 +++ .../tool-cards/CodeReviewToolCard.tsx | 272 +- src/web-ui/src/flow_chat/types/flow-chat.ts | 4 + .../flow_chat/utils/codeReviewReport.test.ts | 479 ++++ .../src/flow_chat/utils/codeReviewReport.ts | 521 +++- .../utils/deepReviewCapacityGuard.test.ts | 86 + .../utils/deepReviewCapacityGuard.ts | 67 + .../utils/deepReviewContinuation.test.ts | 222 ++ .../flow_chat/utils/deepReviewContinuation.ts | 112 +- .../utils/deepReviewQueueStateEvents.test.ts | 73 + .../utils/deepReviewQueueStateEvents.ts | 33 + .../flow_chat/utils/sessionMetadata.test.ts | 21 + .../src/flow_chat/utils/sessionMetadata.ts | 3 + .../api/service-api/ACPClientAPI.ts | 3 + .../api/service-api/AgentAPI.ts | 72 +- .../infrastructure/api/service-api/GitAPI.ts | 37 +- src/web-ui/src/locales/en-US/flow-chat.json | 135 +- src/web-ui/src/locales/zh-CN/flow-chat.json | 135 +- src/web-ui/src/locales/zh-TW/flow-chat.json | 135 +- .../services/reviewSubagentCapabilities.ts | 47 + .../services/reviewTargetClassifier.test.ts | 97 + .../shared/services/reviewTargetClassifier.ts | 344 +++ .../reviewTeamLocaleCompleteness.test.ts | 83 + .../shared/services/reviewTeamService.test.ts | 1310 +++++++++- .../src/shared/services/reviewTeamService.ts | 2217 ++++++++++++++++- .../src/shared/types/session-history.ts | 7 + 58 files changed, 9616 insertions(+), 265 deletions(-) create mode 100644 src/web-ui/src/app/scenes/agents/agentVisibility.test.ts create mode 100644 src/web-ui/src/app/scenes/agents/components/AgentTeamCard.test.tsx create mode 100644 src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.test.tsx create mode 100644 src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.test.tsx create mode 100644 src/web-ui/src/flow_chat/utils/deepReviewCapacityGuard.test.ts create mode 100644 src/web-ui/src/flow_chat/utils/deepReviewCapacityGuard.ts create mode 100644 src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.test.ts create mode 100644 src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.ts create mode 100644 src/web-ui/src/shared/services/reviewSubagentCapabilities.ts create mode 100644 src/web-ui/src/shared/services/reviewTargetClassifier.test.ts create mode 100644 src/web-ui/src/shared/services/reviewTargetClassifier.ts create mode 100644 src/web-ui/src/shared/services/reviewTeamLocaleCompleteness.test.ts diff --git a/src/web-ui/src/app/scenes/agents/AgentsScene.tsx b/src/web-ui/src/app/scenes/agents/AgentsScene.tsx index ee5f03eec..a23a4687f 100644 --- a/src/web-ui/src/app/scenes/agents/AgentsScene.tsx +++ b/src/web-ui/src/app/scenes/agents/AgentsScene.tsx @@ -198,6 +198,7 @@ const AgentsHomeView: React.FC = () => { availableTools, getModeSkills, counts, + hiddenAgentIds, loadAgents, getModeConfig, handleSetTools, @@ -262,8 +263,8 @@ const AgentsHomeView: React.FC = () => { const coreAgents = useMemo(() => allAgents.filter((agent) => CORE_AGENT_IDS.has(agent.id)), [allAgents]); const visibleAgents = useMemo( - () => filteredAgents.filter(isAgentInOverviewZone), - [filteredAgents], + () => filteredAgents.filter((agent) => isAgentInOverviewZone(agent, hiddenAgentIds)), + [filteredAgents, hiddenAgentIds], ); const scrollToZone = useCallback((targetId: string) => { diff --git a/src/web-ui/src/app/scenes/agents/agentVisibility.test.ts b/src/web-ui/src/app/scenes/agents/agentVisibility.test.ts new file mode 100644 index 000000000..73577d82a --- /dev/null +++ b/src/web-ui/src/app/scenes/agents/agentVisibility.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, it } from 'vitest'; +import { isAgentInOverviewZone } from './agentVisibility'; + +describe('agentVisibility', () => { + it('hides review agents from backend-provided hidden ids', () => { + expect( + isAgentInOverviewZone( + { id: 'ReviewDocs' }, + new Set(['ReviewDocs']), + ), + ).toBe(false); + }); +}); diff --git a/src/web-ui/src/app/scenes/agents/agentVisibility.ts b/src/web-ui/src/app/scenes/agents/agentVisibility.ts index 65da93526..34031091a 100644 --- a/src/web-ui/src/app/scenes/agents/agentVisibility.ts +++ b/src/web-ui/src/app/scenes/agents/agentVisibility.ts @@ -1,6 +1,9 @@ /** Agent IDs hidden from the Agents overview UI (not listed, not counted). */ -export const HIDDEN_AGENT_IDS = new Set([ +export const STATIC_HIDDEN_AGENT_IDS = new Set([ 'Claw', +]); + +export const FALLBACK_REVIEW_HIDDEN_AGENT_IDS = new Set([ 'DeepReview', 'ReviewBusinessLogic', 'ReviewPerformance', @@ -10,10 +13,18 @@ export const HIDDEN_AGENT_IDS = new Set([ 'ReviewJudge', ]); +export const HIDDEN_AGENT_IDS = new Set([ + ...STATIC_HIDDEN_AGENT_IDS, + ...FALLBACK_REVIEW_HIDDEN_AGENT_IDS, +]); + /** Core mode agents shown in the top zone only; excluded from overview zone list and counts. */ export const CORE_AGENT_IDS = new Set(['agentic', 'Cowork', 'ComputerUse']); /** Agents that appear in the bottom overview grid (same pool as filter chip counts). */ -export function isAgentInOverviewZone(agent: { id: string }): boolean { - return !HIDDEN_AGENT_IDS.has(agent.id) && !CORE_AGENT_IDS.has(agent.id); +export function isAgentInOverviewZone( + agent: { id: string }, + hiddenAgentIds: ReadonlySet = HIDDEN_AGENT_IDS, +): boolean { + return !hiddenAgentIds.has(agent.id) && !CORE_AGENT_IDS.has(agent.id); } diff --git a/src/web-ui/src/app/scenes/agents/components/AgentTeamCard.scss b/src/web-ui/src/app/scenes/agents/components/AgentTeamCard.scss index 291c6acb6..7a571c044 100644 --- a/src/web-ui/src/app/scenes/agents/components/AgentTeamCard.scss +++ b/src/web-ui/src/app/scenes/agents/components/AgentTeamCard.scss @@ -116,6 +116,8 @@ align-items: center; gap: 4px; flex-wrap: wrap; + min-width: 0; + max-width: 100%; position: relative; z-index: 1; } diff --git a/src/web-ui/src/app/scenes/agents/components/AgentTeamCard.test.tsx b/src/web-ui/src/app/scenes/agents/components/AgentTeamCard.test.tsx new file mode 100644 index 000000000..46409d5c8 --- /dev/null +++ b/src/web-ui/src/app/scenes/agents/components/AgentTeamCard.test.tsx @@ -0,0 +1,61 @@ +import React from 'react'; +import { readFileSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import { renderToStaticMarkup } from 'react-dom/server'; +import { describe, expect, it } from 'vitest'; +import AgentTeamCard from './AgentTeamCard'; + +function readAgentTeamCardStylesheet(): string { + const stylesheet = readFileSync( + fileURLToPath(new URL('./AgentTeamCard.scss', import.meta.url)), + 'utf8', + ); + return stylesheet.replace(/\r\n/g, '\n'); +} + +function extractBlock(stylesheet: string, selector: string): string { + const escapedSelector = selector.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + const match = stylesheet.match(new RegExp(`${escapedSelector}\\s*\\{(?[\\s\\S]*?)\\n\\s*\\}`)); + return match?.groups?.body ?? ''; +} + +describe('AgentTeamCard', () => { + it('keeps role summary compact when the review team grows', () => { + const markup = renderToStaticMarkup( + undefined} + />, + ); + + const chipMatches = markup.match(/agent-team-card__tag-chip/g) ?? []; + expect(chipMatches).toHaveLength(3); + expect(markup).toContain('Business logic'); + expect(markup).toContain('Performance'); + expect(markup).toContain('Security'); + expect(markup).not.toContain('Architecture'); + expect(markup).not.toContain('Frontend'); + expect(markup).not.toContain('Judge'); + }); + + it('keeps role summary tags shrinkable and wrapping instead of clipping chips', () => { + const stylesheet = readAgentTeamCardStylesheet(); + const tagsBlock = extractBlock(stylesheet, '&__tags'); + const tagChipBlock = extractBlock(stylesheet, '&__tag-chip'); + + expect(tagsBlock).toContain('flex-wrap: wrap;'); + expect(tagsBlock).toContain('min-width: 0;'); + expect(tagsBlock).toContain('max-width: 100%;'); + expect(tagChipBlock).toContain('white-space: nowrap;'); + }); +}); diff --git a/src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx b/src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx index cc14943a3..089bf3971 100644 --- a/src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx +++ b/src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx @@ -33,6 +33,7 @@ import { useAgentsStore } from '../agentsStore'; import { DEFAULT_REVIEW_TEAM_EXECUTION_POLICY, DEFAULT_REVIEW_TEAM_MODEL, + FALLBACK_REVIEW_TEAM_DEFINITION, loadDefaultReviewTeam, REVIEW_STRATEGY_DEFINITIONS, type ReviewStrategyLevel, @@ -166,6 +167,7 @@ const ReviewTeamPage: React.FC = () => { strategyLevel: 'normal', memberStrategyOverrides: {}, executionPolicy: { ...DEFAULT_REVIEW_TEAM_EXECUTION_POLICY }, + definition: FALLBACK_REVIEW_TEAM_DEFINITION, members: [], coreMembers: [], extraMembers: [], diff --git a/src/web-ui/src/app/scenes/agents/components/subagentEditorUtils.test.ts b/src/web-ui/src/app/scenes/agents/components/subagentEditorUtils.test.ts index e606f0792..29307c7e9 100644 --- a/src/web-ui/src/app/scenes/agents/components/subagentEditorUtils.test.ts +++ b/src/web-ui/src/app/scenes/agents/components/subagentEditorUtils.test.ts @@ -1,13 +1,17 @@ import { describe, expect, it } from 'vitest'; import { + evaluateReviewSubagentToolReadiness, filterToolsForReviewMode, normalizeReviewModeState, type SubagentEditorToolInfo, } from './subagentEditorUtils'; const tools: SubagentEditorToolInfo[] = [ + { name: 'GetFileDiff', isReadonly: true }, { name: 'Read', isReadonly: true }, { name: 'Grep', isReadonly: true }, + { name: 'Glob', isReadonly: true }, + { name: 'LS', isReadonly: true }, { name: 'Write', isReadonly: false }, { name: 'Bash', isReadonly: false }, ]; @@ -15,12 +19,18 @@ const tools: SubagentEditorToolInfo[] = [ describe('subagentEditorUtils', () => { it('shows only readonly tools for review subagents', () => { expect(filterToolsForReviewMode(tools, true).map((tool) => tool.name)).toEqual([ + 'GetFileDiff', 'Read', 'Grep', + 'Glob', + 'LS', ]); expect(filterToolsForReviewMode(tools, false).map((tool) => tool.name)).toEqual([ + 'GetFileDiff', 'Read', 'Grep', + 'Glob', + 'LS', 'Write', 'Bash', ]); @@ -38,4 +48,30 @@ describe('subagentEditorUtils', () => { expect(Array.from(next.selectedTools)).toEqual(['Read']); expect(next.removedToolNames).toEqual(['Write', 'Bash']); }); + + it('marks review subagent tooling invalid when the minimum diff or read tool is missing', () => { + expect(evaluateReviewSubagentToolReadiness(new Set(['Read']))).toMatchObject({ + readiness: 'invalid', + missingRequiredTools: ['GetFileDiff'], + }); + }); + + it('marks review subagent tooling degraded when only the minimum tools are present', () => { + expect(evaluateReviewSubagentToolReadiness(new Set(['GetFileDiff', 'Read']))).toMatchObject({ + readiness: 'degraded', + missingRecommendedTools: ['Grep', 'Glob', 'LS'], + }); + }); + + it('marks review subagent tooling ready when the standard review tools are present', () => { + expect( + evaluateReviewSubagentToolReadiness( + new Set(['GetFileDiff', 'Read', 'Grep', 'Glob', 'LS']), + ), + ).toMatchObject({ + readiness: 'ready', + missingRequiredTools: [], + missingRecommendedTools: [], + }); + }); }); diff --git a/src/web-ui/src/app/scenes/agents/components/subagentEditorUtils.ts b/src/web-ui/src/app/scenes/agents/components/subagentEditorUtils.ts index 806528f86..07c906d69 100644 --- a/src/web-ui/src/app/scenes/agents/components/subagentEditorUtils.ts +++ b/src/web-ui/src/app/scenes/agents/components/subagentEditorUtils.ts @@ -3,6 +3,15 @@ export interface SubagentEditorToolInfo { isReadonly: boolean; } +export { + REVIEW_SUBAGENT_OPTIONAL_TOOLS, + REVIEW_SUBAGENT_RECOMMENDED_TOOLS, + REVIEW_SUBAGENT_REQUIRED_TOOLS, + evaluateReviewSubagentToolReadiness, + type ReviewSubagentToolReadiness, + type ReviewSubagentToolReadinessResult, +} from '@/shared/services/reviewSubagentCapabilities'; + export function filterToolsForReviewMode( tools: SubagentEditorToolInfo[], review: boolean, diff --git a/src/web-ui/src/app/scenes/agents/hooks/useAgentsList.ts b/src/web-ui/src/app/scenes/agents/hooks/useAgentsList.ts index 2f77e7d72..c172540a2 100644 --- a/src/web-ui/src/app/scenes/agents/hooks/useAgentsList.ts +++ b/src/web-ui/src/app/scenes/agents/hooks/useAgentsList.ts @@ -7,8 +7,9 @@ import type { ModeConfigItem, ModeSkillInfo } from '@/infrastructure/config/type import { useNotification } from '@/shared/notification-system'; import type { AgentWithCapabilities } from '../agentsStore'; import { enrichCapabilities } from '../utils'; -import { isAgentInOverviewZone } from '../agentVisibility'; +import { STATIC_HIDDEN_AGENT_IDS, isAgentInOverviewZone } from '../agentVisibility'; import { useCurrentWorkspace } from '@/infrastructure/contexts/WorkspaceContext'; +import { loadDefaultReviewTeamDefinition } from '@/shared/services/reviewTeamService'; export type FilterLevel = 'all' | 'builtin' | 'user' | 'project'; export type FilterType = 'all' | 'mode' | 'subagent'; @@ -39,6 +40,9 @@ export function useAgentsList({ const [availableTools, setAvailableTools] = useState([]); const [modeSkills, setModeSkills] = useState>({}); const [modeConfigs, setModeConfigs] = useState>({}); + const [hiddenAgentIds, setHiddenAgentIds] = useState>( + () => new Set(STATIC_HIDDEN_AGENT_IDS), + ); const loadRequestIdRef = useRef(0); const loadAgents = useCallback(async () => { @@ -55,11 +59,12 @@ export function useAgentsList({ }; try { - const [modes, subagents, tools, configs] = await Promise.all([ + const [modes, subagents, tools, configs, reviewTeamDefinition] = await Promise.all([ agentAPI.getAvailableModes().catch(() => []), SubagentAPI.listSubagents({ workspacePath: workspacePath || undefined }).catch(() => []), fetchTools(), configAPI.getModeConfigs().catch(() => ({})), + loadDefaultReviewTeamDefinition().catch(() => undefined), ]); const skillEntries = await Promise.all( modes.map(async (mode) => [ @@ -101,6 +106,10 @@ export function useAgentsList({ setAvailableTools(tools); setModeSkills(Object.fromEntries(skillEntries)); setModeConfigs(configs as Record); + setHiddenAgentIds(new Set([ + ...STATIC_HIDDEN_AGENT_IDS, + ...(reviewTeamDefinition?.hiddenAgentIds ?? []), + ])); } finally { if (requestId === loadRequestIdRef.current) { setLoading(false); @@ -233,8 +242,8 @@ export function useAgentsList({ }), [allAgents, filterLevel, filterType, searchQuery]); const overviewAgents = useMemo( - () => allAgents.filter(isAgentInOverviewZone), - [allAgents], + () => allAgents.filter((agent) => isAgentInOverviewZone(agent, hiddenAgentIds)), + [allAgents, hiddenAgentIds], ); const counts = useMemo(() => ({ @@ -253,6 +262,7 @@ export function useAgentsList({ availableTools, getModeSkills, counts, + hiddenAgentIds, loadAgents, getModeConfig, handleSetTools, diff --git a/src/web-ui/src/flow_chat/components/ChatInput.tsx b/src/web-ui/src/flow_chat/components/ChatInput.tsx index 9bb370bdc..4c9e9e0f5 100644 --- a/src/web-ui/src/flow_chat/components/ChatInput.tsx +++ b/src/web-ui/src/flow_chat/components/ChatInput.tsx @@ -36,8 +36,9 @@ import { startBtwThread } from '../services/BtwThreadService'; import { FlowChatManager } from '@/flow_chat'; import { DEEP_REVIEW_SLASH_COMMAND, - buildDeepReviewPromptFromSlashCommand, getDeepReviewLaunchErrorMessage, + buildDeepReviewLaunchFromSlashCommand, + buildDeepReviewPreviewFromSlashCommand, isDeepReviewSlashCommand, launchDeepReviewSession, } from '../services/DeepReviewService'; @@ -61,6 +62,7 @@ import { useDeepReviewConsent } from './DeepReviewConsentDialog'; import { useAgentCompanionActivity } from '../hooks/useAgentCompanionActivity'; import { useSessionReviewActivity } from '../hooks/useSessionReviewActivity'; import { shouldBlockDeepReviewCommand } from '../utils/deepReviewCommandGuard'; +import { deriveDeepReviewSessionConcurrencyGuard } from '../utils/deepReviewCapacityGuard'; import './ChatInput.scss'; const log = createLogger('ChatInput'); @@ -1449,24 +1451,34 @@ export const ChatInput: React.FC = ({ return; } - const confirmed = await confirmDeepReviewLaunch(); - if (!confirmed) { - return; - } - const originalPendingLargePastes = { ...pendingLargePastesRef.current }; - if (effectiveTargetSessionId) { - addToHistory(effectiveTargetSessionId, message); - } - setHistoryIndex(-1); - setSavedDraft(''); - dispatchInput({ type: 'CLEAR_VALUE' }); - clearPendingLargePastes(); - setQueuedInput(null); - setSlashCommandState({ isActive: false, kind: 'modes', query: '', selectedIndex: 0 }); try { - const prompt = await buildDeepReviewPromptFromSlashCommand( + const preview = await buildDeepReviewPreviewFromSlashCommand( + message, + effectiveTargetSession.workspacePath, + ); + const confirmed = await confirmDeepReviewLaunch(preview, { + sessionConcurrencyGuard: deriveDeepReviewSessionConcurrencyGuard( + flowChatState, + effectiveTargetSessionId, + ), + }); + if (!confirmed) { + return; + } + + if (effectiveTargetSessionId) { + addToHistory(effectiveTargetSessionId, message); + } + setHistoryIndex(-1); + setSavedDraft(''); + dispatchInput({ type: 'CLEAR_VALUE' }); + clearPendingLargePastes(); + setQueuedInput(null); + setSlashCommandState({ isActive: false, kind: 'modes', query: '', selectedIndex: 0 }); + + const { prompt, runManifest } = await buildDeepReviewLaunchFromSlashCommand( message, effectiveTargetSession.workspacePath, ); @@ -1476,6 +1488,7 @@ export const ChatInput: React.FC = ({ workspacePath: effectiveTargetSession.workspacePath, prompt, displayMessage: message, + runManifest, childSessionName: t('chatInput.deepreviewThreadTitle', { defaultValue: 'Deep review', }), @@ -1504,6 +1517,7 @@ export const ChatInput: React.FC = ({ currentReviewActivity, effectiveTargetSession, effectiveTargetSessionId, + flowChatState, inputState.value, isBtwSession, setQueuedInput, diff --git a/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.scss b/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.scss index ca709bd5d..c38a98d97 100644 --- a/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.scss +++ b/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.scss @@ -35,6 +35,12 @@ border: 1px solid color-mix(in srgb, var(--deep-review-accent) 22%, transparent); } +.deep-review-consent__fact-icon--warning { + color: color-mix(in srgb, var(--color-warning, #f59e0b) 82%, var(--color-text-primary)); + background: color-mix(in srgb, var(--color-warning, #f59e0b) 12%, transparent); + border-color: color-mix(in srgb, var(--color-warning, #f59e0b) 24%, transparent); +} + .deep-review-consent__heading { min-width: 0; @@ -43,7 +49,7 @@ color: var(--color-text-primary); font-size: 21px; font-weight: 680; - letter-spacing: -0.025em; + letter-spacing: 0; line-height: 1.22; } } @@ -52,7 +58,7 @@ color: color-mix(in srgb, var(--deep-review-accent) 58%, var(--color-text-muted)); font-size: 11px; font-weight: 720; - letter-spacing: 0.08em; + letter-spacing: 0; text-transform: uppercase; } @@ -92,8 +98,7 @@ line-height: 1.7; } -.deep-review-consent__safety-note, -.deep-review-consent__fact { +.deep-review-consent__safety-note { display: grid; grid-template-columns: auto minmax(0, 1fr); align-items: flex-start; @@ -118,16 +123,189 @@ color-mix(in srgb, var(--color-bg-elevated) 86%, transparent); } -.deep-review-consent__facts { +.deep-review-consent__capacity-note { display: grid; - grid-template-columns: repeat(2, minmax(0, 1fr)); + grid-template-columns: auto minmax(0, 1fr); + align-items: flex-start; gap: 12px; + padding: 12px 14px; + border: 1px solid color-mix(in srgb, var(--color-warning, #f59e0b) 28%, var(--border-subtle)); + border-radius: 8px; + background: color-mix(in srgb, var(--color-warning, #f59e0b) 8%, var(--color-bg-elevated)); + + p { + margin: 4px 0 0; + color: var(--color-text-secondary); + font-size: 12px; + line-height: 1.58; + } } -.deep-review-consent__fact { - min-height: 118px; +.deep-review-consent__summary { + display: flex; + flex-direction: column; + gap: 12px; padding: 14px; + border: 1px solid var(--border-subtle); + border-radius: 8px; + background: color-mix(in srgb, var(--color-bg-elevated) 88%, transparent); +} + +.deep-review-consent__summary-header { + display: grid; + grid-template-columns: minmax(0, 1fr) auto; + gap: 12px; + align-items: flex-start; + + p { + margin: 4px 0 0; + color: var(--color-text-secondary); + font-size: 12px; + line-height: 1.55; + } +} + +.deep-review-consent__summary-stats { + display: flex; + flex-wrap: wrap; + gap: 8px; + + span { + display: inline-flex; + align-items: center; + min-height: 24px; + padding: 2px 8px; + border: 1px solid color-mix(in srgb, var(--border-subtle) 88%, var(--deep-review-accent)); + border-radius: 999px; + color: var(--color-text-secondary); + background: color-mix(in srgb, var(--deep-review-accent) 7%, transparent); + font-size: 11px; + font-weight: 650; + line-height: 1.3; + } +} + +.deep-review-consent__summary-stats .deep-review-consent__summary-stat--warning { + border-color: color-mix(in srgb, var(--color-warning, #f59e0b) 34%, var(--border-subtle)); + background: color-mix(in srgb, var(--color-warning, #f59e0b) 9%, transparent); +} + +.deep-review-consent__reviewer-group { + display: flex; + flex-direction: column; + gap: 8px; +} + +.deep-review-consent__strategy-control { + display: grid; + grid-template-columns: minmax(0, 1fr) auto; + gap: 12px; + align-items: center; + padding: 10px; + border: 1px solid var(--border-subtle); border-radius: 8px; + background: color-mix(in srgb, var(--color-bg-primary) 78%, transparent); + + p { + margin: 4px 0 0; + color: var(--color-text-secondary); + font-size: 12px; + line-height: 1.45; + } +} + +.deep-review-consent__strategy-options { + display: inline-flex; + flex-wrap: wrap; + justify-content: flex-end; + gap: 6px; +} + +.deep-review-consent__strategy-option { + min-height: 28px; + padding: 4px 9px; + border: 1px solid var(--border-subtle); + border-radius: 6px; + background: color-mix(in srgb, var(--color-bg-elevated) 90%, transparent); + color: var(--color-text-secondary); + cursor: pointer; + font-size: 12px; + font-weight: 650; + line-height: 1.25; + transition: + background 160ms ease, + border-color 160ms ease, + color 160ms ease; + + &:hover { + border-color: color-mix(in srgb, var(--deep-review-accent) 34%, var(--border-subtle)); + background: color-mix(in srgb, var(--deep-review-accent) 9%, var(--color-bg-elevated)); + color: var(--color-text-primary); + } +} + +.deep-review-consent__strategy-option--active { + border-color: color-mix(in srgb, var(--deep-review-accent) 58%, var(--border-base)); + background: color-mix(in srgb, var(--deep-review-accent) 15%, var(--color-bg-elevated)); + color: var(--color-text-primary); +} + +.deep-review-consent__reviewer-group-title { + display: inline-flex; + align-items: center; + gap: 6px; + color: var(--color-text-secondary); + font-size: 11px; + font-weight: 720; + text-transform: uppercase; +} + +.deep-review-consent__reviewer-group-title--warning { + color: color-mix(in srgb, var(--color-warning, #f59e0b) 78%, var(--color-text-primary)); +} + +.deep-review-consent__skipped-list { + display: flex; + flex-direction: column; + gap: 6px; + margin: 0; + padding: 0; + list-style: none; + + li { + display: grid; + grid-template-columns: minmax(0, 1fr) auto; + gap: 10px; + align-items: center; + min-height: 28px; + padding: 6px 8px; + border: 1px solid color-mix(in srgb, var(--color-warning, #f59e0b) 20%, var(--border-subtle)); + border-radius: 6px; + background: color-mix(in srgb, var(--color-warning, #f59e0b) 6%, transparent); + color: var(--color-text-secondary); + font-size: 12px; + line-height: 1.35; + + span { + min-width: 0; + color: var(--color-text-primary); + overflow-wrap: anywhere; + } + + strong { + color: color-mix(in srgb, var(--color-warning, #f59e0b) 72%, var(--color-text-primary)); + font-size: 11px; + font-weight: 720; + text-align: right; + white-space: normal; + } + } +} + +li.deep-review-consent__skipped-more { + grid-template-columns: minmax(0, 1fr); + background: color-mix(in srgb, var(--color-bg-primary) 82%, transparent); + color: var(--color-text-muted); } .deep-review-consent__fact-icon { @@ -143,14 +321,6 @@ line-height: 1.35; } -.deep-review-consent__token-estimate { - margin: 6px 0 0; - color: var(--color-text-muted); - font-size: 11px; - font-weight: 500; - font-variant-numeric: tabular-nums; -} - .deep-review-consent__footer { display: flex; align-items: center; @@ -191,11 +361,22 @@ grid-template-columns: minmax(0, 1fr) auto; } - .deep-review-consent__facts, .deep-review-consent__footer { grid-template-columns: 1fr; } + .deep-review-consent__skipped-list li { + grid-template-columns: 1fr; + } + + .deep-review-consent__strategy-control { + grid-template-columns: 1fr; + } + + .deep-review-consent__strategy-options { + justify-content: flex-start; + } + .deep-review-consent__footer { flex-direction: column; align-items: stretch; diff --git a/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.test.tsx b/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.test.tsx new file mode 100644 index 000000000..a84ecbed5 --- /dev/null +++ b/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.test.tsx @@ -0,0 +1,367 @@ +import React from 'react'; +import { act } from 'react'; +import { createRoot, type Root } from 'react-dom/client'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { useDeepReviewConsent } from './DeepReviewConsentDialog'; +import type { ReviewTeamRunManifest } from '@/shared/services/reviewTeamService'; + +const mockSaveReviewTeamProjectStrategyOverride = vi.hoisted(() => vi.fn()); + +vi.mock('react-i18next', () => ({ + useTranslation: () => ({ + t: (_key: string, options?: Record) => { + const value = typeof options?.defaultValue === 'string' ? options.defaultValue : _key; + return value.replace(/\{\{(\w+)\}\}/g, (_match, key) => String(options?.[key] ?? '')); + }, + }), +})); + +vi.mock('@/component-library', () => ({ + Button: ({ + children, + onClick, + }: { + children: React.ReactNode; + onClick?: () => void; + }) => , + Checkbox: ({ + checked, + label, + onChange, + }: { + checked: boolean; + label: string; + onChange: (event: React.ChangeEvent) => void; + }) => ( + + ), + Modal: ({ + children, + isOpen, + }: { + children: React.ReactNode; + isOpen: boolean; + }) => (isOpen ?
{children}
: null), +})); + +vi.mock('@/shared/services/reviewTeamService', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + saveReviewTeamProjectStrategyOverride: ( + ...args: Parameters + ) => mockSaveReviewTeamProjectStrategyOverride(...args), + }; +}); + +let JSDOMCtor: (new ( + html?: string, + options?: { pretendToBeVisual?: boolean; url?: string } +) => { window: Window & typeof globalThis }) | null = null; + +try { + const jsdom = await import('jsdom'); + JSDOMCtor = jsdom.JSDOM as typeof JSDOMCtor; +} catch { + JSDOMCtor = null; +} + +const describeWithJsdom = JSDOMCtor ? describe : describe.skip; + +function Harness({ + preview, + launchContext, + onResult, +}: { + preview?: ReviewTeamRunManifest; + launchContext?: unknown; + onResult: (confirmed: boolean) => void; +}) { + const { confirmDeepReviewLaunch, deepReviewConsentDialog } = useDeepReviewConsent(); + + return ( + <> + + {deepReviewConsentDialog} + + ); +} + +function buildPreview(): ReviewTeamRunManifest { + return { + reviewMode: 'deep', + workspacePath: '/test-fixtures/project-a', + policySource: 'default-review-team-config', + target: { + source: 'session_files', + resolution: 'resolved', + tags: ['backend_core'], + files: ['src/crates/core/src/service/config/types.rs'], + warnings: [], + }, + strategyLevel: 'normal', + strategyRecommendation: { + strategyLevel: 'deep', + score: 24, + rationale: 'Large/high-risk change (8 files, 900 lines; 2 security-sensitive files, 3 workspace areas). Deep review recommended.', + factors: { + fileCount: 8, + totalLinesChanged: 900, + lineCountSource: 'diff_stat', + securityFileCount: 2, + workspaceAreaCount: 3, + contractSurfaceChanged: true, + }, + }, + executionPolicy: { + reviewerTimeoutSeconds: 300, + judgeTimeoutSeconds: 240, + reviewerFileSplitThreshold: 20, + maxSameRoleInstances: 3, + }, + tokenBudget: { + mode: 'balanced', + estimatedReviewerCalls: 3, + maxReviewerCalls: 4, + maxExtraReviewers: 1, + largeDiffSummaryFirst: false, + skippedReviewerIds: [], + warnings: [], + }, + coreReviewers: [ + { + subagentId: 'ReviewBusinessLogic', + displayName: 'Logic reviewer', + roleName: 'Business Logic Reviewer', + model: 'fast', + configuredModel: 'fast', + defaultModelSlot: 'fast', + strategyLevel: 'normal', + strategySource: 'team', + strategyDirective: 'Review logic.', + locked: true, + source: 'core', + subagentSource: 'builtin', + }, + ], + qualityGateReviewer: { + subagentId: 'ReviewJudge', + displayName: 'Quality inspector', + roleName: 'Review Quality Inspector', + model: 'fast', + configuredModel: 'fast', + defaultModelSlot: 'fast', + strategyLevel: 'normal', + strategySource: 'team', + strategyDirective: 'Check report quality.', + locked: true, + source: 'core', + subagentSource: 'builtin', + }, + enabledExtraReviewers: [ + { + subagentId: 'CustomSecurity', + displayName: 'Custom security reviewer', + roleName: 'Additional Specialist Reviewer', + model: 'fast', + configuredModel: 'fast', + defaultModelSlot: 'fast', + strategyLevel: 'normal', + strategySource: 'team', + strategyDirective: 'Review security.', + locked: false, + source: 'extra', + subagentSource: 'user', + }, + ], + skippedReviewers: [ + { + subagentId: 'ReviewFrontend', + displayName: 'Frontend reviewer', + roleName: 'Frontend Reviewer', + model: 'fast', + configuredModel: 'fast', + defaultModelSlot: 'fast', + strategyLevel: 'normal', + strategySource: 'team', + strategyDirective: 'Review frontend.', + locked: true, + source: 'core', + subagentSource: 'builtin', + reason: 'not_applicable', + }, + { + subagentId: 'CustomInvalid', + displayName: 'Custom invalid reviewer', + roleName: 'Additional Specialist Reviewer', + model: 'fast', + configuredModel: 'fast', + defaultModelSlot: 'fast', + strategyLevel: 'normal', + strategySource: 'team', + strategyDirective: 'Review custom rules.', + locked: false, + source: 'extra', + subagentSource: 'user', + reason: 'invalid_tooling', + }, + ], + }; +} + +function buildPreviewWithoutSkippedReviewers(): ReviewTeamRunManifest { + return { + ...buildPreview(), + skippedReviewers: [], + }; +} + +describeWithJsdom('DeepReviewConsentDialog', () => { + let dom: { window: Window & typeof globalThis }; + let container: HTMLDivElement; + let root: Root; + + beforeEach(() => { + mockSaveReviewTeamProjectStrategyOverride.mockResolvedValue(undefined); + dom = new JSDOMCtor!('', { + pretendToBeVisual: true, + url: 'http://localhost', + }); + + const { window } = dom; + vi.stubGlobal('window', window); + vi.stubGlobal('document', window.document); + vi.stubGlobal('navigator', window.navigator); + vi.stubGlobal('HTMLElement', window.HTMLElement); + vi.stubGlobal('Event', window.Event); + vi.stubGlobal('localStorage', window.localStorage); + vi.stubGlobal('IS_REACT_ACT_ENVIRONMENT', true); + + container = document.createElement('div'); + document.body.appendChild(container); + root = createRoot(container); + }); + + afterEach(() => { + act(() => { + root.unmount(); + }); + container.remove(); + dom.window.close(); + vi.unstubAllGlobals(); + }); + + it('shows a compact launch summary with skipped reviewers only when needed', async () => { + const result = vi.fn(); + + await act(async () => { + root.render(); + }); + await act(async () => { + container.querySelector('button')?.dispatchEvent(new window.Event('click', { bubbles: true })); + }); + + expect(container.textContent).toContain('Launch summary'); + expect(container.textContent).toContain('1 file'); + expect(container.textContent).toContain('Risk areas: Backend core'); + expect(container.textContent).toContain('3 reviewer calls'); + expect(container.textContent).toContain('1 optional reviewer'); + expect(container.textContent).toContain('2 skipped'); + expect(container.textContent).toContain('Run strategy: Normal'); + expect(container.textContent).toContain('Frontend reviewer'); + expect(container.textContent).toContain('Not applicable to this target'); + expect(container.textContent).toContain('Custom invalid reviewer'); + expect(container.textContent).toContain('Configuration issue'); + expect(container.textContent).not.toContain('Logic reviewer'); + expect(container.textContent).not.toContain('Custom security reviewer'); + }); + + it('still opens when skip preference is set but reviewers are skipped', async () => { + localStorage.setItem('bitfun.deepReview.skipCostConfirmation', 'true'); + const result = vi.fn(); + + await act(async () => { + root.render(); + }); + await act(async () => { + container.querySelector('button')?.dispatchEvent(new window.Event('click', { bubbles: true })); + }); + + expect(container.querySelector('[role="dialog"]')).not.toBeNull(); + expect(result).not.toHaveBeenCalled(); + }); + + it('still opens when skip preference is set but the active session is busy', async () => { + localStorage.setItem('bitfun.deepReview.skipCostConfirmation', 'true'); + const result = vi.fn(); + + await act(async () => { + root.render( + , + ); + }); + await act(async () => { + container.querySelector('button')?.dispatchEvent(new window.Event('click', { bubbles: true })); + }); + + expect(container.querySelector('[role="dialog"]')).not.toBeNull(); + expect(container.textContent).toContain('Active session is busy'); + expect(container.textContent).toContain('2 running subagent tasks'); + expect(result).not.toHaveBeenCalled(); + }); + + it('persists a selected project strategy override before confirming', async () => { + const result = vi.fn(); + + await act(async () => { + root.render(); + }); + await act(async () => { + container.querySelector('button')?.dispatchEvent(new window.Event('click', { bubbles: true })); + }); + + const deepStrategyButton = Array.from(container.querySelectorAll('button')) + .find((button) => button.textContent === 'Deep'); + expect(deepStrategyButton).not.toBeUndefined(); + + await act(async () => { + deepStrategyButton?.dispatchEvent(new window.Event('click', { bubbles: true })); + }); + await act(async () => { + Array.from(container.querySelectorAll('button')) + .find((button) => button.textContent === 'Start Deep Review') + ?.dispatchEvent(new window.Event('click', { bubbles: true })); + }); + + expect(mockSaveReviewTeamProjectStrategyOverride).toHaveBeenCalledWith( + '/test-fixtures/project-a', + 'deep', + ); + expect(result).toHaveBeenCalledWith(true); + }); +}); diff --git a/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.tsx b/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.tsx index c743b2706..e44f31d10 100644 --- a/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.tsx +++ b/src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.tsx @@ -1,31 +1,111 @@ import React, { useCallback, useState } from 'react'; -import { Clock, Coins, ShieldCheck, X } from 'lucide-react'; -import { estimateTokenConsumption, formatTokenCount } from '../utils/deepReviewExperience'; +import { AlertTriangle, ShieldCheck, X } from 'lucide-react'; import { useTranslation } from 'react-i18next'; import { Button, Checkbox, Modal } from '@/component-library'; import { createLogger } from '@/shared/utils/logger'; +import type { + ReviewStrategyLevel, + ReviewTeamManifestMember, + ReviewTeamManifestMemberReason, + ReviewTeamRunManifest, +} from '@/shared/services/reviewTeamService'; +import { + REVIEW_STRATEGY_LEVELS, + getReviewStrategyProfile, + saveReviewTeamProjectStrategyOverride, +} from '@/shared/services/reviewTeamService'; +import type { DeepReviewSessionConcurrencyGuard } from '../utils/deepReviewCapacityGuard'; import './DeepReviewConsentDialog.scss'; const log = createLogger('DeepReviewConsentDialog'); const SKIP_DEEP_REVIEW_CONFIRMATION_STORAGE_KEY = 'bitfun.deepReview.skipCostConfirmation'; +const MAX_VISIBLE_SKIPPED_REVIEWERS = 3; +const MAX_VISIBLE_TARGET_TAGS = 3; + +const TARGET_TAG_LABELS: Record = { + frontend_ui: { key: 'frontendUi', defaultValue: 'Frontend UI' }, + frontend_style: { key: 'frontendStyle', defaultValue: 'Frontend styles' }, + frontend_i18n: { key: 'frontendI18n', defaultValue: 'Frontend i18n' }, + frontend_contract: { key: 'frontendContract', defaultValue: 'Frontend contract' }, + desktop_contract: { key: 'desktopContract', defaultValue: 'Desktop contract' }, + web_server_contract: { key: 'webServerContract', defaultValue: 'Web server contract' }, + backend_core: { key: 'backendCore', defaultValue: 'Backend core' }, + transport: { key: 'transport', defaultValue: 'Transport' }, + api_layer: { key: 'apiLayer', defaultValue: 'API layer' }, + ai_adapter: { key: 'aiAdapter', defaultValue: 'AI adapter' }, + installer_ui: { key: 'installerUi', defaultValue: 'Installer UI' }, + test: { key: 'test', defaultValue: 'Tests' }, + docs: { key: 'docs', defaultValue: 'Docs' }, + config: { key: 'config', defaultValue: 'Config' }, + generated_or_lock: { key: 'generatedOrLock', defaultValue: 'Generated or lockfile' }, + unknown: { key: 'unknown', defaultValue: 'Unknown area' }, +}; interface PendingConsent { resolve: (confirmed: boolean) => void; + preview?: ReviewTeamRunManifest; + launchContext?: DeepReviewConsentLaunchContext; +} + +export interface DeepReviewConsentLaunchContext { + sessionConcurrencyGuard?: DeepReviewSessionConcurrencyGuard | null; } export interface DeepReviewConsentControls { - confirmDeepReviewLaunch: () => Promise; + confirmDeepReviewLaunch: ( + preview?: ReviewTeamRunManifest, + launchContext?: DeepReviewConsentLaunchContext, + ) => Promise; deepReviewConsentDialog: React.ReactNode; } +function hasSkippedReviewers(preview?: ReviewTeamRunManifest): boolean { + return Boolean(preview?.skippedReviewers?.length); +} + +function hasSessionConcurrencyWarning(launchContext?: DeepReviewConsentLaunchContext): boolean { + return Boolean(launchContext?.sessionConcurrencyGuard?.highActivity); +} + +function getReviewerLabel(member: ReviewTeamManifestMember): string { + return member.displayName || member.subagentId; +} + +function getReviewTargetFileCount(preview: ReviewTeamRunManifest): number { + return preview.target.files.filter((file) => { + if (typeof file === 'string') { + return true; + } + return !file.excluded; + }).length; +} + +function getFallbackTargetTagLabel(tag: string): string { + return tag + .split('_') + .filter(Boolean) + .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) + .join(' '); +} + export function useDeepReviewConsent(): DeepReviewConsentControls { const { t } = useTranslation('flow-chat'); const [pendingConsent, setPendingConsent] = useState(null); const [dontShowAgain, setDontShowAgain] = useState(false); + const [selectedStrategyOverride, setSelectedStrategyOverride] = + useState(null); + const [strategySelectionTouched, setStrategySelectionTouched] = useState(false); - const confirmDeepReviewLaunch = useCallback(async () => { + const confirmDeepReviewLaunch = useCallback(async ( + preview?: ReviewTeamRunManifest, + launchContext?: DeepReviewConsentLaunchContext, + ) => { try { - if (localStorage.getItem(SKIP_DEEP_REVIEW_CONFIRMATION_STORAGE_KEY) === 'true') { + if ( + localStorage.getItem(SKIP_DEEP_REVIEW_CONFIRMATION_STORAGE_KEY) === 'true' && + !hasSkippedReviewers(preview) && + !hasSessionConcurrencyWarning(launchContext) + ) { return true; } } catch (error) { @@ -34,7 +114,9 @@ export function useDeepReviewConsent(): DeepReviewConsentControls { return new Promise((resolve) => { setDontShowAgain(false); - setPendingConsent({ resolve }); + setSelectedStrategyOverride(null); + setStrategySelectionTouched(false); + setPendingConsent({ resolve, preview, launchContext }); }); }, []); @@ -44,6 +126,21 @@ export function useDeepReviewConsent(): DeepReviewConsentControls { return; } + if ( + confirmed && + strategySelectionTouched && + pending.preview?.workspacePath + ) { + try { + await saveReviewTeamProjectStrategyOverride( + pending.preview.workspacePath, + selectedStrategyOverride ?? undefined, + ); + } catch (error) { + log.warn('Failed to persist Deep Review project strategy override', error); + } + } + if (confirmed && dontShowAgain) { try { localStorage.setItem(SKIP_DEEP_REVIEW_CONFIRMATION_STORAGE_KEY, 'true'); @@ -54,7 +151,229 @@ export function useDeepReviewConsent(): DeepReviewConsentControls { setPendingConsent(null); pending.resolve(confirmed); - }, [dontShowAgain, pendingConsent]); + }, [dontShowAgain, pendingConsent, selectedStrategyOverride, strategySelectionTouched]); + + const selectStrategyOverride = useCallback((strategyLevel: ReviewStrategyLevel | null) => { + setSelectedStrategyOverride(strategyLevel); + setStrategySelectionTouched(true); + }, []); + + const getSkippedReasonLabel = useCallback((reason?: ReviewTeamManifestMemberReason) => { + switch (reason) { + case 'not_applicable': + return t('deepReviewConsent.skippedReasons.notApplicable', { + defaultValue: 'Not applicable to this target', + }); + case 'budget_limited': + return t('deepReviewConsent.skippedReasons.budgetLimited', { + defaultValue: 'Limited by token budget', + }); + case 'invalid_tooling': + return t('deepReviewConsent.skippedReasons.invalidTooling', { + defaultValue: 'Configuration issue', + }); + case 'disabled': + return t('deepReviewConsent.skippedReasons.disabled', { + defaultValue: 'Disabled', + }); + case 'unavailable': + return t('deepReviewConsent.skippedReasons.unavailable', { + defaultValue: 'Unavailable', + }); + default: + return t('deepReviewConsent.skippedReasons.skipped', { + defaultValue: 'Skipped', + }); + } + }, [t]); + + const renderLaunchSummary = useCallback((preview: ReviewTeamRunManifest) => { + const skippedReviewers = preview.skippedReviewers; + const skippedCount = skippedReviewers.length; + const visibleSkippedReviewers = skippedReviewers.slice(0, MAX_VISIBLE_SKIPPED_REVIEWERS); + const hiddenSkippedCount = Math.max(0, skippedCount - visibleSkippedReviewers.length); + const selectedStrategy = strategySelectionTouched + ? selectedStrategyOverride + : preview.strategyLevel; + const selectedStrategyLabel = selectedStrategy + ? t(`deepReviewConsent.strategyLabels.${selectedStrategy}`, { + defaultValue: getReviewStrategyProfile(selectedStrategy).label, + }) + : t('deepReviewConsent.teamDefaultStrategy', { + defaultValue: 'Team default', + }); + const targetFileCount = getReviewTargetFileCount(preview); + const visibleTargetTags = preview.target.tags.slice(0, MAX_VISIBLE_TARGET_TAGS); + const hiddenTargetTagCount = Math.max(0, preview.target.tags.length - visibleTargetTags.length); + const targetTagLabels = visibleTargetTags.map((tag) => { + const label = TARGET_TAG_LABELS[tag] ?? { + key: 'unknown', + defaultValue: getFallbackTargetTagLabel(tag), + }; + return t(`deepReviewConsent.targetTagLabels.${label.key}`, { + defaultValue: label.defaultValue, + }); + }); + const targetTagSummary = targetTagLabels.length > 0 + ? hiddenTargetTagCount > 0 + ? t('deepReviewConsent.targetTagsWithMore', { + tags: targetTagLabels.join(', '), + count: hiddenTargetTagCount, + defaultValue: '{{tags}} +{{count}} more', + }) + : targetTagLabels.join(', ') + : t('deepReviewConsent.targetTagLabels.unknown', { + defaultValue: 'Unknown area', + }); + const optionalReviewerCount = preview.enabledExtraReviewers.length; + + return ( +
+
+ + {t('deepReviewConsent.summaryTitle', { defaultValue: 'Launch summary' })} + +
+ +
+ + {t('deepReviewConsent.targetFiles', { + count: targetFileCount, + defaultValue: targetFileCount === 1 ? '{{count}} file' : '{{count}} files', + })} + + + {t('deepReviewConsent.targetRiskTags', { + tags: targetTagSummary, + defaultValue: 'Risk areas: {{tags}}', + })} + + + {t('deepReviewConsent.estimatedCalls', { + count: preview.tokenBudget.estimatedReviewerCalls, + defaultValue: '{{count}} reviewer calls', + })} + + {skippedCount > 0 && ( + + {t('deepReviewConsent.skippedReviewers', { + count: skippedCount, + defaultValue: '{{count}} skipped', + })} + + )} + {optionalReviewerCount > 0 && ( + + {t('deepReviewConsent.optionalReviewers', { + count: optionalReviewerCount, + defaultValue: optionalReviewerCount === 1 + ? '{{count}} optional reviewer' + : '{{count}} optional reviewers', + })} + + )} + {preview.tokenBudget.largeDiffSummaryFirst && ( + + {t('deepReviewConsent.summaryFirstReview', { + defaultValue: 'Summary-first coverage', + })} + + )} + + {t('deepReviewConsent.runStrategy', { + strategy: selectedStrategyLabel, + defaultValue: 'Run strategy: {{strategy}}', + })} + +
+ + {preview.workspacePath && ( +
+
+ {t('deepReviewConsent.strategyOverrideTitle', { + defaultValue: 'Run strategy', + })} +
+
+ + {REVIEW_STRATEGY_LEVELS.map((strategyLevel) => { + const isActive = selectedStrategy === strategyLevel; + return ( + + ); + })} +
+
+ )} + + {skippedReviewers.length > 0 && ( +
+
+ + {t('deepReviewConsent.skippedGroupTitle', { defaultValue: 'Skipped reviewers' })} +
+
    + {visibleSkippedReviewers.map((member) => ( +
  • + {getReviewerLabel(member)} + {getSkippedReasonLabel(member.reason)} +
  • + ))} + {hiddenSkippedCount > 0 && ( +
  • + + {t('deepReviewConsent.skippedMore', { + count: hiddenSkippedCount, + defaultValue: '+{{count}} more', + })} + +
  • + )} +
+
+ )} +
+ ); + }, [ + getSkippedReasonLabel, + selectStrategyOverride, + selectedStrategyOverride, + strategySelectionTouched, + t, + ]); const deepReviewConsentDialog = pendingConsent ? ( {t('deepReviewConsent.eyebrow', { defaultValue: 'Code review team' })} -

{t('deepReviewConsent.title')}

+

{t('deepReviewConsent.title', { defaultValue: 'Start Deep Review?' })}

-

{t('deepReviewConsent.body')}

+

+ {t('deepReviewConsent.body', { + defaultValue: 'Deep Review launches multiple reviewers and can take longer or use more tokens than a standard review.', + })} +

@@ -93,51 +416,46 @@ export function useDeepReviewConsent(): DeepReviewConsentControls { {t('deepReviewConsent.readonlyLabel', { defaultValue: 'Read-only first pass' })} -

{t('deepReviewConsent.readonly')}

+

+ {t('deepReviewConsent.readonly', { + defaultValue: 'The first pass reports findings and a remediation plan before any code changes.', + })} +

-
-
-
- + {pendingConsent.launchContext?.sessionConcurrencyGuard?.highActivity && ( +
+
+
- {t('deepReviewConsent.costLabel', { defaultValue: 'Higher token usage' })} + {t('deepReviewConsent.sessionConcurrencyTitle', { + defaultValue: 'Active session is busy', + })} -

{t('deepReviewConsent.cost')}

-

- {(() => { - const est = estimateTokenConsumption(5); - return t('deepReviewConsent.estimatedTokens', { - min: formatTokenCount(est.min), - max: formatTokenCount(est.max), - defaultValue: 'Estimated: {{min}} - {{max}} tokens', - }); - })()} +

+ {t('deepReviewConsent.sessionConcurrencyBody', { + count: pendingConsent.launchContext.sessionConcurrencyGuard.activeSubagentCount, + defaultValue: + 'The target session already has {{count}} running subagent tasks. Choose a lighter strategy, cancel for now, or continue manually when capacity is free.', + })}

-
-
- -
-
- - {t('deepReviewConsent.timeLabel', { defaultValue: 'Longer runtime' })} - -

{t('deepReviewConsent.time')}

-
-
-
+ )} + + {pendingConsent.preview && renderLaunchSummary(pendingConsent.preview)}
setDontShowAgain(event.target.checked)} - label={t('deepReviewConsent.dontShowAgain')} + label={t('deepReviewConsent.dontShowAgain', { + defaultValue: 'Do not show this again', + })} />
diff --git a/src/web-ui/src/flow_chat/components/btw/BtwSessionPanel.tsx b/src/web-ui/src/flow_chat/components/btw/BtwSessionPanel.tsx index 07b14fe76..98d04a60f 100644 --- a/src/web-ui/src/flow_chat/components/btw/BtwSessionPanel.tsx +++ b/src/web-ui/src/flow_chat/components/btw/BtwSessionPanel.tsx @@ -437,6 +437,14 @@ export const BtwSessionPanel: React.FC = ({ completedRemediationIds: store.completedRemediationIds, }); store.minimize(); + } else if (isComplete && store.phase === 'review_waiting_capacity') { + store.showActionBar({ + childSessionId, + parentSessionId: parentSessionId ?? null, + reviewData: latestReviewData, + reviewMode, + phase: 'review_completed', + }); } return; } diff --git a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.i18n.test.ts b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.i18n.test.ts index 17bafe44a..af98398db 100644 --- a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.i18n.test.ts +++ b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.i18n.test.ts @@ -34,6 +34,15 @@ const REQUIRED_ACTION_BAR_KEYS = [ 'deepReviewActionBar.continueFix', 'deepReviewActionBar.skipRemaining', 'deepReviewActionBar.switchModel', + 'deepReviewActionBar.capacityQueue.title', + 'deepReviewActionBar.capacityQueue.pausedTitle', + 'deepReviewActionBar.capacityQueue.detail', + 'deepReviewActionBar.capacityQueue.sessionBusy', + 'deepReviewActionBar.capacityQueue.pauseQueue', + 'deepReviewActionBar.capacityQueue.continueQueue', + 'deepReviewActionBar.capacityQueue.cancelQueued', + 'deepReviewActionBar.capacityQueue.skipOptionalQueued', + 'deepReviewActionBar.capacityQueue.controlFailed', 'reviewActionBar.noIssuesFound', ]; @@ -59,6 +68,11 @@ const REQUIRED_REVIEW_TEAM_PAGE_KEYS = [ 'reviewTeams.detail.loading', ]; +const REQUIRED_DEEP_REVIEW_CONSENT_KEYS = [ + 'deepReviewConsent.sessionConcurrencyTitle', + 'deepReviewConsent.sessionConcurrencyBody', +]; + function getMessageValue(messages: unknown, key: string): unknown { return key .split('.') @@ -103,4 +117,15 @@ describe('DeepReviewActionBar i18n', () => { expect(missingKeys, `${locale} missing keys`).toEqual([]); } }); + + it('keeps Deep Review consent strings available in every bundled locale', () => { + for (const [locale, messages] of Object.entries(LOCALES)) { + const missingKeys = REQUIRED_DEEP_REVIEW_CONSENT_KEYS.filter((key) => { + const value = getMessageValue(messages, key); + return typeof value !== 'string' || value.trim().length === 0; + }); + + expect(missingKeys, `${locale} missing keys`).toEqual([]); + } + }); }); diff --git a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.scss b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.scss index 16f30c002..00844d416 100644 --- a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.scss +++ b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.scss @@ -594,6 +594,56 @@ font-variant-numeric: tabular-nums; } + /* Capacity queue */ + &__capacity-queue { + display: flex; + flex-wrap: wrap; + align-items: flex-start; + justify-content: space-between; + gap: 10px; + padding: 8px 10px; + border-radius: 6px; + border: 1px solid color-mix(in srgb, var(--color-warning, #f59e0b) 30%, var(--border-base)); + background: color-mix(in srgb, var(--color-warning, #f59e0b) 8%, var(--deep-review-action-bar-surface)); + } + + &__capacity-queue-main { + display: flex; + align-items: flex-start; + gap: 8px; + min-width: 0; + } + + &__capacity-queue-icon { + flex-shrink: 0; + margin-top: 2px; + color: var(--color-warning, #f59e0b); + } + + &__capacity-queue-copy { + display: flex; + flex-direction: column; + gap: 2px; + min-width: 0; + } + + &__capacity-queue-title { + color: var(--color-text-primary); + font-weight: 600; + } + + &__capacity-queue-detail { + color: var(--color-text-secondary); + } + + &__capacity-queue-actions { + display: flex; + flex-wrap: wrap; + justify-content: flex-end; + gap: 6px; + flex-shrink: 0; + } + /* Partial results */ &__partial-summary { display: flex; diff --git a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx index f268716f4..8b5e3e836 100644 --- a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx +++ b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx @@ -13,6 +13,7 @@ const buildRecoveryPlanMock = vi.hoisted(() => vi.fn(() => ({ willSkip: [], summaryText: '1 completed reviewer will be preserved; 1 reviewer will be rerun', }))); +const controlDeepReviewQueueMock = vi.hoisted(() => vi.fn()); vi.mock('react-i18next', () => ({ initReactI18next: { @@ -59,6 +60,12 @@ vi.mock('../../services/FlowChatManager', () => ({ }, })); +vi.mock('@/infrastructure/api/service-api/AgentAPI', () => ({ + agentAPI: { + controlDeepReviewQueue: controlDeepReviewQueueMock, + }, +})); + vi.mock('@/infrastructure/event-bus', () => ({ globalEventBus: { emit: eventBusEmitMock, @@ -360,6 +367,117 @@ describeWithJsdom('DeepReviewActionBar', () => { expect(state.minimized).toBe(true); }); + it('does not show capacity queue controls when there is no queue state', async () => { + const { DeepReviewActionBar } = await import('./DeepReviewActionBar'); + + useReviewActionBarStore.getState().showActionBar({ + childSessionId: 'child-session', + parentSessionId: 'parent-session', + reviewData: { + summary: { recommended_action: 'request_changes' }, + remediation_plan: ['Fix issue 1'], + }, + phase: 'review_completed', + }); + + await act(async () => { + root.render(); + }); + + expect(container.textContent).not.toContain('Reviewers waiting for capacity'); + expect(Array.from(container.querySelectorAll('button')).some((button) => ( + button.textContent?.includes('Pause queue') + ))).toBe(false); + }); + + it('shows compact capacity queue controls and keeps them locally adjustable', async () => { + const { DeepReviewActionBar } = await import('./DeepReviewActionBar'); + + useReviewActionBarStore.getState().showActionBar({ + childSessionId: 'child-session', + parentSessionId: 'parent-session', + reviewData: { + summary: { recommended_action: 'request_changes' }, + remediation_plan: ['Fix issue 1'], + }, + phase: 'review_completed', + }); + useReviewActionBarStore.setState({ + capacityQueueState: { + status: 'queued_for_capacity', + queuedReviewerCount: 2, + activeReviewerCount: 1, + optionalReviewerCount: 1, + sessionConcurrencyHigh: true, + }, + } as Partial>); + + await act(async () => { + root.render(); + }); + + expect(container.textContent).toContain('Reviewers waiting for capacity'); + expect(container.textContent).toContain('Queue wait does not count against reviewer runtime.'); + expect(container.textContent).toContain('Your active session is busy.'); + + const pauseButton = Array.from(container.querySelectorAll('button')) + .find((button) => button.textContent?.includes('Pause queue')); + expect(pauseButton).toBeTruthy(); + + await act(async () => { + pauseButton!.dispatchEvent(new dom.window.MouseEvent('click', { bubbles: true })); + await Promise.resolve(); + }); + + expect((useReviewActionBarStore.getState() as unknown as { + capacityQueueState: { status: string }; + }).capacityQueueState.status).toBe('paused_by_user'); + expect(container.textContent).toContain('Queue paused'); + }); + + it('sends backend queue control actions for event-driven capacity waits', async () => { + const { DeepReviewActionBar } = await import('./DeepReviewActionBar'); + controlDeepReviewQueueMock.mockResolvedValue(undefined); + + useReviewActionBarStore.getState().showCapacityQueueBar({ + childSessionId: 'child-session', + parentSessionId: 'parent-session', + capacityQueueState: { + toolId: 'task-queue-1', + subagentType: 'ReviewSecurity', + dialogTurnId: 'turn-queue-1', + status: 'queued_for_capacity', + queuedReviewerCount: 1, + activeReviewerCount: 1, + optionalReviewerCount: 1, + controlMode: 'backend', + }, + }); + + await act(async () => { + root.render(); + }); + + const pauseButton = Array.from(container.querySelectorAll('button')) + .find((button) => button.textContent?.includes('Pause queue')); + expect(pauseButton).toBeTruthy(); + + await act(async () => { + pauseButton!.dispatchEvent(new dom.window.MouseEvent('click', { bubbles: true })); + await Promise.resolve(); + }); + + expect(controlDeepReviewQueueMock).toHaveBeenCalledWith({ + sessionId: 'child-session', + dialogTurnId: 'turn-queue-1', + toolId: 'task-queue-1', + action: 'pause', + }); + expect((useReviewActionBarStore.getState() as unknown as { + capacityQueueState: { status: string }; + }).capacityQueueState.status).toBe('paused_by_user'); + }); + it('shows distinct progress text after starting fix and re-review', async () => { const { DeepReviewActionBar } = await import('./DeepReviewActionBar'); diff --git a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx index a0584bff1..a96641851 100644 --- a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx +++ b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx @@ -10,6 +10,7 @@ import { ChevronUp, MessageSquare, Play, + Pause, Copy, Info, SkipForward, @@ -18,7 +19,11 @@ import { Minus, } from 'lucide-react'; import { Button, Checkbox, Tooltip } from '@/component-library'; -import { useReviewActionBarStore, type ReviewActionPhase } from '../../store/deepReviewActionBarStore'; +import { + useReviewActionBarStore, + type DeepReviewCapacityQueueAction, + type ReviewActionPhase, +} from '../../store/deepReviewActionBarStore'; import type { ReviewRemediationItem } from '../../utils/codeReviewRemediation'; import { buildSelectedReviewRemediationPrompt, REMEDIATION_GROUP_ORDER } from '../../utils/codeReviewRemediation'; import type { RemediationGroupId } from '../../utils/codeReviewReport'; @@ -40,6 +45,7 @@ import { } from '../../utils/deepReviewExperience'; import { flowChatStore } from '../../store/FlowChatStore'; import { CodeReviewReportExportActions } from '../../tool-cards/CodeReviewReportExportActions'; +import { agentAPI } from '@/infrastructure/api/service-api/AgentAPI'; import './DeepReviewActionBar.scss'; const log = createLogger('DeepReviewActionBar'); @@ -56,6 +62,7 @@ const PHASE_CONFIG: Record { completedRemediationIds, remainingFixIds, decisionSelections, + capacityQueueState, } = store; const [showCustomInput, setShowCustomInput] = useState(false); @@ -112,6 +120,55 @@ export const ReviewActionBar: React.FC = () => { const isDeepReview = reviewMode === 'deep'; const hasInterruption = isDeepReview && Boolean(interruption); const isResumeRunning = phase === 'resume_running'; + const showCapacityQueueNotice = isDeepReview && + Boolean(capacityQueueState) && + capacityQueueState?.status !== 'running' && + capacityQueueState?.status !== 'capacity_skipped'; + const hasBackendQueueControlTarget = Boolean( + childSessionId && + capacityQueueState?.dialogTurnId && + capacityQueueState?.toolId, + ); + const supportsInlineQueueControls = + capacityQueueState?.controlMode === 'backend' + ? hasBackendQueueControlTarget + : capacityQueueState?.controlMode !== 'session_stop_only'; + + const handleCapacityQueueAction = useCallback(async ( + action: DeepReviewCapacityQueueAction, + applyLocalAction: () => void, + ) => { + if (!capacityQueueState) { + return; + } + + if (capacityQueueState.controlMode !== 'backend') { + applyLocalAction(); + return; + } + + if (!childSessionId || !capacityQueueState.dialogTurnId || !capacityQueueState.toolId) { + notificationService.error(t('deepReviewActionBar.capacityQueue.controlFailed', { + defaultValue: 'Queue control is unavailable for this reviewer.', + })); + return; + } + + try { + await agentAPI.controlDeepReviewQueue({ + sessionId: childSessionId, + dialogTurnId: capacityQueueState.dialogTurnId, + toolId: capacityQueueState.toolId, + action, + }); + applyLocalAction(); + } catch (error) { + log.warn('Failed to control DeepReview capacity queue', error); + notificationService.error(t('deepReviewActionBar.capacityQueue.controlFailed', { + defaultValue: 'Queue control failed. Please try again or stop the review.', + })); + } + }, [capacityQueueState, childSessionId, t]); // ---- progress tracking ---- const sessions = flowChatStore.getState().sessions; @@ -533,6 +590,10 @@ export const ReviewActionBar: React.FC = () => { return t('deepReviewActionBar.fixTimeout', { defaultValue: 'Fix timed out', }); + case 'review_waiting_capacity': + return t('deepReviewActionBar.reviewWaitingCapacity', { + defaultValue: 'Review queue waiting', + }); case 'review_interrupted': return t('deepReviewActionBar.reviewInterrupted', { defaultValue: 'Deep review interrupted', @@ -613,6 +674,105 @@ export const ReviewActionBar: React.FC = () => {
)} + {/* Capacity queue notice */} + {showCapacityQueueNotice && capacityQueueState && ( +
+
+ +
+ + {capacityQueueState.status === 'paused_by_user' + ? t('deepReviewActionBar.capacityQueue.pausedTitle', { + defaultValue: 'Queue paused', + }) + : t('deepReviewActionBar.capacityQueue.title', { + defaultValue: 'Reviewers waiting for capacity', + })} + + + {t('deepReviewActionBar.capacityQueue.detail', { + defaultValue: 'Queue wait does not count against reviewer runtime.', + })} + + {capacityQueueState.sessionConcurrencyHigh && ( + + {t('deepReviewActionBar.capacityQueue.sessionBusy', { + defaultValue: 'Your active session is busy. Pause Deep Review or continue later.', + })} + + )} + {!supportsInlineQueueControls && ( + + {t('deepReviewActionBar.capacityQueue.stopHint', { + defaultValue: 'Use Stop to interrupt this review queue.', + })} + + )} +
+
+ {supportsInlineQueueControls && ( +
+ {capacityQueueState.status === 'paused_by_user' ? ( + + ) : ( + + )} + {(capacityQueueState.optionalReviewerCount ?? 0) > 0 && ( + + )} + +
+ )} +
+ )} + {/* Partial results summary on interruption */} {hasInterruption && progressSummary && progressSummary.completed > 0 && (
diff --git a/src/web-ui/src/flow_chat/components/modern/SessionFilesBadge.tsx b/src/web-ui/src/flow_chat/components/modern/SessionFilesBadge.tsx index 01cc6e582..d7f3f32c4 100644 --- a/src/web-ui/src/flow_chat/components/modern/SessionFilesBadge.tsx +++ b/src/web-ui/src/flow_chat/components/modern/SessionFilesBadge.tsx @@ -26,7 +26,8 @@ import { runWithConcurrencyLimit } from '@/shared/utils/runWithConcurrencyLimit' import { createBtwChildSession } from '../../services/BtwThreadService'; import { openBtwSessionInAuxPane } from '../../services/openBtwSession'; import { - buildDeepReviewPromptFromSessionFiles, + buildDeepReviewLaunchFromSessionFiles, + buildDeepReviewPreviewFromSessionFiles, launchDeepReviewSession, } from '../../services/DeepReviewService'; import { insertReviewSessionSummaryMarker } from '../../services/ReviewSessionMarkerService'; @@ -41,6 +42,7 @@ import { useSessionReviewActivity } from '../../hooks/useSessionReviewActivity'; import { useSessionStateMachine } from '../../hooks/useSessionStateMachine'; import { SessionExecutionState } from '../../state-machine/types'; import { isReviewActivityBlocking } from '../../utils/sessionReviewActivity'; +import { deriveDeepReviewSessionConcurrencyGuard } from '../../utils/deepReviewCapacityGuard'; import './SessionFilesBadge.scss'; const log = createLogger('SessionFilesBadge'); @@ -609,24 +611,6 @@ export const SessionFilesBadge: React.FC = ({ return; } - const confirmed = await confirmDeepReviewLaunch(); - if (!confirmed) { - return; - } - setLaunchingReviewMode('deep_review'); - - if (skippedCount > 0) { - notificationService.info( - t('sessionFilesBadge.review.filteredNotice', { - included: reviewableFilePaths.length, - skipped: skippedCount, - defaultValue: - 'Review will analyze {{included}} files and skip {{skipped}} excluded files such as lock, generated, or binary assets.', - }), - { duration: 3500 } - ); - } - const fileList = reviewableFilePaths.map(p => `- ${p}`).join('\n'); const displayMessage = skippedCount > 0 ? t('sessionFilesBadge.deepReview.displayMessageFiltered', { @@ -641,7 +625,34 @@ export const SessionFilesBadge: React.FC = ({ }); try { - const prompt = await buildDeepReviewPromptFromSessionFiles( + const preview = await buildDeepReviewPreviewFromSessionFiles( + reviewableFilePaths, + currentWorkspace?.rootPath, + ); + const confirmed = await confirmDeepReviewLaunch(preview, { + sessionConcurrencyGuard: deriveDeepReviewSessionConcurrencyGuard( + flowChatStore.getState(), + sessionId, + ), + }); + if (!confirmed) { + return; + } + setLaunchingReviewMode('deep_review'); + + if (skippedCount > 0) { + notificationService.info( + t('sessionFilesBadge.review.filteredNotice', { + included: reviewableFilePaths.length, + skipped: skippedCount, + defaultValue: + 'Review will analyze {{included}} files and skip {{skipped}} excluded files such as lock, generated, or binary assets.', + }), + { duration: 3500 } + ); + } + + const { prompt, runManifest } = await buildDeepReviewLaunchFromSessionFiles( reviewableFilePaths, undefined, currentWorkspace?.rootPath, @@ -652,6 +663,7 @@ export const SessionFilesBadge: React.FC = ({ workspacePath: currentWorkspace?.rootPath, prompt, displayMessage, + runManifest, childSessionName: t('sessionFilesBadge.deepReview.threadTitle', { defaultValue: 'Deep review', }), diff --git a/src/web-ui/src/flow_chat/services/AgenticEventListener.ts b/src/web-ui/src/flow_chat/services/AgenticEventListener.ts index 6c076e39b..f0627f16f 100644 --- a/src/web-ui/src/flow_chat/services/AgenticEventListener.ts +++ b/src/web-ui/src/flow_chat/services/AgenticEventListener.ts @@ -16,6 +16,7 @@ import type { SessionModelAutoMigratedEvent, ImageAnalysisEvent, UserSteeringInjectedEvent, + DeepReviewQueueStateChangedEvent, } from '@/infrastructure/api/service-api/AgentAPI'; import { createLogger } from '@/shared/utils/logger'; @@ -33,6 +34,7 @@ export interface AgenticEventCallbacks { onModelRoundStarted?: (event: AgenticEvent) => void; onTextChunk?: (event: TextChunkEvent) => void; onToolEvent?: (event: ToolEvent) => void; + onDeepReviewQueueStateChanged?: (event: DeepReviewQueueStateChangedEvent) => void; onDialogTurnCompleted?: (event: AgenticEvent) => void; onDialogTurnFailed?: (event: AgenticEvent) => void; onDialogTurnCancelled?: (event: AgenticEvent) => void; @@ -128,6 +130,14 @@ export class AgenticEventListener { this.unlistenFunctions.push(unlisten); } + if (callbacks.onDeepReviewQueueStateChanged) { + const unlisten = agentAPI.onDeepReviewQueueStateChanged((event) => { + logger.debug('Deep Review queue state changed:', event); + callbacks.onDeepReviewQueueStateChanged?.(event); + }); + this.unlistenFunctions.push(unlisten); + } + if (callbacks.onDialogTurnCompleted) { const unlisten = agentAPI.onDialogTurnCompleted((event) => { logger.debug('Dialog turn completed:', event); diff --git a/src/web-ui/src/flow_chat/services/BtwThreadService.ts b/src/web-ui/src/flow_chat/services/BtwThreadService.ts index 1e002b409..2fa0e0ee9 100644 --- a/src/web-ui/src/flow_chat/services/BtwThreadService.ts +++ b/src/web-ui/src/flow_chat/services/BtwThreadService.ts @@ -6,6 +6,7 @@ import { stateMachineManager } from '../state-machine'; import { flowChatManager } from './FlowChatManager'; import type { Session } from '../types/flow-chat'; import type { SessionKind } from '@/shared/types/session-history'; +import type { ReviewTeamRunManifest } from '@/shared/services/reviewTeamService'; import { buildSessionMetadata } from '../utils/sessionMetadata'; const log = createLogger('BtwThreadService'); @@ -93,6 +94,7 @@ export async function createBtwChildSession(params: { requestId?: string; addMarker?: boolean; sessionKind?: Extract; + deepReviewRunManifest?: ReviewTeamRunManifest; }): Promise<{ requestId: string; childSessionId: string; @@ -149,6 +151,7 @@ export async function createBtwChildSession(params: { parentDialogTurnId, parentTurnIndex, }, + deepReviewRunManifest: params.deepReviewRunManifest, isTransient: false, }, remoteConnectionId, diff --git a/src/web-ui/src/flow_chat/services/DeepReviewService.test.ts b/src/web-ui/src/flow_chat/services/DeepReviewService.test.ts index a2abd05aa..9b8cdf35b 100644 --- a/src/web-ui/src/flow_chat/services/DeepReviewService.test.ts +++ b/src/web-ui/src/flow_chat/services/DeepReviewService.test.ts @@ -1,11 +1,15 @@ import { describe, expect, it, vi, beforeEach } from 'vitest'; import { DEEP_REVIEW_SLASH_COMMAND, + buildDeepReviewLaunchFromSlashCommand, + buildDeepReviewPreviewFromSessionFiles, + buildDeepReviewPromptFromSessionFiles, buildDeepReviewPromptFromSlashCommand, getDeepReviewLaunchErrorMessage, isDeepReviewSlashCommand, launchDeepReviewSession, } from './DeepReviewService'; +import { buildEffectiveReviewTeamManifest } from '@/shared/services/reviewTeamService'; const mockDeleteSession = vi.fn(); const mockCreateBtwChildSession = vi.fn(); @@ -14,11 +18,23 @@ const mockCloseBtwSessionInAuxPane = vi.fn(); const mockSendMessage = vi.fn(); const mockDiscardLocalSession = vi.fn(); const mockInsertReviewSessionSummaryMarker = vi.fn(); +const mockGitGetStatus = vi.fn(); +const mockGitGetChangedFiles = vi.fn(); +const mockGitGetDiff = vi.fn(); +const mockLoadDefaultReviewTeam = vi.fn(); +const mockPrepareDefaultReviewTeamForLaunch = vi.fn(); +const mockLoadReviewTeamRateLimitStatus = vi.fn(); +const mockLoadReviewTeamProjectStrategyOverride = vi.fn(); vi.mock('@/infrastructure/api', () => ({ agentAPI: { deleteSession: (...args: any[]) => mockDeleteSession(...args), }, + gitAPI: { + getStatus: (...args: any[]) => mockGitGetStatus(...args), + getChangedFiles: (...args: any[]) => mockGitGetChangedFiles(...args), + getDiff: (...args: any[]) => mockGitGetDiff(...args), + }, })); vi.mock('./BtwThreadService', () => ({ @@ -51,12 +67,34 @@ vi.mock('./ReviewSessionMarkerService', () => ({ })); vi.mock('@/shared/services/reviewTeamService', () => ({ - prepareDefaultReviewTeamForLaunch: vi.fn(async () => ({ members: [] })), + loadDefaultReviewTeam: (...args: any[]) => mockLoadDefaultReviewTeam(...args), + prepareDefaultReviewTeamForLaunch: (...args: any[]) => mockPrepareDefaultReviewTeamForLaunch(...args), + loadReviewTeamRateLimitStatus: (...args: any[]) => mockLoadReviewTeamRateLimitStatus(...args), + loadReviewTeamProjectStrategyOverride: (...args: any[]) => mockLoadReviewTeamProjectStrategyOverride(...args), buildEffectiveReviewTeamManifest: vi.fn(() => ({ reviewers: [] })), buildReviewTeamPromptBlock: vi.fn(() => 'Review team manifest.'), })); describe('DeepReviewService slash command', () => { + beforeEach(() => { + vi.clearAllMocks(); + mockLoadDefaultReviewTeam.mockResolvedValue({ members: [] }); + mockPrepareDefaultReviewTeamForLaunch.mockResolvedValue({ members: [] }); + mockLoadReviewTeamRateLimitStatus.mockResolvedValue(null); + mockLoadReviewTeamProjectStrategyOverride.mockResolvedValue(undefined); + mockGitGetStatus.mockResolvedValue({ + staged: [], + unstaged: [], + untracked: [], + conflicts: [], + current_branch: 'main', + ahead: 0, + behind: 0, + }); + mockGitGetChangedFiles.mockResolvedValue([]); + mockGitGetDiff.mockResolvedValue(''); + }); + it('uses /DeepReview as the canonical command', () => { expect(DEEP_REVIEW_SLASH_COMMAND).toBe('/DeepReview'); }); @@ -78,6 +116,362 @@ describe('DeepReviewService slash command', () => { expect(prompt).toContain('User-provided focus or target:\nreview commit abc123 for security'); expect(prompt).not.toContain('User-provided focus or target:\n/DeepReview'); }); + + it('classifies explicit slash-command file paths before building the review team manifest', async () => { + await buildDeepReviewPromptFromSlashCommand( + '/DeepReview src/web-ui/src/App.tsx src/crates/core/src/service/config/types.rs for regressions', + 'D:\\workspace\\repo', + ); + + expect(buildEffectiveReviewTeamManifest).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + workspacePath: 'D:\\workspace\\repo', + target: expect.objectContaining({ + source: 'slash_command_explicit_files', + resolution: 'resolved', + tags: expect.arrayContaining(['frontend_ui', 'backend_core']), + }), + }), + ); + }); + + it('classifies workspace diff files for a slash command without an explicit target', async () => { + mockGitGetStatus.mockResolvedValueOnce({ + staged: [{ path: 'src/web-ui/src/App.tsx', status: 'modified' }], + unstaged: [{ path: 'src/crates/core/src/service/config/types.rs', status: 'modified' }], + untracked: ['src/web-ui/src/newFeature.tsx'], + conflicts: [], + current_branch: 'main', + ahead: 0, + behind: 0, + }); + + await buildDeepReviewPromptFromSlashCommand( + '/DeepReview', + 'D:\\workspace\\repo', + ); + + expect(mockGitGetStatus).toHaveBeenCalledWith('D:\\workspace\\repo'); + expect(buildEffectiveReviewTeamManifest).toHaveBeenLastCalledWith( + expect.anything(), + expect.objectContaining({ + workspacePath: 'D:\\workspace\\repo', + target: expect.objectContaining({ + source: 'workspace_diff', + resolution: 'resolved', + tags: expect.arrayContaining(['frontend_ui', 'backend_core']), + }), + }), + ); + }); + + it('passes workspace diff line stats into the review manifest', async () => { + mockGitGetStatus.mockResolvedValueOnce({ + staged: [{ path: 'src/web-ui/src/App.tsx', status: 'modified' }], + unstaged: [{ path: 'src/crates/core/src/service/config/types.rs', status: 'modified' }], + untracked: [], + conflicts: [], + current_branch: 'main', + ahead: 0, + behind: 0, + }); + mockGitGetDiff.mockResolvedValueOnce([ + 'diff --git a/src/crates/core/src/service/config/types.rs b/src/crates/core/src/service/config/types.rs', + '@@ -1,2 +1,3 @@', + '-old core line', + '+new core line', + '+another core line', + 'diff --git a/src/web-ui/src/App.tsx b/src/web-ui/src/App.tsx', + '@@ -5,3 +5,2 @@', + '-removed ui line', + '+added ui line', + ].join('\n')); + + await buildDeepReviewPromptFromSlashCommand( + '/DeepReview', + 'D:\\workspace\\repo', + ); + + expect(mockGitGetDiff).toHaveBeenCalledWith('D:\\workspace\\repo', { + source: 'HEAD', + }); + expect(buildEffectiveReviewTeamManifest).toHaveBeenLastCalledWith( + expect.anything(), + expect.objectContaining({ + changeStats: expect.objectContaining({ + fileCount: 2, + totalLinesChanged: 5, + lineCountSource: 'diff_stat', + }), + }), + ); + }); + + it('passes cached rate limit status into slash-command launch manifests', async () => { + mockLoadReviewTeamRateLimitStatus.mockResolvedValueOnce({ remaining: 2 }); + + await buildDeepReviewLaunchFromSlashCommand( + '/DeepReview', + 'D:\\workspace\\repo', + ); + + expect(mockLoadReviewTeamRateLimitStatus).toHaveBeenCalled(); + expect(buildEffectiveReviewTeamManifest).toHaveBeenLastCalledWith( + expect.anything(), + expect.objectContaining({ + workspacePath: 'D:\\workspace\\repo', + rateLimitStatus: { remaining: 2 }, + }), + ); + }); + + it('does not block slash-command launch manifests when rate limit status is unavailable', async () => { + mockLoadReviewTeamRateLimitStatus.mockRejectedValueOnce(new Error('rate status unavailable')); + + await buildDeepReviewLaunchFromSlashCommand( + '/DeepReview', + 'D:\\workspace\\repo', + ); + + const lastCall = vi.mocked(buildEffectiveReviewTeamManifest).mock.calls.at(-1); + expect(lastCall?.[1]).not.toHaveProperty('rateLimitStatus'); + }); + + it('passes project strategy overrides into slash-command launch manifests', async () => { + mockLoadReviewTeamProjectStrategyOverride.mockResolvedValueOnce('deep'); + + await buildDeepReviewLaunchFromSlashCommand( + '/DeepReview', + 'D:\\workspace\\repo', + ); + + expect(mockLoadReviewTeamProjectStrategyOverride).toHaveBeenCalledWith( + 'D:\\workspace\\repo', + ); + expect(buildEffectiveReviewTeamManifest).toHaveBeenLastCalledWith( + expect.anything(), + expect.objectContaining({ + workspacePath: 'D:\\workspace\\repo', + strategyOverride: 'deep', + }), + ); + }); + + it('does not block slash-command launch manifests when project strategy overrides are unavailable', async () => { + mockLoadReviewTeamProjectStrategyOverride.mockRejectedValueOnce(new Error('strategy unavailable')); + + await buildDeepReviewLaunchFromSlashCommand( + '/DeepReview', + 'D:\\workspace\\repo', + ); + + const lastCall = vi.mocked(buildEffectiveReviewTeamManifest).mock.calls.at(-1); + expect(lastCall?.[1]).not.toHaveProperty('strategyOverride'); + }); + + it('classifies commit target files through the git changed-files API', async () => { + mockGitGetChangedFiles.mockResolvedValueOnce([ + { + path: 'src/web-ui/src/App.tsx', + old_path: undefined, + status: 'modified', + }, + ]); + + await buildDeepReviewPromptFromSlashCommand( + '/DeepReview review commit abc123', + 'D:\\workspace\\repo', + ); + + expect(mockGitGetChangedFiles).toHaveBeenCalledWith('D:\\workspace\\repo', { + source: 'abc123^', + target: 'abc123', + }); + expect(buildEffectiveReviewTeamManifest).toHaveBeenLastCalledWith( + expect.anything(), + expect.objectContaining({ + target: expect.objectContaining({ + source: 'slash_command_git_ref', + resolution: 'resolved', + tags: expect.arrayContaining(['frontend_ui']), + }), + }), + ); + }); + + it('passes git ref diff line stats into the review manifest', async () => { + mockGitGetChangedFiles.mockResolvedValueOnce([ + { + path: 'src/web-ui/src/App.tsx', + old_path: undefined, + status: 'modified', + }, + ]); + mockGitGetDiff.mockResolvedValueOnce([ + 'diff --git a/src/web-ui/src/App.tsx b/src/web-ui/src/App.tsx', + '--- a/src/web-ui/src/App.tsx', + '+++ b/src/web-ui/src/App.tsx', + '@@ -10,2 +10,3 @@', + '-old line', + '+new line', + '+new second line', + ].join('\n')); + + await buildDeepReviewPromptFromSlashCommand( + '/DeepReview review commit abc123', + 'D:\\workspace\\repo', + ); + + expect(mockGitGetDiff).toHaveBeenCalledWith('D:\\workspace\\repo', { + source: 'abc123^', + target: 'abc123', + }); + expect(buildEffectiveReviewTeamManifest).toHaveBeenLastCalledWith( + expect.anything(), + expect.objectContaining({ + changeStats: expect.objectContaining({ + fileCount: 1, + totalLinesChanged: 3, + lineCountSource: 'diff_stat', + }), + }), + ); + }); + + it('keeps line stats unknown when git diff stats fail', async () => { + mockGitGetChangedFiles.mockResolvedValueOnce([ + { + path: 'src/web-ui/src/App.tsx', + old_path: undefined, + status: 'modified', + }, + ]); + mockGitGetDiff.mockRejectedValueOnce(new Error('diff unavailable')); + + await buildDeepReviewPromptFromSlashCommand( + '/DeepReview review commit abc123', + 'D:\\workspace\\repo', + ); + + expect(buildEffectiveReviewTeamManifest).toHaveBeenLastCalledWith( + expect.anything(), + expect.objectContaining({ + changeStats: expect.objectContaining({ + fileCount: 1, + lineCountSource: 'unknown', + }), + }), + ); + }); + + it('classifies explicit ref ranges through the git changed-files API', async () => { + mockGitGetChangedFiles.mockResolvedValueOnce([ + { + path: 'src/crates/core/src/service/config/types.rs', + old_path: undefined, + status: 'modified', + }, + ]); + + await buildDeepReviewPromptFromSlashCommand( + '/DeepReview review main..feature/deep-review', + 'D:\\workspace\\repo', + ); + + expect(mockGitGetChangedFiles).toHaveBeenCalledWith('D:\\workspace\\repo', { + source: 'main', + target: 'feature/deep-review', + }); + expect(buildEffectiveReviewTeamManifest).toHaveBeenLastCalledWith( + expect.anything(), + expect.objectContaining({ + target: expect.objectContaining({ + source: 'slash_command_git_ref', + resolution: 'resolved', + tags: expect.arrayContaining(['backend_core']), + }), + }), + ); + }); + + it('keeps git targets conservative when no workspace is available', async () => { + await buildDeepReviewPromptFromSlashCommand( + '/DeepReview review commit abc123', + ); + + expect(mockGitGetChangedFiles).not.toHaveBeenCalled(); + expect(buildEffectiveReviewTeamManifest).toHaveBeenLastCalledWith( + expect.anything(), + expect.objectContaining({ + target: expect.objectContaining({ + source: 'slash_command_git_ref', + resolution: 'unknown', + tags: ['unknown'], + }), + }), + ); + }); + + it('returns the run manifest with the slash-command launch prompt', async () => { + const runManifest = { reviewMode: 'deep', skippedReviewers: [] }; + vi.mocked(buildEffectiveReviewTeamManifest).mockReturnValueOnce(runManifest as any); + + const result = await buildDeepReviewLaunchFromSlashCommand( + '/DeepReview review commit abc123', + 'D:\\workspace\\repo', + ); + + expect(result.prompt).toContain('Original command:\n/DeepReview review commit abc123'); + expect(result.runManifest).toBe(runManifest); + }); + + it('classifies session files before building the review team manifest', async () => { + await buildDeepReviewPromptFromSessionFiles( + ['src/web-ui/src/App.tsx'], + undefined, + 'D:\\workspace\\repo', + ); + + expect(buildEffectiveReviewTeamManifest).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + workspacePath: 'D:\\workspace\\repo', + target: expect.objectContaining({ + resolution: 'resolved', + tags: expect.arrayContaining(['frontend_ui']), + }), + }), + ); + }); + + it('builds a read-only session-file preview without preparing launch state', async () => { + const runManifest = { + reviewMode: 'deep', + skippedReviewers: [{ subagentId: 'ReviewFrontend', reason: 'not_applicable' }], + }; + vi.mocked(buildEffectiveReviewTeamManifest).mockReturnValueOnce(runManifest as any); + + const result = await buildDeepReviewPreviewFromSessionFiles( + ['src/crates/core/src/service/config/types.rs'], + 'D:\\workspace\\repo', + ); + + expect(result).toBe(runManifest); + expect(mockLoadDefaultReviewTeam).toHaveBeenCalledWith('D:\\workspace\\repo'); + expect(mockPrepareDefaultReviewTeamForLaunch).not.toHaveBeenCalled(); + expect(buildEffectiveReviewTeamManifest).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + workspacePath: 'D:\\workspace\\repo', + target: expect.objectContaining({ + source: 'session_files', + resolution: 'resolved', + tags: expect.arrayContaining(['backend_core']), + }), + }), + ); + }); }); describe('launchDeepReviewSession', () => { @@ -126,6 +520,59 @@ describe('launchDeepReviewSession', () => { ); }); + it('passes the run manifest into child session creation', async () => { + const runManifest = { reviewMode: 'deep', skippedReviewers: [] }; + mockCreateBtwChildSession.mockResolvedValue({ + childSessionId: 'child-123', + parentDialogTurnId: 'turn-456', + }); + mockSendMessage.mockResolvedValue(undefined); + + await launchDeepReviewSession({ + parentSessionId: 'parent-123', + workspacePath: 'D:\\workspace\\repo', + prompt: 'Review these files', + displayMessage: 'Deep review started', + runManifest: runManifest as any, + }); + + expect(mockCreateBtwChildSession).toHaveBeenCalledWith( + expect.objectContaining({ + deepReviewRunManifest: runManifest, + }), + ); + }); + + it('passes the run manifest as first-turn message metadata', async () => { + const runManifest = { reviewMode: 'deep', skippedReviewers: [] }; + mockCreateBtwChildSession.mockResolvedValue({ + childSessionId: 'child-123', + parentDialogTurnId: 'turn-456', + }); + mockSendMessage.mockResolvedValue(undefined); + + await launchDeepReviewSession({ + parentSessionId: 'parent-123', + workspacePath: 'D:\\workspace\\repo', + prompt: 'Review these files', + displayMessage: 'Deep review started', + runManifest: runManifest as any, + }); + + expect(mockSendMessage).toHaveBeenCalledWith( + 'Review these files', + 'child-123', + 'Deep review started', + undefined, + undefined, + { + userMessageMetadata: { + deepReviewRunManifest: runManifest, + }, + }, + ); + }); + it('throws and does not cleanup when createBtwChildSession fails', async () => { mockCreateBtwChildSession.mockRejectedValue(new Error('Session creation failed')); diff --git a/src/web-ui/src/flow_chat/services/DeepReviewService.ts b/src/web-ui/src/flow_chat/services/DeepReviewService.ts index c25d1d221..880403bd6 100644 --- a/src/web-ui/src/flow_chat/services/DeepReviewService.ts +++ b/src/web-ui/src/flow_chat/services/DeepReviewService.ts @@ -1,4 +1,10 @@ -import { agentAPI } from '@/infrastructure/api'; +import { agentAPI, gitAPI } from '@/infrastructure/api'; +import type { + GitChangedFile, + GitChangedFilesParams, + GitDiffParams, + GitStatus, +} from '@/infrastructure/api/service-api/GitAPI'; import { createLogger } from '@/shared/utils/logger'; import { createBtwChildSession } from './BtwThreadService'; import { closeBtwSessionInAuxPane, openBtwSessionInAuxPane } from './openBtwSession'; @@ -8,8 +14,19 @@ import { insertReviewSessionSummaryMarker } from './ReviewSessionMarkerService'; import { buildEffectiveReviewTeamManifest, buildReviewTeamPromptBlock, + loadDefaultReviewTeam, + loadReviewTeamProjectStrategyOverride, + loadReviewTeamRateLimitStatus, prepareDefaultReviewTeamForLaunch, + type ReviewTeamChangeStats, + type ReviewTeamRunManifest, } from '@/shared/services/reviewTeamService'; +import { + classifyReviewTargetFromFiles, + createUnknownReviewTargetClassification, + normalizeReviewPath, + type ReviewTargetClassification, +} from '@/shared/services/reviewTargetClassifier'; import { DEEP_REVIEW_COMMAND_RE } from '../utils/deepReviewConstants'; import { classifyLaunchError } from '../utils/deepReviewExperience'; @@ -24,6 +41,17 @@ interface LaunchDeepReviewSessionParams { displayMessage: string; childSessionName?: string; requestedFiles?: string[]; + runManifest?: ReviewTeamRunManifest; +} + +export interface DeepReviewLaunchPrompt { + prompt: string; + runManifest: ReviewTeamRunManifest; +} + +interface ResolvedDeepReviewTarget { + target: ReviewTargetClassification; + changeStats: ReviewTeamChangeStats; } type DeepReviewLaunchStep = @@ -220,24 +248,294 @@ function getDeepReviewCommandFocus(commandText: string): string { return commandText.trim().replace(/^\/DeepReview\b/, '').trim(); } -export async function buildDeepReviewPromptFromSessionFiles( +const EXPLICIT_REVIEW_FILE_EXTENSIONS = new Set([ + '.ts', + '.tsx', + '.js', + '.jsx', + '.rs', + '.json', + '.scss', + '.css', + '.md', + '.toml', + '.yaml', + '.yml', +]); + +function cleanPotentialFileToken(token: string): string { + return token + .trim() + .replace(/^[`"']+/, '') + .replace(/[`"',;:]+$/, ''); +} + +function getPathExtension(path: string): string { + const lastSlash = path.lastIndexOf('/'); + const lastDot = path.lastIndexOf('.'); + if (lastDot <= lastSlash) { + return ''; + } + return path.slice(lastDot); +} + +function looksLikeExplicitReviewPath(token: string): boolean { + const normalizedPath = normalizeReviewPath(token); + return ( + normalizedPath.includes('/') && + !normalizedPath.startsWith('-') && + EXPLICIT_REVIEW_FILE_EXTENSIONS.has(getPathExtension(normalizedPath)) + ); +} + +function extractExplicitReviewFilePaths(commandFocus: string): string[] { + const paths = commandFocus + .split(/\s+/) + .map(cleanPotentialFileToken) + .filter(Boolean) + .filter(looksLikeExplicitReviewPath); + + return Array.from(new Set(paths)); +} + +function parseSlashCommandGitTarget(commandFocus: string): GitChangedFilesParams | null { + const tokens = commandFocus + .split(/\s+/) + .map(cleanPotentialFileToken) + .filter(Boolean); + + const commitKeywordIndex = tokens.findIndex((token) => token.toLowerCase() === 'commit'); + const commitRef = commitKeywordIndex >= 0 ? tokens[commitKeywordIndex + 1] : undefined; + if (commitRef && !commitRef.startsWith('-')) { + return { + source: `${commitRef}^`, + target: commitRef, + }; + } + + const rangeToken = tokens.find((token) => { + if (token.startsWith('-') || !token.includes('..')) { + return false; + } + + const parts = token.split('..'); + return parts.length === 2 && Boolean(parts[0]) && Boolean(parts[1]); + }); + + if (!rangeToken) { + return null; + } + + const [source, target] = rangeToken.split('..'); + return { source, target }; +} + +function collectChangedFilePaths(changedFiles: GitChangedFile[]): string[] { + return Array.from( + new Set( + changedFiles + .flatMap((file) => [file.path, file.old_path]) + .filter((path): path is string => Boolean(path)), + ), + ); +} + +function collectWorkspaceDiffFilePaths(status: GitStatus): string[] { + return Array.from( + new Set([ + ...status.staged.map((file) => file.path), + ...status.unstaged.map((file) => file.path), + ...status.untracked, + ...status.conflicts, + ].filter(Boolean)), + ); +} + +function countReviewTargetFiles(target: ReviewTargetClassification): number { + return target.files.filter((file) => !file.excluded).length; +} + +function buildUnknownChangeStats(target: ReviewTargetClassification): ReviewTeamChangeStats { + return { + fileCount: countReviewTargetFiles(target), + lineCountSource: 'unknown', + }; +} + +function countChangedLinesFromUnifiedDiff(diff: string): number | undefined { + if (!diff.trim()) { + return undefined; + } + + let changedLines = 0; + for (const line of diff.split(/\r?\n/)) { + if ( + (line.startsWith('+') && !/^\+\+\+\s/.test(line)) || + (line.startsWith('-') && !/^---\s/.test(line)) + ) { + changedLines += 1; + } + } + + return changedLines; +} + +function buildDiffChangeStats( + target: ReviewTargetClassification, + totalLinesChanged: number | undefined, +): ReviewTeamChangeStats { + if (totalLinesChanged === undefined) { + return buildUnknownChangeStats(target); + } + + return { + fileCount: countReviewTargetFiles(target), + totalLinesChanged, + lineCountSource: 'diff_stat', + }; +} + +async function resolveGitDiffChangeStats( + workspacePath: string, + params: GitDiffParams, + target: ReviewTargetClassification, +): Promise { + try { + const diff = await gitAPI.getDiff(workspacePath, params); + return buildDiffChangeStats(target, countChangedLinesFromUnifiedDiff(diff)); + } catch (error) { + log.warn('Failed to resolve Git diff stats for Deep Review target', { + workspacePath, + params, + error, + }); + return buildUnknownChangeStats(target); + } +} + +async function resolveWorkspaceDiffChangeStats( + workspacePath: string, + target: ReviewTargetClassification, +): Promise { + return resolveGitDiffChangeStats(workspacePath, { source: 'HEAD' }, target); +} + +async function resolveSlashCommandReviewTarget( + commandFocus: string, + workspacePath?: string, +): Promise { + const explicitFilePaths = extractExplicitReviewFilePaths(commandFocus); + if (explicitFilePaths.length > 0) { + const target = classifyReviewTargetFromFiles( + explicitFilePaths, + 'slash_command_explicit_files', + ); + return { target, changeStats: buildUnknownChangeStats(target) }; + } + + const gitTarget = parseSlashCommandGitTarget(commandFocus); + if (gitTarget) { + if (!workspacePath) { + const target = createUnknownReviewTargetClassification('slash_command_git_ref'); + return { target, changeStats: buildUnknownChangeStats(target) }; + } + + try { + const changedFiles = await gitAPI.getChangedFiles(workspacePath, gitTarget); + const target = classifyReviewTargetFromFiles( + collectChangedFilePaths(changedFiles), + 'slash_command_git_ref', + ); + const changeStats = await resolveGitDiffChangeStats( + workspacePath, + gitTarget, + target, + ); + return { target, changeStats }; + } catch (error) { + log.warn('Failed to resolve Git target for Deep Review target', { + workspacePath, + gitTarget, + error, + }); + const target = createUnknownReviewTargetClassification('slash_command_git_ref'); + return { target, changeStats: buildUnknownChangeStats(target) }; + } + } + + if (!commandFocus && workspacePath) { + try { + const status = await gitAPI.getStatus(workspacePath); + const target = classifyReviewTargetFromFiles( + collectWorkspaceDiffFilePaths(status), + 'workspace_diff', + ); + const changeStats = await resolveWorkspaceDiffChangeStats( + workspacePath, + target, + ); + return { target, changeStats }; + } catch (error) { + log.warn('Failed to resolve workspace diff for Deep Review target', { + workspacePath, + error, + }); + } + } + + const target = createUnknownReviewTargetClassification( + commandFocus ? 'manual_prompt' : 'unknown', + ); + return { target, changeStats: buildUnknownChangeStats(target) }; +} + +async function buildReviewTeamManifestWithRuntimeSignals( + team: Parameters[0], + options: Parameters[1], +): Promise { + const manifestOptions = options ?? {}; + const [rateLimitStatus, strategyOverride] = await Promise.all([ + loadReviewTeamRateLimitStatus().catch((error) => { + log.warn('Failed to load Deep Review rate limit status', { error }); + return null; + }), + manifestOptions.workspacePath + ? loadReviewTeamProjectStrategyOverride(manifestOptions.workspacePath).catch((error) => { + log.warn('Failed to load Deep Review project strategy override', { error }); + return undefined; + }) + : Promise.resolve(undefined), + ]); + + return buildEffectiveReviewTeamManifest(team, { + ...manifestOptions, + ...(rateLimitStatus ? { rateLimitStatus } : {}), + ...(strategyOverride ? { strategyOverride } : {}), + }); +} + +export async function buildDeepReviewLaunchFromSessionFiles( filePaths: string[], extraContext?: string, workspacePath?: string, -): Promise { +): Promise { + const target = classifyReviewTargetFromFiles(filePaths, 'session_files'); + const changeStats = buildUnknownChangeStats(target); const team = await prepareDefaultReviewTeamForLaunch(workspacePath, { reviewTargetFilePaths: filePaths, + target, }); - const manifest = buildEffectiveReviewTeamManifest(team, { + const manifest = await buildReviewTeamManifestWithRuntimeSignals(team, { workspacePath, - reviewTargetFilePaths: filePaths, + target, + changeStats, }); const fileList = formatFileList(filePaths); const contextBlock = extraContext?.trim() ? `User-provided focus:\n${extraContext.trim()}` : 'User-provided focus:\nNone.'; - return [ + const prompt = [ 'Run a deep code review using the parallel Code Review Team.', 'Review scope: ONLY inspect the following files modified in this session.', fileList, @@ -245,21 +543,54 @@ export async function buildDeepReviewPromptFromSessionFiles( buildReviewTeamPromptBlock(team, manifest), 'Keep the scope tight to the listed files unless a directly-related dependency must be read to confirm a finding.', ].join('\n\n'); + + return { prompt, runManifest: manifest }; } -export async function buildDeepReviewPromptFromSlashCommand( - commandText: string, +export async function buildDeepReviewPreviewFromSessionFiles( + filePaths: string[], + workspacePath?: string, +): Promise { + const team = await loadDefaultReviewTeam(workspacePath); + const target = classifyReviewTargetFromFiles(filePaths, 'session_files'); + const changeStats = buildUnknownChangeStats(target); + return buildReviewTeamManifestWithRuntimeSignals(team, { + workspacePath, + target, + changeStats, + }); +} + +export async function buildDeepReviewPromptFromSessionFiles( + filePaths: string[], + extraContext?: string, workspacePath?: string, ): Promise { + return (await buildDeepReviewLaunchFromSessionFiles( + filePaths, + extraContext, + workspacePath, + )).prompt; +} + +export async function buildDeepReviewLaunchFromSlashCommand( + commandText: string, + workspacePath?: string, +): Promise { const team = await prepareDefaultReviewTeamForLaunch(workspacePath); - const manifest = buildEffectiveReviewTeamManifest(team, { workspacePath }); const trimmed = commandText.trim(); const extraContext = getDeepReviewCommandFocus(trimmed); + const { target, changeStats } = await resolveSlashCommandReviewTarget(extraContext, workspacePath); + const manifest = await buildReviewTeamManifestWithRuntimeSignals(team, { + workspacePath, + target, + changeStats, + }); const contextBlock = extraContext ? `User-provided focus or target:\n${extraContext}` : 'User-provided focus or target:\nNone. If no explicit target is given, review the current workspace changes relative to HEAD.'; - return [ + const prompt = [ 'Run a deep code review using the parallel Code Review Team.', 'Interpret the user command below to determine the review target.', 'If the user mentions a commit, ref, branch, or explicit file set, review that target.', @@ -268,6 +599,30 @@ export async function buildDeepReviewPromptFromSlashCommand( contextBlock, buildReviewTeamPromptBlock(team, manifest), ].join('\n\n'); + + return { prompt, runManifest: manifest }; +} + +export async function buildDeepReviewPreviewFromSlashCommand( + commandText: string, + workspacePath?: string, +): Promise { + const team = await loadDefaultReviewTeam(workspacePath); + const trimmed = commandText.trim(); + const extraContext = getDeepReviewCommandFocus(trimmed); + const { target, changeStats } = await resolveSlashCommandReviewTarget(extraContext, workspacePath); + return buildReviewTeamManifestWithRuntimeSignals(team, { + workspacePath, + target, + changeStats, + }); +} + +export async function buildDeepReviewPromptFromSlashCommand( + commandText: string, + workspacePath?: string, +): Promise { + return (await buildDeepReviewLaunchFromSlashCommand(commandText, workspacePath)).prompt; } export async function launchDeepReviewSession({ @@ -277,6 +632,7 @@ export async function launchDeepReviewSession({ displayMessage, childSessionName = 'Deep review', requestedFiles = [], + runManifest, }: LaunchDeepReviewSessionParams): Promise<{ childSessionId: string }> { let childSessionId: string | null = null; let launchStep: DeepReviewLaunchStep = 'create_child_session'; @@ -293,6 +649,7 @@ export async function launchDeepReviewSession({ autoCompact: true, enableContextCompression: true, addMarker: false, + deepReviewRunManifest: runManifest, }); childSessionId = created.childSessionId; @@ -306,11 +663,26 @@ export async function launchDeepReviewSession({ launchStep = 'send_start_message'; const flowChatManager = FlowChatManager.getInstance(); - await flowChatManager.sendMessage( - prompt, - childSessionId, - displayMessage, - ); + if (runManifest) { + await flowChatManager.sendMessage( + prompt, + childSessionId, + displayMessage, + undefined, + undefined, + { + userMessageMetadata: { + deepReviewRunManifest: runManifest, + }, + }, + ); + } else { + await flowChatManager.sendMessage( + prompt, + childSessionId, + displayMessage, + ); + } insertReviewSessionSummaryMarker({ parentSessionId, diff --git a/src/web-ui/src/flow_chat/services/FlowChatManager.ts b/src/web-ui/src/flow_chat/services/FlowChatManager.ts index c90cd1c58..a7c050cb4 100644 --- a/src/web-ui/src/flow_chat/services/FlowChatManager.ts +++ b/src/web-ui/src/flow_chat/services/FlowChatManager.ts @@ -368,6 +368,7 @@ export class FlowChatManager { options?: { imageContexts?: import('@/infrastructure/api/service-api/ImageContextTypes').ImageContextData[]; imageDisplayData?: Array<{ id: string; name: string; dataUrl?: string; imagePath?: string; mimeType?: string }>; + userMessageMetadata?: Record; } ): Promise { const targetSessionId = sessionId || this.context.flowChatStore.getState().activeSessionId; diff --git a/src/web-ui/src/flow_chat/services/flow-chat-manager/EventHandlerModule.ts b/src/web-ui/src/flow_chat/services/flow-chat-manager/EventHandlerModule.ts index 61f7465e3..9bd7ad215 100644 --- a/src/web-ui/src/flow_chat/services/flow-chat-manager/EventHandlerModule.ts +++ b/src/web-ui/src/flow_chat/services/flow-chat-manager/EventHandlerModule.ts @@ -21,6 +21,7 @@ import { notificationService } from '../../../shared/notification-system/service import type { NotificationAction } from '../../../shared/notification-system/types'; import { createLogger } from '@/shared/utils/logger'; import type { + DeepReviewQueueStateChangedEvent, ImageAnalysisEvent, SessionModelAutoMigratedEvent, } from '@/infrastructure/api/service-api/AgentAPI'; @@ -39,6 +40,8 @@ import { type AiErrorPresentation, type AiErrorDetail, } from '@/shared/ai-errors/aiErrorPresenter'; +import { useReviewActionBarStore } from '../../store/deepReviewActionBarStore'; +import { buildDeepReviewCapacityQueueStateFromEvent } from '../../utils/deepReviewQueueStateEvents'; const pendingImageAnalysisTurns = new Map(); // `restore_session` and assistant bootstrap can race on the same historical @@ -106,6 +109,34 @@ function logDroppedDataEvent( }); } +function handleDeepReviewQueueStateChanged(event: DeepReviewQueueStateChangedEvent): void { + const store = FlowChatStore.getInstance(); + const session = store.getState().sessions.get(event.sessionId); + const queueState = buildDeepReviewCapacityQueueStateFromEvent(event, session); + if (!queueState) { + return; + } + + const actionBar = useReviewActionBarStore.getState(); + if (actionBar.childSessionId === event.sessionId) { + actionBar.setCapacityQueueState(queueState); + if (actionBar.phase === 'idle') { + actionBar.updatePhase('review_waiting_capacity'); + } + return; + } + + if (queueState.status === 'running' || queueState.status === 'capacity_skipped') { + return; + } + + actionBar.showCapacityQueueBar({ + childSessionId: event.sessionId, + parentSessionId: session?.parentSessionId ?? null, + capacityQueueState: queueState, + }); +} + function attachSubagentSessionToParentTool( parentInfo: SubagentParentInfo, subagentSessionId: string, @@ -391,6 +422,9 @@ export async function initializeEventListeners( onToolEvent: (event) => { handleToolEvent(context, event, onTodoWriteResult); }, + onDeepReviewQueueStateChanged: (event) => { + handleDeepReviewQueueStateChanged(event); + }, onModelRoundStarted: (event) => { handleModelRoundStart(context, event); }, diff --git a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts index 34cdb3260..7f08be09e 100644 --- a/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts +++ b/src/web-ui/src/flow_chat/services/flow-chat-manager/MessageModule.ts @@ -122,6 +122,7 @@ export async function sendMessage( * Callers should not set this directly. */ bypassPendingQueue?: boolean; + userMessageMetadata?: Record; } ): Promise { const session = context.flowChatStore.getState().sessions.get(sessionId); @@ -235,6 +236,7 @@ export async function sendMessage( timestamp: Date.now(), hasImages, images: options?.imageDisplayData, + metadata: options?.userMessageMetadata, }, modelRounds: [], // Images are attached for multimodal primary models or reduced to text placeholders for text-only models. @@ -304,6 +306,8 @@ export async function sendMessage( originalUserInput: displayMessage || message, turnId: dialogTurnId, workspacePath, + imageContexts: options?.imageContexts, + userMessageMetadata: options?.userMessageMetadata, remoteConnectionId: updatedSession.remoteConnectionId, remoteSshHost: updatedSession.remoteSshHost, }); @@ -317,6 +321,7 @@ export async function sendMessage( agentType: currentAgentType, workspacePath, imageContexts: options?.imageContexts, + userMessageMetadata: options?.userMessageMetadata, }); } catch (error: any) { if (error?.message?.includes('Session does not exist') || error?.message?.includes('Not found')) { diff --git a/src/web-ui/src/flow_chat/services/flow-chat-manager/TextChunkModule.test.ts b/src/web-ui/src/flow_chat/services/flow-chat-manager/TextChunkModule.test.ts index 86dff03ad..4881667a5 100644 --- a/src/web-ui/src/flow_chat/services/flow-chat-manager/TextChunkModule.test.ts +++ b/src/web-ui/src/flow_chat/services/flow-chat-manager/TextChunkModule.test.ts @@ -54,6 +54,8 @@ function makeContext(session: Session): any { turnSaveInFlight: new Map(), turnSavePending: new Set(), runtimeStatusTimers: new Map(), + userCancelledSessionIds: new Set(), + currentWorkspacePath: null, }; } diff --git a/src/web-ui/src/flow_chat/store/FlowChatStore.ts b/src/web-ui/src/flow_chat/store/FlowChatStore.ts index 5af10c993..b271080be 100644 --- a/src/web-ui/src/flow_chat/store/FlowChatStore.ts +++ b/src/web-ui/src/flow_chat/store/FlowChatStore.ts @@ -286,6 +286,7 @@ export class FlowChatStore { sessionKind?: SessionKind; btwOrigin?: Session['btwOrigin']; isTransient?: boolean; + deepReviewRunManifest?: Session['deepReviewRunManifest']; }, remoteConnectionId?: string, remoteSshHost?: string @@ -324,6 +325,7 @@ export class FlowChatStore { sessionKind: relationship.sessionKind, btwThreads: [], btwOrigin: relationship.btwOrigin, + deepReviewRunManifest: meta?.deepReviewRunManifest, isTransient: meta?.isTransient ?? false, }; @@ -1683,6 +1685,7 @@ export class FlowChatStore { btwOrigin: relationship.btwOrigin, hasUnreadCompletion: metadata.unreadCompletion, needsUserAttention: metadata.needsUserAttention, + deepReviewRunManifest: metadata.deepReviewRunManifest, isTransient: false, }; diff --git a/src/web-ui/src/flow_chat/store/deepReviewActionBarStore.test.ts b/src/web-ui/src/flow_chat/store/deepReviewActionBarStore.test.ts index f52e9e29b..f6884ddb0 100644 --- a/src/web-ui/src/flow_chat/store/deepReviewActionBarStore.test.ts +++ b/src/web-ui/src/flow_chat/store/deepReviewActionBarStore.test.ts @@ -188,6 +188,97 @@ describe('deepReviewActionBarStore', () => { }); }); + describe('capacity queue controls', () => { + it('can bind a visible queue state before the review report is available', () => { + bar().showCapacityQueueBar({ + childSessionId: 'child-1', + parentSessionId: 'parent-1', + capacityQueueState: { + status: 'queued_for_capacity', + queuedReviewerCount: 2, + activeReviewerCount: 1, + }, + }); + + expect(bar().childSessionId).toBe('child-1'); + expect(bar().reviewMode).toBe('deep'); + expect(bar().phase).toBe('review_waiting_capacity'); + expect(bar().reviewData).toBeNull(); + expect(bar().capacityQueueState?.queuedReviewerCount).toBe(2); + }); + + it('pauses and resumes capacity queue state without clearing completed remediation', () => { + bar().showActionBar({ + childSessionId: 'child-1', + parentSessionId: 'parent-1', + reviewData: { + summary: { recommended_action: 'request_changes' }, + remediation_plan: ['Fix issue 1', 'Fix issue 2'], + }, + completedRemediationIds: new Set(['remediation-0']), + }); + + const queueActions = bar() as unknown as { + setCapacityQueueState: (state: { + status: string; + queuedReviewerCount: number; + optionalReviewerCount: number; + }) => void; + pauseCapacityQueue: () => void; + continueCapacityQueue: () => void; + }; + + queueActions.setCapacityQueueState({ + status: 'queued_for_capacity', + queuedReviewerCount: 2, + optionalReviewerCount: 1, + }); + queueActions.pauseCapacityQueue(); + + expect((bar() as unknown as { capacityQueueState: { status: string } }).capacityQueueState.status).toBe('paused_by_user'); + expect(bar().completedRemediationIds.has('remediation-0')).toBe(true); + + queueActions.continueCapacityQueue(); + + expect((bar() as unknown as { capacityQueueState: { status: string } }).capacityQueueState.status).toBe('queued_for_capacity'); + expect(bar().completedRemediationIds.has('remediation-0')).toBe(true); + }); + + it('can skip optional queued reviewers without cancelling required queued work', () => { + bar().showActionBar({ + childSessionId: 'child-1', + parentSessionId: 'parent-1', + reviewData: { + summary: { recommended_action: 'request_changes' }, + remediation_plan: ['Fix issue 1'], + }, + }); + + const queueActions = bar() as unknown as { + setCapacityQueueState: (state: { + status: string; + queuedReviewerCount: number; + optionalReviewerCount: number; + }) => void; + skipOptionalQueuedReviewers: () => void; + }; + + queueActions.setCapacityQueueState({ + status: 'queued_for_capacity', + queuedReviewerCount: 3, + optionalReviewerCount: 2, + }); + queueActions.skipOptionalQueuedReviewers(); + + const state = (bar() as unknown as { + capacityQueueState: { status: string; queuedReviewerCount: number; optionalReviewerCount: number }; + }).capacityQueueState; + expect(state.status).toBe('queued_for_capacity'); + expect(state.queuedReviewerCount).toBe(1); + expect(state.optionalReviewerCount).toBe(0); + }); + }); + describe('toggleRemediation with completed items', () => { it('does not allow toggling completed items', () => { bar().showActionBar({ diff --git a/src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts b/src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts index be6a2d855..cdf984a29 100644 --- a/src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts +++ b/src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts @@ -28,6 +28,7 @@ export type ReviewActionPhase = | 'fix_failed' | 'fix_timeout' | 'fix_interrupted' + | 'review_waiting_capacity' | 'review_interrupted' | 'resume_blocked' | 'resume_running' @@ -36,6 +37,34 @@ export type ReviewActionPhase = export type DeepReviewActionPhase = ReviewActionPhase; +export type DeepReviewCapacityQueueStatus = + | 'queued_for_capacity' + | 'paused_by_user' + | 'running' + | 'capacity_skipped'; + +export type DeepReviewCapacityQueueAction = + | 'pause' + | 'continue' + | 'cancel' + | 'skip_optional'; + +export interface DeepReviewCapacityQueueState { + toolId?: string; + subagentType?: string; + dialogTurnId?: string; + status: DeepReviewCapacityQueueStatus; + queuedReviewerCount: number; + activeReviewerCount?: number; + effectiveParallelInstances?: number; + optionalReviewerCount?: number; + queueElapsedMs?: number; + runElapsedMs?: number; + maxQueueWaitSeconds?: number; + sessionConcurrencyHigh?: boolean; + controlMode?: 'local' | 'session_stop_only' | 'backend'; +} + export interface ReviewActionBarState { /** Which child session this bar belongs to */ childSessionId: string | null; @@ -73,6 +102,10 @@ export interface ReviewActionBarState { remainingFixIds: string[]; /** User's option choice for needs_decision items: map of item id -> option index */ decisionSelections: Record; + /** Visible Deep Review capacity queue state. Automatic queue execution is not enabled here. */ + capacityQueueState: DeepReviewCapacityQueueState | null; + /** Last local queue-control action selected by the user */ + lastCapacityQueueAction: DeepReviewCapacityQueueAction | null; // ---- actions ---- showActionBar: (params: { @@ -89,6 +122,11 @@ export interface ReviewActionBarState { interruption: DeepReviewInterruption; phase?: Extract; }) => void; + showCapacityQueueBar: (params: { + childSessionId: string; + parentSessionId: string | null; + capacityQueueState: DeepReviewCapacityQueueState; + }) => void; updatePhase: (phase: ReviewActionPhase, errorMessage?: string | null) => void; toggleRemediation: (id: string) => void; toggleAllRemediation: () => void; @@ -100,6 +138,11 @@ export interface ReviewActionBarState { minimize: () => void; restore: () => void; skipRemainingFixes: () => void; + setCapacityQueueState: (state: DeepReviewCapacityQueueState | null) => void; + pauseCapacityQueue: () => void; + continueCapacityQueue: () => void; + cancelQueuedReviewers: () => void; + skipOptionalQueuedReviewers: () => void; setDecisionSelection: (itemId: string, optionIndex: number) => void; reset: () => void; } @@ -125,6 +168,8 @@ const initialState = { fixingRemediationIds: new Set(), remainingFixIds: [] as string[], decisionSelections: {} as Record, + capacityQueueState: null as DeepReviewCapacityQueueState | null, + lastCapacityQueueAction: null as DeepReviewCapacityQueueAction | null, }; export const useReviewActionBarStore = create((set, get) => ({ @@ -164,6 +209,8 @@ export const useReviewActionBarStore = create((set, get) = fixingRemediationIds: new Set(), remainingFixIds: [], decisionSelections: {}, + capacityQueueState: null, + lastCapacityQueueAction: null, }); }, @@ -187,6 +234,35 @@ export const useReviewActionBarStore = create((set, get) = fixingRemediationIds: new Set(), remainingFixIds: [], decisionSelections: {}, + capacityQueueState: null, + lastCapacityQueueAction: null, + }); + }, + + showCapacityQueueBar: ({ childSessionId, parentSessionId, capacityQueueState }) => { + set({ + childSessionId, + parentSessionId, + reviewMode: 'deep', + reviewData: null, + remediationItems: [], + selectedRemediationIds: new Set(), + phase: 'review_waiting_capacity', + dismissed: false, + minimized: false, + activeAction: null, + lastSubmittedAction: null, + customInstructions: '', + errorMessage: null, + interruption: null, + completedRemediationIds: get().childSessionId === childSessionId + ? get().completedRemediationIds + : new Set(), + fixingRemediationIds: new Set(), + remainingFixIds: [], + decisionSelections: {}, + capacityQueueState, + lastCapacityQueueAction: null, }); }, @@ -283,6 +359,57 @@ export const useReviewActionBarStore = create((set, get) = activeAction: null, lastSubmittedAction: null, }), + setCapacityQueueState: (capacityQueueState) => set({ + capacityQueueState, + lastCapacityQueueAction: null, + }), + pauseCapacityQueue: () => { + const current = get().capacityQueueState; + if (!current || current.status === 'capacity_skipped') return; + set({ + capacityQueueState: { ...current, status: 'paused_by_user' }, + lastCapacityQueueAction: 'pause', + }); + }, + continueCapacityQueue: () => { + const current = get().capacityQueueState; + if (!current || current.status !== 'paused_by_user') return; + set({ + capacityQueueState: { ...current, status: 'queued_for_capacity' }, + lastCapacityQueueAction: 'continue', + }); + }, + cancelQueuedReviewers: () => { + const current = get().capacityQueueState; + if (!current) return; + set({ + capacityQueueState: { + ...current, + status: 'capacity_skipped', + queuedReviewerCount: 0, + optionalReviewerCount: 0, + }, + lastCapacityQueueAction: 'cancel', + }); + }, + skipOptionalQueuedReviewers: () => { + const current = get().capacityQueueState; + if (!current) return; + const optionalCount = current.optionalReviewerCount ?? 0; + if (optionalCount <= 0) return; + + const skippedCount = Math.min(optionalCount, current.queuedReviewerCount); + const queuedReviewerCount = Math.max(0, current.queuedReviewerCount - skippedCount); + set({ + capacityQueueState: { + ...current, + status: queuedReviewerCount > 0 ? current.status : 'capacity_skipped', + queuedReviewerCount, + optionalReviewerCount: 0, + }, + lastCapacityQueueAction: 'skip_optional', + }); + }, reset: () => set({ ...initialState, selectedRemediationIds: new Set() }), })); diff --git a/src/web-ui/src/flow_chat/tool-cards/CodeReviewReportExportActions.tsx b/src/web-ui/src/flow_chat/tool-cards/CodeReviewReportExportActions.tsx index b8efcd7f6..0e0173c76 100644 --- a/src/web-ui/src/flow_chat/tool-cards/CodeReviewReportExportActions.tsx +++ b/src/web-ui/src/flow_chat/tool-cards/CodeReviewReportExportActions.tsx @@ -9,9 +9,11 @@ import { type CodeReviewReportData, type CodeReviewReportMarkdownLabels, } from '../utils/codeReviewReport'; +import type { ReviewTeamRunManifest } from '@/shared/services/reviewTeamService'; interface CodeReviewReportExportActionsProps { reviewData: CodeReviewReportData; + runManifest?: ReviewTeamRunManifest; } function timestampForFileName(): string { @@ -38,6 +40,7 @@ function downloadMarkdownInBrowser(fileName: string, markdown: string): void { export const CodeReviewReportExportActions: React.FC = ({ reviewData, + runManifest, }) => { const { t } = useTranslation('flow-chat'); const [copied, setCopied] = useState(false); @@ -48,16 +51,25 @@ export const CodeReviewReportExportActions: React.FC formatCodeReviewReportMarkdown(reviewData, markdownLabels), - [markdownLabels, reviewData], + () => formatCodeReviewReportMarkdown( + reviewData, + markdownLabels, + { runManifest }, + ), + [markdownLabels, reviewData, runManifest], ); const fileName = useMemo(() => { diff --git a/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.scss b/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.scss index 87eb6f1e6..db4e83f72 100644 --- a/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.scss +++ b/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.scss @@ -79,6 +79,72 @@ flex-direction: column; gap: 10px; + .review-reliability-status { + display: grid; + gap: 7px; + padding: 2px 0 8px; + border-bottom: 1px dashed var(--border-base); + } + + .review-reliability-status__title { + color: var(--color-text-muted); + font-size: 11px; + font-weight: 650; + } + + .review-reliability-status__items { + display: flex; + flex-wrap: wrap; + gap: 6px; + } + + .review-reliability-status__item { + display: grid; + grid-template-columns: auto minmax(0, 1fr); + align-items: start; + gap: 6px; + max-width: 100%; + padding: 6px 8px; + border-left: 2px solid var(--color-accent-500, #60a5fa); + border-radius: 4px; + background: color-mix(in srgb, var(--color-bg-elevated) 70%, transparent); + color: var(--color-text-secondary); + font-size: 11px; + line-height: 1.4; + + &--warning { + border-left-color: var(--color-warning, #f59e0b); + } + + &--action { + border-left-color: var(--color-danger, #ef4444); + } + } + + .review-reliability-status__icon { + display: inline-flex; + align-items: center; + justify-content: center; + margin-top: 1px; + color: var(--color-text-muted); + } + + .review-reliability-status__text { + display: grid; + gap: 1px; + min-width: 0; + } + + .review-reliability-status__label { + color: var(--color-text-primary); + font-weight: 650; + } + + .review-reliability-status__detail { + color: var(--color-text-muted); + overflow-wrap: anywhere; + } + /* ---------- Summary section — vertical layout ---------- */ .review-summary { padding-bottom: 10px; @@ -278,6 +344,140 @@ font-style: italic; } + .run-manifest { + display: grid; + gap: 10px; + } + + .run-manifest__facts { + display: grid; + grid-template-columns: repeat(3, minmax(0, 1fr)); + gap: 6px; + } + + .run-manifest__fact { + display: grid; + gap: 3px; + min-width: 0; + padding: 8px; + border: 1px solid var(--border-base); + border-radius: 6px; + background: var(--color-bg-elevated, rgba(255, 255, 255, 0.03)); + + span { + color: var(--color-text-muted); + font-size: 11px; + } + + strong { + overflow: hidden; + color: var(--color-text-primary); + font-size: 12px; + font-weight: 650; + text-overflow: ellipsis; + white-space: nowrap; + } + } + + .run-manifest__group { + display: grid; + gap: 6px; + } + + .run-manifest__group-title { + color: var(--color-text-muted); + font-size: 11px; + font-weight: 650; + + &--warning { + color: var(--color-warning, #f59e0b); + } + } + + .run-manifest__chips { + display: flex; + flex-wrap: wrap; + gap: 6px; + } + + .run-manifest__chip { + display: inline-grid; + max-width: 100%; + gap: 2px; + padding: 6px 8px; + border: 1px solid var(--border-base); + border-radius: 6px; + background: var(--color-bg-elevated, rgba(255, 255, 255, 0.03)); + } + + .run-manifest__chip-name { + overflow: hidden; + color: var(--color-text-primary); + font-size: 12px; + font-weight: 600; + text-overflow: ellipsis; + white-space: nowrap; + } + + .run-manifest__chip-meta { + overflow: hidden; + color: var(--color-text-muted); + font-size: 11px; + text-overflow: ellipsis; + white-space: nowrap; + } + + .run-manifest__skipped-list { + display: grid; + gap: 5px; + margin: 0; + padding: 0; + list-style: none; + + li { + display: grid; + grid-template-columns: minmax(0, 1fr) auto; + align-items: center; + gap: 8px; + min-width: 0; + padding: 6px 8px; + border: 1px solid var(--border-base); + border-radius: 6px; + background: var(--color-bg-elevated, rgba(255, 255, 255, 0.03)); + color: var(--color-text-secondary); + font-size: 12px; + } + + span { + overflow: hidden; + min-width: 0; + text-overflow: ellipsis; + white-space: nowrap; + } + + strong { + color: var(--color-text-muted); + font-size: 11px; + font-weight: 650; + white-space: nowrap; + } + } + + @media (max-width: 560px) { + .run-manifest__facts { + grid-template-columns: 1fr; + } + + .run-manifest__skipped-list li { + grid-template-columns: 1fr; + align-items: start; + } + + .run-manifest__skipped-list strong { + white-space: normal; + } + } + .team-list { display: flex; flex-direction: column; diff --git a/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.test.tsx b/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.test.tsx new file mode 100644 index 000000000..d66a5eb8b --- /dev/null +++ b/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.test.tsx @@ -0,0 +1,381 @@ +import React, { act } from 'react'; +import { createRoot, type Root } from 'react-dom/client'; +import { JSDOM } from 'jsdom'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; + +import { CodeReviewToolCard } from './CodeReviewToolCard'; +import type { FlowToolItem, ToolCardConfig } from '../types/flow-chat'; +import type { ReviewTeamManifestMember, ReviewTeamRunManifest } from '@/shared/services/reviewTeamService'; + +globalThis.IS_REACT_ACT_ENVIRONMENT = true; + +const flowState = vi.hoisted(() => ({ + current: { + sessions: new Map(), + activeSessionId: null, + }, + listeners: new Set<(state: { sessions: Map; activeSessionId: string | null }) => void>(), +})); + +vi.mock('react-i18next', () => ({ + initReactI18next: { + type: '3rdParty', + init: vi.fn(), + }, + useTranslation: () => ({ + t: (key: string, options?: Record) => { + const value = typeof options?.defaultValue === 'string' ? options.defaultValue : key; + return value.replace(/\{\{(\w+)\}\}/g, (_match, name) => String(options?.[name] ?? '')); + }, + }), +})); + +vi.mock('@/component-library', () => ({ + Tooltip: ({ children }: { children: React.ReactNode }) => <>{children}, +})); + +vi.mock('./CodeReviewReportExportActions', () => ({ + CodeReviewReportExportActions: () => null, +})); + +vi.mock('../store/FlowChatStore', () => ({ + flowChatStore: { + getState: () => flowState.current, + subscribe: (listener: (state: typeof flowState.current) => void) => { + flowState.listeners.add(listener); + return () => flowState.listeners.delete(listener); + }, + }, + FlowChatStore: { + getInstance: () => ({ + getState: () => flowState.current, + subscribe: (listener: (state: typeof flowState.current) => void) => { + flowState.listeners.add(listener); + return () => flowState.listeners.delete(listener); + }, + }), + }, +})); + +function buildManifestMember( + subagentId: string, + displayName: string, + source: ReviewTeamManifestMember['source'], + reason?: ReviewTeamManifestMember['reason'], +): ReviewTeamManifestMember { + return { + subagentId, + displayName, + roleName: displayName, + model: 'fast-model', + configuredModel: 'fast-model', + defaultModelSlot: 'fast', + strategyLevel: 'normal', + strategySource: 'team', + strategyDirective: 'Review the target.', + locked: source === 'core', + source, + subagentSource: source === 'extra' ? 'user' : 'builtin', + ...(reason ? { reason } : {}), + }; +} + +function buildManifest(): ReviewTeamRunManifest { + return { + reviewMode: 'deep', + workspacePath: 'C:/repo/project', + policySource: 'default-review-team-config', + target: { + source: 'session_files', + resolution: 'resolved', + tags: ['frontend'], + files: ['src/App.tsx'], + warnings: [], + }, + strategyLevel: 'normal', + strategyRecommendation: { + strategyLevel: 'deep', + score: 24, + rationale: 'Large/high-risk change (8 files, 900 lines; 2 security-sensitive files, 3 workspace areas). Deep review recommended.', + factors: { + fileCount: 8, + totalLinesChanged: 900, + lineCountSource: 'diff_stat', + securityFileCount: 2, + workspaceAreaCount: 3, + contractSurfaceChanged: true, + }, + }, + executionPolicy: { + reviewerTimeoutSeconds: 300, + judgeTimeoutSeconds: 240, + reviewerFileSplitThreshold: 20, + maxSameRoleInstances: 3, + }, + tokenBudget: { + mode: 'balanced', + estimatedReviewerCalls: 3, + maxReviewerCalls: 4, + maxExtraReviewers: 1, + largeDiffSummaryFirst: false, + skippedReviewerIds: ['CustomInvalid'], + warnings: [], + }, + coreReviewers: [ + buildManifestMember('ReviewBusinessLogic', 'Logic reviewer', 'core'), + ], + qualityGateReviewer: buildManifestMember('ReviewJudge', 'Quality inspector', 'core'), + enabledExtraReviewers: [ + buildManifestMember('CustomSecurity', 'Custom security reviewer', 'extra'), + ], + skippedReviewers: [ + buildManifestMember('ReviewFrontend', 'Frontend reviewer', 'core', 'not_applicable'), + buildManifestMember('CustomInvalid', 'Custom invalid reviewer', 'extra', 'invalid_tooling'), + ], + }; +} + +function notifyFlowState(): void { + for (const listener of flowState.listeners) { + listener(flowState.current); + } +} + +describe('CodeReviewToolCard', () => { + let dom: JSDOM; + let container: HTMLDivElement; + let root: Root; + + beforeEach(() => { + dom = new JSDOM('
', { + pretendToBeVisual: true, + url: 'http://localhost', + }); + vi.stubGlobal('window', dom.window); + vi.stubGlobal('document', dom.window.document); + vi.stubGlobal('navigator', dom.window.navigator); + vi.stubGlobal('HTMLElement', dom.window.HTMLElement); + vi.stubGlobal('CustomEvent', dom.window.CustomEvent); + + flowState.current = { + sessions: new Map([ + ['review-session', { id: 'review-session', deepReviewRunManifest: buildManifest() }], + ]), + activeSessionId: 'review-session', + }; + flowState.listeners.clear(); + container = dom.window.document.getElementById('root') as HTMLDivElement; + root = createRoot(container); + }); + + afterEach(() => { + act(() => { + root.unmount(); + }); + vi.unstubAllGlobals(); + dom.window.close(); + }); + + it('echoes the deep review run manifest from the review session', () => { + const toolItem: FlowToolItem = { + id: 'tool-1', + type: 'tool', + timestamp: Date.now(), + toolName: 'submit_code_review', + status: 'completed', + toolCall: { + id: 'call-1', + input: {}, + }, + toolResult: { + success: true, + result: { + review_mode: 'deep', + summary: { + overall_assessment: 'No validated issues.', + risk_level: 'low', + recommended_action: 'approve', + }, + issues: [], + reviewers: [], + }, + }, + }; + const config: ToolCardConfig = { + toolName: 'submit_code_review', + displayName: 'Code Review', + icon: 'REVIEW', + requiresConfirmation: false, + resultDisplayType: 'detailed', + }; + + act(() => { + root.render( + , + ); + }); + act(() => { + container.querySelector('.preview-toggle-btn')?.dispatchEvent( + new window.Event('click', { bubbles: true }), + ); + }); + + expect(container.textContent).toContain('Run manifest'); + expect(container.textContent).toContain('3 active'); + expect(container.textContent).toContain('2 skipped'); + + const manifestSectionButton = Array.from(container.querySelectorAll('button')) + .find((button) => button.textContent?.includes('Run manifest')); + + act(() => { + manifestSectionButton?.dispatchEvent(new window.Event('click', { bubbles: true })); + }); + + expect(container.textContent).toContain('Logic reviewer'); + expect(container.textContent).toContain('Quality inspector'); + expect(container.textContent).toContain('Custom security reviewer'); + expect(container.textContent).toContain('Frontend reviewer'); + expect(container.textContent).toContain('Not applicable to this target'); + expect(container.textContent).toContain('Custom invalid reviewer'); + expect(container.textContent).toContain('Configuration issue'); + expect(container.textContent).toContain('Recommended strategy'); + expect(container.textContent).toContain('deep'); + expect(container.textContent).toContain('Large/high-risk change'); + }); + + it('updates the manifest echo when session metadata arrives after render', () => { + flowState.current = { + sessions: new Map([ + ['review-session', { id: 'review-session' }], + ]), + activeSessionId: 'review-session', + }; + + const toolItem: FlowToolItem = { + id: 'tool-1', + type: 'tool', + timestamp: Date.now(), + toolName: 'submit_code_review', + status: 'completed', + toolCall: { + id: 'call-1', + input: {}, + }, + toolResult: { + success: true, + result: { + review_mode: 'deep', + summary: { + overall_assessment: 'No validated issues.', + risk_level: 'low', + recommended_action: 'approve', + }, + issues: [], + reviewers: [], + }, + }, + }; + const config: ToolCardConfig = { + toolName: 'submit_code_review', + displayName: 'Code Review', + icon: 'REVIEW', + requiresConfirmation: false, + resultDisplayType: 'detailed', + }; + + act(() => { + root.render( + , + ); + }); + act(() => { + container.querySelector('.preview-toggle-btn')?.dispatchEvent( + new window.Event('click', { bubbles: true }), + ); + }); + + expect(container.textContent).not.toContain('Run manifest'); + + act(() => { + flowState.current = { + sessions: new Map([ + ['review-session', { id: 'review-session', deepReviewRunManifest: buildManifest() }], + ]), + activeSessionId: 'review-session', + }; + notifyFlowState(); + }); + + expect(container.textContent).toContain('Run manifest'); + expect(container.textContent).toContain('3 active'); + }); + + it('renders compact reliability status when a reviewer timed out with partial evidence', () => { + const toolItem: FlowToolItem = { + id: 'tool-1', + type: 'tool', + timestamp: Date.now(), + toolName: 'submit_code_review', + status: 'completed', + toolCall: { + id: 'call-1', + input: {}, + }, + toolResult: { + success: true, + result: { + review_mode: 'deep', + summary: { + overall_assessment: 'Review completed with reduced confidence.', + risk_level: 'medium', + recommended_action: 'request_changes', + }, + issues: [], + reviewers: [ + { + name: 'Security Reviewer', + specialty: 'security', + status: 'partial_timeout', + summary: 'Timed out after producing partial evidence.', + partial_output: 'Found likely token logging in src/auth.ts before timeout.', + }, + ], + }, + }, + }; + const config: ToolCardConfig = { + toolName: 'submit_code_review', + displayName: 'Code Review', + icon: 'REVIEW', + requiresConfirmation: false, + resultDisplayType: 'detailed', + }; + + act(() => { + root.render( + , + ); + }); + act(() => { + container.querySelector('.preview-toggle-btn')?.dispatchEvent( + new window.Event('click', { bubbles: true }), + ); + }); + + expect(container.textContent).toContain('Review status'); + expect(container.textContent).toContain('Reviewer timed out with partial result'); + expect(container.textContent).toContain('1 reviewer result is partial; confidence is reduced.'); + }); +}); diff --git a/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.tsx b/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.tsx index 2cb9cb03a..fc8092541 100644 --- a/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.tsx +++ b/src/web-ui/src/flow_chat/tool-cards/CodeReviewToolCard.tsx @@ -17,6 +17,7 @@ import { import { useTranslation } from 'react-i18next'; import { Tooltip } from '@/component-library'; import type { ToolCardProps } from '../types/flow-chat'; +import { flowChatStore } from '../store/FlowChatStore'; import { BaseToolCard, ToolCardHeader } from './BaseToolCard'; import { createLogger } from '@/shared/utils/logger'; import { useToolCardHeightContract } from './useToolCardHeightContract'; @@ -24,10 +25,12 @@ import { buildReviewRemediationItems, } from '../utils/codeReviewRemediation'; import { + buildCodeReviewReliabilityNotices, buildCodeReviewReportSections, getDefaultExpandedCodeReviewSectionIds, type CodeReviewReportData, type CodeReviewReviewer, + type ReviewReliabilityNotice, type RemediationGroupId, type ReviewReportGroup, type ReviewSectionId, @@ -35,8 +38,14 @@ import { } from '../utils/codeReviewReport'; import { CodeReviewReportExportActions } from './CodeReviewReportExportActions'; import { DEEP_REVIEW_SCROLL_TO_EVENT, type DeepReviewScrollToRequest } from '../events/flowchatNavigation'; -import { globalEventBus } from '@/infrastructure'; +import { globalEventBus } from '@/infrastructure/event-bus'; import { normalizeDecisionEntry, type DecisionContext } from '../utils/codeReviewReport'; +import { + getActiveReviewTeamManifestMembers, + type ReviewTeamManifestMember, + type ReviewTeamManifestMemberReason, + type ReviewTeamRunManifest, +} from '@/shared/services/reviewTeamService'; import './CodeReviewToolCard.scss'; const log = createLogger('CodeReviewToolCard'); @@ -127,6 +136,122 @@ function formatReviewerStatus(status: string, t: Translate): string { }); } +function getReliabilityNoticeLabel(notice: ReviewReliabilityNotice, t: Translate): string { + return t(`toolCards.codeReview.reliabilityStatus.${notice.kind}.label`, { + defaultValue: { + context_pressure: 'Context pressure rising', + compression_preserved: 'Compression preserved key facts', + cache_hit: 'Incremental cache reused reviewer output', + cache_miss: 'Incremental cache missed or refreshed', + concurrency_limited: 'Reviewer launch was concurrency-limited', + partial_reviewer: 'Reviewer timed out with partial result', + retry_guidance: 'Retry guidance emitted', + skipped_reviewers: 'Skipped reviewers', + token_budget_limited: 'Token budget limited reviewer coverage', + user_decision: 'User decision needed', + }[notice.kind], + }); +} + +function getReliabilityNoticeDetail(notice: ReviewReliabilityNotice, t: Translate): string { + if (notice.detail?.trim()) { + return notice.detail.trim(); + } + + return t(`toolCards.codeReview.reliabilityStatus.${notice.kind}.detail`, { + count: notice.count ?? 0, + defaultValue: { + context_pressure: '{{count}} reviewer calls planned for a large or constrained target.', + compression_preserved: 'Coverage notes include preserved context from compression.', + cache_hit: '{{count}} reviewer packet reused matching cached output.', + cache_miss: '{{count}} reviewer packet ran fresh or refreshed stale cache.', + concurrency_limited: '{{count}} reviewer launch hit a concurrency cap.', + partial_reviewer: '{{count}} reviewer result is partial; confidence is reduced.', + retry_guidance: '{{count}} retry guidance item was emitted for partial review coverage.', + skipped_reviewers: '{{count}} reviewer was skipped by applicability, configuration, or budget.', + token_budget_limited: '{{count}} reviewer was skipped by token budget mode.', + user_decision: '{{count}} review item needs your decision before fixing.', + }[notice.kind], + }); +} + +function getReliabilityNoticeIcon(notice: ReviewReliabilityNotice): React.ReactNode { + if (notice.kind === 'partial_reviewer' || notice.kind === 'retry_guidance') { + return ; + } + if ( + notice.kind === 'user_decision' || + notice.kind === 'concurrency_limited' || + notice.kind === 'token_budget_limited' + ) { + return ; + } + return ; +} + +function getDeepReviewRunManifestForSession(sessionId?: string): ReviewTeamRunManifest | undefined { + if (!sessionId) { + return undefined; + } + + return flowChatStore.getState().sessions.get(sessionId)?.deepReviewRunManifest; +} + +function getReviewerLabel(member: ReviewTeamManifestMember): string { + return member.displayName || member.subagentId; +} + +function getSkippedReasonLabel( + reason: ReviewTeamManifestMemberReason | undefined, + t: Translate, +): string { + switch (reason) { + case 'not_applicable': + return t('toolCards.codeReview.runManifest.skippedReasons.notApplicable', { + defaultValue: 'Not applicable to this target', + }); + case 'budget_limited': + return t('toolCards.codeReview.runManifest.skippedReasons.budgetLimited', { + defaultValue: 'Limited by token budget', + }); + case 'invalid_tooling': + return t('toolCards.codeReview.runManifest.skippedReasons.invalidTooling', { + defaultValue: 'Configuration issue', + }); + case 'disabled': + return t('toolCards.codeReview.runManifest.skippedReasons.disabled', { + defaultValue: 'Disabled', + }); + case 'unavailable': + return t('toolCards.codeReview.runManifest.skippedReasons.unavailable', { + defaultValue: 'Unavailable', + }); + default: + return t('toolCards.codeReview.runManifest.skippedReasons.skipped', { + defaultValue: 'Skipped', + }); + } +} + +function formatRunManifestSummary( + manifest: ReviewTeamRunManifest, + activeReviewers: ReviewTeamManifestMember[], + t: Translate, +): string { + return t('toolCards.codeReview.runManifest.summary', { + active: activeReviewers.length, + skipped: manifest.skippedReviewers.length, + calls: manifest.tokenBudget.estimatedReviewerCalls, + defaultValue: '{{active}} active / {{skipped}} skipped / {{calls}} calls', + }); +} + +function formatRunManifestTarget(manifest: ReviewTeamRunManifest): string { + return manifest.target.tags.length > 0 + ? manifest.target.tags.join(', ') + : manifest.target.source; +} + function renderReportGroupList( groups: Array>, titleForGroup: (id: TId) => string, @@ -145,7 +270,7 @@ function renderReportGroupList export const CodeReviewToolCard: React.FC = React.memo(({ toolItem, - sessionId: _sessionId, + sessionId, }) => { const { t } = useTranslation('flow-chat'); const { toolResult, status } = toolItem; @@ -158,6 +283,21 @@ export const CodeReviewToolCard: React.FC = React.memo(({ toolId, toolName: toolItem.toolName, }); + const [sessionRunManifest, setSessionRunManifest] = useState( + () => getDeepReviewRunManifestForSession(sessionId), + ); + + useEffect(() => { + setSessionRunManifest(getDeepReviewRunManifestForSession(sessionId)); + + if (!sessionId) { + return undefined; + } + + return flowChatStore.subscribe((state) => { + setSessionRunManifest(state.sessions.get(sessionId)?.deepReviewRunManifest); + }); + }, [sessionId]); const getStatusIcon = () => { switch (status) { @@ -446,7 +586,10 @@ export const CodeReviewToolCard: React.FC = React.memo(({ extra={( <> {hasData && reviewData && ( - + )} {hasData && ( = React.memo(({ const review_mode = reviewData.review_mode; const review_scope = reviewData.review_scope; const reviewers = reviewData.reviewers ?? []; + const runManifest = review_mode === 'deep' + ? sessionRunManifest + : undefined; + const activeRunManifestReviewers = runManifest + ? getActiveReviewTeamManifestMembers(runManifest) + : []; const reportSections = buildCodeReviewReportSections(reviewData); + const reliabilityNotices = buildCodeReviewReliabilityNotices(reviewData, runManifest); const riskLevel = summary.risk_level ?? 'low'; const recommendedAction = summary.recommended_action ?? 'approve'; const remediationItemCount = reportSections.remediationGroups @@ -486,11 +636,47 @@ export const CodeReviewToolCard: React.FC = React.memo(({ const remediationExpanded = expandedReportSectionIds.has('remediation'); const issuesExpanded = expandedReportSectionIds.has('issues'); const strengthsExpanded = expandedReportSectionIds.has('strengths'); + const runManifestExpanded = expandedReportSectionIds.has('runManifest'); const teamExpanded = expandedReportSectionIds.has('team'); const coverageExpanded = expandedReportSectionIds.has('coverage'); return (
+ {reliabilityNotices.length > 0 && ( +
+
+ {t('toolCards.codeReview.reliabilityStatus.title', { + defaultValue: 'Review status', + })} +
+
+ {reliabilityNotices.map((notice) => ( +
+ + {getReliabilityNoticeIcon(notice)} + + + + {getReliabilityNoticeLabel(notice, t)} + + + {getReliabilityNoticeDetail(notice, t)} + + +
+ ))} +
+
+ )} +
{t('toolCards.codeReview.overallAssessment')}
@@ -539,6 +725,85 @@ export const CodeReviewToolCard: React.FC = React.memo(({
+ {runManifest && ( + +
+
+
+ {t('toolCards.codeReview.runManifest.target', { defaultValue: 'Target' })} + {formatRunManifestTarget(runManifest)} +
+
+ {t('toolCards.codeReview.runManifest.budget', { defaultValue: 'Budget' })} + {runManifest.tokenBudget.mode} +
+
+ {t('toolCards.codeReview.runManifest.estimatedCalls', { defaultValue: 'Estimated calls' })} + {runManifest.tokenBudget.estimatedReviewerCalls} +
+ {runManifest.strategyRecommendation && ( +
+ + {t('toolCards.codeReview.runManifest.recommendedStrategy', { + defaultValue: 'Recommended strategy', + })} + + {runManifest.strategyRecommendation.strategyLevel} +
+ )} +
+ + {runManifest.strategyRecommendation && ( +
+
+ {t('toolCards.codeReview.runManifest.riskRecommendationTitle', { + defaultValue: 'Risk recommendation', + })} +
+

{runManifest.strategyRecommendation.rationale}

+
+ )} + + {activeRunManifestReviewers.length > 0 && ( +
+
+ {t('toolCards.codeReview.runManifest.activeGroupTitle', { defaultValue: 'Will run' })} +
+
+ {activeRunManifestReviewers.map((member) => ( + + {getReviewerLabel(member)} + {member.roleName} + + ))} +
+
+ )} + + {runManifest.skippedReviewers.length > 0 && ( +
+
+ {t('toolCards.codeReview.runManifest.skippedGroupTitle', { defaultValue: 'Skipped reviewers' })} +
+
    + {runManifest.skippedReviewers.map((member) => ( +
  • + {getReviewerLabel(member)} + {getSkippedReasonLabel(member.reason, t)} +
  • + ))} +
+
+ )} +
+
+ )} + {issues.length > 0 && ( = React.memo(({ handleToggleReportSection, remediationItems, reviewData, + sessionRunManifest, t, ]); diff --git a/src/web-ui/src/flow_chat/types/flow-chat.ts b/src/web-ui/src/flow_chat/types/flow-chat.ts index 3299a41a7..c5d2094a2 100644 --- a/src/web-ui/src/flow_chat/types/flow-chat.ts +++ b/src/web-ui/src/flow_chat/types/flow-chat.ts @@ -8,6 +8,7 @@ import type { SessionKind, SessionTitleSource, } from '@/shared/types/session-history'; +import type { ReviewTeamRunManifest } from '@/shared/services/reviewTeamService'; // Base type for streaming items. export interface FlowItem { @@ -317,6 +318,9 @@ export interface Session { */ needsUserAttention?: 'ask_user' | 'tool_confirm'; + /** Per-run reviewer manifest for Deep Review child sessions. */ + deepReviewRunManifest?: ReviewTeamRunManifest; + /** * Runtime-only session that should stay in memory but never be persisted or * shown in the main session navigation. diff --git a/src/web-ui/src/flow_chat/utils/codeReviewReport.test.ts b/src/web-ui/src/flow_chat/utils/codeReviewReport.test.ts index df17cfe39..824f75e3a 100644 --- a/src/web-ui/src/flow_chat/utils/codeReviewReport.test.ts +++ b/src/web-ui/src/flow_chat/utils/codeReviewReport.test.ts @@ -1,9 +1,148 @@ import { describe, expect, it } from 'vitest'; import { buildCodeReviewReportSections, + buildCodeReviewReliabilityNotices, formatCodeReviewReportMarkdown, getDefaultExpandedCodeReviewSectionIds, } from './codeReviewReport'; +import type { ReviewTeamManifestMember, ReviewTeamRunManifest } from '@/shared/services/reviewTeamService'; + +function manifestMember( + subagentId: string, + displayName: string, + reason?: ReviewTeamManifestMember['reason'], +): ReviewTeamManifestMember { + return { + subagentId, + displayName, + roleName: displayName, + model: 'fast', + configuredModel: 'fast', + defaultModelSlot: 'fast', + strategyLevel: 'normal', + strategySource: 'team', + strategyDirective: 'Review the target.', + locked: !subagentId.startsWith('Custom'), + source: subagentId.startsWith('Custom') ? 'extra' : 'core', + subagentSource: subagentId.startsWith('Custom') ? 'user' : 'builtin', + ...(reason ? { reason } : {}), + }; +} + +function buildRunManifest(): ReviewTeamRunManifest { + return { + reviewMode: 'deep', + workspacePath: '/test-fixtures/project-a', + policySource: 'default-review-team-config', + target: { + source: 'session_files', + resolution: 'resolved', + tags: ['frontend'], + files: ['src/App.tsx'], + warnings: [], + }, + strategyLevel: 'normal', + strategyRecommendation: { + strategyLevel: 'deep', + score: 24, + rationale: 'Large/high-risk change (8 files, 900 lines; 2 security-sensitive files, 3 workspace areas). Deep review recommended.', + factors: { + fileCount: 8, + totalLinesChanged: 900, + lineCountSource: 'diff_stat', + securityFileCount: 2, + workspaceAreaCount: 3, + contractSurfaceChanged: true, + }, + }, + executionPolicy: { + reviewerTimeoutSeconds: 300, + judgeTimeoutSeconds: 240, + reviewerFileSplitThreshold: 20, + maxSameRoleInstances: 3, + maxRetriesPerRole: 1, + }, + concurrencyPolicy: { + maxParallelInstances: 4, + staggerSeconds: 0, + batchExtrasSeparately: true, + }, + preReviewSummary: { + source: 'target_manifest', + summary: '1 file, 12 changed lines across 1 workspace area: web-ui (1)', + fileCount: 1, + excludedFileCount: 0, + lineCount: 12, + lineCountSource: 'diff_stat', + targetTags: ['frontend'], + workspaceAreas: [ + { + key: 'web-ui', + fileCount: 1, + sampleFiles: ['src/App.tsx'], + }, + ], + warnings: [], + }, + sharedContextCache: { + source: 'work_packets', + strategy: 'reuse_readonly_file_context_by_cache_key', + entries: [ + { + cacheKey: 'shared-context:1', + path: 'src/App.tsx', + workspaceArea: 'web-ui', + recommendedTools: ['GetFileDiff', 'Read'], + consumerPacketIds: [ + 'reviewer:ReviewBusinessLogic', + 'reviewer:CustomSecurity', + ], + }, + ], + omittedEntryCount: 0, + }, + incrementalReviewCache: { + source: 'target_manifest', + strategy: 'reuse_completed_packets_when_fingerprint_matches', + cacheKey: 'incremental-review:abc12345', + fingerprint: 'abc12345', + filePaths: ['src/App.tsx'], + workspaceAreas: ['web-ui'], + targetTags: ['frontend'], + reviewerPacketIds: [ + 'reviewer:ReviewBusinessLogic', + 'reviewer:CustomSecurity', + ], + lineCount: 12, + lineCountSource: 'diff_stat', + invalidatesOn: [ + 'target_file_set_changed', + 'target_line_count_changed', + 'reviewer_roster_changed', + ], + }, + tokenBudget: { + mode: 'balanced', + estimatedReviewerCalls: 3, + maxReviewerCalls: 4, + maxExtraReviewers: 1, + largeDiffSummaryFirst: false, + skippedReviewerIds: ['CustomInvalid'], + warnings: [], + }, + coreReviewers: [ + manifestMember('ReviewBusinessLogic', 'Logic reviewer'), + ], + qualityGateReviewer: manifestMember('ReviewJudge', 'Quality inspector'), + enabledExtraReviewers: [ + manifestMember('CustomSecurity', 'Custom security reviewer'), + ], + skippedReviewers: [ + manifestMember('ReviewFrontend', 'Frontend reviewer', 'not_applicable'), + manifestMember('CustomInvalid', 'Custom invalid reviewer', 'invalid_tooling'), + ], + }; +} describe('codeReviewReport', () => { it('uses structured report sections when present', () => { @@ -99,6 +238,256 @@ describe('codeReviewReport', () => { ]); }); + it('surfaces partial reviewer output in coverage notes', () => { + const sections = buildCodeReviewReportSections({ + summary: { + overall_assessment: 'Review completed with reduced confidence.', + risk_level: 'medium' as const, + recommended_action: 'request_changes' as const, + }, + reviewers: [ + { + name: 'Security Reviewer', + specialty: 'security', + status: 'partial_timeout', + summary: 'Timed out after finding one likely issue.', + partial_output: 'Found likely token logging in src/auth.ts before timeout.', + }, + ], + }); + + expect(sections.reviewerStats).toMatchObject({ total: 1, completed: 0, degraded: 1 }); + expect(sections.coverageNotes).toEqual([ + 'Security Reviewer timed out after producing partial output: Found likely token logging in src/auth.ts before timeout.', + ]); + }); + + it('builds compact reliability notices only when review attention is needed', () => { + expect(buildCodeReviewReliabilityNotices({ + summary: { + overall_assessment: 'No issues found.', + risk_level: 'low' as const, + recommended_action: 'approve' as const, + }, + reviewers: [{ name: 'Reviewer', specialty: 'logic', status: 'completed', summary: 'Done.' }], + })).toEqual([]); + + const manifest = { + ...buildRunManifest(), + tokenBudget: { + ...buildRunManifest().tokenBudget, + largeDiffSummaryFirst: true, + warnings: ['Large target; reviewers will receive compact scopes.'], + }, + }; + const notices = buildCodeReviewReliabilityNotices({ + summary: { + overall_assessment: 'Review completed with reduced confidence.', + risk_level: 'medium' as const, + recommended_action: 'request_changes' as const, + }, + reviewers: [ + { + name: 'Security Reviewer', + specialty: 'security', + status: 'partial_timeout', + summary: 'Timed out after producing partial evidence.', + partial_output: 'Found likely token logging in src/auth.ts before timeout.', + }, + ], + report_sections: { + coverage_notes: ['Context compression preserved key file and test facts.'], + remediation_groups: { + needs_decision: ['Decide whether to block the release or isolate the feature.'], + }, + }, + }, manifest); + + expect(notices.map((notice) => notice.kind)).toEqual([ + 'context_pressure', + 'skipped_reviewers', + 'token_budget_limited', + 'compression_preserved', + 'partial_reviewer', + 'retry_guidance', + 'user_decision', + ]); + expect(notices.find((notice) => notice.kind === 'partial_reviewer')).toMatchObject({ + severity: 'warning', + count: 1, + }); + }); + + it('prefers structured reliability signals for status and markdown export', () => { + const report = { + summary: { + overall_assessment: 'Review completed with runtime reliability signals.', + risk_level: 'medium' as const, + recommended_action: 'request_changes' as const, + }, + review_mode: 'deep' as const, + reviewers: [ + { + name: 'Security Reviewer', + specialty: 'security', + status: 'completed', + summary: 'Completed.', + }, + ], + reliability_signals: [ + { + kind: 'context_pressure', + severity: 'warning', + count: 7, + source: 'runtime', + detail: 'Runtime profile capped reviewer fan-out for this large target.', + }, + { + kind: 'compression_preserved', + severity: 'info', + source: 'runtime', + detail: 'Compression contract retained modified files and failed commands.', + }, + { + kind: 'cache_hit', + severity: 'info', + count: 2, + source: 'runtime', + detail: 'Two reviewer packets reused matching cached output.', + }, + { + kind: 'cache_miss', + severity: 'info', + count: 1, + source: 'runtime', + detail: 'One reviewer packet ran fresh and updated the cache.', + }, + { + kind: 'concurrency_limited', + severity: 'warning', + count: 1, + source: 'runtime', + detail: 'One reviewer launch hit the configured concurrency cap.', + }, + { + kind: 'retry_guidance', + severity: 'warning', + count: 1, + source: 'runtime', + detail: 'Retry guidance was emitted for a partial reviewer.', + }, + ], + }; + + const notices = buildCodeReviewReliabilityNotices(report); + + expect(notices).toEqual([ + { + kind: 'context_pressure', + severity: 'warning', + count: 7, + source: 'runtime', + detail: 'Runtime profile capped reviewer fan-out for this large target.', + }, + { + kind: 'compression_preserved', + severity: 'info', + source: 'runtime', + detail: 'Compression contract retained modified files and failed commands.', + }, + { + kind: 'cache_hit', + severity: 'info', + count: 2, + source: 'runtime', + detail: 'Two reviewer packets reused matching cached output.', + }, + { + kind: 'cache_miss', + severity: 'info', + count: 1, + source: 'runtime', + detail: 'One reviewer packet ran fresh and updated the cache.', + }, + { + kind: 'concurrency_limited', + severity: 'warning', + count: 1, + source: 'runtime', + detail: 'One reviewer launch hit the configured concurrency cap.', + }, + { + kind: 'retry_guidance', + severity: 'warning', + count: 1, + source: 'runtime', + detail: 'Retry guidance was emitted for a partial reviewer.', + }, + ]); + + const markdown = formatCodeReviewReportMarkdown(report); + + expect(markdown).toContain('## Review Reliability'); + expect(markdown).toContain( + '- Context pressure rising [warning/runtime]: Runtime profile capped reviewer fan-out for this large target.', + ); + expect(markdown).toContain( + '- Compression preserved key facts [info/runtime]: Compression contract retained modified files and failed commands.', + ); + expect(markdown).toContain( + '- Incremental cache reused reviewer output [info/runtime]: Two reviewer packets reused matching cached output.', + ); + expect(markdown).toContain( + '- Incremental cache missed or refreshed [info/runtime]: One reviewer packet ran fresh and updated the cache.', + ); + expect(markdown).toContain( + '- Reviewer launch was concurrency-limited [warning/runtime]: One reviewer launch hit the configured concurrency cap.', + ); + expect(markdown).toContain( + '- Retry guidance emitted [warning/runtime]: Retry guidance was emitted for a partial reviewer.', + ); + }); + + it('summarizes skipped reviewer and token budget tradeoffs from the run manifest', () => { + const report = { + summary: { + overall_assessment: 'Review completed with one skipped reviewer.', + risk_level: 'medium' as const, + recommended_action: 'request_changes' as const, + }, + review_mode: 'deep' as const, + reviewers: [ + { + name: 'Business Logic Reviewer', + specialty: 'logic', + status: 'completed', + summary: 'Done.', + }, + ], + }; + const notices = buildCodeReviewReliabilityNotices(report, buildRunManifest()); + + expect(notices).toEqual([ + { + kind: 'skipped_reviewers', + severity: 'info', + count: 2, + source: 'manifest', + }, + { + kind: 'token_budget_limited', + severity: 'warning', + count: 1, + source: 'manifest', + }, + ]); + + const markdown = formatCodeReviewReportMarkdown(report, undefined, { runManifest: buildRunManifest() }); + + expect(markdown).toContain('- Skipped reviewers [info/manifest]: Count: 2'); + expect(markdown).toContain('- Token budget limited reviewer coverage [warning/manifest]: Count: 1'); + }); + it('keeps team and issue details collapsed by default while leaving remediation visible', () => { const report = { summary: { @@ -149,4 +538,94 @@ describe('codeReviewReport', () => { expect(markdown).toContain('## Remediation Plan'); expect(markdown).toContain('## Code Review Team'); }); + + it('exports partial reviewer output in markdown', () => { + const markdown = formatCodeReviewReportMarkdown({ + summary: { + overall_assessment: 'Review completed with partial security evidence.', + risk_level: 'medium' as const, + recommended_action: 'request_changes' as const, + }, + review_mode: 'deep' as const, + issues: [], + reviewers: [ + { + name: 'Security Reviewer', + specialty: 'security', + status: 'partial_timeout', + summary: 'Timed out after producing partial evidence.', + partial_output: 'Found likely token logging in src/auth.ts before timeout.', + }, + ], + }); + + expect(markdown).toContain('Security Reviewer (security; Status: partial_timeout)'); + expect(markdown).toContain('Partial output: Found likely token logging in src/auth.ts before timeout.'); + expect(markdown).toContain( + 'Security Reviewer timed out after producing partial output: Found likely token logging in src/auth.ts before timeout.', + ); + }); + + it('exports reviewer packet fallback metadata in markdown', () => { + const markdown = formatCodeReviewReportMarkdown({ + summary: { + overall_assessment: 'Review completed with inferred packet metadata.', + risk_level: 'low' as const, + recommended_action: 'approve' as const, + }, + review_mode: 'deep' as const, + issues: [], + reviewers: [ + { + name: 'Security Reviewer', + specialty: 'security', + status: 'completed', + summary: 'Checked the first security split.', + packet_id: 'reviewer:ReviewSecurity:group-1-of-3', + packet_status_source: 'inferred', + }, + ], + }); + + expect(markdown).toContain('Packet: reviewer:ReviewSecurity:group-1-of-3 (inferred)'); + }); + + it('includes the run manifest when exporting a deep review report', () => { + const markdown = formatCodeReviewReportMarkdown( + { + summary: { + overall_assessment: 'No validated issues.', + risk_level: 'low' as const, + recommended_action: 'approve' as const, + }, + review_mode: 'deep' as const, + issues: [], + reviewers: [], + }, + undefined, + { runManifest: buildRunManifest() }, + ); + + expect(markdown).toContain('## Run manifest'); + expect(markdown).toContain('- Target: frontend'); + expect(markdown).toContain('- Budget: balanced'); + expect(markdown).toContain('- Estimated calls: 3'); + expect(markdown).toContain('- Recommended strategy: deep'); + expect(markdown).toContain('- Recommendation score: 24'); + expect(markdown).toContain('- Recommendation rationale: Large/high-risk change'); + expect(markdown).toContain('- Logic reviewer (ReviewBusinessLogic)'); + expect(markdown).toContain('- Custom security reviewer (CustomSecurity)'); + expect(markdown).toContain('- Quality inspector (ReviewJudge)'); + expect(markdown).toContain('- Frontend reviewer (ReviewFrontend): not_applicable'); + expect(markdown).toContain('- Custom invalid reviewer (CustomInvalid): invalid_tooling'); + expect(markdown).toContain('### Pre-review summary'); + expect(markdown).toContain('- 1 file, 12 changed lines across 1 workspace area: web-ui (1)'); + expect(markdown).toContain('- web-ui: 1 file (src/App.tsx)'); + expect(markdown).toContain('### Shared context cache'); + expect(markdown).toContain('- shared-context:1: src/App.tsx -> reviewer:ReviewBusinessLogic, reviewer:CustomSecurity'); + expect(markdown).toContain('### Incremental review cache'); + expect(markdown).toContain('- Cache key: incremental-review:abc12345'); + expect(markdown).toContain('- Fingerprint: abc12345'); + expect(markdown).toContain('- Invalidates on: target_file_set_changed, target_line_count_changed, reviewer_roster_changed'); + }); }); diff --git a/src/web-ui/src/flow_chat/utils/codeReviewReport.ts b/src/web-ui/src/flow_chat/utils/codeReviewReport.ts index f34c57889..0c70e082b 100644 --- a/src/web-ui/src/flow_chat/utils/codeReviewReport.ts +++ b/src/web-ui/src/flow_chat/utils/codeReviewReport.ts @@ -1,9 +1,23 @@ +import { + getActiveReviewTeamManifestMembers, + type ReviewTeamManifestMember, + type ReviewTeamRunManifest, +} from '@/shared/services/reviewTeamService'; + export type ReviewRiskLevel = 'low' | 'medium' | 'high' | 'critical'; export type ReviewAction = 'approve' | 'approve_with_suggestions' | 'request_changes' | 'block'; export type ReviewMode = 'standard' | 'deep'; export type ReviewIssueSeverity = 'critical' | 'high' | 'medium' | 'low' | 'info'; export type ReviewIssueCertainty = 'confirmed' | 'likely' | 'possible'; -export type ReviewSectionId = 'summary' | 'issues' | 'remediation' | 'strengths' | 'team' | 'coverage'; +export type ReviewPacketStatusSource = 'reported' | 'inferred' | 'missing'; +export type ReviewSectionId = + | 'summary' + | 'issues' + | 'remediation' + | 'strengths' + | 'runManifest' + | 'team' + | 'coverage'; export type RemediationGroupId = 'must_fix' | 'should_improve' | 'needs_decision' | 'verification'; export type StrengthGroupId = | 'architecture' @@ -39,6 +53,9 @@ export interface CodeReviewReviewer { specialty: string; status: string; summary: string; + partial_output?: string; + packet_id?: string; + packet_status_source?: ReviewPacketStatusSource; issue_count?: number; } @@ -80,6 +97,7 @@ export interface CodeReviewReportData { reviewers?: CodeReviewReviewer[]; remediation_plan?: string[]; report_sections?: CodeReviewReportSectionsData; + reliability_signals?: CodeReviewReliabilitySignal[]; } export interface ReviewReportGroup { @@ -111,27 +129,72 @@ export interface ReviewReportSections { reviewerStats: ReviewReviewerStats; } +export type ReviewReliabilityNoticeKind = + | 'context_pressure' + | 'compression_preserved' + | 'cache_hit' + | 'cache_miss' + | 'concurrency_limited' + | 'partial_reviewer' + | 'retry_guidance' + | 'skipped_reviewers' + | 'token_budget_limited' + | 'user_decision'; + +export type ReviewReliabilityNoticeSeverity = 'info' | 'warning' | 'action'; +export type ReviewReliabilitySignalSource = 'runtime' | 'manifest' | 'report' | 'inferred'; + +export interface ReviewReliabilityNotice { + kind: ReviewReliabilityNoticeKind; + severity: ReviewReliabilityNoticeSeverity; + count?: number; + source?: ReviewReliabilitySignalSource; + detail?: string; +} + +export interface CodeReviewReliabilitySignal { + kind: ReviewReliabilityNoticeKind; + severity?: ReviewReliabilityNoticeSeverity; + count?: number; + source?: ReviewReliabilitySignalSource; + detail?: string; +} + export interface CodeReviewReportMarkdownLabels { titleStandard: string; titleDeep: string; executiveSummary: string; reviewDecision: string; + runManifest: string; riskLevel: string; recommendedAction: string; scope: string; + target: string; + budget: string; + estimatedCalls: string; + activeReviewers: string; + skippedReviewers: string; issues: string; noIssues: string; remediationPlan: string; strengths: string; reviewTeam: string; + reliabilitySignals: string; coverageNotes: string; status: string; + packet: string; + partialOutput: string; findings: string; validation: string; suggestion: string; source: string; noItems: string; groupTitles: Record; + reliabilityNoticeLabels: Record; +} + +export interface CodeReviewReportMarkdownOptions { + runManifest?: ReviewTeamRunManifest; } const REMEDIATION_GROUP_ORDER: RemediationGroupId[] = [ @@ -152,27 +215,74 @@ const STRENGTH_GROUP_ORDER: StrengthGroupId[] = [ ]; const DEGRADED_REVIEWER_STATUSES = new Set(['timed_out', 'cancelled_by_user', 'failed', 'skipped']); +const PARTIAL_TIMEOUT_REVIEWER_STATUSES = new Set(['partial_timeout', 'timed_out', 'cancelled_by_user']); +const RELIABILITY_NOTICE_ORDER: ReviewReliabilityNoticeKind[] = [ + 'context_pressure', + 'skipped_reviewers', + 'token_budget_limited', + 'compression_preserved', + 'cache_hit', + 'cache_miss', + 'concurrency_limited', + 'partial_reviewer', + 'retry_guidance', + 'user_decision', +]; +const RELIABILITY_NOTICE_FALLBACK_LABELS: Record = { + context_pressure: 'Context pressure rising', + compression_preserved: 'Compression preserved key facts', + cache_hit: 'Incremental cache reused reviewer output', + cache_miss: 'Incremental cache missed or refreshed', + concurrency_limited: 'Reviewer launch was concurrency-limited', + partial_reviewer: 'Reviewer timed out with partial result', + retry_guidance: 'Retry guidance emitted', + skipped_reviewers: 'Skipped reviewers', + token_budget_limited: 'Token budget limited reviewer coverage', + user_decision: 'User decision needed', +}; +const RELIABILITY_NOTICE_SEVERITY_BY_KIND: Record = { + context_pressure: 'info', + compression_preserved: 'info', + cache_hit: 'info', + cache_miss: 'info', + concurrency_limited: 'warning', + partial_reviewer: 'warning', + retry_guidance: 'warning', + skipped_reviewers: 'info', + token_budget_limited: 'warning', + user_decision: 'action', +}; export const DEFAULT_CODE_REVIEW_MARKDOWN_LABELS: CodeReviewReportMarkdownLabels = { titleStandard: 'Code Review Report', titleDeep: 'Deep Review Report', executiveSummary: 'Executive Summary', reviewDecision: 'Review Decision', + runManifest: 'Run manifest', riskLevel: 'Risk Level', recommendedAction: 'Recommended Action', scope: 'Scope', + target: 'Target', + budget: 'Budget', + estimatedCalls: 'Estimated calls', + activeReviewers: 'Active reviewers', + skippedReviewers: 'Skipped reviewers', issues: 'Issues', noIssues: 'No validated issues.', remediationPlan: 'Remediation Plan', strengths: 'Strengths', reviewTeam: 'Code Review Team', + reliabilitySignals: 'Review Reliability', coverageNotes: 'Coverage Notes', status: 'Status', + packet: 'Packet', + partialOutput: 'Partial output', findings: 'Findings', validation: 'Validation', suggestion: 'Suggestion', source: 'Source', noItems: 'None.', + reliabilityNoticeLabels: RELIABILITY_NOTICE_FALLBACK_LABELS, groupTitles: { must_fix: 'Must Fix', should_improve: 'Should Improve', @@ -259,7 +369,10 @@ function buildReviewerStats(reviewers: CodeReviewReviewer[] = []): ReviewReviewe for (const reviewer of reviewers) { if (reviewer.status === 'completed') { completed += 1; - } else if (DEGRADED_REVIEWER_STATUSES.has(reviewer.status)) { + } else if ( + DEGRADED_REVIEWER_STATUSES.has(reviewer.status) || + reviewer.status === 'partial_timeout' + ) { degraded += 1; } } @@ -271,6 +384,251 @@ function buildReviewerStats(reviewers: CodeReviewReviewer[] = []): ReviewReviewe }; } +function buildPartialReviewerCoverageNotes(reviewers: CodeReviewReviewer[] = []): string[] { + return reviewers + .map((reviewer) => { + const partialOutput = reviewer.partial_output?.trim(); + if (!partialOutput || !PARTIAL_TIMEOUT_REVIEWER_STATUSES.has(reviewer.status)) { + return null; + } + return `${reviewer.name} timed out after producing partial output: ${partialOutput}`; + }) + .filter((note): note is string => Boolean(note)); +} + +function hasCompressionPreservationNote(report: CodeReviewReportData): boolean { + const notes = [ + ...(report.report_sections?.coverage_notes ?? []), + report.summary?.confidence_note, + ]; + + return notes.some((note) => { + const normalized = note?.toLowerCase() ?? ''; + return normalized.includes('compress') && normalized.includes('preserv'); + }); +} + +function countPartialReviewers(reviewers: CodeReviewReviewer[] = []): number { + return reviewers.filter((reviewer) => + reviewer.status === 'partial_timeout' || + ( + PARTIAL_TIMEOUT_REVIEWER_STATUSES.has(reviewer.status) && + Boolean(reviewer.partial_output?.trim()) + ) + ).length; +} + +function countSkippedReviewers(runManifest?: ReviewTeamRunManifest): number { + return runManifest?.skippedReviewers.length ?? 0; +} + +function countTokenBudgetLimitedReviewers(runManifest?: ReviewTeamRunManifest): number { + if (!runManifest) { + return 0; + } + const skippedByBudget = new Set(runManifest.tokenBudget.skippedReviewerIds); + for (const reviewer of runManifest.skippedReviewers) { + if (reviewer.reason === 'budget_limited') { + skippedByBudget.add(reviewer.subagentId); + } + } + return skippedByBudget.size; +} + +function countDecisionItems(report: CodeReviewReportData): number { + const structuredDecisionItems = report.report_sections?.remediation_groups?.needs_decision ?? []; + if (structuredDecisionItems.length > 0) { + const stringItems = structuredDecisionItems.filter((item): item is string => typeof item === 'string'); + return nonEmpty(stringItems).length; + } + + return report.summary?.recommended_action === 'block' ? 1 : 0; +} + +function isReliabilityNoticeKind(value: string): value is ReviewReliabilityNoticeKind { + return RELIABILITY_NOTICE_ORDER.includes(value as ReviewReliabilityNoticeKind); +} + +function isReliabilitySeverity(value: string): value is ReviewReliabilityNoticeSeverity { + return value === 'info' || value === 'warning' || value === 'action'; +} + +function isReliabilitySignalSource(value: string): value is ReviewReliabilitySignalSource { + return value === 'runtime' || value === 'manifest' || value === 'report' || value === 'inferred'; +} + +function normalizeStructuredReliabilityNotice( + signal: CodeReviewReliabilitySignal, +): ReviewReliabilityNotice | null { + if (!isReliabilityNoticeKind(signal.kind)) { + return null; + } + + const detail = signal.detail?.trim(); + return { + kind: signal.kind, + severity: signal.severity && isReliabilitySeverity(signal.severity) + ? signal.severity + : RELIABILITY_NOTICE_SEVERITY_BY_KIND[signal.kind], + ...(typeof signal.count === 'number' ? { count: signal.count } : {}), + ...(signal.source && isReliabilitySignalSource(signal.source) + ? { source: signal.source } + : {}), + ...(detail ? { detail } : {}), + }; +} + +function structuredReliabilityNoticeMap( + report: CodeReviewReportData, +): Map { + const notices = new Map(); + for (const signal of report.reliability_signals ?? []) { + const notice = normalizeStructuredReliabilityNotice(signal); + if (notice && !notices.has(notice.kind)) { + notices.set(notice.kind, notice); + } + } + return notices; +} + +function reliabilityNoticeLabel( + kind: ReviewReliabilityNoticeKind, + labels: CodeReviewReportMarkdownLabels, +): string { + return labels.reliabilityNoticeLabels[kind] ?? RELIABILITY_NOTICE_FALLBACK_LABELS[kind]; +} + +function reliabilityNoticeMarkdownDetail(notice: ReviewReliabilityNotice): string { + if (notice.detail?.trim()) { + return notice.detail.trim(); + } + if (typeof notice.count === 'number') { + return `Count: ${notice.count}`; + } + return ''; +} + +function reliabilityNoticeMarkdownLine( + notice: ReviewReliabilityNotice, + labels: CodeReviewReportMarkdownLabels, +): string { + const tags = [notice.severity, notice.source].filter(Boolean).join('/'); + const detail = reliabilityNoticeMarkdownDetail(notice); + const tagText = tags ? ` [${tags}]` : ''; + return detail + ? `- ${reliabilityNoticeLabel(notice.kind, labels)}${tagText}: ${detail}` + : `- ${reliabilityNoticeLabel(notice.kind, labels)}${tagText}`; +} + +export function buildCodeReviewReliabilityNotices( + report: CodeReviewReportData, + runManifest?: ReviewTeamRunManifest, +): ReviewReliabilityNotice[] { + const notices: ReviewReliabilityNotice[] = []; + const structuredNotices = structuredReliabilityNoticeMap(report); + const hasContextPressure = runManifest + ? runManifest.tokenBudget.largeDiffSummaryFirst || runManifest.tokenBudget.warnings.length > 0 + : false; + + const structuredContextPressure = structuredNotices.get('context_pressure'); + if (structuredContextPressure) { + notices.push(structuredContextPressure); + } else if (hasContextPressure && runManifest) { + notices.push({ + kind: 'context_pressure', + severity: 'info', + count: runManifest.tokenBudget.estimatedReviewerCalls, + source: 'manifest', + }); + } + + const structuredCompressionPreserved = structuredNotices.get('compression_preserved'); + if (structuredCompressionPreserved) { + notices.push(structuredCompressionPreserved); + } else if (hasCompressionPreservationNote(report)) { + notices.push({ + kind: 'compression_preserved', + severity: 'info', + source: 'inferred', + }); + } + + for (const kind of ['cache_hit', 'cache_miss', 'concurrency_limited'] as const) { + const structuredNotice = structuredNotices.get(kind); + if (structuredNotice) { + notices.push(structuredNotice); + } + } + + const partialReviewerCount = countPartialReviewers(report.reviewers); + const structuredPartialReviewer = structuredNotices.get('partial_reviewer'); + if (structuredPartialReviewer) { + notices.push(structuredPartialReviewer); + } else if (partialReviewerCount > 0) { + notices.push({ + kind: 'partial_reviewer', + severity: 'warning', + count: partialReviewerCount, + source: 'runtime', + }); + } + + const structuredRetryGuidance = structuredNotices.get('retry_guidance'); + if (structuredRetryGuidance) { + notices.push(structuredRetryGuidance); + } else if (partialReviewerCount > 0) { + notices.push({ + kind: 'retry_guidance', + severity: 'warning', + count: partialReviewerCount, + source: 'runtime', + }); + } + + const skippedReviewerCount = countSkippedReviewers(runManifest); + const structuredSkippedReviewers = structuredNotices.get('skipped_reviewers'); + if (structuredSkippedReviewers) { + notices.push(structuredSkippedReviewers); + } else if (skippedReviewerCount > 0) { + notices.push({ + kind: 'skipped_reviewers', + severity: 'info', + count: skippedReviewerCount, + source: 'manifest', + }); + } + + const tokenBudgetLimitedReviewerCount = countTokenBudgetLimitedReviewers(runManifest); + const structuredTokenBudgetLimited = structuredNotices.get('token_budget_limited'); + if (structuredTokenBudgetLimited) { + notices.push(structuredTokenBudgetLimited); + } else if (tokenBudgetLimitedReviewerCount > 0) { + notices.push({ + kind: 'token_budget_limited', + severity: 'warning', + count: tokenBudgetLimitedReviewerCount, + source: 'manifest', + }); + } + + const decisionItemCount = countDecisionItems(report); + const structuredUserDecision = structuredNotices.get('user_decision'); + if (structuredUserDecision) { + notices.push(structuredUserDecision); + } else if (decisionItemCount > 0) { + notices.push({ + kind: 'user_decision', + severity: 'action', + count: decisionItemCount, + source: 'report', + }); + } + + return RELIABILITY_NOTICE_ORDER + .map((kind) => notices.find((notice) => notice.kind === kind)) + .filter((notice): notice is ReviewReliabilityNotice => Boolean(notice)); +} + export function buildCodeReviewReportSections(report: CodeReviewReportData): ReviewReportSections { const structuredSections = report.report_sections; @@ -291,6 +649,7 @@ export function buildCodeReviewReportSections(report: CodeReviewReportData): Rev const strengthGroups = buildGroups(STRENGTH_GROUP_ORDER, structuredSections?.strength_groups); const executiveSummary = nonEmpty(structuredSections?.executive_summary); const coverageNotes = nonEmpty(structuredSections?.coverage_notes); + const partialReviewerCoverageNotes = buildPartialReviewerCoverageNotes(report.reviewers); const confidenceNote = report.summary?.confidence_note?.trim(); return { @@ -304,8 +663,8 @@ export function buildCodeReviewReportSections(report: CodeReviewReportData): Rev ? strengthGroups : buildLegacyStrengthGroups(report), coverageNotes: coverageNotes.length > 0 - ? coverageNotes - : nonEmpty([confidenceNote]), + ? nonEmpty([...coverageNotes, ...partialReviewerCoverageNotes]) + : nonEmpty([confidenceNote, ...partialReviewerCoverageNotes]), issueStats: buildIssueStats(report.issues), reviewerStats: buildReviewerStats(report.reviewers), }; @@ -330,6 +689,10 @@ function mergeLabels(labels?: Partial): CodeRevi ...DEFAULT_CODE_REVIEW_MARKDOWN_LABELS.groupTitles, ...labels?.groupTitles, }, + reliabilityNoticeLabels: { + ...DEFAULT_CODE_REVIEW_MARKDOWN_LABELS.reliabilityNoticeLabels, + ...labels?.reliabilityNoticeLabels, + }, }; } @@ -352,9 +715,137 @@ function issueLocation(issue: CodeReviewIssue): string { return issue.line ? `${issue.file}:${issue.line}` : issue.file; } +function manifestTarget(manifest: ReviewTeamRunManifest): string { + return manifest.target.tags.length > 0 + ? manifest.target.tags.join(', ') + : manifest.target.source; +} + +function manifestMemberLabel(member: ReviewTeamManifestMember): string { + return member.displayName || member.subagentId; +} + +function manifestMemberLine(member: ReviewTeamManifestMember): string { + return `${manifestMemberLabel(member)} (${member.subagentId})`; +} + +function pluralize(count: number, singular: string): string { + return `${count} ${singular}${count === 1 ? '' : 's'}`; +} + +function pushPreReviewSummarySection( + lines: string[], + manifest: ReviewTeamRunManifest, +): void { + const summary = manifest.preReviewSummary; + if (!summary) { + return; + } + + lines.push(`### Pre-review summary`); + lines.push(`- ${summary.summary}`); + lines.push(`- Files: ${summary.fileCount}`); + if (summary.lineCount !== undefined) { + lines.push(`- Lines changed: ${summary.lineCount} (${summary.lineCountSource})`); + } else { + lines.push(`- Lines changed: unknown (${summary.lineCountSource})`); + } + if (summary.workspaceAreas.length > 0) { + for (const area of summary.workspaceAreas) { + const sampleFiles = area.sampleFiles.length > 0 + ? ` (${area.sampleFiles.join(', ')})` + : ''; + lines.push(`- ${area.key}: ${pluralize(area.fileCount, 'file')}${sampleFiles}`); + } + } + lines.push(''); +} + +function pushSharedContextCacheSection( + lines: string[], + manifest: ReviewTeamRunManifest, +): void { + const cachePlan = manifest.sharedContextCache; + if (!cachePlan) { + return; + } + + lines.push(`### Shared context cache`); + if (cachePlan.entries.length === 0) { + lines.push('- None.'); + } else { + for (const entry of cachePlan.entries) { + lines.push( + `- ${entry.cacheKey}: ${entry.path} -> ${entry.consumerPacketIds.join(', ')}`, + ); + } + } + if (cachePlan.omittedEntryCount > 0) { + lines.push(`- Omitted entries: ${cachePlan.omittedEntryCount}`); + } + lines.push(''); +} + +function pushIncrementalReviewCacheSection( + lines: string[], + manifest: ReviewTeamRunManifest, +): void { + const cachePlan = manifest.incrementalReviewCache; + if (!cachePlan) { + return; + } + + lines.push(`### Incremental review cache`); + lines.push(`- Cache key: ${cachePlan.cacheKey}`); + lines.push(`- Fingerprint: ${cachePlan.fingerprint}`); + lines.push(`- Strategy: ${cachePlan.strategy}`); + lines.push(`- Reviewer packets: ${cachePlan.reviewerPacketIds.join(', ') || 'none'}`); + lines.push(`- Invalidates on: ${cachePlan.invalidatesOn.join(', ') || 'none'}`); + lines.push(''); +} + +function pushRunManifestSection( + lines: string[], + manifest: ReviewTeamRunManifest, + labels: CodeReviewReportMarkdownLabels, +): void { + const activeReviewers = getActiveReviewTeamManifestMembers(manifest); + + lines.push(`## ${labels.runManifest}`); + lines.push(`- ${labels.target}: ${manifestTarget(manifest)}`); + lines.push(`- ${labels.budget}: ${manifest.tokenBudget.mode}`); + lines.push(`- ${labels.estimatedCalls}: ${manifest.tokenBudget.estimatedReviewerCalls}`); + if (manifest.strategyRecommendation) { + lines.push(`- Recommended strategy: ${manifest.strategyRecommendation.strategyLevel}`); + lines.push(`- Recommendation score: ${manifest.strategyRecommendation.score}`); + lines.push(`- Recommendation rationale: ${manifest.strategyRecommendation.rationale}`); + } + lines.push(''); + lines.push(`### ${labels.activeReviewers}`); + pushList( + lines, + activeReviewers.map((member) => manifestMemberLine(member)), + labels.noItems, + ); + lines.push(''); + lines.push(`### ${labels.skippedReviewers}`); + pushList( + lines, + manifest.skippedReviewers.map((member) => + `${manifestMemberLine(member)}: ${member.reason ?? 'skipped'}`, + ), + labels.noItems, + ); + lines.push(''); + pushPreReviewSummarySection(lines, manifest); + pushSharedContextCacheSection(lines, manifest); + pushIncrementalReviewCacheSection(lines, manifest); +} + export function formatCodeReviewReportMarkdown( report: CodeReviewReportData, labels?: Partial, + options?: CodeReviewReportMarkdownOptions, ): string { const mergedLabels = mergeLabels(labels); const sections = buildCodeReviewReportSections(report); @@ -374,6 +865,17 @@ export function formatCodeReviewReportMarkdown( lines.push(`- ${mergedLabels.scope}: ${report.review_scope.trim()}`); } lines.push(''); + if (report.review_mode === 'deep' && options?.runManifest) { + pushRunManifestSection(lines, options.runManifest, mergedLabels); + } + const reliabilityNotices = buildCodeReviewReliabilityNotices(report, options?.runManifest); + if (reliabilityNotices.length > 0) { + lines.push(`## ${mergedLabels.reliabilitySignals}`); + reliabilityNotices.forEach((notice) => { + lines.push(reliabilityNoticeMarkdownLine(notice, mergedLabels)); + }); + lines.push(''); + } lines.push(`## ${mergedLabels.issues}`); if (issues.length === 0) { lines.push(`- ${mergedLabels.noIssues}`); @@ -438,6 +940,17 @@ export function formatCodeReviewReportMarkdown( if (reviewer.summary) { lines.push(` - ${reviewer.summary}`); } + const packetId = reviewer.packet_id?.trim(); + if (packetId || reviewer.packet_status_source) { + const packetLabel = packetId || 'missing'; + const sourceLabel = reviewer.packet_status_source + ? ` (${reviewer.packet_status_source})` + : ''; + lines.push(` - ${mergedLabels.packet}: ${packetLabel}${sourceLabel}`); + } + if (reviewer.partial_output?.trim()) { + lines.push(` - ${mergedLabels.partialOutput}: ${reviewer.partial_output.trim()}`); + } } } lines.push(''); diff --git a/src/web-ui/src/flow_chat/utils/deepReviewCapacityGuard.test.ts b/src/web-ui/src/flow_chat/utils/deepReviewCapacityGuard.test.ts new file mode 100644 index 000000000..855400492 --- /dev/null +++ b/src/web-ui/src/flow_chat/utils/deepReviewCapacityGuard.test.ts @@ -0,0 +1,86 @@ +import { describe, expect, it } from 'vitest'; +import { + DEEP_REVIEW_SESSION_CONCURRENCY_WARNING_THRESHOLD, + deriveDeepReviewSessionConcurrencyGuard, +} from './deepReviewCapacityGuard'; +import type { FlowChatState, FlowToolItem, Session } from '../types/flow-chat'; + +function createTaskItem(id: string, status: FlowToolItem['status']): FlowToolItem { + return { + id, + type: 'tool', + toolName: 'Task', + timestamp: 1000, + status, + toolCall: { + id, + input: { subagent_type: 'ReviewSecurity' }, + }, + }; +} + +function createSession(items: FlowToolItem[]): Session { + return { + sessionId: 'parent-session', + sessionKind: 'normal', + status: 'active', + createdAt: 1000, + updatedAt: 2000, + lastActiveAt: 2000, + dialogTurns: [ + { + id: 'turn-1', + status: 'processing', + modelRounds: [ + { + id: 'round-1', + items, + }, + ], + } as any, + ], + } as Session; +} + +function createState(session: Session): FlowChatState { + return { + sessions: new Map([[session.sessionId, session]]), + activeSessionId: session.sessionId, + } as FlowChatState; +} + +describe('deriveDeepReviewSessionConcurrencyGuard', () => { + it('warns when the target session already has multiple active Task subagents', () => { + const state = createState(createSession([ + createTaskItem('task-1', 'running'), + createTaskItem('task-2', 'streaming'), + ])); + + const guard = deriveDeepReviewSessionConcurrencyGuard(state, 'parent-session'); + + expect(guard.activeSubagentCount).toBe(DEEP_REVIEW_SESSION_CONCURRENCY_WARNING_THRESHOLD); + expect(guard.highActivity).toBe(true); + }); + + it('ignores completed Task subagents and unrelated sessions', () => { + const targetSession = createSession([ + createTaskItem('task-1', 'completed'), + ]); + const unrelatedSession = { + ...createSession([createTaskItem('task-2', 'running')]), + sessionId: 'unrelated-session', + } as Session; + const state = { + sessions: new Map([ + [targetSession.sessionId, targetSession], + [unrelatedSession.sessionId, unrelatedSession], + ]), + activeSessionId: targetSession.sessionId, + } as FlowChatState; + + const guard = deriveDeepReviewSessionConcurrencyGuard(state, 'parent-session'); + + expect(guard.activeSubagentCount).toBe(0); + expect(guard.highActivity).toBe(false); + }); +}); diff --git a/src/web-ui/src/flow_chat/utils/deepReviewCapacityGuard.ts b/src/web-ui/src/flow_chat/utils/deepReviewCapacityGuard.ts new file mode 100644 index 000000000..8a8a8b0ba --- /dev/null +++ b/src/web-ui/src/flow_chat/utils/deepReviewCapacityGuard.ts @@ -0,0 +1,67 @@ +import type { FlowChatState, FlowToolItem, Session } from '../types/flow-chat'; + +export const DEEP_REVIEW_SESSION_CONCURRENCY_WARNING_THRESHOLD = 2; + +export interface DeepReviewSessionConcurrencyGuard { + activeSubagentCount: number; + highActivity: boolean; +} + +const ACTIVE_TOOL_STATUSES = new Set([ + 'pending', + 'preparing', + 'running', + 'streaming', + 'receiving', + 'analyzing', +]); + +function isActiveSubagentTask(item: unknown): item is FlowToolItem { + if (!item || typeof item !== 'object') { + return false; + } + const toolItem = item as FlowToolItem; + if ( + toolItem.type !== 'tool' || + toolItem.toolName !== 'Task' || + !ACTIVE_TOOL_STATUSES.has(toolItem.status) + ) { + return false; + } + + const input = toolItem.toolCall?.input ?? {}; + const subagentType = input.subagent_type ?? input.subagentType ?? input.agent_type ?? input.agentType; + return typeof subagentType === 'string' && subagentType.trim().length > 0; +} + +function countActiveSubagentTasks(session?: Session): number { + if (!session) { + return 0; + } + + let count = 0; + for (const turn of session.dialogTurns ?? []) { + for (const round of turn.modelRounds ?? []) { + for (const item of round.items ?? []) { + if (isActiveSubagentTask(item)) { + count += 1; + } + } + } + } + return count; +} + +export function deriveDeepReviewSessionConcurrencyGuard( + state: FlowChatState, + parentSessionId?: string | null, +): DeepReviewSessionConcurrencyGuard { + const activeSubagentCount = countActiveSubagentTasks( + parentSessionId ? state.sessions.get(parentSessionId) : undefined, + ); + + return { + activeSubagentCount, + highActivity: activeSubagentCount >= DEEP_REVIEW_SESSION_CONCURRENCY_WARNING_THRESHOLD, + }; +} diff --git a/src/web-ui/src/flow_chat/utils/deepReviewContinuation.test.ts b/src/web-ui/src/flow_chat/utils/deepReviewContinuation.test.ts index 7080526a5..e41f83849 100644 --- a/src/web-ui/src/flow_chat/utils/deepReviewContinuation.test.ts +++ b/src/web-ui/src/flow_chat/utils/deepReviewContinuation.test.ts @@ -168,6 +168,72 @@ describe('deepReviewContinuation', () => { expect(prompt).toContain('ReviewSecurity: timed_out'); }); + it('tracks reviewer partial timeout output when available', () => { + const session = createDeepReviewSession({ + error: 'Timeout', + dialogTurns: [ + { + id: 'turn-1', + sessionId: 'deep-review-session', + timestamp: 1, + status: 'error', + userMessage: { + id: 'user-1', + content: 'Original command:\n/DeepReview review latest commit', + timestamp: 1, + }, + startTime: 1, + modelRounds: [ + { + id: 'round-1', + index: 0, + startTime: 1, + isStreaming: false, + isComplete: true, + status: 'completed', + items: [ + { + id: 'tool-1', + type: 'tool', + toolName: 'Task', + toolCall: { + id: 'call-security', + input: { subagent_type: 'ReviewSecurity' }, + }, + toolResult: { + result: { + status: 'partial_timeout', + partial_output: 'Found one likely token logging issue before timeout.', + }, + success: true, + resultForAssistant: + "Subagent 'ReviewSecurity' timed out with partial result.", + }, + startTime: 1, + timestamp: 1, + status: 'completed', + }, + ], + }, + ], + }, + ], + }); + + const interruption = deriveDeepReviewInterruption(session, { category: 'timeout' }); + const prompt = buildDeepReviewContinuationPrompt(interruption!); + + expect(interruption?.reviewers).toEqual([ + expect.objectContaining({ + reviewer: 'ReviewSecurity', + status: 'partial_timeout', + partialOutput: 'Found one likely token logging issue before timeout.', + }), + ]); + expect(prompt).toContain('ReviewSecurity: partial_timeout'); + expect(prompt).toContain('partial output: Found one likely token logging issue before timeout.'); + }); + it('marks policy-ineligible reviewers as skipped so continuation does not re-run them', () => { const session = createDeepReviewSession({ dialogTurns: [ @@ -287,4 +353,160 @@ describe('deepReviewContinuation', () => { expect(actionCodes).not.toContain('switch_model'); expect(actionCodes).not.toContain('wait_and_retry'); }); + + it('includes retry budget constraints from the persisted run manifest', () => { + const session = createDeepReviewSession({ + error: 'Timeout', + deepReviewRunManifest: { + executionPolicy: { + maxRetriesPerRole: 1, + }, + skippedReviewers: [], + }, + dialogTurns: [ + { + id: 'turn-1', + sessionId: 'deep-review-session', + timestamp: 1, + status: 'error', + userMessage: { + id: 'user-1', + content: 'Original command:\n/DeepReview review latest commit', + timestamp: 1, + }, + startTime: 1, + modelRounds: [ + { + id: 'round-1', + index: 0, + startTime: 1, + isStreaming: false, + isComplete: true, + status: 'completed', + items: [ + { + id: 'tool-1', + type: 'tool', + toolName: 'Task', + toolCall: { + id: 'call-security', + input: { subagent_type: 'ReviewSecurity' }, + }, + toolResult: { + result: { status: 'timed_out' }, + success: false, + error: 'Reviewer timed out', + }, + startTime: 1, + timestamp: 1, + status: 'error', + }, + ], + }, + ], + error: 'Timeout', + }, + ], + } as Partial); + + const interruption = deriveDeepReviewInterruption(session, { category: 'timeout' }); + const prompt = buildDeepReviewContinuationPrompt(interruption!); + + expect(prompt).toContain('max_retries_per_role = 1'); + expect(prompt).toContain('retry = true'); + expect(prompt).toContain('reduce the scope'); + }); + + it('includes persisted manifest skips when continuing an interrupted review', () => { + const session = createDeepReviewSession({ + error: 'Timeout', + deepReviewRunManifest: { + skippedReviewers: [ + { + subagentId: 'ReviewFrontend', + displayName: 'Frontend Reviewer', + reason: 'not_applicable', + }, + ], + }, + dialogTurns: [ + { + id: 'turn-1', + sessionId: 'deep-review-session', + timestamp: 1, + status: 'error', + userMessage: { + id: 'user-1', + content: 'Original command:\n/DeepReview review latest commit', + timestamp: 1, + }, + startTime: 1, + modelRounds: [], + error: 'Timeout', + }, + ], + } as Partial); + + const interruption = deriveDeepReviewInterruption(session, { category: 'timeout' }); + const prompt = buildDeepReviewContinuationPrompt(interruption!); + + expect(prompt).toContain('Do not run reviewers skipped as not_applicable.'); + expect(prompt).toContain('ReviewFrontend: skipped (not_applicable)'); + }); + + it('includes incremental cache guidance from the persisted run manifest', () => { + const session = createDeepReviewSession({ + error: 'Timeout', + deepReviewRunManifest: { + incrementalReviewCache: { + source: 'target_manifest', + strategy: 'reuse_completed_packets_when_fingerprint_matches', + cacheKey: 'incremental-review:abc12345', + fingerprint: 'abc12345', + filePaths: [ + 'src/web-ui/src/shared/services/reviewTeamService.ts', + ], + workspaceAreas: ['web-ui'], + reviewerPacketIds: [ + 'reviewer:ReviewBusinessLogic', + 'reviewer:ReviewSecurity', + ], + lineCount: 128, + lineCountSource: 'diff_stat', + invalidatesOn: [ + 'target_file_set_changed', + 'target_line_count_changed', + 'reviewer_roster_changed', + ], + }, + skippedReviewers: [], + }, + dialogTurns: [ + { + id: 'turn-1', + sessionId: 'deep-review-session', + timestamp: 1, + status: 'error', + userMessage: { + id: 'user-1', + content: 'Original command:\n/DeepReview review latest commit', + timestamp: 1, + }, + startTime: 1, + modelRounds: [], + error: 'Timeout', + }, + ], + } as Partial); + + const interruption = deriveDeepReviewInterruption(session, { category: 'timeout' }); + const prompt = buildDeepReviewContinuationPrompt(interruption!); + + expect(prompt).toContain('Incremental review cache guidance:'); + expect(prompt).toContain('cache_key: incremental-review:abc12345'); + expect(prompt).toContain('fingerprint: abc12345'); + expect(prompt).toContain('Only reuse completed reviewer outputs when the current review target fingerprint still matches.'); + expect(prompt).toContain('reviewer:ReviewBusinessLogic'); + expect(prompt).toContain('target_file_set_changed'); + }); }); diff --git a/src/web-ui/src/flow_chat/utils/deepReviewContinuation.ts b/src/web-ui/src/flow_chat/utils/deepReviewContinuation.ts index 0bdc62579..c30d53206 100644 --- a/src/web-ui/src/flow_chat/utils/deepReviewContinuation.ts +++ b/src/web-ui/src/flow_chat/utils/deepReviewContinuation.ts @@ -6,13 +6,21 @@ import { import type { FlowToolItem, Session } from '../types/flow-chat'; export type DeepReviewContinuationPhase = 'review_interrupted' | 'resume_blocked'; -export type DeepReviewReviewerStatus = 'completed' | 'timed_out' | 'failed' | 'cancelled' | 'skipped' | 'unknown'; +export type DeepReviewReviewerStatus = + | 'completed' + | 'partial_timeout' + | 'timed_out' + | 'failed' + | 'cancelled' + | 'skipped' + | 'unknown'; export interface DeepReviewReviewerProgress { reviewer: string; status: DeepReviewReviewerStatus; toolCallId?: string; error?: string; + partialOutput?: string; } export interface DeepReviewInterruption { @@ -24,6 +32,7 @@ export interface DeepReviewInterruption { canResume: boolean; recommendedActions: AiErrorAction[]; reviewers: DeepReviewReviewerProgress[]; + runManifest?: Session['deepReviewRunManifest']; } const RESUME_BLOCKING_CATEGORIES = new Set([ @@ -75,6 +84,7 @@ export function deriveDeepReviewInterruption( canResume, recommendedActions: presentation.actions, reviewers: collectReviewerProgress(session), + runManifest: session.deepReviewRunManifest, }; } @@ -83,10 +93,31 @@ export function buildDeepReviewContinuationPrompt(interruption: DeepReviewInterr ? interruption.reviewers .map((reviewer) => { const suffix = reviewer.error ? ` (${reviewer.error})` : ''; - return `- ${reviewer.reviewer}: ${reviewer.status}${suffix}`; + const partialOutput = reviewer.partialOutput + ? `; partial output: ${reviewer.partialOutput}` + : ''; + return `- ${reviewer.reviewer}: ${reviewer.status}${suffix}${partialOutput}`; }) .join('\n') : '- No reliable reviewer progress was detected. Reconstruct progress from this session before deciding what to rerun.'; + const skippedReviewers = interruption.runManifest?.skippedReviewers ?? []; + const manifestSkippedReviewers = formatManifestSkippedReviewers(skippedReviewers); + const manifestRules = skippedReviewers.some((reviewer) => reviewer.reason === 'not_applicable') + ? [ + '- Do not run reviewers skipped as not_applicable.', + ] + : []; + const manifestBlock = manifestSkippedReviewers.length + ? [ + '', + 'Run manifest reviewer skips:', + manifestSkippedReviewers.join('\n'), + ] + : []; + const retryBudgetRules = formatRetryBudgetRules(interruption.runManifest); + const incrementalCacheBlock = formatIncrementalReviewCacheGuidance( + interruption.runManifest, + ); return [ 'Continue the interrupted Deep Review in this same session.', @@ -94,6 +125,8 @@ export function buildDeepReviewContinuationPrompt(interruption: DeepReviewInterr 'Recovery rules:', '- Do not restart completed reviewer work unless the existing result is clearly incomplete or unusable.', '- Do not re-run skipped, non-applicable, or policy-ineligible reviewers; keep them recorded as skipped coverage.', + ...retryBudgetRules, + ...manifestRules, '- Re-run only missing, failed, timed-out, or cancelled reviewers when enough context exists.', '- If reviewer coverage remains incomplete, say that explicitly and mark the final report as lower confidence.', '- Run ReviewJudge before the final submit_code_review result when reviewer findings exist.', @@ -103,6 +136,8 @@ export function buildDeepReviewContinuationPrompt(interruption: DeepReviewInterr '', 'Known reviewer progress:', reviewerLines, + ...manifestBlock, + ...incrementalCacheBlock, '', 'Last error:', `- category: ${interruption.errorDetail.category ?? 'unknown'}`, @@ -111,6 +146,66 @@ export function buildDeepReviewContinuationPrompt(interruption: DeepReviewInterr ].join('\n'); } +function formatIncrementalReviewCacheGuidance( + runManifest: Session['deepReviewRunManifest'] | undefined, +): string[] { + const cachePlan = runManifest?.incrementalReviewCache; + if (!cachePlan) { + return []; + } + + return [ + '', + 'Incremental review cache guidance:', + `- cache_key: ${cachePlan.cacheKey}`, + `- fingerprint: ${cachePlan.fingerprint}`, + `- strategy: ${cachePlan.strategy}`, + `- reviewer_packet_ids: ${cachePlan.reviewerPacketIds.join(', ') || 'none'}`, + `- invalidates_on: ${cachePlan.invalidatesOn.join(', ') || 'none'}`, + '- Only reuse completed reviewer outputs when the current review target fingerprint still matches.', + '- If any invalidates_on condition changed, rerun affected reviewer packets and explain the fresh review boundary.', + ]; +} + +function formatRetryBudgetRules( + runManifest: Session['deepReviewRunManifest'] | undefined, +): string[] { + const maxRetriesPerRole = runManifest?.executionPolicy?.maxRetriesPerRole; + const baseRules = [ + '- Treat partial_timeout reviewers as preserved partial evidence. Re-run them only when useful evidence is missing or unusable.', + ]; + + if (typeof maxRetriesPerRole !== 'number') { + return [ + ...baseRules, + '- Respect the original retry budget if it is recoverable from context; do not retry the same reviewer repeatedly.', + ]; + } + + if (maxRetriesPerRole <= 0) { + return [ + ...baseRules, + '- Retry budget from manifest: max_retries_per_role = 0. Do not re-run failed, timed-out, or partial reviewers automatically; report remaining gaps instead.', + ]; + } + + return [ + ...baseRules, + `- Retry budget from manifest: max_retries_per_role = ${maxRetriesPerRole}.`, + '- For each retry, use the same subagent_type with retry = true, reduce the scope to missing evidence, downgrade strategy when possible, and use a shorter timeout.', + ]; +} + +function formatManifestSkippedReviewers( + skippedReviewers: NonNullable['skippedReviewers'], +): string[] { + return skippedReviewers.map((reviewer) => { + const reviewerName = reviewer.subagentId || reviewer.displayName; + const reason = reviewer.reason ?? 'unknown'; + return `- ${reviewerName}: skipped (${reason})`; + }); +} + function findOriginalTarget(session: Session): string { const firstTurn = session.dialogTurns[0]; return firstTurn?.userMessage?.content?.trim() || 'Unknown Deep Review target.'; @@ -151,8 +246,12 @@ function getReviewerProgressFromTask(item: FlowToolItem): DeepReviewReviewerProg } const error = item.toolResult?.error; + const resultStatus = String(item.toolResult?.result?.status ?? '').trim(); + const partialOutput = getPartialOutput(item); let status: DeepReviewReviewerStatus = 'unknown'; - if (item.toolResult?.success === true || item.status === 'completed') { + if (resultStatus === 'partial_timeout' || /partial[_ -]?timeout/i.test(error ?? '')) { + status = 'partial_timeout'; + } else if (item.toolResult?.success === true || item.status === 'completed') { status = 'completed'; } else if (/timeout|timed out/i.test(error ?? '')) { status = 'timed_out'; @@ -172,6 +271,7 @@ function getReviewerProgressFromTask(item: FlowToolItem): DeepReviewReviewerProg status, toolCallId: item.toolCall.id, error, + partialOutput, }; } @@ -181,3 +281,9 @@ function isPolicyIneligibleReviewerError(error?: string): boolean { } return /DeepReview Task policy violation|deep_review_subagent_(?:not_review|not_allowed|not_readonly)/i.test(error); } + +function getPartialOutput(item: FlowToolItem): string | undefined { + const result = item.toolResult?.result; + const value = result?.partial_output ?? result?.partialOutput; + return typeof value === 'string' && value.trim() ? value.trim() : undefined; +} diff --git a/src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.test.ts b/src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.test.ts new file mode 100644 index 000000000..39b3bfb95 --- /dev/null +++ b/src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.test.ts @@ -0,0 +1,73 @@ +import { describe, expect, it } from 'vitest'; +import type { DeepReviewQueueStateChangedEvent } from '@/infrastructure/api/service-api/AgentAPI'; +import type { Session } from '../types/flow-chat'; +import { buildDeepReviewCapacityQueueStateFromEvent } from './deepReviewQueueStateEvents'; + +function createQueueEvent( + overrides: Partial = {}, +): DeepReviewQueueStateChangedEvent { + return { + sessionId: 'review-child', + turnId: 'turn-1', + queueState: { + toolId: 'task-1', + subagentType: 'ReviewSecurity', + status: 'queued_for_capacity', + reason: 'provider_concurrency_limit', + queuedReviewerCount: 2, + activeReviewerCount: 1, + effectiveParallelInstances: 2, + optionalReviewerCount: 1, + queueElapsedMs: 1200, + maxQueueWaitSeconds: 60, + sessionConcurrencyHigh: true, + }, + ...overrides, + }; +} + +function createSession(sessionKind: Session['sessionKind']): Session { + return { + sessionId: 'review-child', + sessionKind, + status: 'active', + createdAt: 1000, + updatedAt: 1000, + lastActiveAt: 1000, + dialogTurns: [], + } as Session; +} + +describe('buildDeepReviewCapacityQueueStateFromEvent', () => { + it('maps backend queue events into the action bar queue state for Deep Review sessions', () => { + const state = buildDeepReviewCapacityQueueStateFromEvent( + createQueueEvent(), + createSession('deep_review'), + ); + + expect(state).toEqual({ + toolId: 'task-1', + subagentType: 'ReviewSecurity', + dialogTurnId: 'turn-1', + status: 'queued_for_capacity', + queuedReviewerCount: 2, + activeReviewerCount: 1, + effectiveParallelInstances: 2, + optionalReviewerCount: 1, + queueElapsedMs: 1200, + runElapsedMs: undefined, + maxQueueWaitSeconds: 60, + sessionConcurrencyHigh: true, + controlMode: 'backend', + }); + }); + + it('ignores queue events for non-Deep Review sessions', () => { + const state = buildDeepReviewCapacityQueueStateFromEvent( + createQueueEvent(), + createSession('normal'), + ); + + expect(state).toBeNull(); + }); +}); diff --git a/src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.ts b/src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.ts new file mode 100644 index 000000000..4d4019a7c --- /dev/null +++ b/src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.ts @@ -0,0 +1,33 @@ +import type { DeepReviewQueueStateChangedEvent } from '@/infrastructure/api/service-api/AgentAPI'; +import type { DeepReviewCapacityQueueState } from '../store/deepReviewActionBarStore'; +import type { Session } from '../types/flow-chat'; + +export function buildDeepReviewCapacityQueueStateFromEvent( + event: DeepReviewQueueStateChangedEvent, + session: Session | undefined, +): DeepReviewCapacityQueueState | null { + if (session?.sessionKind !== 'deep_review') { + return null; + } + + const queueState = event.queueState; + if (!queueState) { + return null; + } + + return { + toolId: queueState.toolId, + subagentType: queueState.subagentType, + dialogTurnId: event.turnId, + status: queueState.status, + queuedReviewerCount: Math.max(0, queueState.queuedReviewerCount ?? 0), + activeReviewerCount: queueState.activeReviewerCount, + effectiveParallelInstances: queueState.effectiveParallelInstances, + optionalReviewerCount: queueState.optionalReviewerCount, + queueElapsedMs: queueState.queueElapsedMs, + runElapsedMs: queueState.runElapsedMs, + maxQueueWaitSeconds: queueState.maxQueueWaitSeconds, + sessionConcurrencyHigh: queueState.sessionConcurrencyHigh, + controlMode: 'backend', + }; +} diff --git a/src/web-ui/src/flow_chat/utils/sessionMetadata.test.ts b/src/web-ui/src/flow_chat/utils/sessionMetadata.test.ts index dd4c721a6..3d1f5ff5f 100644 --- a/src/web-ui/src/flow_chat/utils/sessionMetadata.test.ts +++ b/src/web-ui/src/flow_chat/utils/sessionMetadata.test.ts @@ -357,6 +357,27 @@ describe('sessionMetadata', () => { }); }); + it('persists the Deep Review run manifest from the runtime session', () => { + const runManifest = { + reviewMode: 'deep', + skippedReviewers: [ + { + subagentId: 'ReviewFrontend', + displayName: 'Frontend Reviewer', + reason: 'not_applicable', + }, + ], + }; + const session = createSession({ + sessionKind: 'deep_review', + deepReviewRunManifest: runManifest, + } as Partial); + + const metadata = buildSessionMetadata(session); + + expect(metadata.deepReviewRunManifest).toBe(runManifest); + }); + describe('unread completion persistence', () => { it('persists unreadCompletion from session to metadata', () => { const session = createSession({ diff --git a/src/web-ui/src/flow_chat/utils/sessionMetadata.ts b/src/web-ui/src/flow_chat/utils/sessionMetadata.ts index 84dc33b43..f32882734 100644 --- a/src/web-ui/src/flow_chat/utils/sessionMetadata.ts +++ b/src/web-ui/src/flow_chat/utils/sessionMetadata.ts @@ -261,6 +261,7 @@ export function buildSessionMetadata( | 'titleI18nParams' | 'hasUnreadCompletion' | 'needsUserAttention' + | 'deepReviewRunManifest' >, existingMetadata?: SessionMetadata | null ): SessionMetadata { @@ -317,5 +318,7 @@ export function buildSessionMetadata( // `undefined ?? existingMetadata.unreadCompletion` would restore the old value. unreadCompletion: session.hasUnreadCompletion, needsUserAttention: session.needsUserAttention, + deepReviewRunManifest: + session.deepReviewRunManifest ?? existingMetadata?.deepReviewRunManifest, }; } diff --git a/src/web-ui/src/infrastructure/api/service-api/ACPClientAPI.ts b/src/web-ui/src/infrastructure/api/service-api/ACPClientAPI.ts index 9db847132..05a9d1d89 100644 --- a/src/web-ui/src/infrastructure/api/service-api/ACPClientAPI.ts +++ b/src/web-ui/src/infrastructure/api/service-api/ACPClientAPI.ts @@ -1,4 +1,5 @@ import { api } from './ApiClient'; +import type { ImageContextData as ImageInputContextData } from './ImageContextTypes'; export type AcpClientPermissionMode = 'ask' | 'allow_once' | 'reject_once'; export type AcpClientStatus = 'configured' | 'starting' | 'running' | 'stopped' | 'failed'; @@ -60,6 +61,8 @@ export interface StartAcpDialogTurnRequest { remoteConnectionId?: string; remoteSshHost?: string; timeoutSeconds?: number; + imageContexts?: ImageInputContextData[]; + userMessageMetadata?: Record; } export interface CancelAcpDialogTurnRequest { diff --git a/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts b/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts index 91fa656ba..9d7189fc2 100644 --- a/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts +++ b/src/web-ui/src/infrastructure/api/service-api/AgentAPI.ts @@ -62,6 +62,7 @@ export interface StartDialogTurnRequest { workspacePath?: string; /** Optional multimodal image contexts (snake_case fields, aligned with backend ImageContextData). */ imageContexts?: ImageInputContextData[]; + userMessageMetadata?: Record; } export interface CompactSessionRequest { @@ -155,6 +156,51 @@ export interface ToolEvent extends AgenticEvent { subagentParentInfo?: SubagentParentInfo; } +export type DeepReviewQueueStatus = + | 'queued_for_capacity' + | 'paused_by_user' + | 'running' + | 'capacity_skipped'; + +export type DeepReviewQueueReason = + | 'provider_rate_limit' + | 'provider_concurrency_limit' + | 'retry_after' + | 'local_concurrency_cap' + | 'temporary_overload'; + +export interface DeepReviewQueueStateEventData { + toolId: string; + subagentType: string; + status: DeepReviewQueueStatus; + reason?: DeepReviewQueueReason; + queuedReviewerCount: number; + activeReviewerCount?: number; + effectiveParallelInstances?: number; + optionalReviewerCount?: number; + queueElapsedMs?: number; + runElapsedMs?: number; + maxQueueWaitSeconds?: number; + sessionConcurrencyHigh?: boolean; +} + +export interface DeepReviewQueueStateChangedEvent extends AgenticEvent { + queueState: DeepReviewQueueStateEventData; +} + +export type DeepReviewQueueControlAction = + | 'pause' + | 'continue' + | 'cancel' + | 'skip_optional'; + +export interface DeepReviewQueueControlRequest { + sessionId: string; + dialogTurnId: string; + toolId: string; + action: DeepReviewQueueControlAction; +} + export interface ImageAnalysisEvent extends AgenticEvent { imageCount?: number; @@ -267,6 +313,14 @@ export class AgentAPI { } } + async controlDeepReviewQueue(request: DeepReviewQueueControlRequest): Promise { + try { + await api.invoke('control_deep_review_queue', { request }); + } catch (error) { + throw createTauriCommandError('control_deep_review_queue', error, request); + } + } + async deleteSession( sessionId: string, @@ -419,6 +473,15 @@ export class AgentAPI { return api.listen('agentic://tool-event', callback); } + onDeepReviewQueueStateChanged( + callback: (event: DeepReviewQueueStateChangedEvent) => void + ): () => void { + return api.listen( + 'agentic://deep-review-queue-state-changed', + callback + ); + } + onDialogTurnCompleted(callback: (event: AgenticEvent) => void): () => void { return api.listen('agentic://dialog-turn-completed', callback); @@ -477,7 +540,14 @@ export class AgentAPI { } } - + async getDefaultReviewTeamDefinition(): Promise { + try { + return await api.invoke('get_default_review_team_definition'); + } catch (error) { + throw createTauriCommandError('get_default_review_team_definition', error); + } + } + async generateSessionTitle( sessionId: string, userMessage: string, diff --git a/src/web-ui/src/infrastructure/api/service-api/GitAPI.ts b/src/web-ui/src/infrastructure/api/service-api/GitAPI.ts index 543971783..fd5591fae 100644 --- a/src/web-ui/src/infrastructure/api/service-api/GitAPI.ts +++ b/src/web-ui/src/infrastructure/api/service-api/GitAPI.ts @@ -89,11 +89,35 @@ export interface GitPullParams { } export interface GitDiffParams { + source?: string; + target?: string; + files?: string[]; + stat?: boolean; filePath?: string; staged?: boolean; commit?: string; } +export interface GitChangedFilesParams { + source?: string; + target?: string; + staged?: boolean; +} + +export type GitChangedFileStatus = + | 'added' + | 'modified' + | 'deleted' + | 'renamed' + | 'copied' + | 'unknown'; + +export interface GitChangedFile { + path: string; + old_path?: string; + status: GitChangedFileStatus; +} + export interface GitLogParams { maxCount?: number; since?: string; @@ -330,6 +354,17 @@ export class GitAPI { } + async getChangedFiles(repositoryPath: string, params: GitChangedFilesParams): Promise { + try { + return await api.invoke('git_get_changed_files', { + request: { repositoryPath, params } + }); + } catch (error) { + throw createTauriCommandError('git_get_changed_files', error, { repositoryPath, params }); + } + } + + async resetFiles(repositoryPath: string, files: string[], staged: boolean = false): Promise { try { return await api.invoke('git_reset_files', { @@ -435,4 +470,4 @@ export class GitAPI { } -export const gitAPI = new GitAPI(); \ No newline at end of file +export const gitAPI = new GitAPI(); diff --git a/src/web-ui/src/locales/en-US/flow-chat.json b/src/web-ui/src/locales/en-US/flow-chat.json index a4e397b81..e9fa69c44 100644 --- a/src/web-ui/src/locales/en-US/flow-chat.json +++ b/src/web-ui/src/locales/en-US/flow-chat.json @@ -396,6 +396,7 @@ "fixFailed": "Fix failed", "fixTimeout": "Fix timed out", "fixInterrupted": "Fix was interrupted. {{count}} items remain.", + "reviewWaitingCapacity": "Review queue waiting", "continueFix": "Continue fixing {{count}} items", "skipRemaining": "Skip remaining", "reviewError": "Review error", @@ -457,6 +458,18 @@ "recoveryPreserve": "{{count}} completed reviewers will be preserved", "recoveryRerun": "{{count}} reviewers will be rerun", "recoverySkip": "{{count}} reviewers will be skipped", + "capacityQueue": { + "title": "Reviewers waiting for capacity", + "pausedTitle": "Queue paused", + "detail": "Queue wait does not count against reviewer runtime.", + "sessionBusy": "Your active session is busy. Pause Deep Review or continue later.", + "stopHint": "Use Stop to interrupt this review queue.", + "pauseQueue": "Pause queue", + "continueQueue": "Continue queue", + "cancelQueued": "Cancel queued reviewers", + "skipOptionalQueued": "Skip optional extras", + "controlFailed": "Queue control failed. Please try again or stop the review." + }, "degradation": { "reduceReviewers": "Run with core reviewers only", "reduceReviewersDesc": "Skip extra reviewers and keep only core reviewers", @@ -487,9 +500,11 @@ "windowTitle": "Deep Review", "eyebrow": "Code review team", "title": "Start Deep Review?", - "body": "Deep Review launches a parallel code review team. It can take longer and consume noticeably more tokens than a standard review.", + "body": "Deep Review launches multiple reviewers and can take longer or use more tokens than a standard review.", "readonlyLabel": "Read-only first pass", - "readonly": "The first pass is read-only. It will produce findings and a remediation plan before any code changes.", + "readonly": "The first pass reports findings and a remediation plan before any code changes.", + "sessionConcurrencyTitle": "Active session is busy", + "sessionConcurrencyBody": "The target session already has {{count}} running subagent tasks. Choose a lighter strategy, cancel for now, or continue manually when capacity is free.", "costLabel": "Higher token usage", "cost": "Expected cost: multiple reviewers plus a quality gate, so runtime and token use may be several times higher than a standard review.", "timeLabel": "Longer runtime", @@ -497,7 +512,71 @@ "dontShowAgain": "Do not show this again", "cancel": "Cancel", "confirm": "Start Deep Review", - "estimatedTokens": "Estimated: {{min}} - {{max}} tokens" + "estimatedTokens": "Estimated: {{min}} - {{max}} tokens", + "runStrategy": "Run strategy: {{strategy}}", + "recommendedStrategy": "Recommended strategy: {{strategy}}", + "recommendationTitle": "Risk recommendation", + "strategyOverrideTitle": "Run strategy", + "strategyOverrideBody": "Choose a project-specific strategy for this launch.", + "teamDefaultStrategy": "Team default", + "strategyLabels": { + "quick": "Quick", + "normal": "Normal", + "deep": "Deep" + }, + "lineupTitle": "Review lineup", + "summaryTitle": "Launch summary", + "targetFiles": "{{count}} files", + "targetFiles_one": "{{count}} file", + "targetFiles_other": "{{count}} files", + "targetRiskTags": "Risk areas: {{tags}}", + "targetTagsWithMore": "{{tags}} +{{count}} more", + "targetTagsWithMore_one": "{{tags}} +{{count}} more", + "targetTagsWithMore_other": "{{tags}} +{{count}} more", + "optionalReviewers": "{{count}} optional reviewers", + "optionalReviewers_one": "{{count}} optional reviewer", + "optionalReviewers_other": "{{count}} optional reviewers", + "summaryFirstReview": "Summary-first coverage", + "targetTagLabels": { + "frontendUi": "Frontend UI", + "frontendStyle": "Frontend styles", + "frontendI18n": "Frontend i18n", + "frontendContract": "Frontend contract", + "desktopContract": "Desktop contract", + "webServerContract": "Web server contract", + "backendCore": "Backend core", + "transport": "Transport", + "apiLayer": "API layer", + "aiAdapter": "AI adapter", + "installerUi": "Installer UI", + "test": "Tests", + "docs": "Docs", + "config": "Config", + "generatedOrLock": "Generated or lockfile", + "unknown": "Unknown area" + }, + "estimatedCalls": "{{count}} reviewer calls", + "estimatedCalls_one": "{{count}} reviewer call", + "estimatedCalls_other": "{{count}} reviewer calls", + "activeReviewers": "{{count}} active", + "activeReviewers_one": "{{count}} active", + "activeReviewers_other": "{{count}} active", + "skippedReviewers": "{{count}} skipped", + "skippedReviewers_one": "{{count}} skipped", + "skippedReviewers_other": "{{count}} skipped", + "skippedMore": "+{{count}} more", + "skippedMore_one": "+{{count}} more", + "skippedMore_other": "+{{count}} more", + "activeGroupTitle": "Will run", + "skippedGroupTitle": "Skipped reviewers", + "skippedReasons": { + "notApplicable": "Not applicable to this target", + "budgetLimited": "Limited by token budget", + "invalidTooling": "Configuration issue", + "disabled": "Disabled", + "unavailable": "Unavailable", + "skipped": "Skipped" + } }, "flowChatHeader": { "turnList": "Turn list", @@ -1180,8 +1259,55 @@ "partial": "partial", "unknown": "unknown" }, + "runManifest": { + "recommendedStrategy": "Recommended strategy", + "riskRecommendationTitle": "Risk recommendation" + }, "sectionItemCount": "{{count}} items", "remediationPlan": "Remediation Plan", + "reliabilityStatus": { + "title": "Review status", + "context_pressure": { + "label": "Context pressure rising", + "detail": "{{count}} reviewer calls planned for a large or constrained target." + }, + "compression_preserved": { + "label": "Compression preserved key facts", + "detail": "Coverage notes include preserved context from compression." + }, + "cache_hit": { + "label": "Incremental cache reused reviewer output", + "detail": "{{count}} reviewer packet reused matching cached output." + }, + "cache_miss": { + "label": "Incremental cache missed or refreshed", + "detail": "{{count}} reviewer packet ran fresh or refreshed stale cache." + }, + "concurrency_limited": { + "label": "Reviewer launch was concurrency-limited", + "detail": "{{count}} reviewer launch hit a concurrency cap." + }, + "partial_reviewer": { + "label": "Reviewer timed out with partial result", + "detail": "{{count}} reviewer result is partial; confidence is reduced." + }, + "retry_guidance": { + "label": "Retry guidance emitted", + "detail": "{{count}} retry guidance item was emitted for partial review coverage." + }, + "skipped_reviewers": { + "label": "Skipped reviewers", + "detail": "{{count}} reviewer was skipped by applicability, configuration, or budget." + }, + "token_budget_limited": { + "label": "Token budget limited reviewer coverage", + "detail": "{{count}} reviewer was skipped by token budget mode." + }, + "user_decision": { + "label": "User decision needed", + "detail": "{{count}} review item needs your decision before fixing." + } + }, "sections": { "summary": "Executive Summary", "issues": "Issues", @@ -1210,8 +1336,11 @@ "noIssues": "No validated issues.", "status": "Status", "findings": "Findings", + "packet": "Packet", + "partialOutput": "Partial output", "validation": "Validation", "source": "Source", + "reliabilitySignals": "Review Reliability", "noItems": "None." }, "export": { diff --git a/src/web-ui/src/locales/zh-CN/flow-chat.json b/src/web-ui/src/locales/zh-CN/flow-chat.json index ad99158c0..25476ec84 100644 --- a/src/web-ui/src/locales/zh-CN/flow-chat.json +++ b/src/web-ui/src/locales/zh-CN/flow-chat.json @@ -396,6 +396,7 @@ "fixFailed": "修复失败", "fixTimeout": "修复超时", "fixInterrupted": "修复已中断,还剩 {{count}} 项。", + "reviewWaitingCapacity": "审核队列等待中", "continueFix": "继续修复 {{count}} 项", "skipRemaining": "跳过剩余项", "reviewError": "审核出错", @@ -457,6 +458,18 @@ "recoveryPreserve": "{{count}} 个已完成的 reviewer 将保留", "recoveryRerun": "{{count}} 个 reviewer 将重新运行", "recoverySkip": "{{count}} 个 reviewer 将被跳过", + "capacityQueue": { + "title": "审核员正在等待容量", + "pausedTitle": "队列已暂停", + "detail": "排队等待时间不会计入审核员运行时长。", + "sessionBusy": "当前会话较忙,可暂停 Deep Review 或稍后继续。", + "stopHint": "可使用停止按钮中断当前审核队列。", + "pauseQueue": "暂停队列", + "continueQueue": "继续队列", + "cancelQueued": "取消排队审核员", + "skipOptionalQueued": "跳过可选扩展", + "controlFailed": "\u961f\u5217\u63a7\u5236\u5931\u8d25\uff0c\u8bf7\u91cd\u8bd5\u6216\u505c\u6b62\u5ba1\u6838\u3002" + }, "degradation": { "reduceReviewers": "仅运行核心 reviewer", "reduceReviewersDesc": "跳过额外 reviewer,只保留核心 reviewer", @@ -487,9 +500,11 @@ "windowTitle": "深度审核", "eyebrow": "代码审核团队", "title": "开始深度审核?", - "body": "深度审核会启动并行代码审核团队,通常比普通审核耗时更久,也会消耗更多 Token。", + "body": "深度审核会启动多个审核员,通常比普通审核耗时更久,也会消耗更多 Token。", "readonlyLabel": "首次只读", - "readonly": "首次审核默认只读:它会先输出问题和修复计划,不会直接修改代码。", + "readonly": "首次审核会先输出问题和修复计划,不会直接修改代码。", + "sessionConcurrencyTitle": "当前会话较忙", + "sessionConcurrencyBody": "目标会话已有 {{count}} 个 subagent 任务在运行。可选择更轻量策略、暂时取消,或在容量空闲后手动继续。", "costLabel": "Token 消耗更高", "cost": "预期消耗:多个审核角色加质量门禁,运行时间和 Token 可能达到普通审核的数倍。", "timeLabel": "耗时更久", @@ -497,7 +512,71 @@ "dontShowAgain": "下次不再提示", "cancel": "取消", "confirm": "开始深度审核", - "estimatedTokens": "预计消耗:{{min}} - {{max}} tokens" + "estimatedTokens": "预计消耗:{{min}} - {{max}} tokens", + "runStrategy": "运行策略:{{strategy}}", + "recommendedStrategy": "推荐策略:{{strategy}}", + "recommendationTitle": "风险推荐", + "strategyOverrideTitle": "运行策略", + "strategyOverrideBody": "为本项目的本次启动选择审核策略。", + "teamDefaultStrategy": "团队默认", + "strategyLabels": { + "quick": "快速", + "normal": "标准", + "deep": "深度" + }, + "lineupTitle": "审核阵容", + "summaryTitle": "启动摘要", + "targetFiles": "{{count}} 个文件", + "targetFiles_one": "{{count}} 个文件", + "targetFiles_other": "{{count}} 个文件", + "targetRiskTags": "风险区域:{{tags}}", + "targetTagsWithMore": "{{tags}},另有 {{count}} 个", + "targetTagsWithMore_one": "{{tags}},另有 {{count}} 个", + "targetTagsWithMore_other": "{{tags}},另有 {{count}} 个", + "optionalReviewers": "{{count}} 个可选审核者", + "optionalReviewers_one": "{{count}} 个可选审核者", + "optionalReviewers_other": "{{count}} 个可选审核者", + "summaryFirstReview": "先摘要再覆盖", + "targetTagLabels": { + "frontendUi": "前端 UI", + "frontendStyle": "前端样式", + "frontendI18n": "前端国际化", + "frontendContract": "前端契约", + "desktopContract": "桌面契约", + "webServerContract": "Web 服务契约", + "backendCore": "后端核心", + "transport": "传输层", + "apiLayer": "API 层", + "aiAdapter": "AI 适配器", + "installerUi": "安装器 UI", + "test": "测试", + "docs": "文档", + "config": "配置", + "generatedOrLock": "生成文件或锁文件", + "unknown": "未知区域" + }, + "estimatedCalls": "{{count}} 次审核调用", + "estimatedCalls_one": "{{count}} 次审核调用", + "estimatedCalls_other": "{{count}} 次审核调用", + "activeReviewers": "{{count}} 个已启用", + "activeReviewers_one": "{{count}} 个已启用", + "activeReviewers_other": "{{count}} 个已启用", + "skippedReviewers": "{{count}} 个已跳过", + "skippedReviewers_one": "{{count}} 个已跳过", + "skippedReviewers_other": "{{count}} 个已跳过", + "skippedMore": "另有 {{count}} 个", + "skippedMore_one": "另有 {{count}} 个", + "skippedMore_other": "另有 {{count}} 个", + "activeGroupTitle": "将运行", + "skippedGroupTitle": "已跳过的审核者", + "skippedReasons": { + "notApplicable": "不适用于本次目标", + "budgetLimited": "受 Token 预算限制", + "invalidTooling": "配置问题", + "disabled": "已停用", + "unavailable": "不可用", + "skipped": "已跳过" + } }, "flowChatHeader": { "turnList": "轮次列表", @@ -1180,8 +1259,55 @@ "partial": "部分完成", "unknown": "未知" }, + "runManifest": { + "recommendedStrategy": "推荐策略", + "riskRecommendationTitle": "风险推荐" + }, "sectionItemCount": "{{count}} 项", "remediationPlan": "修复计划", + "reliabilityStatus": { + "title": "审核状态", + "context_pressure": { + "label": "上下文压力上升", + "detail": "大型或受限目标预计会调用 {{count}} 次审核者。" + }, + "compression_preserved": { + "label": "压缩已保留关键事实", + "detail": "覆盖说明包含压缩保留下来的上下文。" + }, + "cache_hit": { + "label": "增量缓存复用了审核结果", + "detail": "{{count}} 个审核包复用了匹配的缓存输出。" + }, + "cache_miss": { + "label": "增量缓存未命中或已刷新", + "detail": "{{count}} 个审核包重新执行或刷新了过期缓存。" + }, + "concurrency_limited": { + "label": "审核员启动受到并发限制", + "detail": "{{count}} 次审核员启动触发了并发上限。" + }, + "partial_reviewer": { + "label": "审核者超时但有部分结果", + "detail": "{{count}} 个审核者结果是部分结果,置信度已降低。" + }, + "retry_guidance": { + "label": "已给出重试指引", + "detail": "{{count}} 条重试指引用于补足部分审核覆盖。" + }, + "skipped_reviewers": { + "label": "已跳过的审核员", + "detail": "{{count}} 个审核员因适用性、配置或预算被跳过。" + }, + "token_budget_limited": { + "label": "Token 预算限制了审核覆盖", + "detail": "{{count}} 个审核员因 Token 预算模式被跳过。" + }, + "user_decision": { + "label": "需要用户决策", + "detail": "{{count}} 个审核项需要先决策再修复。" + } + }, "sections": { "summary": "结论摘要", "issues": "问题列表", @@ -1210,8 +1336,11 @@ "noIssues": "没有已验证问题。", "status": "状态", "findings": "问题数", + "packet": "分包", + "partialOutput": "部分输出", "validation": "验证说明", "source": "来源", + "reliabilitySignals": "审核可靠性", "noItems": "无。" }, "export": { diff --git a/src/web-ui/src/locales/zh-TW/flow-chat.json b/src/web-ui/src/locales/zh-TW/flow-chat.json index 2eda5ee9e..644471964 100644 --- a/src/web-ui/src/locales/zh-TW/flow-chat.json +++ b/src/web-ui/src/locales/zh-TW/flow-chat.json @@ -396,6 +396,7 @@ "fixFailed": "修復失敗", "fixTimeout": "修復超時", "fixInterrupted": "修復已中斷,還剩 {{count}} 項。", + "reviewWaitingCapacity": "審核佇列等待中", "continueFix": "繼續修復 {{count}} 項", "skipRemaining": "跳過剩餘項", "reviewError": "審核出錯", @@ -457,6 +458,18 @@ "recoveryPreserve": "{{count}} 個已完成的 reviewer 將保留", "recoveryRerun": "{{count}} 個 reviewer 將重新運行", "recoverySkip": "{{count}} 個 reviewer 將被跳過", + "capacityQueue": { + "title": "審核員正在等待容量", + "pausedTitle": "佇列已暫停", + "detail": "排隊等待時間不會計入審核員執行時長。", + "sessionBusy": "目前會話較忙,可暫停 Deep Review 或稍後繼續。", + "stopHint": "可使用停止按鈕中斷目前審核佇列。", + "pauseQueue": "暫停佇列", + "continueQueue": "繼續佇列", + "cancelQueued": "取消排隊審核員", + "skipOptionalQueued": "略過選用擴充", + "controlFailed": "\u4f47\u5217\u63a7\u5236\u5931\u6557\uff0c\u8acb\u91cd\u8a66\u6216\u505c\u6b62\u5be9\u6838\u3002" + }, "degradation": { "reduceReviewers": "僅運行核心 reviewer", "reduceReviewersDesc": "跳過額外 reviewer,只保留核心 reviewer", @@ -487,9 +500,11 @@ "windowTitle": "深度審核", "eyebrow": "程式碼審核團隊", "title": "開始深度審核?", - "body": "深度審核會啟動並行程式碼審核團隊,通常比普通審核耗時更久,也會消耗更多 Token。", + "body": "深度審核會啟動多個審核員,通常比普通審核耗時更久,也會消耗更多 Token。", "readonlyLabel": "首次只讀", - "readonly": "首次審核默認只讀:它會先輸出問題和修復計劃,不會直接修改代碼。", + "readonly": "首次審核會先輸出問題和修復計劃,不會直接修改程式碼。", + "sessionConcurrencyTitle": "目前會話較忙", + "sessionConcurrencyBody": "目標會話已有 {{count}} 個 subagent 任務在執行。可選擇更輕量策略、暫時取消,或在容量空閒後手動繼續。", "costLabel": "Token 消耗更高", "cost": "預期消耗:多個審核角色加質量門禁,運行時間和 Token 可能達到普通審核的數倍。", "timeLabel": "耗時更久", @@ -497,7 +512,71 @@ "dontShowAgain": "下次不再提示", "cancel": "取消", "confirm": "開始深度審核", - "estimatedTokens": "預計消耗:{{min}} - {{max}} tokens" + "estimatedTokens": "預計消耗:{{min}} - {{max}} tokens", + "runStrategy": "運行策略:{{strategy}}", + "recommendedStrategy": "推薦策略:{{strategy}}", + "recommendationTitle": "風險推薦", + "strategyOverrideTitle": "運行策略", + "strategyOverrideBody": "為本專案的本次啟動選擇審核策略。", + "teamDefaultStrategy": "團隊預設", + "strategyLabels": { + "quick": "快速", + "normal": "標準", + "deep": "深度" + }, + "lineupTitle": "審核陣容", + "summaryTitle": "啟動摘要", + "targetFiles": "{{count}} 個檔案", + "targetFiles_one": "{{count}} 個檔案", + "targetFiles_other": "{{count}} 個檔案", + "targetRiskTags": "風險區域:{{tags}}", + "targetTagsWithMore": "{{tags}},另有 {{count}} 個", + "targetTagsWithMore_one": "{{tags}},另有 {{count}} 個", + "targetTagsWithMore_other": "{{tags}},另有 {{count}} 個", + "optionalReviewers": "{{count}} 個可選審核者", + "optionalReviewers_one": "{{count}} 個可選審核者", + "optionalReviewers_other": "{{count}} 個可選審核者", + "summaryFirstReview": "先摘要再覆蓋", + "targetTagLabels": { + "frontendUi": "前端 UI", + "frontendStyle": "前端樣式", + "frontendI18n": "前端國際化", + "frontendContract": "前端契約", + "desktopContract": "桌面契約", + "webServerContract": "Web 服務契約", + "backendCore": "後端核心", + "transport": "傳輸層", + "apiLayer": "API 層", + "aiAdapter": "AI 介接器", + "installerUi": "安裝器 UI", + "test": "測試", + "docs": "文件", + "config": "設定", + "generatedOrLock": "生成檔或鎖定檔", + "unknown": "未知區域" + }, + "estimatedCalls": "{{count}} 次審核調用", + "estimatedCalls_one": "{{count}} 次審核調用", + "estimatedCalls_other": "{{count}} 次審核調用", + "activeReviewers": "{{count}} 個已啟用", + "activeReviewers_one": "{{count}} 個已啟用", + "activeReviewers_other": "{{count}} 個已啟用", + "skippedReviewers": "{{count}} 個已跳過", + "skippedReviewers_one": "{{count}} 個已跳過", + "skippedReviewers_other": "{{count}} 個已跳過", + "skippedMore": "另有 {{count}} 個", + "skippedMore_one": "另有 {{count}} 個", + "skippedMore_other": "另有 {{count}} 個", + "activeGroupTitle": "將運行", + "skippedGroupTitle": "已跳過的審核者", + "skippedReasons": { + "notApplicable": "不適用於本次目標", + "budgetLimited": "受 Token 預算限制", + "invalidTooling": "設定問題", + "disabled": "已停用", + "unavailable": "不可用", + "skipped": "已跳過" + } }, "flowChatHeader": { "turnList": "輪次列表", @@ -1180,8 +1259,55 @@ "partial": "部分完成", "unknown": "未知" }, + "runManifest": { + "recommendedStrategy": "推薦策略", + "riskRecommendationTitle": "風險推薦" + }, "sectionItemCount": "{{count}} 項", "remediationPlan": "修復計劃", + "reliabilityStatus": { + "title": "審核狀態", + "context_pressure": { + "label": "上下文壓力上升", + "detail": "大型或受限目標預計會呼叫 {{count}} 次審核者。" + }, + "compression_preserved": { + "label": "壓縮已保留關鍵事實", + "detail": "覆蓋說明包含壓縮保留下來的上下文。" + }, + "cache_hit": { + "label": "增量快取重用了審核結果", + "detail": "{{count}} 個審核包重用了相符的快取輸出。" + }, + "cache_miss": { + "label": "增量快取未命中或已刷新", + "detail": "{{count}} 個審核包重新執行或刷新了過期快取。" + }, + "concurrency_limited": { + "label": "審核員啟動受到並行限制", + "detail": "{{count}} 次審核員啟動觸發了並行上限。" + }, + "partial_reviewer": { + "label": "審核者逾時但有部分結果", + "detail": "{{count}} 個審核者結果是部分結果,信心已降低。" + }, + "retry_guidance": { + "label": "已給出重試指引", + "detail": "{{count}} 條重試指引用於補足部分審核覆蓋。" + }, + "skipped_reviewers": { + "label": "已略過的審核員", + "detail": "{{count}} 個審核員因適用性、設定或預算被略過。" + }, + "token_budget_limited": { + "label": "Token 預算限制了審核覆蓋", + "detail": "{{count}} 個審核員因 Token 預算模式被略過。" + }, + "user_decision": { + "label": "需要使用者決策", + "detail": "{{count}} 個審核項需要先決策再修復。" + } + }, "sections": { "summary": "結論摘要", "issues": "問題列表", @@ -1210,8 +1336,11 @@ "noIssues": "沒有已驗證問題。", "status": "狀態", "findings": "問題數", + "packet": "分包", + "partialOutput": "部分輸出", "validation": "驗證說明", "source": "來源", + "reliabilitySignals": "審核可靠性", "noItems": "無。" }, "export": { diff --git a/src/web-ui/src/shared/services/reviewSubagentCapabilities.ts b/src/web-ui/src/shared/services/reviewSubagentCapabilities.ts new file mode 100644 index 000000000..9301a38cc --- /dev/null +++ b/src/web-ui/src/shared/services/reviewSubagentCapabilities.ts @@ -0,0 +1,47 @@ +export const REVIEW_SUBAGENT_REQUIRED_TOOLS = ['GetFileDiff', 'Read'] as const; +export const REVIEW_SUBAGENT_RECOMMENDED_TOOLS = [ + 'GetFileDiff', + 'Read', + 'Grep', + 'Glob', + 'LS', +] as const; +export const REVIEW_SUBAGENT_OPTIONAL_TOOLS = ['Git'] as const; + +export type ReviewSubagentToolReadiness = 'ready' | 'degraded' | 'invalid'; + +export interface ReviewSubagentToolReadinessResult { + readiness: ReviewSubagentToolReadiness; + requiredTools: string[]; + recommendedTools: string[]; + optionalTools: string[]; + missingRequiredTools: string[]; + missingRecommendedTools: string[]; +} + +export function evaluateReviewSubagentToolReadiness( + selectedTools: Iterable, +): ReviewSubagentToolReadinessResult { + const selectedToolNames = new Set(selectedTools); + const missingRequiredTools = REVIEW_SUBAGENT_REQUIRED_TOOLS.filter( + (toolName) => !selectedToolNames.has(toolName), + ); + const missingRecommendedTools = REVIEW_SUBAGENT_RECOMMENDED_TOOLS.filter( + (toolName) => !selectedToolNames.has(toolName), + ); + const readiness: ReviewSubagentToolReadiness = + missingRequiredTools.length > 0 + ? 'invalid' + : missingRecommendedTools.length > 0 + ? 'degraded' + : 'ready'; + + return { + readiness, + requiredTools: [...REVIEW_SUBAGENT_REQUIRED_TOOLS], + recommendedTools: [...REVIEW_SUBAGENT_RECOMMENDED_TOOLS], + optionalTools: [...REVIEW_SUBAGENT_OPTIONAL_TOOLS], + missingRequiredTools, + missingRecommendedTools, + }; +} diff --git a/src/web-ui/src/shared/services/reviewTargetClassifier.test.ts b/src/web-ui/src/shared/services/reviewTargetClassifier.test.ts new file mode 100644 index 000000000..0f6e383da --- /dev/null +++ b/src/web-ui/src/shared/services/reviewTargetClassifier.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it } from 'vitest'; +import { + classifyReviewTargetFromFiles, + createUnknownReviewTargetClassification, + getReviewerApplicabilityRule, + normalizeReviewPath, + shouldRunReviewerForTarget, +} from './reviewTargetClassifier'; + +describe('reviewTargetClassifier', () => { + it('normalizes Windows and relative paths for review classification', () => { + expect(normalizeReviewPath('.\\src\\web-ui\\src\\App.tsx')).toBe( + 'src/web-ui/src/App.tsx', + ); + }); + + it('classifies frontend source, style, locale, and contract files', () => { + const target = classifyReviewTargetFromFiles( + [ + 'src/web-ui/src/App.tsx', + 'src/web-ui/src/app/App.scss', + 'src/web-ui/src/locales/en-US/flow-chat.json', + 'src/apps/desktop/src/api/agentic_api.rs', + ], + 'session_files', + ); + + expect(target.resolution).toBe('resolved'); + expect(target.tags).toEqual( + expect.arrayContaining([ + 'frontend_ui', + 'frontend_style', + 'frontend_i18n', + 'desktop_contract', + 'frontend_contract', + ]), + ); + expect(target.files[0]).toMatchObject({ + path: 'src/web-ui/src/App.tsx', + normalizedPath: 'src/web-ui/src/App.tsx', + source: 'session_files', + tags: expect.arrayContaining(['frontend_ui']), + }); + }); + + it('classifies backend core files without frontend tags', () => { + const target = classifyReviewTargetFromFiles( + ['src/crates/core/src/service/config/types.rs'], + 'session_files', + ); + + expect(target.resolution).toBe('resolved'); + expect(target.tags).toEqual(['backend_core']); + }); + + it('returns an unknown target when no file list is available', () => { + const target = createUnknownReviewTargetClassification('unknown'); + + expect(target.resolution).toBe('unknown'); + expect(target.tags).toEqual(['unknown']); + expect(target.warnings).toEqual([ + expect.objectContaining({ code: 'target_unknown' }), + ]); + }); + + it('keeps frontend reviewer applicability in a reusable registry', () => { + const rule = getReviewerApplicabilityRule('ReviewFrontend'); + + expect(rule).toEqual( + expect.objectContaining({ + subagentId: 'ReviewFrontend', + runWhenTargetUnknown: true, + matchingTags: expect.arrayContaining([ + 'frontend_ui', + 'frontend_contract', + ]), + }), + ); + }); + + it('evaluates conditional reviewer applicability from registry tags', () => { + const backendTarget = classifyReviewTargetFromFiles( + ['src/crates/core/src/service/config/types.rs'], + 'session_files', + ); + const frontendTarget = classifyReviewTargetFromFiles( + ['src/web-ui/src/App.tsx'], + 'session_files', + ); + const unknownTarget = createUnknownReviewTargetClassification('manual_prompt'); + + expect(shouldRunReviewerForTarget('ReviewFrontend', backendTarget)).toBe(false); + expect(shouldRunReviewerForTarget('ReviewFrontend', frontendTarget)).toBe(true); + expect(shouldRunReviewerForTarget('ReviewFrontend', unknownTarget)).toBe(true); + expect(shouldRunReviewerForTarget('ReviewSecurity', backendTarget)).toBe(true); + }); +}); diff --git a/src/web-ui/src/shared/services/reviewTargetClassifier.ts b/src/web-ui/src/shared/services/reviewTargetClassifier.ts new file mode 100644 index 000000000..fc85fedcf --- /dev/null +++ b/src/web-ui/src/shared/services/reviewTargetClassifier.ts @@ -0,0 +1,344 @@ +export type ReviewTargetSource = + | 'session_files' + | 'slash_command_explicit_files' + | 'slash_command_git_ref' + | 'workspace_diff' + | 'manual_prompt' + | 'unknown'; + +export type ReviewDomainTag = + | 'frontend_ui' + | 'frontend_style' + | 'frontend_i18n' + | 'frontend_contract' + | 'desktop_contract' + | 'web_server_contract' + | 'backend_core' + | 'transport' + | 'api_layer' + | 'ai_adapter' + | 'installer_ui' + | 'test' + | 'docs' + | 'config' + | 'generated_or_lock' + | 'unknown'; + +export interface ReviewTargetFile { + path: string; + normalizedPath: string; + oldPath?: string; + normalizedOldPath?: string; + status: 'added' | 'modified' | 'deleted' | 'renamed' | 'copied' | 'unknown'; + source: ReviewTargetSource; + tags: ReviewDomainTag[]; + excluded?: boolean; + excludeReason?: 'lockfile' | 'generated' | 'binary' | 'too_large' | 'unsupported'; +} + +export interface ReviewTargetWarning { + code: + | 'target_unknown' + | 'git_ref_unresolved' + | 'file_list_empty' + | 'remote_resolution_unavailable' + | 'excluded_files_present' + | 'contract_surface_detected' + | 'classification_partial'; + message: string; +} + +export interface ReviewTargetClassification { + source: ReviewTargetSource; + resolution: 'resolved' | 'partial' | 'unknown'; + files: ReviewTargetFile[]; + tags: ReviewDomainTag[]; + evidence: string[]; + warnings: ReviewTargetWarning[]; +} + +interface PathTagRule { + id: string; + tags: ReviewDomainTag[]; + match: { + pathPrefixes?: string[]; + extensions?: string[]; + exactFiles?: string[]; + }; + evidence: string; +} + +export const FRONTEND_REVIEW_DOMAIN_TAGS: ReviewDomainTag[] = [ + 'frontend_ui', + 'frontend_style', + 'frontend_i18n', + 'frontend_contract', + 'desktop_contract', + 'web_server_contract', +]; + +export interface ReviewerApplicabilityRule { + subagentId: string; + matchingTags: ReviewDomainTag[]; + runWhenTargetUnknown: boolean; +} + +const REVIEWER_APPLICABILITY_RULES: ReviewerApplicabilityRule[] = [ + { + subagentId: 'ReviewFrontend', + matchingTags: FRONTEND_REVIEW_DOMAIN_TAGS, + runWhenTargetUnknown: true, + }, +]; + +export function getReviewerApplicabilityRule( + subagentId: string, +): ReviewerApplicabilityRule | undefined { + return REVIEWER_APPLICABILITY_RULES.find((rule) => rule.subagentId === subagentId); +} + +export function shouldRunReviewerForTarget( + subagentId: string, + target: ReviewTargetClassification, +): boolean { + const rule = getReviewerApplicabilityRule(subagentId); + if (!rule) { + return true; + } + if (target.resolution === 'unknown') { + return rule.runWhenTargetUnknown; + } + return rule.matchingTags.some((tag) => target.tags.includes(tag)); +} + +const PATH_TAG_RULES: PathTagRule[] = [ + { + id: 'web-ui-locales', + tags: ['frontend_i18n'], + match: { pathPrefixes: ['src/web-ui/src/locales/'] }, + evidence: 'Frontend locale file changed', + }, + { + id: 'web-ui-style', + tags: ['frontend_style'], + match: { + pathPrefixes: ['src/web-ui/'], + extensions: ['.scss', '.css', '.sass', '.less'], + }, + evidence: 'Frontend stylesheet changed', + }, + { + id: 'web-ui-source', + tags: ['frontend_ui'], + match: { + pathPrefixes: ['src/web-ui/src/'], + extensions: ['.ts', '.tsx', '.js', '.jsx'], + }, + evidence: 'File is under src/web-ui/src', + }, + { + id: 'desktop-api-contract', + tags: ['desktop_contract', 'frontend_contract'], + match: { pathPrefixes: ['src/apps/desktop/src/api/'] }, + evidence: 'Desktop API surface may affect frontend invoke contract', + }, + { + id: 'api-layer-contract', + tags: ['api_layer', 'frontend_contract'], + match: { pathPrefixes: ['src/crates/api-layer/'] }, + evidence: 'API layer may affect frontend/backend contract', + }, + { + id: 'server-contract', + tags: ['web_server_contract', 'frontend_contract'], + match: { pathPrefixes: ['src/apps/server/src/routes/'] }, + evidence: 'Server route surface may affect frontend communication contract', + }, + { + id: 'transport', + tags: ['transport'], + match: { pathPrefixes: ['src/crates/transport/'] }, + evidence: 'Transport layer changed', + }, + { + id: 'core', + tags: ['backend_core'], + match: { pathPrefixes: ['src/crates/core/'] }, + evidence: 'Core product logic changed', + }, + { + id: 'ai-adapter', + tags: ['ai_adapter'], + match: { pathPrefixes: ['src/crates/ai-adapters/'] }, + evidence: 'AI adapter changed', + }, + { + id: 'installer-ui', + tags: ['installer_ui'], + match: { pathPrefixes: ['BitFun-Installer/'] }, + evidence: 'Installer UI changed', + }, + { + id: 'docs', + tags: ['docs'], + match: { + pathPrefixes: ['docs/'], + extensions: ['.md'], + }, + evidence: 'Documentation changed', + }, + { + id: 'lockfile', + tags: ['generated_or_lock'], + match: { + exactFiles: ['pnpm-lock.yaml', 'package-lock.json', 'yarn.lock', 'Cargo.lock'], + }, + evidence: 'Lockfile changed', + }, +]; + +export function normalizeReviewPath(path: string): string { + return path.trim().replace(/\\/g, '/').replace(/^\.\/+/, ''); +} + +function dedupe(values: T[]): T[] { + return Array.from(new Set(values)); +} + +function getExtension(path: string): string { + const lastSlash = path.lastIndexOf('/'); + const lastDot = path.lastIndexOf('.'); + if (lastDot <= lastSlash) { + return ''; + } + return path.slice(lastDot); +} + +function matchesRule(path: string, rule: PathTagRule): boolean { + const { pathPrefixes, extensions, exactFiles } = rule.match; + const extension = getExtension(path); + return Boolean( + exactFiles?.includes(path) || + pathPrefixes?.some((prefix) => path.startsWith(prefix)) && + (!extensions || extensions.includes(extension)) || + !pathPrefixes && + extensions?.includes(extension), + ); +} + +function inferSupplementalTags(path: string): ReviewDomainTag[] { + const tags: ReviewDomainTag[] = []; + if ( + path.includes('/tests/') || + path.endsWith('.test.ts') || + path.endsWith('.test.tsx') || + path.endsWith('.spec.ts') || + path.endsWith('.spec.tsx') + ) { + tags.push('test'); + } + if ( + path === 'package.json' || + path.endsWith('/package.json') || + path.endsWith('.config.ts') || + path.endsWith('.config.js') || + path.startsWith('.github/workflows/') + ) { + tags.push('config'); + } + return tags; +} + +function classifyPath( + originalPath: string, + source: ReviewTargetSource, +): { file: ReviewTargetFile; evidence: string[] } { + const normalizedPath = normalizeReviewPath(originalPath); + const matchedRules = PATH_TAG_RULES.filter((rule) => + matchesRule(normalizedPath, rule), + ); + const ruleTags = matchedRules.flatMap((rule) => rule.tags); + const tags = dedupe([...ruleTags, ...inferSupplementalTags(normalizedPath)]); + const finalTags = tags.length > 0 ? tags : ['unknown' as const]; + + return { + file: { + path: originalPath, + normalizedPath, + status: 'unknown', + source, + tags: finalTags, + }, + evidence: matchedRules.map((rule) => rule.evidence), + }; +} + +export function createUnknownReviewTargetClassification( + source: ReviewTargetSource, +): ReviewTargetClassification { + return { + source, + resolution: 'unknown', + files: [], + tags: ['unknown'], + evidence: ['Review target could not be resolved before launch.'], + warnings: [ + { + code: 'target_unknown', + message: 'Review target could not be resolved before launch.', + }, + ], + }; +} + +export function classifyReviewTargetFromFiles( + filePaths: string[], + source: ReviewTargetSource, +): ReviewTargetClassification { + const normalizedInputs = filePaths + .map((path) => path.trim()) + .filter(Boolean); + + if (normalizedInputs.length === 0) { + return { + ...createUnknownReviewTargetClassification(source), + warnings: [ + { + code: 'file_list_empty', + message: 'No reviewable files were provided for target classification.', + }, + ], + }; + } + + const classified = normalizedInputs.map((path) => classifyPath(path, source)); + const files = classified.map((item) => item.file); + const tags = dedupe(files.flatMap((file) => file.tags)); + const hasUnknown = tags.includes('unknown'); + const hasKnown = tags.some((tag) => tag !== 'unknown'); + const resolution = hasUnknown ? (hasKnown ? 'partial' : 'unknown') : 'resolved'; + const warnings: ReviewTargetWarning[] = []; + + if (resolution === 'partial') { + warnings.push({ + code: 'classification_partial', + message: 'Some review target files could not be classified.', + }); + } + + if (tags.includes('frontend_contract')) { + warnings.push({ + code: 'contract_surface_detected', + message: 'A frontend-facing contract surface changed.', + }); + } + + return { + source, + resolution, + files, + tags, + evidence: dedupe(classified.flatMap((item) => item.evidence)), + warnings, + }; +} diff --git a/src/web-ui/src/shared/services/reviewTeamLocaleCompleteness.test.ts b/src/web-ui/src/shared/services/reviewTeamLocaleCompleteness.test.ts new file mode 100644 index 000000000..4f370aed4 --- /dev/null +++ b/src/web-ui/src/shared/services/reviewTeamLocaleCompleteness.test.ts @@ -0,0 +1,83 @@ +import { readFileSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; +import { describe, expect, it } from 'vitest'; +import { FALLBACK_REVIEW_TEAM_DEFINITION } from './reviewTeamService'; + +const REVIEW_TEAM_LOCALES = ['en-US', 'zh-CN', 'zh-TW'] as const; + +type Locale = (typeof REVIEW_TEAM_LOCALES)[number]; +type JsonObject = Record; + +const REVIEW_TEAM_FLOW_CHAT_KEYS = [ + 'deepReviewConsent.runStrategy', + 'deepReviewConsent.recommendedStrategy', + 'deepReviewConsent.recommendationTitle', + 'deepReviewConsent.strategyOverrideTitle', + 'deepReviewConsent.strategyOverrideBody', + 'deepReviewConsent.teamDefaultStrategy', + 'deepReviewConsent.strategyLabels.quick', + 'deepReviewConsent.strategyLabels.normal', + 'deepReviewConsent.strategyLabels.deep', + 'toolCards.codeReview.runManifest.recommendedStrategy', + 'toolCards.codeReview.runManifest.riskRecommendationTitle', +] as const; + +function readLocaleJson( + locale: Locale, + namespace: 'flow-chat.json' | 'scenes/agents.json' | 'settings/review.json', +) { + const filePath = fileURLToPath(new URL(`../../locales/${locale}/${namespace}`, import.meta.url)); + return JSON.parse(readFileSync(filePath, 'utf8')) as JsonObject; +} + +function getPathValue(source: JsonObject, path: string): unknown { + return path.split('.').reduce((current, segment) => { + if (!current || typeof current !== 'object') { + return undefined; + } + return (current as JsonObject)[segment]; + }, source); +} + +function expectNonEmptyLocaleString(source: JsonObject, path: string) { + const value = getPathValue(source, path); + expect(value, path).toEqual(expect.any(String)); + expect((value as string).trim(), path).not.toBe(''); +} + +describe('review team locale completeness', () => { + it.each(REVIEW_TEAM_LOCALES)( + 'keeps core review roles translated in %s settings and agents namespaces', + (locale) => { + const settingsReview = readLocaleJson(locale, 'settings/review.json'); + const scenesAgents = readLocaleJson(locale, 'scenes/agents.json'); + + for (const role of FALLBACK_REVIEW_TEAM_DEFINITION.coreRoles) { + expectNonEmptyLocaleString(settingsReview, `members.${role.key}.name`); + expectNonEmptyLocaleString(settingsReview, `members.${role.key}.role`); + + expectNonEmptyLocaleString(scenesAgents, `reviewTeams.members.${role.key}.funName`); + expectNonEmptyLocaleString(scenesAgents, `reviewTeams.members.${role.key}.role`); + expectNonEmptyLocaleString(scenesAgents, `reviewTeams.members.${role.key}.description`); + + role.responsibilities.forEach((_, index) => { + expectNonEmptyLocaleString( + scenesAgents, + `reviewTeams.members.${role.key}.responsibilities.${index}`, + ); + }); + } + }, + ); + + it.each(REVIEW_TEAM_LOCALES)( + 'keeps Deep Review strategy recommendation UI translated in %s flow chat namespace', + (locale) => { + const flowChat = readLocaleJson(locale, 'flow-chat.json'); + + for (const path of REVIEW_TEAM_FLOW_CHAT_KEYS) { + expectNonEmptyLocaleString(flowChat, path); + } + }, + ); +}); diff --git a/src/web-ui/src/shared/services/reviewTeamService.test.ts b/src/web-ui/src/shared/services/reviewTeamService.test.ts index 073ebbeb4..958a917bd 100644 --- a/src/web-ui/src/shared/services/reviewTeamService.test.ts +++ b/src/web-ui/src/shared/services/reviewTeamService.test.ts @@ -3,19 +3,29 @@ import { configAPI } from '@/infrastructure/api/service-api/ConfigAPI'; import { DEFAULT_REVIEW_TEAM_EXECUTION_POLICY, DEFAULT_REVIEW_TEAM_STRATEGY_LEVEL, + FALLBACK_REVIEW_TEAM_DEFINITION, REVIEW_STRATEGY_DEFINITIONS, buildEffectiveReviewTeamManifest, buildReviewTeamPromptBlock, canUseSubagentAsReviewTeamMember, + loadDefaultReviewTeamDefinition, loadDefaultReviewTeamConfig, + loadReviewTeamProjectStrategyOverride, + loadReviewTeamRateLimitStatus, prepareDefaultReviewTeamForLaunch, resolveDefaultReviewTeam, + saveReviewTeamProjectStrategyOverride, type ReviewTeamStoredConfig, } from './reviewTeamService'; +import { agentAPI } from '@/infrastructure/api/service-api/AgentAPI'; import { SubagentAPI, type SubagentInfo, } from '@/infrastructure/api/service-api/SubagentAPI'; +import { + classifyReviewTargetFromFiles, + createUnknownReviewTargetClassification, +} from './reviewTargetClassifier'; vi.mock('@/infrastructure/api/service-api/ConfigAPI', () => ({ configAPI: { @@ -31,11 +41,19 @@ vi.mock('@/infrastructure/api/service-api/SubagentAPI', () => ({ }, })); +vi.mock('@/infrastructure/api/service-api/AgentAPI', () => ({ + agentAPI: { + getDefaultReviewTeamDefinition: vi.fn(), + }, +})); + describe('reviewTeamService', () => { beforeEach(() => { vi.clearAllMocks(); }); + const WORKSPACE_PATH = '/test-fixtures/project-a'; + const storedConfigWithExtra = ( extraSubagentIds: string[] = [], overrides: Partial = {}, @@ -47,6 +65,7 @@ describe('reviewTeamService', () => { judge_timeout_seconds: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.judgeTimeoutSeconds, reviewer_file_split_threshold: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.reviewerFileSplitThreshold, max_same_role_instances: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxSameRoleInstances, + max_retries_per_role: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxRetriesPerRole, ...overrides, }); @@ -57,14 +76,15 @@ describe('reviewTeamService', () => { model = 'fast', isReadonly = true, isReview = id.startsWith('Review'), + defaultTools = ['GetFileDiff', 'Read', 'Grep', 'Glob', 'LS'], ): SubagentInfo => ({ id, name: id, description: `${id} description`, isReadonly, isReview, - toolCount: 1, - defaultTools: ['Read'], + toolCount: defaultTools.length, + defaultTools, enabled, subagentSource, model, @@ -92,6 +112,7 @@ describe('reviewTeamService', () => { judge_timeout_seconds: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.judgeTimeoutSeconds, reviewer_file_split_threshold: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.reviewerFileSplitThreshold, max_same_role_instances: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxSameRoleInstances, + max_retries_per_role: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxRetriesPerRole, }); }); @@ -134,6 +155,78 @@ describe('reviewTeamService', () => { await expect(loadDefaultReviewTeamConfig()).rejects.toThrow(error.message); }); + it('loads cached review team rate limit status when available', async () => { + vi.mocked(configAPI.getConfig).mockResolvedValueOnce({ + remaining: 3.8, + }); + + await expect(loadReviewTeamRateLimitStatus()).resolves.toEqual({ + remaining: 3, + }); + expect(configAPI.getConfig).toHaveBeenCalledWith( + 'ai.review_teams.rate_limit_status', + { skipRetryOnNotFound: true }, + ); + }); + + it('ignores missing or invalid cached review team rate limit status', async () => { + vi.mocked(configAPI.getConfig) + .mockResolvedValueOnce(undefined) + .mockResolvedValueOnce({ remaining: 'not-a-number' }) + .mockRejectedValueOnce(new Error('rate status unavailable')); + + await expect(loadReviewTeamRateLimitStatus()).resolves.toBeNull(); + await expect(loadReviewTeamRateLimitStatus()).resolves.toBeNull(); + await expect(loadReviewTeamRateLimitStatus()).resolves.toBeNull(); + }); + + it('loads project strategy overrides by normalized workspace path', async () => { + vi.mocked(configAPI.getConfig).mockResolvedValueOnce({ + 'd:/workspace/repo': 'deep', + '/test-fixtures/project-a': 'quick', + invalid: 'invalid', + }); + + await expect( + loadReviewTeamProjectStrategyOverride('D:\\workspace\\repo'), + ).resolves.toBe('deep'); + expect(configAPI.getConfig).toHaveBeenCalledWith( + 'ai.review_teams.project_strategy_overrides', + { skipRetryOnNotFound: true }, + ); + }); + + it('saves and clears project strategy overrides by normalized workspace path', async () => { + vi.mocked(configAPI.getConfig) + .mockResolvedValueOnce({ + 'd:/workspace/repo': 'quick', + '/test-fixtures/project-a': 'normal', + }) + .mockResolvedValueOnce({ + 'd:/workspace/repo': 'deep', + '/test-fixtures/project-a': 'normal', + }); + + await saveReviewTeamProjectStrategyOverride('D:\\workspace\\repo', 'deep'); + expect(configAPI.setConfig).toHaveBeenNthCalledWith( + 1, + 'ai.review_teams.project_strategy_overrides', + { + 'd:/workspace/repo': 'deep', + '/test-fixtures/project-a': 'normal', + }, + ); + + await saveReviewTeamProjectStrategyOverride('D:\\workspace\\repo'); + expect(configAPI.setConfig).toHaveBeenNthCalledWith( + 2, + 'ai.review_teams.project_strategy_overrides', + { + '/test-fixtures/project-a': 'normal', + }, + ); + }); + it('only force-enables locked core members before launch', async () => { vi.mocked(configAPI.getConfig).mockResolvedValue( storedConfigWithExtra(['ExtraEnabled', 'ExtraDisabled']), @@ -144,38 +237,38 @@ describe('reviewTeamService', () => { subagent('ExtraDisabled', false, 'project', 'fast', true, true), ]); - await prepareDefaultReviewTeamForLaunch('D:/workspace/project-a'); + await prepareDefaultReviewTeamForLaunch(WORKSPACE_PATH); expect(SubagentAPI.updateSubagentConfig).toHaveBeenCalledTimes(6); expect(SubagentAPI.updateSubagentConfig).toHaveBeenCalledWith({ subagentId: 'ReviewBusinessLogic', enabled: true, - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, }); expect(SubagentAPI.updateSubagentConfig).toHaveBeenCalledWith({ subagentId: 'ReviewPerformance', enabled: true, - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, }); expect(SubagentAPI.updateSubagentConfig).toHaveBeenCalledWith({ subagentId: 'ReviewSecurity', enabled: true, - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, }); expect(SubagentAPI.updateSubagentConfig).toHaveBeenCalledWith({ subagentId: 'ReviewArchitecture', enabled: true, - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, }); expect(SubagentAPI.updateSubagentConfig).toHaveBeenCalledWith({ subagentId: 'ReviewFrontend', enabled: true, - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, }); expect(SubagentAPI.updateSubagentConfig).toHaveBeenCalledWith({ subagentId: 'ReviewJudge', enabled: true, - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, }); expect(SubagentAPI.updateSubagentConfig).not.toHaveBeenCalledWith( expect.objectContaining({ subagentId: 'ExtraEnabled' }), @@ -199,11 +292,142 @@ describe('reviewTeamService', () => { expect(promptBlock).toContain('subagent_type: ExtraEnabled'); expect(promptBlock).not.toContain('subagent_type: ExtraDisabled'); - expect(promptBlock).toContain('Always run the four locked core reviewer roles'); + expect(promptBlock).toContain('Run the active core reviewer roles first'); expect(promptBlock).not.toContain('Always run the three locked reviewer roles'); }); - it('requires extra members to be explicitly marked for review and readonly', () => { + it('can resolve the team from a backend-provided reviewer definition', () => { + const team = resolveDefaultReviewTeam( + [ + ...coreSubagents(), + subagent('ReviewDocs'), + ], + storedConfigWithExtra(['ReviewDocs']), + { + definition: { + id: 'default-review-team', + name: 'Code Review Team', + description: 'Backend-defined team', + warning: 'Review may take longer.', + defaultModel: 'fast', + defaultStrategyLevel: 'normal', + defaultExecutionPolicy: { + reviewerTimeoutSeconds: 300, + judgeTimeoutSeconds: 240, + reviewerFileSplitThreshold: 20, + maxSameRoleInstances: 3, + maxRetriesPerRole: 1, + }, + disallowedExtraSubagentIds: [ + 'ReviewBusinessLogic', + 'ReviewPerformance', + 'ReviewSecurity', + 'ReviewArchitecture', + 'ReviewFrontend', + 'ReviewDocs', + 'ReviewJudge', + 'DeepReview', + 'ReviewFixer', + ], + hiddenAgentIds: [ + 'DeepReview', + 'ReviewBusinessLogic', + 'ReviewPerformance', + 'ReviewSecurity', + 'ReviewArchitecture', + 'ReviewFrontend', + 'ReviewDocs', + 'ReviewJudge', + ], + coreRoles: [ + ...[ + 'ReviewBusinessLogic', + 'ReviewPerformance', + 'ReviewSecurity', + 'ReviewArchitecture', + 'ReviewFrontend', + 'ReviewJudge', + ].map((id) => ({ + key: id === 'ReviewJudge' ? 'judge' : id.replace(/^Review/, '').replace(/^BusinessLogic$/, 'businessLogic').toLowerCase(), + subagentId: id, + funName: id, + roleName: id, + description: `${id} description`, + responsibilities: [`${id} responsibility`], + accentColor: '#64748b', + conditional: id === 'ReviewFrontend', + })), + { + key: 'docs', + subagentId: 'ReviewDocs', + funName: 'Docs Reviewer', + roleName: 'Documentation Reviewer', + description: 'Checks docs and release notes.', + responsibilities: ['Verify documentation stays aligned.'], + accentColor: '#0f766e', + }, + ], + strategyProfiles: { + ...REVIEW_STRATEGY_DEFINITIONS, + quick: { + ...REVIEW_STRATEGY_DEFINITIONS.quick, + roleDirectives: { + ...REVIEW_STRATEGY_DEFINITIONS.quick.roleDirectives, + ReviewDocs: 'Only check changed docs.', + }, + }, + }, + }, + }, + ); + + expect(team.coreMembers.map((member) => member.subagentId)).toContain('ReviewDocs'); + expect(team.extraMembers.map((member) => member.subagentId)).not.toContain('ReviewDocs'); + + const manifest = buildEffectiveReviewTeamManifest(team, { + tokenBudgetMode: 'balanced', + }); + expect(manifest.coreReviewers).toContainEqual( + expect.objectContaining({ + subagentId: 'ReviewDocs', + strategyDirective: REVIEW_STRATEGY_DEFINITIONS.normal.promptDirective, + }), + ); + }); + + it('falls back safely when backend reviewer definition fields are malformed', async () => { + vi.mocked(agentAPI.getDefaultReviewTeamDefinition).mockResolvedValue({ + id: 42, + name: null, + description: ['bad'], + warning: {}, + defaultModel: 99, + defaultStrategyLevel: 'normal', + defaultExecutionPolicy: { + reviewerTimeoutSeconds: 300, + judgeTimeoutSeconds: 240, + reviewerFileSplitThreshold: 20, + maxSameRoleInstances: 3, + maxRetriesPerRole: 1, + }, + coreRoles: [], + strategyProfiles: {}, + disallowedExtraSubagentIds: ['ReviewDocs', 42], + hiddenAgentIds: ['ReviewDocs', null], + }); + + await expect(loadDefaultReviewTeamDefinition()).resolves.toMatchObject({ + id: FALLBACK_REVIEW_TEAM_DEFINITION.id, + name: FALLBACK_REVIEW_TEAM_DEFINITION.name, + description: FALLBACK_REVIEW_TEAM_DEFINITION.description, + warning: FALLBACK_REVIEW_TEAM_DEFINITION.warning, + defaultModel: FALLBACK_REVIEW_TEAM_DEFINITION.defaultModel, + disallowedExtraSubagentIds: ['ReviewDocs'], + hiddenAgentIds: ['ReviewDocs'], + }); + }); + + it('keeps invalid configured extra members explainable in the run manifest', () => { const readonlyReviewExtra = subagent('ExtraReadonlyReview', true, 'user', 'fast', true, true); const readonlyPlainExtra = subagent('ExtraReadonlyPlain', true, 'user', 'fast', true, false); const writableReviewExtra = subagent('ExtraWritableReview', true, 'project', 'fast', false, true); @@ -219,15 +443,113 @@ describe('reviewTeamService', () => { readonlyPlainExtra, writableReviewExtra, ], - storedConfigWithExtra(['ExtraReadonlyReview', 'ExtraReadonlyPlain', 'ExtraWritableReview']), + storedConfigWithExtra([ + 'ExtraReadonlyReview', + 'ExtraReadonlyPlain', + 'ExtraWritableReview', + 'ExtraMissingReviewer', + ]), ); - expect(team.extraMembers.map((member) => member.subagentId)).toEqual(['ExtraReadonlyReview']); + expect( + team.extraMembers + .filter((member) => member.available) + .map((member) => member.subagentId), + ).toEqual(['ExtraReadonlyReview']); - const promptBlock = buildReviewTeamPromptBlock(team); + const manifest = buildEffectiveReviewTeamManifest(team); + + expect(manifest.skippedReviewers).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + subagentId: 'ExtraReadonlyPlain', + reason: 'invalid_tooling', + }), + expect.objectContaining({ + subagentId: 'ExtraWritableReview', + reason: 'invalid_tooling', + }), + expect.objectContaining({ + subagentId: 'ExtraMissingReviewer', + reason: 'unavailable', + }), + ]), + ); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); expect(promptBlock).toContain('subagent_type: ExtraReadonlyReview'); - expect(promptBlock).not.toContain('ExtraReadonlyPlain'); - expect(promptBlock).not.toContain('ExtraWritableReview'); + expect(promptBlock).toContain('- ExtraReadonlyPlain: invalid_tooling'); + expect(promptBlock).toContain('- ExtraWritableReview: invalid_tooling'); + expect(promptBlock).toContain('- ExtraMissingReviewer: unavailable'); + expect(promptBlock).not.toContain('subagent_type: ExtraReadonlyPlain'); + expect(promptBlock).not.toContain('subagent_type: ExtraWritableReview'); + expect(promptBlock).not.toContain('subagent_type: ExtraMissingReviewer'); + }); + + it('requires extra review members to have the minimum review tools', () => { + const readyReviewExtra = subagent('ExtraReadyReview', true, 'user', 'fast', true, true); + const missingDiffExtra = subagent( + 'ExtraMissingDiff', + true, + 'user', + 'fast', + true, + true, + ['Read', 'Grep'], + ); + const missingReadExtra = subagent( + 'ExtraMissingRead', + true, + 'project', + 'fast', + true, + true, + ['GetFileDiff', 'Grep'], + ); + + expect(canUseSubagentAsReviewTeamMember(readyReviewExtra)).toBe(true); + expect(canUseSubagentAsReviewTeamMember(missingDiffExtra)).toBe(false); + expect(canUseSubagentAsReviewTeamMember(missingReadExtra)).toBe(false); + + const team = resolveDefaultReviewTeam( + [ + ...coreSubagents(), + readyReviewExtra, + missingDiffExtra, + missingReadExtra, + ], + storedConfigWithExtra(['ExtraReadyReview', 'ExtraMissingDiff', 'ExtraMissingRead']), + ); + + expect( + team.extraMembers + .filter((member) => member.available) + .map((member) => member.subagentId), + ).toEqual(['ExtraReadyReview']); + + const manifest = buildEffectiveReviewTeamManifest(team); + + expect(manifest.enabledExtraReviewers.map((member) => member.subagentId)).toEqual([ + 'ExtraReadyReview', + ]); + expect(manifest.skippedReviewers).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + subagentId: 'ExtraMissingDiff', + reason: 'invalid_tooling', + }), + expect.objectContaining({ + subagentId: 'ExtraMissingRead', + reason: 'invalid_tooling', + }), + ]), + ); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- ExtraMissingDiff: invalid_tooling'); + expect(promptBlock).toContain('- ExtraMissingRead: invalid_tooling'); + expect(promptBlock).not.toContain('subagent_type: ExtraMissingDiff'); + expect(promptBlock).not.toContain('subagent_type: ExtraMissingRead'); }); it('builds an explicit run manifest for enabled, skipped, and quality-gate reviewers', () => { @@ -241,13 +563,13 @@ describe('reviewTeamService', () => { ); const manifest = buildEffectiveReviewTeamManifest(team, { - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, policySource: 'default-review-team-config', }); expect(manifest.reviewMode).toBe('deep'); expect(manifest.strategyLevel).toBe('normal'); - expect(manifest.workspacePath).toBe('D:/workspace/project-a'); + expect(manifest.workspacePath).toBe(WORKSPACE_PATH); expect(manifest.policySource).toBe('default-review-team-config'); expect(manifest.coreReviewers.map((member) => member.subagentId)).toEqual([ 'ReviewBusinessLogic', @@ -268,14 +590,481 @@ describe('reviewTeamService', () => { ]); }); - it('skips the conditional frontend reviewer when an explicit target has no frontend files', () => { + it('generates structured work packets for active reviewers and the judge', () => { + const team = resolveDefaultReviewTeam( + [ + ...coreSubagents(), + subagent('ExtraEnabled', true, 'user', 'fast', true, true), + ], + storedConfigWithExtra(['ExtraEnabled']), + ); + const target = classifyReviewTargetFromFiles( + ['src/web-ui/src/components/ReviewPanel.tsx'], + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + workspacePath: WORKSPACE_PATH, + target, + }); + + const logicPacket = manifest.workPackets?.find( + (packet) => packet.subagentId === 'ReviewBusinessLogic', + ); + const judgePacket = manifest.workPackets?.find( + (packet) => packet.subagentId === 'ReviewJudge', + ); + + expect(logicPacket).toMatchObject({ + packetId: 'reviewer:ReviewBusinessLogic', + phase: 'reviewer', + subagentId: 'ReviewBusinessLogic', + roleName: 'Business Logic Reviewer', + assignedScope: { + kind: 'review_target', + fileCount: 1, + files: ['src/web-ui/src/components/ReviewPanel.tsx'], + }, + allowedTools: ['GetFileDiff', 'Read', 'Grep', 'Glob', 'LS', 'Git'], + timeoutSeconds: manifest.executionPolicy.reviewerTimeoutSeconds, + requiredOutputFields: expect.arrayContaining([ + 'packet_id', + 'status', + 'findings', + ]), + }); + expect(judgePacket).toMatchObject({ + packetId: 'judge:ReviewJudge', + phase: 'judge', + subagentId: 'ReviewJudge', + timeoutSeconds: manifest.executionPolicy.judgeTimeoutSeconds, + requiredOutputFields: expect.arrayContaining([ + 'packet_id', + 'status', + 'validated_findings', + ]), + }); + expect(manifest.workPackets?.map((packet) => packet.subagentId)).not.toContain( + 'ExtraDisabled', + ); + expect(manifest.executionPolicy.maxRetriesPerRole).toBe(1); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('Review work packets:'); + expect(promptBlock).toContain('"packet_id": "reviewer:ReviewBusinessLogic"'); + expect(promptBlock).toContain('"allowed_tools"'); + expect(promptBlock).toContain('- max_retries_per_role: 1'); + expect(promptBlock).toContain('set retry to true'); + expect(promptBlock).toContain('Each reviewer Task prompt must include the matching work packet verbatim.'); + expect(promptBlock).toContain('If the reviewer omits packet_id but the Task was launched from a packet, infer the packet_id from the Task description or work packet and mark packet_status_source as inferred.'); + }); + + it('pre-generates a compact diff summary for reviewer orientation', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + const target = classifyReviewTargetFromFiles( + [ + 'src/web-ui/src/shared/services/reviewTeamService.ts', + 'src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx', + 'src/web-ui/src/locales/en-US/scenes/agents.json', + 'src/crates/core/src/agentic/deep_review_policy.rs', + 'src/crates/core/src/agentic/tools/implementations/task_tool.rs', + ], + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + changeStats: { + totalLinesChanged: 420, + lineCountSource: 'diff_stat', + }, + }); + + expect(manifest.preReviewSummary).toMatchObject({ + source: 'target_manifest', + fileCount: 5, + lineCount: 420, + lineCountSource: 'diff_stat', + workspaceAreas: [ + { + key: 'web-ui', + fileCount: 3, + sampleFiles: [ + 'src/web-ui/src/shared/services/reviewTeamService.ts', + 'src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx', + 'src/web-ui/src/locales/en-US/scenes/agents.json', + ], + }, + { + key: 'crate:core', + fileCount: 2, + sampleFiles: [ + 'src/crates/core/src/agentic/deep_review_policy.rs', + 'src/crates/core/src/agentic/tools/implementations/task_tool.rs', + ], + }, + ], + }); + expect(manifest.preReviewSummary.summary).toContain( + '5 files, 420 changed lines across 2 workspace areas', + ); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('Pre-generated diff summary:'); + expect(promptBlock).toContain('"key": "web-ui"'); + expect(promptBlock).toContain('Use the pre-generated diff summary'); + }); + + it('builds a shared context cache plan for files consumed by multiple reviewers', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + const target = classifyReviewTargetFromFiles( + [ + 'src/web-ui/src/shared/services/reviewTeamService.ts', + 'src/crates/core/src/agentic/deep_review_policy.rs', + ], + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { target }); + const webUiCacheEntry = manifest.sharedContextCache.entries.find( + (entry) => entry.path === 'src/web-ui/src/shared/services/reviewTeamService.ts', + ); + + expect(manifest.sharedContextCache).toMatchObject({ + source: 'work_packets', + strategy: 'reuse_readonly_file_context_by_cache_key', + omittedEntryCount: 0, + }); + expect(webUiCacheEntry).toMatchObject({ + cacheKey: 'shared-context:1', + workspaceArea: 'web-ui', + recommendedTools: ['GetFileDiff', 'Read'], + consumerPacketIds: expect.arrayContaining([ + 'reviewer:ReviewBusinessLogic', + 'reviewer:ReviewPerformance', + 'reviewer:ReviewSecurity', + 'reviewer:ReviewArchitecture', + 'reviewer:ReviewFrontend', + ]), + }); + expect(webUiCacheEntry?.consumerPacketIds).not.toContain('judge:ReviewJudge'); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('Shared context cache plan:'); + expect(promptBlock).toContain('"cache_key": "shared-context:1"'); + expect(promptBlock).toContain('Use shared_context_cache entries'); + }); + + it('builds an incremental review cache plan for follow-up reviews', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + const target = classifyReviewTargetFromFiles( + [ + 'src/web-ui/src/shared/services/reviewTeamService.ts', + 'src/crates/core/src/agentic/deep_review_policy.rs', + ], + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + changeStats: { + totalLinesChanged: 128, + lineCountSource: 'diff_stat', + }, + }); + + expect(manifest.incrementalReviewCache).toMatchObject({ + source: 'target_manifest', + strategy: 'reuse_completed_packets_when_fingerprint_matches', + filePaths: [ + 'src/crates/core/src/agentic/deep_review_policy.rs', + 'src/web-ui/src/shared/services/reviewTeamService.ts', + ], + workspaceAreas: ['crate:core', 'web-ui'], + lineCount: 128, + lineCountSource: 'diff_stat', + reviewerPacketIds: expect.arrayContaining([ + 'reviewer:ReviewBusinessLogic', + 'reviewer:ReviewSecurity', + 'reviewer:ReviewFrontend', + ]), + invalidatesOn: expect.arrayContaining([ + 'target_file_set_changed', + 'target_line_count_changed', + 'reviewer_roster_changed', + ]), + }); + expect(manifest.incrementalReviewCache.cacheKey).toMatch(/^incremental-review:/); + expect(manifest.incrementalReviewCache.fingerprint).toHaveLength(8); + expect(manifest.incrementalReviewCache.reviewerPacketIds).not.toContain('judge:ReviewJudge'); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('Incremental review cache plan:'); + expect(promptBlock).toContain('"strategy": "reuse_completed_packets_when_fingerprint_matches"'); + expect(promptBlock).toContain('Use incremental_review_cache only when the target fingerprint matches'); + }); + + it('splits reviewer work packets across file groups for large targets', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra([], { + reviewer_file_split_threshold: 10, + max_same_role_instances: 3, + }), + ); + const target = classifyReviewTargetFromFiles( + Array.from( + { length: 25 }, + (_, index) => `src/web-ui/src/components/ReviewPanel${index}.tsx`, + ), + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + concurrencyPolicy: { + maxParallelInstances: 16, + }, + }); + const logicPackets = manifest.workPackets?.filter( + (packet) => packet.subagentId === 'ReviewBusinessLogic', + ); + const judgePackets = manifest.workPackets?.filter( + (packet) => packet.subagentId === 'ReviewJudge', + ); + + expect(logicPackets).toHaveLength(3); + expect(logicPackets?.map((packet) => packet.packetId)).toEqual([ + 'reviewer:ReviewBusinessLogic:group-1-of-3', + 'reviewer:ReviewBusinessLogic:group-2-of-3', + 'reviewer:ReviewBusinessLogic:group-3-of-3', + ]); + expect(logicPackets?.map((packet) => packet.assignedScope.fileCount)).toEqual([ + 9, + 8, + 8, + ]); + expect(logicPackets?.[0].assignedScope).toMatchObject({ + groupIndex: 1, + groupCount: 3, + }); + expect(logicPackets?.[0].assignedScope.files.slice(0, 2)).toEqual([ + 'src/web-ui/src/components/ReviewPanel0.tsx', + 'src/web-ui/src/components/ReviewPanel1.tsx', + ]); + expect(logicPackets?.[0].assignedScope.files.at(-1)).toBe( + 'src/web-ui/src/components/ReviewPanel8.tsx', + ); + expect(judgePackets).toHaveLength(1); + expect(judgePackets?.[0].assignedScope).toMatchObject({ + fileCount: 25, + }); + expect(judgePackets?.[0].assignedScope.groupCount).toBeUndefined(); + expect(manifest.tokenBudget).toMatchObject({ + estimatedReviewerCalls: 16, + maxFilesPerReviewer: 10, + largeDiffSummaryFirst: true, + }); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('"packet_id": "reviewer:ReviewBusinessLogic:group-1-of-3"'); + expect(promptBlock).toContain('"group_index": 1'); + expect(promptBlock).toContain('"group_count": 3'); + }); + + it('keeps split reviewer work packets grouped by workspace area when possible', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra([], { + reviewer_file_split_threshold: 4, + max_same_role_instances: 3, + }), + ); + const target = classifyReviewTargetFromFiles( + [ + 'src/web-ui/src/components/ReviewPanel.tsx', + 'src/crates/core/src/agentic/deep_review_policy.rs', + 'src/apps/desktop/src/api/review.rs', + 'src/web-ui/src/shared/services/reviewTeamService.ts', + 'src/crates/core/src/agentic/tools/implementations/task_tool.rs', + 'src/apps/desktop/src/api/agent.rs', + 'src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx', + 'src/crates/core/src/agentic/agents/deep_review_agent.rs', + 'src/apps/desktop/src/api/config.rs', + 'src/web-ui/src/locales/en-US/scenes/agents.json', + 'src/crates/core/src/agentic/agents/prompts/deep_review_agent.md', + 'src/apps/desktop/src/api/subagent.rs', + ], + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + concurrencyPolicy: { + maxParallelInstances: 16, + }, + }); + const logicPackets = manifest.workPackets?.filter( + (packet) => packet.subagentId === 'ReviewBusinessLogic', + ); + + expect(logicPackets).toHaveLength(3); + expect(logicPackets?.map((packet) => packet.assignedScope.files)).toEqual([ + [ + 'src/web-ui/src/components/ReviewPanel.tsx', + 'src/web-ui/src/shared/services/reviewTeamService.ts', + 'src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx', + 'src/web-ui/src/locales/en-US/scenes/agents.json', + ], + [ + 'src/crates/core/src/agentic/deep_review_policy.rs', + 'src/crates/core/src/agentic/tools/implementations/task_tool.rs', + 'src/crates/core/src/agentic/agents/deep_review_agent.rs', + 'src/crates/core/src/agentic/agents/prompts/deep_review_agent.md', + ], + [ + 'src/apps/desktop/src/api/review.rs', + 'src/apps/desktop/src/api/agent.rs', + 'src/apps/desktop/src/api/config.rs', + 'src/apps/desktop/src/api/subagent.rs', + ], + ]); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('Prefer module/workspace-area coherent file groups'); + }); + + it('caps file splitting and launch batches by concurrency policy', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra([], { + reviewer_file_split_threshold: 10, + max_same_role_instances: 3, + }), + ); + const target = classifyReviewTargetFromFiles( + Array.from( + { length: 25 }, + (_, index) => `src/web-ui/src/components/ReviewPanel${index}.tsx`, + ), + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { target }); + const reviewerPackets = manifest.workPackets?.filter( + (packet) => packet.phase === 'reviewer', + ) ?? []; + const logicPackets = reviewerPackets.filter( + (packet) => packet.subagentId === 'ReviewBusinessLogic', + ); + + expect(manifest.concurrencyPolicy).toMatchObject({ + maxParallelInstances: 4, + staggerSeconds: 0, + maxQueueWaitSeconds: 60, + batchExtrasSeparately: true, + }); + expect(logicPackets).toHaveLength(1); + expect(logicPackets[0].assignedScope.groupCount).toBeUndefined(); + expect(reviewerPackets).toHaveLength(5); + expect(reviewerPackets.slice(0, 4).map((packet) => packet.launchBatch)).toEqual([1, 1, 1, 1]); + expect(reviewerPackets[4].launchBatch).toBe(2); + expect(manifest.qualityGateReviewer && manifest.workPackets?.find( + (packet) => packet.subagentId === manifest.qualityGateReviewer?.subagentId, + )?.launchBatch).toBe(3); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- max_parallel_instances: 4'); + expect(promptBlock).toContain('- max_queue_wait_seconds: 60'); + expect(promptBlock).toContain('Launch reviewer Tasks by launch_batch'); + expect(promptBlock).toContain('"launch_batch": 2'); + }); + + it('reduces reviewer concurrency when rate limit remaining is tight', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra([], { + reviewer_file_split_threshold: 10, + max_same_role_instances: 3, + }), + ); + const target = classifyReviewTargetFromFiles( + Array.from( + { length: 25 }, + (_, index) => `src/web-ui/src/components/ReviewPanel${index}.tsx`, + ), + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + rateLimitStatus: { remaining: 2 }, + }); + const reviewerPackets = manifest.workPackets?.filter( + (packet) => packet.phase === 'reviewer', + ) ?? []; + + expect(manifest.concurrencyPolicy).toMatchObject({ + maxParallelInstances: 2, + staggerSeconds: 10, + batchExtrasSeparately: true, + }); + expect(reviewerPackets.map((packet) => packet.launchBatch)).toEqual([1, 1, 2, 2, 3]); + expect(manifest.qualityGateReviewer && manifest.workPackets?.find( + (packet) => packet.subagentId === manifest.qualityGateReviewer?.subagentId, + )?.launchBatch).toBe(4); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- max_parallel_instances: 2'); + expect(promptBlock).toContain('- stagger_seconds: 10'); + }); + + it('skips the frontend reviewer when the resolved target has no frontend tags', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target: classifyReviewTargetFromFiles( + ['src/crates/core/src/service/config/types.rs'], + 'session_files', + ), + }); + + expect(manifest.target.resolution).toBe('resolved'); + expect(manifest.target.tags).toEqual(['backend_core']); + expect(manifest.coreReviewers.map((member) => member.subagentId)).toEqual([ + 'ReviewBusinessLogic', + 'ReviewPerformance', + 'ReviewSecurity', + 'ReviewArchitecture', + ]); + expect(manifest.skippedReviewers).toEqual([ + expect.objectContaining({ + subagentId: 'ReviewFrontend', + reason: 'not_applicable', + }), + ]); + }); + + it('keeps explicit file-path targets compatible with conditional frontend reviewer gating', () => { const team = resolveDefaultReviewTeam( coreSubagents(), storedConfigWithExtra(), ); const manifest = buildEffectiveReviewTeamManifest(team, { - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, reviewTargetFilePaths: ['src/crates/core/src/agentic/deep_review_policy.rs'], }); @@ -288,11 +1077,432 @@ describe('reviewTeamService', () => { expect(manifest.skippedReviewers).toEqual([ expect.objectContaining({ subagentId: 'ReviewFrontend', - reason: 'non_applicable', + reason: 'not_applicable', }), ]); }); + it('runs the frontend reviewer for frontend and contract targets', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target: classifyReviewTargetFromFiles( + ['src/apps/desktop/src/api/agentic_api.rs'], + 'session_files', + ), + }); + + expect(manifest.target.tags).toEqual( + expect.arrayContaining(['desktop_contract', 'frontend_contract']), + ); + expect(manifest.coreReviewers.map((member) => member.subagentId)).toContain( + 'ReviewFrontend', + ); + expect(manifest.skippedReviewers).not.toEqual([ + expect.objectContaining({ subagentId: 'ReviewFrontend' }), + ]); + }); + + it('runs conditional reviewers conservatively for unknown targets', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target: createUnknownReviewTargetClassification('manual_prompt'), + }); + + expect(manifest.target.resolution).toBe('unknown'); + expect(manifest.coreReviewers.map((member) => member.subagentId)).toContain( + 'ReviewFrontend', + ); + }); + + it('adds a balanced token budget to the run manifest by default', () => { + const team = resolveDefaultReviewTeam( + [ + ...coreSubagents(), + subagent('ExtraEnabled', true, 'user', 'fast', true, true), + ], + storedConfigWithExtra(['ExtraEnabled']), + ); + + const manifest = buildEffectiveReviewTeamManifest(team); + + expect(manifest.tokenBudget).toMatchObject({ + mode: 'balanced', + estimatedReviewerCalls: 7, + maxExtraReviewers: 1, + skippedReviewerIds: [], + }); + }); + + it('enables summary-first from prompt-byte pressure without hiding assigned files', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + const files = Array.from( + { length: 6 }, + (_, index) => `src/crates/core/src/agentic/large_change_${index}.rs`, + ); + const target = classifyReviewTargetFromFiles(files, 'workspace_diff'); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + changeStats: { + fileCount: files.length, + totalLinesChanged: 5000, + lineCountSource: 'diff_stat', + }, + }); + + expect(manifest.tokenBudget).toMatchObject({ + maxPromptBytesPerReviewer: 96_000, + promptByteEstimateSource: 'manifest_heuristic', + promptByteLimitExceeded: true, + largeDiffSummaryFirst: true, + }); + expect(manifest.tokenBudget.estimatedPromptBytesPerReviewer).toBeGreaterThan( + manifest.tokenBudget.maxPromptBytesPerReviewer ?? 0, + ); + expect(manifest.tokenBudget.decisions).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + kind: 'summary_first_full_scope', + reason: 'prompt_bytes_exceeded', + }), + ]), + ); + const reviewerPackets = manifest.workPackets.filter( + (packet) => packet.phase === 'reviewer', + ); + expect(reviewerPackets).not.toHaveLength(0); + for (const packet of reviewerPackets) { + expect(packet.assignedScope.files).toEqual(files); + } + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- max_prompt_bytes_per_reviewer: 96000'); + expect(promptBlock).toContain('- prompt_byte_limit_exceeded: yes'); + expect(promptBlock).toContain('- token_budget_decisions: summary_first_full_scope'); + expect(promptBlock).toContain('Do not remove files from assigned_scope'); + }); + + it('keeps summary-first disabled when split guardrails fit the prompt-byte budget', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra([], { + reviewer_file_split_threshold: 4, + max_same_role_instances: 2, + }), + ); + const files = Array.from( + { length: 5 }, + (_, index) => `src/crates/core/src/agentic/small_${index}.rs`, + ); + const target = classifyReviewTargetFromFiles(files, 'workspace_diff'); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + tokenBudgetMode: 'thorough', + concurrencyPolicy: { + maxParallelInstances: 8, + }, + changeStats: { + fileCount: files.length, + totalLinesChanged: 25, + lineCountSource: 'diff_stat', + }, + }); + + expect(manifest.tokenBudget).toMatchObject({ + maxFilesPerReviewer: 4, + maxPromptBytesPerReviewer: 192_000, + promptByteLimitExceeded: false, + largeDiffSummaryFirst: false, + }); + expect(manifest.workPackets.filter((packet) => packet.phase === 'reviewer')) + .toEqual( + expect.arrayContaining([ + expect.objectContaining({ + assignedScope: expect.objectContaining({ + groupCount: 2, + }), + }), + ]), + ); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- prompt_byte_limit_exceeded: no'); + expect(promptBlock).toContain('- token_budget_decisions: none'); + }); + + it('predicts manifest timeouts from resolved target size', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + const target = classifyReviewTargetFromFiles( + Array.from( + { length: 25 }, + (_, index) => `src/web-ui/src/components/ReviewPanel${index}.tsx`, + ), + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { target }); + + expect(manifest.changeStats).toMatchObject({ + fileCount: 25, + lineCountSource: 'unknown', + }); + expect(manifest.executionPolicy).toMatchObject({ + reviewerTimeoutSeconds: 675, + judgeTimeoutSeconds: 1350, + }); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- target_file_count: 25'); + expect(promptBlock).toContain('- target_line_count: unknown'); + expect(promptBlock).toContain('- reviewer_timeout_seconds: 675'); + expect(promptBlock).toContain('- judge_timeout_seconds: 1350'); + }); + + it('includes diff line stats in predictive manifest timeouts', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + const target = classifyReviewTargetFromFiles( + Array.from( + { length: 25 }, + (_, index) => `src/web-ui/src/components/ReviewPanel${index}.tsx`, + ), + 'workspace_diff', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + changeStats: { + fileCount: 25, + totalLinesChanged: 800, + lineCountSource: 'diff_stat', + }, + }); + + expect(manifest.changeStats).toMatchObject({ + fileCount: 25, + totalLinesChanged: 800, + lineCountSource: 'diff_stat', + }); + expect(manifest.executionPolicy).toMatchObject({ + reviewerTimeoutSeconds: 915, + judgeTimeoutSeconds: 1830, + }); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- target_line_count: 800'); + expect(promptBlock).toContain('- target_line_count_source: diff_stat'); + expect(promptBlock).toContain('- reviewer_timeout_seconds: 915'); + expect(promptBlock).toContain('- judge_timeout_seconds: 1830'); + }); + + it('adds an advisory risk-based strategy recommendation to the manifest and prompt', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + const target = classifyReviewTargetFromFiles( + [ + 'src/crates/core/src/service/auth/token_store.rs', + 'src/apps/desktop/src/api/agentic_api.rs', + ...Array.from( + { length: 18 }, + (_, index) => `src/web-ui/src/components/ReviewPanel${index}.tsx`, + ), + ], + 'workspace_diff', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + changeStats: { + fileCount: 20, + totalLinesChanged: 1400, + lineCountSource: 'diff_stat', + }, + }); + + expect(manifest.strategyLevel).toBe('normal'); + expect(manifest.strategyRecommendation).toMatchObject({ + strategyLevel: 'deep', + factors: { + fileCount: 20, + totalLinesChanged: 1400, + securityFileCount: 1, + }, + }); + expect(manifest.strategyRecommendation?.rationale).toContain('Large/high-risk change'); + expect(manifest.strategyDecision).toMatchObject({ + authority: 'mismatch_warning', + teamDefaultStrategy: 'normal', + finalStrategy: 'normal', + mismatch: true, + mismatchSeverity: 'medium', + frontendRecommendation: { + strategyLevel: 'deep', + }, + backendRecommendation: { + strategyLevel: 'deep', + factors: { + fileCount: 20, + totalLinesChanged: 1400, + filesInSecurityPaths: 1, + maxCyclomaticComplexityDelta: 0, + maxCyclomaticComplexityDeltaSource: 'not_measured', + }, + }, + }); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- recommended_strategy: deep'); + expect(promptBlock).toContain('- frontend_recommended_strategy: deep'); + expect(promptBlock).toContain('- backend_recommended_strategy: deep'); + expect(promptBlock).toContain('- strategy_authority: mismatch_warning'); + expect(promptBlock).toContain('- strategy_mismatch: yes'); + expect(promptBlock).toContain('- max_cyclomatic_complexity_delta_source: not_measured'); + expect(promptBlock).toContain('- strategy_recommendation_rationale: Large/high-risk change'); + expect(promptBlock).toContain('Risk recommendation is advisory'); + }); + + it('records explicit strategy override as final strategy metadata without expanding reviewer roster', () => { + const team = resolveDefaultReviewTeam( + [ + ...coreSubagents(), + subagent('ExtraEnabled', true, 'user', 'fast', true, true), + ], + storedConfigWithExtra(['ExtraEnabled']), + ); + const target = classifyReviewTargetFromFiles( + [ + ...Array.from( + { length: 24 }, + (_, index) => `src/crates/core/src/review/module_${index}.rs`, + ), + ], + 'workspace_diff', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target, + strategyOverride: 'quick', + changeStats: { + fileCount: 24, + totalLinesChanged: 1800, + lineCountSource: 'diff_stat', + }, + }); + + expect(manifest.strategyDecision).toMatchObject({ + authority: 'mismatch_warning', + teamDefaultStrategy: 'normal', + userOverride: 'quick', + finalStrategy: 'quick', + mismatch: true, + mismatchSeverity: 'high', + backendRecommendation: { + strategyLevel: 'deep', + }, + }); + expect(manifest.coreReviewers).toHaveLength(4); + expect(manifest.enabledExtraReviewers).toHaveLength(1); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- final_strategy: quick'); + expect(promptBlock).toContain('- strategy_user_override: quick'); + expect(promptBlock).toContain('- strategy_mismatch_severity: high'); + }); + + it('keeps unknown targets at a conservative normal recommendation', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra(), + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + target: createUnknownReviewTargetClassification('manual_prompt'), + }); + + expect(manifest.strategyRecommendation).toMatchObject({ + strategyLevel: 'normal', + score: 0, + }); + expect(manifest.strategyRecommendation?.rationale).toContain('unresolved target'); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- recommended_strategy: normal'); + }); + + it('preserves explicit zero timeout policy when predicting manifest timeouts', () => { + const team = resolveDefaultReviewTeam( + coreSubagents(), + storedConfigWithExtra([], { + reviewer_timeout_seconds: 0, + judge_timeout_seconds: 0, + }), + ); + const target = classifyReviewTargetFromFiles( + ['src/web-ui/src/components/ReviewPanel.tsx'], + 'session_files', + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { target }); + + expect(manifest.executionPolicy).toMatchObject({ + reviewerTimeoutSeconds: 0, + judgeTimeoutSeconds: 0, + }); + }); + + it('marks excess extra reviewers as budget-limited in economy mode', () => { + const team = resolveDefaultReviewTeam( + [ + ...coreSubagents(), + subagent('ExtraOne', true, 'user', 'fast', true, true), + subagent('ExtraTwo', true, 'user', 'fast', true, true), + ], + storedConfigWithExtra(['ExtraOne', 'ExtraTwo']), + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + tokenBudgetMode: 'economy', + }); + + expect(manifest.enabledExtraReviewers).toEqual([]); + expect(manifest.skippedReviewers).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + subagentId: 'ExtraOne', + reason: 'budget_limited', + }), + expect.objectContaining({ + subagentId: 'ExtraTwo', + reason: 'budget_limited', + }), + ]), + ); + expect(manifest.tokenBudget).toMatchObject({ + mode: 'economy', + maxExtraReviewers: 0, + skippedReviewerIds: ['ExtraOne', 'ExtraTwo'], + }); + }); + it('applies per-member strategy overrides in the launch manifest and prompt', () => { const team = resolveDefaultReviewTeam( [ @@ -309,7 +1519,7 @@ describe('reviewTeamService', () => { ); const manifest = buildEffectiveReviewTeamManifest(team, { - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, }); expect(manifest.strategyLevel).toBe('quick'); @@ -369,6 +1579,55 @@ describe('reviewTeamService', () => { expect(promptBlock).toContain('Token/time impact: approximately 1.8-2.5x token usage and 1.5-2.5x runtime.'); }); + it('applies a project strategy override to the launch manifest without changing member overrides', () => { + const team = resolveDefaultReviewTeam( + [ + ...coreSubagents(), + subagent('ExtraEnabled', true, 'user', 'fast', true, true), + ], + storedConfigWithExtra(['ExtraEnabled'], { + strategy_level: 'normal', + member_strategy_overrides: { + ReviewSecurity: 'quick', + }, + }), + ); + + const manifest = buildEffectiveReviewTeamManifest(team, { + workspacePath: WORKSPACE_PATH, + strategyOverride: 'deep', + }); + + expect(manifest.strategyLevel).toBe('deep'); + expect(manifest.coreReviewers).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + subagentId: 'ReviewBusinessLogic', + strategyLevel: 'deep', + strategySource: 'team', + defaultModelSlot: 'primary', + }), + expect.objectContaining({ + subagentId: 'ReviewSecurity', + strategyLevel: 'quick', + strategySource: 'member', + defaultModelSlot: 'fast', + }), + ]), + ); + expect(manifest.enabledExtraReviewers[0]).toMatchObject({ + subagentId: 'ExtraEnabled', + strategyLevel: 'deep', + strategySource: 'team', + defaultModelSlot: 'primary', + }); + + const promptBlock = buildReviewTeamPromptBlock(team, manifest); + expect(promptBlock).toContain('- team_strategy: deep'); + expect(promptBlock).toContain('subagent_type: ReviewSecurity'); + expect(promptBlock).toContain('strategy: quick'); + }); + it('falls back removed concrete reviewer models to the strategy default model slot', () => { const team = resolveDefaultReviewTeam( [ @@ -416,18 +1675,21 @@ describe('reviewTeamService', () => { const promptBlock = buildReviewTeamPromptBlock( team, buildEffectiveReviewTeamManifest(team, { - workspacePath: 'D:/workspace/project-a', + workspacePath: WORKSPACE_PATH, }), ); expect(promptBlock).toContain('Run manifest:'); + expect(promptBlock).toContain('target_resolution: unknown'); expect(promptBlock).toContain('- team_strategy: normal'); - expect(promptBlock).toContain('- workspace_path: D:/workspace/project-a'); + expect(promptBlock).toContain(`- workspace_path: ${WORKSPACE_PATH}`); expect(promptBlock).toContain('quality_gate_reviewer: ReviewJudge'); expect(promptBlock).toContain('enabled_extra_reviewers: ExtraEnabled'); expect(promptBlock).toContain('skipped_reviewers:'); expect(promptBlock).toContain('- ExtraDisabled: disabled'); expect(promptBlock).not.toContain('subagent_type: ExtraDisabled'); + expect(promptBlock).toContain('Run only reviewers listed in core_reviewers and enabled_extra_reviewers.'); + expect(promptBlock).not.toContain('run it in parallel with the locked reviewers whenever the change contains frontend files'); }); it('tells DeepReview to wait for user approval before running ReviewFixer', () => { diff --git a/src/web-ui/src/shared/services/reviewTeamService.ts b/src/web-ui/src/shared/services/reviewTeamService.ts index 1c588808f..d14b7d56d 100644 --- a/src/web-ui/src/shared/services/reviewTeamService.ts +++ b/src/web-ui/src/shared/services/reviewTeamService.ts @@ -1,12 +1,25 @@ import { configAPI } from '@/infrastructure/api/service-api/ConfigAPI'; +import { agentAPI } from '@/infrastructure/api/service-api/AgentAPI'; import { SubagentAPI, type SubagentInfo, type SubagentSource, } from '@/infrastructure/api/service-api/SubagentAPI'; +import { + classifyReviewTargetFromFiles, + createUnknownReviewTargetClassification, + shouldRunReviewerForTarget, + type ReviewDomainTag, + type ReviewTargetClassification, +} from './reviewTargetClassifier'; +import { evaluateReviewSubagentToolReadiness } from './reviewSubagentCapabilities'; export const DEFAULT_REVIEW_TEAM_ID = 'default-review-team'; export const DEFAULT_REVIEW_TEAM_CONFIG_PATH = 'ai.review_teams.default'; +export const DEFAULT_REVIEW_TEAM_RATE_LIMIT_STATUS_CONFIG_PATH = + 'ai.review_teams.rate_limit_status'; +export const DEFAULT_REVIEW_TEAM_PROJECT_STRATEGY_OVERRIDES_CONFIG_PATH = + 'ai.review_teams.project_strategy_overrides'; export const DEFAULT_REVIEW_TEAM_MODEL = 'fast'; export const DEFAULT_REVIEW_TEAM_STRATEGY_LEVEL = 'normal' as const; export const DEFAULT_REVIEW_MEMBER_STRATEGY_LEVEL = 'inherit' as const; @@ -15,7 +28,33 @@ export const DEFAULT_REVIEW_TEAM_EXECUTION_POLICY = { judgeTimeoutSeconds: 600, reviewerFileSplitThreshold: 20, maxSameRoleInstances: 3, + maxRetriesPerRole: 1, +} as const; +export const DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY = { + maxParallelInstances: 4, + staggerSeconds: 0, + maxQueueWaitSeconds: 60, + batchExtrasSeparately: true, } as const; +const MAX_PREDICTIVE_TIMEOUT_SECONDS = 3600; +const MAX_PARALLEL_REVIEWER_INSTANCES = 16; +const MAX_QUEUE_WAIT_SECONDS = 600; +const PREDICTIVE_TIMEOUT_PER_FILE_SECONDS = 15; +const PREDICTIVE_TIMEOUT_PER_100_LINES_SECONDS = 30; +const PREDICTIVE_TIMEOUT_BASE_SECONDS: Record = { + quick: 180, + normal: 300, + deep: 600, +}; +const TOKEN_BUDGET_PROMPT_BYTE_LIMIT_BY_MODE: Record = { + economy: 64_000, + balanced: 96_000, + thorough: 192_000, +}; +const PROMPT_BYTE_ESTIMATE_BASE_BYTES = 12_000; +const PROMPT_BYTE_ESTIMATE_PER_FILE_BYTES = 1_800; +const PROMPT_BYTE_ESTIMATE_PER_CHANGED_LINE_BYTES = 120; +const PROMPT_BYTE_ESTIMATE_UNKNOWN_LINES_PER_FILE = 80; export type ReviewStrategyLevel = 'quick' | 'normal' | 'deep'; export type ReviewMemberStrategyLevel = ReviewStrategyLevel | 'inherit'; @@ -26,13 +65,7 @@ export interface ReviewStrategyCommonRules { reviewerPromptRules: string[]; } -export type ReviewRoleDirectiveKey = - | 'ReviewBusinessLogic' - | 'ReviewPerformance' - | 'ReviewSecurity' - | 'ReviewArchitecture' - | 'ReviewFrontend' - | 'ReviewJudge'; +export type ReviewRoleDirectiveKey = string; export interface ReviewStrategyProfile { level: ReviewStrategyLevel; @@ -44,7 +77,7 @@ export interface ReviewStrategyProfile { promptDirective: string; /** Per-role strategy directives. When a role key is present, its directive * overrides `promptDirective` for that reviewer or the judge. */ - roleDirectives: Partial>; + roleDirectives: Record; } export const REVIEW_STRATEGY_LEVELS: ReviewStrategyLevel[] = [ @@ -151,13 +184,7 @@ export function getReviewStrategyProfile( return REVIEW_STRATEGY_PROFILES[strategyLevel]; } -export type ReviewTeamCoreRoleKey = - | 'businessLogic' - | 'performance' - | 'security' - | 'architecture' - | 'frontend' - | 'judge'; +export type ReviewTeamCoreRoleKey = string; export interface ReviewTeamCoreRoleDefinition { key: ReviewTeamCoreRoleKey; @@ -171,6 +198,20 @@ export interface ReviewTeamCoreRoleDefinition { conditional?: boolean; } +export interface ReviewTeamDefinition { + id: string; + name: string; + description: string; + warning: string; + defaultModel: string; + defaultStrategyLevel: ReviewStrategyLevel; + defaultExecutionPolicy: ReviewTeamExecutionPolicy; + coreRoles: ReviewTeamCoreRoleDefinition[]; + strategyProfiles: Record; + disallowedExtraSubagentIds: string[]; + hiddenAgentIds: string[]; +} + export interface ReviewTeamStoredConfig { extra_subagent_ids: string[]; strategy_level: ReviewStrategyLevel; @@ -179,6 +220,7 @@ export interface ReviewTeamStoredConfig { judge_timeout_seconds: number; reviewer_file_split_threshold: number; max_same_role_instances: number; + max_retries_per_role: number; } export interface ReviewTeamExecutionPolicy { @@ -186,6 +228,196 @@ export interface ReviewTeamExecutionPolicy { judgeTimeoutSeconds: number; reviewerFileSplitThreshold: number; maxSameRoleInstances: number; + maxRetriesPerRole: number; +} + +export interface ReviewTeamConcurrencyPolicy { + maxParallelInstances: number; + staggerSeconds: number; + maxQueueWaitSeconds: number; + batchExtrasSeparately: boolean; +} + +export interface ReviewTeamRateLimitStatus { + remaining: number; +} + +export type ReviewTeamManifestMemberReason = + | 'disabled' + | 'unavailable' + | 'not_applicable' + | 'budget_limited' + | 'invalid_tooling'; + +export type ReviewTokenBudgetMode = 'economy' | 'balanced' | 'thorough'; +export type ReviewPromptByteEstimateSource = 'manifest_heuristic'; +export type ReviewTeamTokenBudgetDecisionKind = + | 'summary_first_full_scope' + | 'skip_extra_reviewers'; +export type ReviewTeamTokenBudgetDecisionReason = + | 'prompt_bytes_exceeded' + | 'extra_reviewers_skipped'; + +export interface ReviewTeamTokenBudgetDecision { + kind: ReviewTeamTokenBudgetDecisionKind; + reason: ReviewTeamTokenBudgetDecisionReason; + detail: string; + affectedReviewerIds?: string[]; +} + +export interface ReviewTeamTokenBudgetPlan { + mode: ReviewTokenBudgetMode; + estimatedReviewerCalls: number; + maxReviewerCalls: number; + maxExtraReviewers: number; + maxFilesPerReviewer?: number; + maxPromptBytesPerReviewer?: number; + estimatedPromptBytesPerReviewer?: number; + promptByteEstimateSource?: ReviewPromptByteEstimateSource; + promptByteLimitExceeded?: boolean; + largeDiffSummaryFirst: boolean; + decisions?: ReviewTeamTokenBudgetDecision[]; + skippedReviewerIds: string[]; + warnings: string[]; +} + +export interface ReviewTeamChangeStats { + fileCount: number; + totalLinesChanged?: number; + lineCountSource: 'unknown' | 'diff_stat' | 'estimated'; +} + +export interface ReviewTeamRiskFactors { + fileCount: number; + totalLinesChanged?: number; + lineCountSource: ReviewTeamChangeStats['lineCountSource']; + securityFileCount: number; + workspaceAreaCount: number; + contractSurfaceChanged: boolean; +} + +export interface ReviewTeamStrategyRecommendation { + strategyLevel: ReviewStrategyLevel; + score: number; + rationale: string; + factors: ReviewTeamRiskFactors; +} + +export type ReviewTeamStrategyAuthority = 'mismatch_warning'; +export type ReviewTeamStrategyMismatchSeverity = 'none' | 'low' | 'medium' | 'high'; + +export interface ReviewTeamBackendRiskFactors { + fileCount: number; + totalLinesChanged: number; + lineCountSource: ReviewTeamChangeStats['lineCountSource']; + filesInSecurityPaths: number; + crossCrateChanges: number; + maxCyclomaticComplexityDelta: number; + maxCyclomaticComplexityDeltaSource: 'not_measured'; +} + +export interface ReviewTeamBackendStrategyRecommendation { + strategyLevel: ReviewStrategyLevel; + score: number; + rationale: string; + factors: ReviewTeamBackendRiskFactors; +} + +export interface ReviewTeamStrategyDecision { + authority: ReviewTeamStrategyAuthority; + teamDefaultStrategy: ReviewStrategyLevel; + userOverride?: ReviewStrategyLevel; + finalStrategy: ReviewStrategyLevel; + frontendRecommendation: ReviewTeamStrategyRecommendation; + backendRecommendation: ReviewTeamBackendStrategyRecommendation; + mismatch: boolean; + mismatchSeverity: ReviewTeamStrategyMismatchSeverity; + rationale: string; +} + +export interface ReviewTeamPreReviewSummaryArea { + key: string; + fileCount: number; + sampleFiles: string[]; +} + +export interface ReviewTeamPreReviewSummary { + source: 'target_manifest'; + summary: string; + fileCount: number; + excludedFileCount: number; + lineCount?: number; + lineCountSource: ReviewTeamChangeStats['lineCountSource']; + targetTags: ReviewDomainTag[]; + workspaceAreas: ReviewTeamPreReviewSummaryArea[]; + warnings: ReviewTargetClassification['warnings'][number]['code'][]; +} + +export type ReviewTeamSharedContextTool = 'GetFileDiff' | 'Read'; + +export interface ReviewTeamSharedContextCacheEntry { + cacheKey: string; + path: string; + workspaceArea: string; + recommendedTools: ReviewTeamSharedContextTool[]; + consumerPacketIds: string[]; +} + +export interface ReviewTeamSharedContextCachePlan { + source: 'work_packets'; + strategy: 'reuse_readonly_file_context_by_cache_key'; + entries: ReviewTeamSharedContextCacheEntry[]; + omittedEntryCount: number; +} + +export type ReviewTeamIncrementalReviewCacheInvalidation = + | 'target_file_set_changed' + | 'target_line_count_changed' + | 'target_tag_changed' + | 'target_warning_changed' + | 'reviewer_roster_changed' + | 'strategy_changed'; + +export interface ReviewTeamIncrementalReviewCachePlan { + source: 'target_manifest'; + strategy: 'reuse_completed_packets_when_fingerprint_matches'; + cacheKey: string; + fingerprint: string; + filePaths: string[]; + workspaceAreas: string[]; + targetTags: ReviewDomainTag[]; + reviewerPacketIds: string[]; + lineCount?: number; + lineCountSource: ReviewTeamChangeStats['lineCountSource']; + invalidatesOn: ReviewTeamIncrementalReviewCacheInvalidation[]; +} + +export interface ReviewTeamWorkPacketScope { + kind: 'review_target'; + targetSource: ReviewTargetClassification['source']; + targetResolution: ReviewTargetClassification['resolution']; + targetTags: ReviewDomainTag[]; + fileCount: number; + files: string[]; + excludedFileCount: number; + groupIndex?: number; + groupCount?: number; +} + +export interface ReviewTeamWorkPacket { + packetId: string; + phase: 'reviewer' | 'judge'; + launchBatch: number; + subagentId: string; + displayName: string; + roleName: string; + assignedScope: ReviewTeamWorkPacketScope; + allowedTools: string[]; + timeoutSeconds: number; + requiredOutputFields: string[]; + strategyLevel: ReviewStrategyLevel; + strategyDirective: string; + model: string; } export interface ReviewTeamMember { @@ -209,6 +441,10 @@ export interface ReviewTeamMember { source: 'core' | 'extra'; subagentSource: SubagentSource; accentColor: string; + allowedTools: string[]; + defaultModelSlot?: ReviewStrategyProfile['defaultModelSlot']; + strategyDirective?: string; + skipReason?: ReviewTeamManifestMemberReason; } export interface ReviewTeam { @@ -219,6 +455,7 @@ export interface ReviewTeam { strategyLevel: ReviewStrategyLevel; memberStrategyOverrides: Record; executionPolicy: ReviewTeamExecutionPolicy; + definition: ReviewTeamDefinition; members: ReviewTeamMember[]; coreMembers: ReviewTeamMember[]; extraMembers: ReviewTeamMember[]; @@ -238,19 +475,39 @@ export interface ReviewTeamManifestMember { locked: boolean; source: ReviewTeamMember['source']; subagentSource: ReviewTeamMember['subagentSource']; - reason?: 'disabled' | 'unavailable' | 'non_applicable'; + reason?: ReviewTeamManifestMemberReason; } export interface ReviewTeamRunManifest { reviewMode: 'deep'; workspacePath?: string; policySource: 'default-review-team-config'; + target: ReviewTargetClassification; strategyLevel: ReviewStrategyLevel; + strategyRecommendation?: ReviewTeamStrategyRecommendation; + strategyDecision: ReviewTeamStrategyDecision; executionPolicy: ReviewTeamExecutionPolicy; + concurrencyPolicy: ReviewTeamConcurrencyPolicy; + changeStats?: ReviewTeamChangeStats; + preReviewSummary: ReviewTeamPreReviewSummary; + sharedContextCache: ReviewTeamSharedContextCachePlan; + incrementalReviewCache: ReviewTeamIncrementalReviewCachePlan; + tokenBudget: ReviewTeamTokenBudgetPlan; coreReviewers: ReviewTeamManifestMember[]; qualityGateReviewer?: ReviewTeamManifestMember; enabledExtraReviewers: ReviewTeamManifestMember[]; skippedReviewers: ReviewTeamManifestMember[]; + workPackets?: ReviewTeamWorkPacket[]; +} + +export function getActiveReviewTeamManifestMembers( + manifest: ReviewTeamRunManifest, +): ReviewTeamManifestMember[] { + return [ + ...manifest.coreReviewers, + ...manifest.enabledExtraReviewers, + ...(manifest.qualityGateReviewer ? [manifest.qualityGateReviewer] : []), + ]; } const EXTRA_MEMBER_DEFAULTS = { @@ -265,6 +522,32 @@ const EXTRA_MEMBER_DEFAULTS = { accentColor: '#64748b', }; +const REVIEW_WORK_PACKET_ALLOWED_TOOLS = [ + 'GetFileDiff', + 'Read', + 'Grep', + 'Glob', + 'LS', + 'Git', +] as const; + +const REVIEWER_WORK_PACKET_REQUIRED_OUTPUT_FIELDS = [ + 'packet_id', + 'status', + 'verdict', + 'findings', + 'reviewer_summary', +] as const; + +const JUDGE_WORK_PACKET_REQUIRED_OUTPUT_FIELDS = [ + 'packet_id', + 'status', + 'decision_summary', + 'validated_findings', + 'rejected_or_downgraded_notes', + 'coverage_notes', +] as const; + export const DEFAULT_REVIEW_TEAM_CORE_ROLES: ReviewTeamCoreRoleDefinition[] = [ { key: 'businessLogic', @@ -363,6 +646,166 @@ const DISALLOWED_REVIEW_TEAM_MEMBER_IDS = new Set([ 'ReviewFixer', ]); +export const FALLBACK_REVIEW_TEAM_DEFINITION: ReviewTeamDefinition = { + id: DEFAULT_REVIEW_TEAM_ID, + name: 'Code Review Team', + description: + 'A multi-reviewer team for deep code review with mandatory logic, performance, security, architecture, conditional frontend, and quality-gate roles.', + warning: + 'Deep review may take longer and usually consumes more tokens than a standard review.', + defaultModel: DEFAULT_REVIEW_TEAM_MODEL, + defaultStrategyLevel: DEFAULT_REVIEW_TEAM_STRATEGY_LEVEL, + defaultExecutionPolicy: { + ...DEFAULT_REVIEW_TEAM_EXECUTION_POLICY, + }, + coreRoles: DEFAULT_REVIEW_TEAM_CORE_ROLES, + strategyProfiles: REVIEW_STRATEGY_PROFILES, + disallowedExtraSubagentIds: [...DISALLOWED_REVIEW_TEAM_MEMBER_IDS], + hiddenAgentIds: [ + 'DeepReview', + ...DEFAULT_REVIEW_TEAM_CORE_ROLES.map((role) => role.subagentId), + ], +}; + +function isReviewTeamCoreRoleDefinition(value: unknown): value is ReviewTeamCoreRoleDefinition { + if (!value || typeof value !== 'object') return false; + const role = value as Partial; + return ( + typeof role.key === 'string' && + typeof role.subagentId === 'string' && + typeof role.funName === 'string' && + typeof role.roleName === 'string' && + typeof role.description === 'string' && + Array.isArray(role.responsibilities) && + role.responsibilities.every((item) => typeof item === 'string') && + typeof role.accentColor === 'string' + ); +} + +function isReviewStrategyProfile(value: unknown): value is ReviewStrategyProfile { + if (!value || typeof value !== 'object') return false; + const profile = value as Partial; + return ( + isReviewStrategyLevel(profile.level) && + typeof profile.label === 'string' && + typeof profile.summary === 'string' && + typeof profile.tokenImpact === 'string' && + typeof profile.runtimeImpact === 'string' && + (profile.defaultModelSlot === 'fast' || profile.defaultModelSlot === 'primary') && + typeof profile.promptDirective === 'string' && + Boolean(profile.roleDirectives) && + typeof profile.roleDirectives === 'object' + ); +} + +function nonEmptyStringOrFallback(value: unknown, fallback: string): string { + if (typeof value !== 'string') { + return fallback; + } + + return value.trim() || fallback; +} + +function normalizeReviewTeamDefinition(raw: unknown): ReviewTeamDefinition { + if (!raw || typeof raw !== 'object') { + return FALLBACK_REVIEW_TEAM_DEFINITION; + } + + const source = raw as Partial; + const coreRoles = Array.isArray(source.coreRoles) + ? source.coreRoles.filter(isReviewTeamCoreRoleDefinition) + : []; + const strategyProfiles = REVIEW_STRATEGY_LEVELS.reduce< + Partial> + >((profiles, level) => { + const profile = source.strategyProfiles?.[level]; + profiles[level] = isReviewStrategyProfile(profile) + ? profile + : FALLBACK_REVIEW_TEAM_DEFINITION.strategyProfiles[level]; + return profiles; + }, {}) as Record; + const disallowedExtraSubagentIds = Array.isArray(source.disallowedExtraSubagentIds) + ? dedupeIds(source.disallowedExtraSubagentIds.filter((id): id is string => typeof id === 'string')) + : []; + const hiddenAgentIds = Array.isArray(source.hiddenAgentIds) + ? dedupeIds(source.hiddenAgentIds.filter((id): id is string => typeof id === 'string')) + : []; + + return { + id: nonEmptyStringOrFallback(source.id, FALLBACK_REVIEW_TEAM_DEFINITION.id), + name: nonEmptyStringOrFallback(source.name, FALLBACK_REVIEW_TEAM_DEFINITION.name), + description: nonEmptyStringOrFallback( + source.description, + FALLBACK_REVIEW_TEAM_DEFINITION.description, + ), + warning: nonEmptyStringOrFallback( + source.warning, + FALLBACK_REVIEW_TEAM_DEFINITION.warning, + ), + defaultModel: nonEmptyStringOrFallback( + source.defaultModel, + FALLBACK_REVIEW_TEAM_DEFINITION.defaultModel, + ), + defaultStrategyLevel: isReviewStrategyLevel(source.defaultStrategyLevel) + ? source.defaultStrategyLevel + : FALLBACK_REVIEW_TEAM_DEFINITION.defaultStrategyLevel, + defaultExecutionPolicy: source.defaultExecutionPolicy + ? { + reviewerTimeoutSeconds: clampInteger( + source.defaultExecutionPolicy.reviewerTimeoutSeconds, + 0, + 3600, + FALLBACK_REVIEW_TEAM_DEFINITION.defaultExecutionPolicy.reviewerTimeoutSeconds, + ), + judgeTimeoutSeconds: clampInteger( + source.defaultExecutionPolicy.judgeTimeoutSeconds, + 0, + 3600, + FALLBACK_REVIEW_TEAM_DEFINITION.defaultExecutionPolicy.judgeTimeoutSeconds, + ), + reviewerFileSplitThreshold: clampInteger( + source.defaultExecutionPolicy.reviewerFileSplitThreshold, + 0, + 9999, + FALLBACK_REVIEW_TEAM_DEFINITION.defaultExecutionPolicy.reviewerFileSplitThreshold, + ), + maxSameRoleInstances: clampInteger( + source.defaultExecutionPolicy.maxSameRoleInstances, + 1, + 8, + FALLBACK_REVIEW_TEAM_DEFINITION.defaultExecutionPolicy.maxSameRoleInstances, + ), + maxRetriesPerRole: clampInteger( + source.defaultExecutionPolicy.maxRetriesPerRole, + 0, + 3, + FALLBACK_REVIEW_TEAM_DEFINITION.defaultExecutionPolicy.maxRetriesPerRole, + ), + } + : FALLBACK_REVIEW_TEAM_DEFINITION.defaultExecutionPolicy, + coreRoles: coreRoles.length > 0 ? coreRoles : FALLBACK_REVIEW_TEAM_DEFINITION.coreRoles, + strategyProfiles, + disallowedExtraSubagentIds: + disallowedExtraSubagentIds.length > 0 + ? disallowedExtraSubagentIds + : FALLBACK_REVIEW_TEAM_DEFINITION.disallowedExtraSubagentIds, + hiddenAgentIds: + hiddenAgentIds.length > 0 + ? hiddenAgentIds + : FALLBACK_REVIEW_TEAM_DEFINITION.hiddenAgentIds, + }; +} + +export async function loadDefaultReviewTeamDefinition(): Promise { + try { + return normalizeReviewTeamDefinition( + await agentAPI.getDefaultReviewTeamDefinition(), + ); + } catch { + return FALLBACK_REVIEW_TEAM_DEFINITION; + } +} + function dedupeIds(ids: string[]): string[] { return Array.from( new Set( @@ -411,6 +854,42 @@ function normalizeMemberStrategyOverrides( }, {}); } +function normalizeProjectStrategyOverrideKey(workspacePath?: string): string | undefined { + const normalized = workspacePath?.trim().replace(/\\/g, '/'); + if (!normalized) { + return undefined; + } + if (normalized === '/' || /^[a-zA-Z]:\/$/.test(normalized)) { + return normalized.toLowerCase(); + } + return normalized.replace(/\/+$/, '').toLowerCase(); +} + +function normalizeProjectStrategyOverrideStore( + raw: unknown, +): Record { + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + return {}; + } + + return Object.entries(raw as Record).reduce< + Record + >((result, [workspacePath, value]) => { + const key = normalizeProjectStrategyOverrideKey(workspacePath); + if (!key) { + return result; + } + if (isReviewStrategyLevel(value)) { + result[key] = value; + } else { + console.warn( + `[ReviewTeamService] Ignoring invalid project strategy override for '${key}': expected one of ${REVIEW_STRATEGY_LEVELS.join(', ')}, got '${value}'`, + ); + } + return result; + }, {}); +} + function clampInteger( value: unknown, min: number, @@ -425,6 +904,80 @@ function clampInteger( return Math.min(max, Math.max(min, Math.floor(numeric))); } +function normalizeConcurrencyPolicy( + raw?: Partial, +): ReviewTeamConcurrencyPolicy { + return { + maxParallelInstances: clampInteger( + raw?.maxParallelInstances, + 1, + MAX_PARALLEL_REVIEWER_INSTANCES, + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.maxParallelInstances, + ), + staggerSeconds: clampInteger( + raw?.staggerSeconds, + 0, + 60, + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.staggerSeconds, + ), + maxQueueWaitSeconds: clampInteger( + raw?.maxQueueWaitSeconds, + 0, + MAX_QUEUE_WAIT_SECONDS, + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.maxQueueWaitSeconds, + ), + batchExtrasSeparately: + typeof raw?.batchExtrasSeparately === 'boolean' + ? raw.batchExtrasSeparately + : DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.batchExtrasSeparately, + }; +} + +function applyRateLimitToConcurrencyPolicy( + policy: ReviewTeamConcurrencyPolicy, + rateLimitStatus?: ReviewTeamRateLimitStatus | null, +): ReviewTeamConcurrencyPolicy { + const remaining = Math.floor(Number(rateLimitStatus?.remaining)); + if (!Number.isFinite(remaining)) { + return policy; + } + + if (remaining > policy.maxParallelInstances * 2) { + return policy; + } + + if (remaining > policy.maxParallelInstances) { + return { + ...policy, + staggerSeconds: Math.max(policy.staggerSeconds, 5), + }; + } + + return { + ...policy, + maxParallelInstances: Math.max( + 1, + Math.min(policy.maxParallelInstances, Math.max(2, remaining)), + ), + staggerSeconds: Math.max(policy.staggerSeconds, 10), + }; +} + +function normalizeRateLimitStatus(raw: unknown): ReviewTeamRateLimitStatus | null { + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + return null; + } + + const remaining = Math.floor(Number((raw as { remaining?: unknown }).remaining)); + if (!Number.isFinite(remaining)) { + return null; + } + + return { + remaining: Math.max(0, remaining), + }; +} + function normalizeExecutionPolicy( raw: unknown, ): Pick< @@ -433,6 +986,7 @@ function normalizeExecutionPolicy( | 'judge_timeout_seconds' | 'reviewer_file_split_threshold' | 'max_same_role_instances' + | 'max_retries_per_role' > { const config = raw as Partial | undefined; @@ -461,6 +1015,12 @@ function normalizeExecutionPolicy( 8, DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxSameRoleInstances, ), + max_retries_per_role: clampInteger( + config?.max_retries_per_role, + 0, + 3, + DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxRetriesPerRole, + ), }; } @@ -472,6 +1032,7 @@ function executionPolicyFromStoredConfig( judgeTimeoutSeconds: config.judge_timeout_seconds, reviewerFileSplitThreshold: config.reviewer_file_split_threshold, maxSameRoleInstances: config.max_same_role_instances, + maxRetriesPerRole: config.max_retries_per_role, }; } @@ -530,9 +1091,70 @@ export async function saveDefaultReviewTeamConfig( judge_timeout_seconds: normalizedConfig.judge_timeout_seconds, reviewer_file_split_threshold: normalizedConfig.reviewer_file_split_threshold, max_same_role_instances: normalizedConfig.max_same_role_instances, + max_retries_per_role: normalizedConfig.max_retries_per_role, }); } +export async function loadReviewTeamRateLimitStatus(): Promise { + try { + const raw = await configAPI.getConfig( + DEFAULT_REVIEW_TEAM_RATE_LIMIT_STATUS_CONFIG_PATH, + { skipRetryOnNotFound: true }, + ); + return normalizeRateLimitStatus(raw); + } catch (error) { + console.warn('[ReviewTeamService] Failed to load review team rate limit status', error); + return null; + } +} + +export async function loadReviewTeamProjectStrategyOverride( + workspacePath?: string, +): Promise { + const key = normalizeProjectStrategyOverrideKey(workspacePath); + if (!key) { + return undefined; + } + + try { + const raw = await configAPI.getConfig( + DEFAULT_REVIEW_TEAM_PROJECT_STRATEGY_OVERRIDES_CONFIG_PATH, + { skipRetryOnNotFound: true }, + ); + return normalizeProjectStrategyOverrideStore(raw)[key]; + } catch (error) { + console.warn('[ReviewTeamService] Failed to load project review strategy override', error); + return undefined; + } +} + +export async function saveReviewTeamProjectStrategyOverride( + workspacePath: string | undefined, + strategyLevel?: ReviewStrategyLevel, +): Promise { + const key = normalizeProjectStrategyOverrideKey(workspacePath); + if (!key) { + return; + } + + const raw = await configAPI.getConfig( + DEFAULT_REVIEW_TEAM_PROJECT_STRATEGY_OVERRIDES_CONFIG_PATH, + { skipRetryOnNotFound: true }, + ).catch(() => undefined); + const nextOverrides = normalizeProjectStrategyOverrideStore(raw); + + if (strategyLevel) { + nextOverrides[key] = normalizeTeamStrategyLevel(strategyLevel); + } else { + delete nextOverrides[key]; + } + + await configAPI.setConfig( + DEFAULT_REVIEW_TEAM_PROJECT_STRATEGY_OVERRIDES_CONFIG_PATH, + nextOverrides, + ); +} + export async function addDefaultReviewTeamMember(subagentId: string): Promise { const current = await loadDefaultReviewTeamConfig(); await saveDefaultReviewTeamConfig({ @@ -559,6 +1181,7 @@ export async function saveDefaultReviewTeamExecutionPolicy( judge_timeout_seconds: policy.judgeTimeoutSeconds, reviewer_file_split_threshold: policy.reviewerFileSplitThreshold, max_same_role_instances: policy.maxSameRoleInstances, + max_retries_per_role: policy.maxRetriesPerRole, }); } @@ -597,6 +1220,7 @@ export async function saveDefaultReviewTeamMemberStrategyOverride( export interface ResolveDefaultReviewTeamOptions { availableModelIds?: string[]; + definition?: ReviewTeamDefinition; } function extractAvailableModelIds(rawModels: unknown): string[] | undefined { @@ -646,13 +1270,14 @@ function resolveMemberModel( configuredModel: string | undefined, strategyLevel: ReviewStrategyLevel, availableModelIds?: Set, + strategyProfiles: Record = REVIEW_STRATEGY_PROFILES, ): { model: string; configuredModel: string; modelFallbackReason?: ReviewModelFallbackReason; } { const normalizedConfiguredModel = configuredModel?.trim() || ''; - const defaultModelSlot = getReviewStrategyProfile(strategyLevel).defaultModelSlot; + const defaultModelSlot = strategyProfiles[strategyLevel].defaultModelSlot; if ( !normalizedConfiguredModel || @@ -684,13 +1309,16 @@ function buildCoreMember( info: SubagentInfo | undefined, storedConfig: ReviewTeamStoredConfig, availableModelIds?: Set, + strategyProfiles: Record = REVIEW_STRATEGY_PROFILES, ): ReviewTeamMember { const strategy = resolveMemberStrategy(storedConfig, definition.subagentId); const model = resolveMemberModel( info?.model || DEFAULT_REVIEW_TEAM_MODEL, strategy.strategyLevel, availableModelIds, + strategyProfiles, ); + const strategyProfile = strategyProfiles[strategy.strategyLevel]; return { id: `core:${definition.subagentId}`, @@ -713,6 +1341,11 @@ function buildCoreMember( source: 'core', subagentSource: info?.subagentSource ?? 'builtin', accentColor: definition.accentColor, + allowedTools: [...REVIEW_WORK_PACKET_ALLOWED_TOOLS], + defaultModelSlot: strategyProfile.defaultModelSlot, + strategyDirective: + strategyProfile.roleDirectives[definition.subagentId] || + strategyProfile.promptDirective, }; } @@ -720,13 +1353,21 @@ function buildExtraMember( info: SubagentInfo, storedConfig: ReviewTeamStoredConfig, availableModelIds?: Set, + options: { + available?: boolean; + skipReason?: ReviewTeamManifestMemberReason; + strategyProfiles?: Record; + } = {}, ): ReviewTeamMember { const strategy = resolveMemberStrategy(storedConfig, info.id); + const strategyProfiles = options.strategyProfiles ?? REVIEW_STRATEGY_PROFILES; const model = resolveMemberModel( info.model || DEFAULT_REVIEW_TEAM_MODEL, strategy.strategyLevel, availableModelIds, + strategyProfiles, ); + const strategyProfile = strategyProfiles[strategy.strategyLevel]; return { id: `extra:${info.id}`, @@ -742,11 +1383,59 @@ function buildExtraMember( : {}), ...strategy, enabled: info.enabled, - available: true, + available: options.available ?? true, locked: false, source: 'extra', subagentSource: info.subagentSource ?? 'builtin', accentColor: EXTRA_MEMBER_DEFAULTS.accentColor, + allowedTools: + info.defaultTools && info.defaultTools.length > 0 + ? [...info.defaultTools] + : [...REVIEW_WORK_PACKET_ALLOWED_TOOLS], + defaultModelSlot: strategyProfile.defaultModelSlot, + strategyDirective: strategyProfile.promptDirective, + ...(options.skipReason ? { skipReason: options.skipReason } : {}), + }; +} + +function buildUnavailableExtraMember( + subagentId: string, + storedConfig: ReviewTeamStoredConfig, + availableModelIds?: Set, + strategyProfiles: Record = REVIEW_STRATEGY_PROFILES, +): ReviewTeamMember { + const strategy = resolveMemberStrategy(storedConfig, subagentId); + const model = resolveMemberModel( + DEFAULT_REVIEW_TEAM_MODEL, + strategy.strategyLevel, + availableModelIds, + strategyProfiles, + ); + const strategyProfile = strategyProfiles[strategy.strategyLevel]; + + return { + id: `extra:${subagentId}`, + subagentId, + displayName: subagentId, + roleName: EXTRA_MEMBER_DEFAULTS.roleName, + description: EXTRA_MEMBER_DEFAULTS.description, + responsibilities: EXTRA_MEMBER_DEFAULTS.responsibilities, + model: model.model, + configuredModel: model.configuredModel, + ...(model.modelFallbackReason + ? { modelFallbackReason: model.modelFallbackReason } + : {}), + ...strategy, + enabled: true, + available: false, + locked: false, + source: 'extra', + subagentSource: 'user', + accentColor: EXTRA_MEMBER_DEFAULTS.accentColor, + allowedTools: [], + defaultModelSlot: strategyProfile.defaultModelSlot, + strategyDirective: strategyProfile.promptDirective, + skipReason: 'unavailable', }; } @@ -791,10 +1480,23 @@ export function canAddSubagentToReviewTeam(subagentId: string): boolean { return !DISALLOWED_REVIEW_TEAM_MEMBER_IDS.has(subagentId); } -export function canUseSubagentAsReviewTeamMember( +function hasReviewTeamExtraMemberShape( subagent: Pick, ): boolean { - return subagent.isReview && subagent.isReadonly && canAddSubagentToReviewTeam(subagent.id); + return ( + subagent.isReview && + subagent.isReadonly && + canAddSubagentToReviewTeam(subagent.id) + ); +} + +export function canUseSubagentAsReviewTeamMember( + subagent: Pick, +): boolean { + return ( + hasReviewTeamExtraMemberShape(subagent) && + evaluateReviewSubagentToolReadiness(subagent.defaultTools ?? []).readiness !== 'invalid' + ); } export function resolveDefaultReviewTeam( @@ -802,34 +1504,66 @@ export function resolveDefaultReviewTeam( storedConfig: ReviewTeamStoredConfig, options: ResolveDefaultReviewTeamOptions = {}, ): ReviewTeam { + const definition = options.definition ?? FALLBACK_REVIEW_TEAM_DEFINITION; const byId = new Map(subagents.map((subagent) => [subagent.id, subagent])); const availableModelIds = options.availableModelIds ? new Set(options.availableModelIds) : undefined; - const coreMembers = DEFAULT_REVIEW_TEAM_CORE_ROLES.map((definition) => + const coreMembers = definition.coreRoles.map((roleDefinition) => buildCoreMember( - definition, - byId.get(definition.subagentId), + roleDefinition, + byId.get(roleDefinition.subagentId), storedConfig, availableModelIds, + definition.strategyProfiles, ), ); + const disallowedExtraSubagentIds = new Set(definition.disallowedExtraSubagentIds); const extraMembers = storedConfig.extra_subagent_ids - .map((subagentId) => byId.get(subagentId)) - .filter((subagent): subagent is SubagentInfo => Boolean(subagent)) - .filter(canUseSubagentAsReviewTeamMember) - .map((subagent) => buildExtraMember(subagent, storedConfig, availableModelIds)); + .filter((subagentId) => !disallowedExtraSubagentIds.has(subagentId)) + .map((subagentId) => { + const subagent = byId.get(subagentId); + if (!subagent) { + return buildUnavailableExtraMember( + subagentId, + storedConfig, + availableModelIds, + definition.strategyProfiles, + ); + } + if (!hasReviewTeamExtraMemberShape(subagent)) { + return buildExtraMember(subagent, storedConfig, availableModelIds, { + available: false, + skipReason: 'invalid_tooling', + strategyProfiles: definition.strategyProfiles, + }); + } + const toolingReadiness = evaluateReviewSubagentToolReadiness( + subagent.defaultTools ?? [], + ); + return buildExtraMember( + subagent, + storedConfig, + availableModelIds, + toolingReadiness.readiness === 'invalid' + ? { + available: false, + skipReason: 'invalid_tooling', + strategyProfiles: definition.strategyProfiles, + } + : { strategyProfiles: definition.strategyProfiles }, + ); + }); return { - id: DEFAULT_REVIEW_TEAM_ID, - name: 'Code Review Team', - description: - 'A multi-reviewer team for deep code review with mandatory logic, performance, security, architecture, conditional frontend, and quality-gate roles.', - warning: - 'Deep review may take longer and usually consumes more tokens than a standard review.', + id: definition.id, + name: definition.name, + description: definition.description, + warning: definition.warning, strategyLevel: storedConfig.strategy_level, memberStrategyOverrides: storedConfig.member_strategy_overrides, executionPolicy: executionPolicyFromStoredConfig(storedConfig), + definition, members: [...coreMembers, ...extraMembers], coreMembers, extraMembers, @@ -839,24 +1573,33 @@ export function resolveDefaultReviewTeam( export async function loadDefaultReviewTeam( workspacePath?: string, ): Promise { - const [storedConfig, subagents, rawModels] = await Promise.all([ + const [definition, storedConfig, subagents, rawModels] = await Promise.all([ + loadDefaultReviewTeamDefinition(), loadDefaultReviewTeamConfig(), SubagentAPI.listSubagents({ workspacePath }), configAPI.getConfig('ai.models').catch(() => undefined), ]); return resolveDefaultReviewTeam(subagents, storedConfig, { + definition, availableModelIds: extractAvailableModelIds(rawModels), }); } interface ReviewTeamLaunchOptions { + target?: ReviewTargetClassification; reviewTargetFilePaths?: string[]; } interface ReviewTeamManifestOptions { workspacePath?: string; policySource?: ReviewTeamRunManifest['policySource']; + target?: ReviewTargetClassification; + changeStats?: Partial; + tokenBudgetMode?: ReviewTokenBudgetMode; + concurrencyPolicy?: Partial; + rateLimitStatus?: ReviewTeamRateLimitStatus | null; + strategyOverride?: ReviewStrategyLevel; reviewTargetFilePaths?: string[]; } @@ -864,33 +1607,32 @@ function hasExplicitReviewTarget(filePaths?: string[]): boolean { return Boolean(filePaths?.some((filePath) => filePath.trim().length > 0)); } -function isFrontendReviewTarget(filePath: string): boolean { - const normalizedPath = filePath.replace(/\\/g, '/').toLowerCase(); - return ( - normalizedPath.startsWith('src/web-ui/') || - normalizedPath.includes('/src/web-ui/') || - normalizedPath.includes('/locales/') || - normalizedPath.startsWith('locales/') || - /\.(tsx|jsx|scss|css)$/.test(normalizedPath) - ); +function resolveReviewTargetForOptions( + target: ReviewTargetClassification | undefined, + reviewTargetFilePaths: string[] | undefined, + fallbackSource: Parameters[0], +): ReviewTargetClassification { + if (target) { + return target; + } + if (hasExplicitReviewTarget(reviewTargetFilePaths)) { + return classifyReviewTargetFromFiles(reviewTargetFilePaths ?? [], 'session_files'); + } + return createUnknownReviewTargetClassification(fallbackSource); } -function isConditionalCoreMemberApplicable( +function isCoreMemberApplicableForLaunch( member: ReviewTeamMember, - reviewTargetFilePaths?: string[], + options: ReviewTeamLaunchOptions, ): boolean { - if (!member.conditional) { - return true; - } - if (!hasExplicitReviewTarget(reviewTargetFilePaths)) { - return true; - } - if (member.definitionKey === 'frontend') { - // The frontend reviewer is optional. Only include it for explicit targets - // that match the same frontend/i18n signal described in the DeepReview prompt. - return (reviewTargetFilePaths ?? []).some(isFrontendReviewTarget); - } - return true; + return shouldRunCoreReviewerForTarget( + member, + resolveReviewTargetForOptions( + options.target, + options.reviewTargetFilePaths, + 'unknown', + ), + ); } export async function prepareDefaultReviewTeamForLaunch( @@ -901,7 +1643,7 @@ export async function prepareDefaultReviewTeamForLaunch( const missingCoreMembers = team.coreMembers.filter( (member) => !member.available && - isConditionalCoreMemberApplicable(member, options.reviewTargetFilePaths), + isCoreMemberApplicableForLaunch(member, options), ); if (missingCoreMembers.length > 0) { @@ -916,7 +1658,7 @@ export async function prepareDefaultReviewTeamForLaunch( (member) => member.available && !member.enabled && - isConditionalCoreMemberApplicable(member, options.reviewTargetFilePaths), + isCoreMemberApplicableForLaunch(member, options), ); if (coreMembersToEnable.length > 0) { @@ -960,10 +1702,11 @@ function toManifestMember( model: member.model || DEFAULT_REVIEW_TEAM_MODEL, configuredModel: member.configuredModel || member.model || DEFAULT_REVIEW_TEAM_MODEL, modelFallbackReason: member.modelFallbackReason, - defaultModelSlot: strategyProfile.defaultModelSlot, + defaultModelSlot: member.defaultModelSlot ?? strategyProfile.defaultModelSlot, strategyLevel: member.strategyLevel, strategySource: member.strategySource, - strategyDirective: roleDirective || strategyProfile.promptDirective, + strategyDirective: + member.strategyDirective || roleDirective || strategyProfile.promptDirective, locked: member.locked, source: member.source, subagentSource: member.subagentSource, @@ -971,38 +1714,1120 @@ function toManifestMember( }; } -export function buildEffectiveReviewTeamManifest( - team: ReviewTeam, - options: ReviewTeamManifestOptions = {}, -): ReviewTeamRunManifest { - const availableCoreMembers = team.coreMembers.filter((member) => member.available); - const unavailableCoreMembers = team.coreMembers.filter((member) => !member.available); - const inapplicableCoreMembers = availableCoreMembers.filter( - (member) => !isConditionalCoreMemberApplicable(member, options.reviewTargetFilePaths), +function resolveManifestMemberModelForStrategy( + member: ReviewTeamMember, + strategyLevel: ReviewStrategyLevel, +): { + model: string; + configuredModel: string; + modelFallbackReason?: ReviewModelFallbackReason; +} { + if (member.modelFallbackReason === 'model_removed') { + return { + model: getReviewStrategyProfile(strategyLevel).defaultModelSlot, + configuredModel: member.configuredModel, + modelFallbackReason: member.modelFallbackReason, + }; + } + + return resolveMemberModel( + member.configuredModel || member.model || DEFAULT_REVIEW_TEAM_MODEL, + strategyLevel, ); - const applicableCoreMembers = availableCoreMembers.filter( - (member) => isConditionalCoreMemberApplicable(member, options.reviewTargetFilePaths), +} + +function applyTeamStrategyOverrideToMember( + member: ReviewTeamMember, + strategyLevel: ReviewStrategyLevel, +): ReviewTeamMember { + if (member.strategySource === 'member' || member.strategyLevel === strategyLevel) { + return member; + } + + const strategyProfile = getReviewStrategyProfile(strategyLevel); + const model = resolveManifestMemberModelForStrategy(member, strategyLevel); + return { + ...member, + model: model.model, + configuredModel: model.configuredModel, + modelFallbackReason: model.modelFallbackReason, + strategyOverride: DEFAULT_REVIEW_MEMBER_STRATEGY_LEVEL, + strategyLevel, + strategySource: 'team', + defaultModelSlot: strategyProfile.defaultModelSlot, + strategyDirective: + strategyProfile.roleDirectives[member.subagentId as ReviewRoleDirectiveKey] || + strategyProfile.promptDirective, + }; +} + +function shouldRunCoreReviewerForTarget( + member: ReviewTeamMember, + target: ReviewTargetClassification, +): boolean { + return shouldRunReviewerForTarget(member.subagentId, target); +} + +function resolveMaxExtraReviewers( + mode: ReviewTokenBudgetMode, + eligibleExtraReviewerCount: number, +): number { + if (mode === 'economy') { + return 0; + } + return eligibleExtraReviewerCount; +} + +function resolveChangeStats( + target: ReviewTargetClassification, + stats?: Partial, +): ReviewTeamChangeStats { + const fileCount = Math.max( + 0, + Math.floor( + stats?.fileCount ?? + target.files.filter((file) => !file.excluded).length, + ), + ); + const totalLinesChanged = + typeof stats?.totalLinesChanged === 'number' && + Number.isFinite(stats.totalLinesChanged) + ? Math.max(0, Math.floor(stats.totalLinesChanged)) + : undefined; + + return { + fileCount, + ...(totalLinesChanged !== undefined ? { totalLinesChanged } : {}), + lineCountSource: + totalLinesChanged !== undefined + ? stats?.lineCountSource ?? 'diff_stat' + : 'unknown', + }; +} + +const SECURITY_SENSITIVE_PATH_PATTERN = + /(^|[/._-])(auth|oauth|crypto|security|permission|permissions|secret|secrets|token|tokens|credential|credentials)([/._-]|$)/; + +function isSecuritySensitiveReviewPath(normalizedPath: string): boolean { + return SECURITY_SENSITIVE_PATH_PATTERN.test(normalizedPath.toLowerCase()); +} + +function workspaceAreaForReviewPath(normalizedPath: string): string { + const crateMatch = normalizedPath.match(/^src\/crates\/([^/]+)/); + if (crateMatch) { + return `crate:${crateMatch[1]}`; + } + + const appMatch = normalizedPath.match(/^src\/apps\/([^/]+)/); + if (appMatch) { + return `app:${appMatch[1]}`; + } + + if (normalizedPath.startsWith('src/web-ui/')) { + return 'web-ui'; + } + + if (normalizedPath.startsWith('BitFun-Installer/')) { + return 'installer'; + } + + const [root] = normalizedPath.split('/'); + return root || 'unknown'; +} + +function pluralize(count: number, singular: string): string { + return `${count} ${singular}${count === 1 ? '' : 's'}`; +} + +const PRE_REVIEW_SUMMARY_SAMPLE_FILE_LIMIT = 3; +const PRE_REVIEW_SUMMARY_AREA_LIMIT = 8; + +function buildPreReviewSummary( + target: ReviewTargetClassification, + changeStats: ReviewTeamChangeStats, +): ReviewTeamPreReviewSummary { + const includedFiles = target.files + .filter((file) => !file.excluded) + .map((file) => file.normalizedPath); + const excludedFileCount = target.files.length - includedFiles.length; + const allWorkspaceAreas = groupFilesByWorkspaceArea(includedFiles) + .sort((a, b) => b.files.length - a.files.length || a.index - b.index); + const workspaceAreas = allWorkspaceAreas + .slice(0, PRE_REVIEW_SUMMARY_AREA_LIMIT) + .map((area) => ({ + key: area.key, + fileCount: area.files.length, + sampleFiles: area.files.slice(0, PRE_REVIEW_SUMMARY_SAMPLE_FILE_LIMIT), + })); + const lineCount = changeStats.totalLinesChanged; + const lineCountLabel = + lineCount === undefined + ? 'unknown changed lines' + : `${lineCount} changed lines`; + const areaLabel = workspaceAreas.length > 0 + ? workspaceAreas.map((area) => `${area.key} (${area.fileCount})`).join(', ') + : 'no resolved workspace area'; + const targetTags = [...target.tags]; + const tagLabel = targetTags.filter((tag) => tag !== 'unknown').join(', ') || 'unknown'; + const omittedAreaCount = Math.max( + 0, + allWorkspaceAreas.length - workspaceAreas.length, + ); + const summaryParts = [ + `${pluralize(changeStats.fileCount, 'file')}, ${lineCountLabel} across ${pluralize(allWorkspaceAreas.length, 'workspace area')}: ${areaLabel}`, + `tags: ${tagLabel}`, + omittedAreaCount > 0 ? `${pluralize(omittedAreaCount, 'workspace area')} omitted from summary` : undefined, + ].filter(Boolean); + + return { + source: 'target_manifest', + summary: summaryParts.join('; '), + fileCount: changeStats.fileCount, + excludedFileCount, + ...(lineCount !== undefined ? { lineCount } : {}), + lineCountSource: changeStats.lineCountSource, + targetTags, + workspaceAreas, + warnings: target.warnings.map((warning) => warning.code), + }; +} + +export function recommendReviewStrategyForTarget( + target: ReviewTargetClassification, + changeStats: ReviewTeamChangeStats, +): ReviewTeamStrategyRecommendation { + const includedFiles = target.files.filter((file) => !file.excluded); + const securityFileCount = includedFiles.filter((file) => + isSecuritySensitiveReviewPath(file.normalizedPath), + ).length; + const workspaceAreaCount = new Set( + includedFiles.map((file) => workspaceAreaForReviewPath(file.normalizedPath)), + ).size; + const contractSurfaceChanged = target.tags.includes('frontend_contract') || + target.tags.includes('desktop_contract') || + target.tags.includes('web_server_contract') || + target.tags.includes('api_layer') || + target.tags.includes('transport'); + const totalLinesChanged = changeStats.totalLinesChanged; + const factors: ReviewTeamRiskFactors = { + fileCount: changeStats.fileCount, + ...(totalLinesChanged !== undefined ? { totalLinesChanged } : {}), + lineCountSource: changeStats.lineCountSource, + securityFileCount, + workspaceAreaCount, + contractSurfaceChanged, + }; + + if (target.resolution === 'unknown' || changeStats.fileCount === 0) { + return { + strategyLevel: 'normal', + score: 0, + rationale: 'unresolved target; keep a conservative normal review recommendation.', + factors, + }; + } + + const lineScore = + totalLinesChanged === undefined + ? 0 + : Math.floor(totalLinesChanged / 100); + const crossAreaScore = Math.max(0, workspaceAreaCount - 1) * 2; + const score = + changeStats.fileCount + + lineScore + + securityFileCount * 3 + + crossAreaScore + + (contractSurfaceChanged ? 2 : 0); + const strategyLevel: ReviewStrategyLevel = + score <= 5 + ? 'quick' + : score <= 20 + ? 'normal' + : 'deep'; + const sizeLabel = totalLinesChanged === undefined + ? `${changeStats.fileCount} files, unknown lines` + : `${changeStats.fileCount} files, ${totalLinesChanged} lines`; + const riskDetails = [ + pluralize(securityFileCount, 'security-sensitive file'), + pluralize(workspaceAreaCount, 'workspace area'), + contractSurfaceChanged ? 'contract surface changed' : undefined, + ].filter(Boolean).join(', '); + const rationale = + strategyLevel === 'quick' + ? `Small change (${sizeLabel}). Quick scan sufficient.` + : strategyLevel === 'normal' + ? `Medium change (${sizeLabel}; ${riskDetails}). Standard review recommended.` + : `Large/high-risk change (${sizeLabel}; ${riskDetails}). Deep review recommended.`; + + return { + strategyLevel, + score, + rationale, + factors, + }; +} + +const REVIEW_STRATEGY_RANK: Record = { + quick: 0, + normal: 1, + deep: 2, +}; + +function crossCrateChangeCountForReviewTarget( + target: ReviewTargetClassification, +): number { + const crateNames = new Set( + target.files + .filter((file) => !file.excluded) + .map((file) => /^src\/crates\/([^/]+)/.exec(file.normalizedPath)?.[1]) + .filter((crateName): crateName is string => Boolean(crateName)), + ); + + return Math.max(0, crateNames.size - 1); +} + +function buildBackendCompatibleRiskFactors( + target: ReviewTargetClassification, + changeStats: ReviewTeamChangeStats, +): ReviewTeamBackendRiskFactors { + const includedFiles = target.files.filter((file) => !file.excluded); + + return { + fileCount: changeStats.fileCount, + totalLinesChanged: changeStats.totalLinesChanged ?? 0, + lineCountSource: changeStats.lineCountSource, + filesInSecurityPaths: includedFiles.filter((file) => + isSecuritySensitiveReviewPath(file.normalizedPath), + ).length, + crossCrateChanges: crossCrateChangeCountForReviewTarget(target), + maxCyclomaticComplexityDelta: 0, + maxCyclomaticComplexityDeltaSource: 'not_measured', + }; +} + +function recommendBackendCompatibleStrategyForTarget( + target: ReviewTargetClassification, + changeStats: ReviewTeamChangeStats, +): ReviewTeamBackendStrategyRecommendation { + const factors = buildBackendCompatibleRiskFactors(target, changeStats); + const score = + factors.fileCount + + Math.floor(factors.totalLinesChanged / 100) + + factors.filesInSecurityPaths * 3 + + factors.crossCrateChanges * 2; + const strategyLevel: ReviewStrategyLevel = + score <= 5 + ? 'quick' + : score <= 20 + ? 'normal' + : 'deep'; + const rationale = + strategyLevel === 'quick' + ? `Backend-compatible policy sees a small change (${factors.fileCount} files, ${factors.totalLinesChanged} lines).` + : strategyLevel === 'normal' + ? `Backend-compatible policy sees a medium change (${factors.fileCount} files, ${factors.totalLinesChanged} lines).` + : `Backend-compatible policy sees a large/high-risk change (${factors.fileCount} files, ${factors.totalLinesChanged} lines, ${factors.filesInSecurityPaths} security files).`; + + return { + strategyLevel, + score, + rationale, + factors, + }; +} + +function resolveStrategyMismatchSeverity(params: { + finalStrategy: ReviewStrategyLevel; + frontendRecommendation: ReviewStrategyLevel; + backendRecommendation: ReviewStrategyLevel; +}): ReviewTeamStrategyMismatchSeverity { + const finalRank = REVIEW_STRATEGY_RANK[params.finalStrategy]; + const recommendedRank = Math.max( + REVIEW_STRATEGY_RANK[params.frontendRecommendation], + REVIEW_STRATEGY_RANK[params.backendRecommendation], + ); + const distance = Math.abs(finalRank - recommendedRank); + + if (distance === 0) { + return 'none'; + } + if (distance >= 2) { + return 'high'; + } + return finalRank < recommendedRank ? 'medium' : 'low'; +} + +function buildReviewStrategyDecision(params: { + teamDefaultStrategy: ReviewStrategyLevel; + finalStrategy: ReviewStrategyLevel; + userOverride?: ReviewStrategyLevel; + frontendRecommendation: ReviewTeamStrategyRecommendation; + backendRecommendation: ReviewTeamBackendStrategyRecommendation; +}): ReviewTeamStrategyDecision { + const mismatch = + params.finalStrategy !== params.frontendRecommendation.strategyLevel || + params.finalStrategy !== params.backendRecommendation.strategyLevel; + const mismatchSeverity = resolveStrategyMismatchSeverity({ + finalStrategy: params.finalStrategy, + frontendRecommendation: params.frontendRecommendation.strategyLevel, + backendRecommendation: params.backendRecommendation.strategyLevel, + }); + const recommendationSummary = [ + `frontend=${params.frontendRecommendation.strategyLevel}`, + `backend=${params.backendRecommendation.strategyLevel}`, + ].join(', '); + + return { + authority: 'mismatch_warning', + teamDefaultStrategy: params.teamDefaultStrategy, + ...(params.userOverride ? { userOverride: params.userOverride } : {}), + finalStrategy: params.finalStrategy, + frontendRecommendation: params.frontendRecommendation, + backendRecommendation: params.backendRecommendation, + mismatch, + mismatchSeverity, + rationale: mismatch + ? `Final strategy ${params.finalStrategy} differs from advisory recommendations (${recommendationSummary}); keep this as non-blocking launch/report metadata.` + : `Final strategy ${params.finalStrategy} matches advisory recommendations (${recommendationSummary}).`, + }; +} + +function buildWorkPacketScopeFromFiles( + target: ReviewTargetClassification, + files: string[], + group?: { index: number; count: number }, +): ReviewTeamWorkPacketScope { + return { + kind: 'review_target', + targetSource: target.source, + targetResolution: target.resolution, + targetTags: [...target.tags], + fileCount: files.length, + files, + excludedFileCount: + target.files.length - target.files.filter((file) => !file.excluded).length, + ...(group ? { groupIndex: group.index, groupCount: group.count } : {}), + }; +} + +function buildWorkPacket(params: { + member: ReviewTeamMember; + phase: ReviewTeamWorkPacket['phase']; + launchBatch: number; + scope: ReviewTeamWorkPacketScope; + timeoutSeconds: number; +}): ReviewTeamWorkPacket { + const manifestMember = toManifestMember(params.member); + const packetGroupSuffix = + params.phase === 'reviewer' && + params.scope.groupIndex !== undefined && + params.scope.groupCount !== undefined + ? `:group-${params.scope.groupIndex}-of-${params.scope.groupCount}` + : ''; + + return { + packetId: `${params.phase}:${manifestMember.subagentId}${packetGroupSuffix}`, + phase: params.phase, + launchBatch: params.launchBatch, + subagentId: manifestMember.subagentId, + displayName: manifestMember.displayName, + roleName: manifestMember.roleName, + assignedScope: params.scope, + allowedTools: [...params.member.allowedTools], + timeoutSeconds: params.timeoutSeconds, + requiredOutputFields: + params.phase === 'judge' + ? [...JUDGE_WORK_PACKET_REQUIRED_OUTPUT_FIELDS] + : [...REVIEWER_WORK_PACKET_REQUIRED_OUTPUT_FIELDS], + strategyLevel: manifestMember.strategyLevel, + strategyDirective: manifestMember.strategyDirective, + model: manifestMember.model || DEFAULT_REVIEW_TEAM_MODEL, + }; +} + +function splitFilesIntoGroups(files: string[], groupCount: number): string[][] { + if (groupCount <= 1) { + return [files]; + } + + const groups: string[][] = []; + let cursor = 0; + for (let index = 0; index < groupCount; index += 1) { + const remainingFiles = files.length - cursor; + const remainingGroups = groupCount - index; + const groupSize = Math.ceil(remainingFiles / remainingGroups); + groups.push(files.slice(cursor, cursor + groupSize)); + cursor += groupSize; + } + return groups; +} + +interface WorkspaceAreaFileBucket { + key: string; + index: number; + files: string[]; +} + +function groupFilesByWorkspaceArea(files: string[]): WorkspaceAreaFileBucket[] { + const buckets: WorkspaceAreaFileBucket[] = []; + const bucketByKey = new Map(); + + for (const file of files) { + const key = workspaceAreaForReviewPath(file); + let bucket = bucketByKey.get(key); + if (!bucket) { + bucket = { + key, + index: buckets.length, + files: [], + }; + buckets.push(bucket); + bucketByKey.set(key, bucket); + } + bucket.files.push(file); + } + + return buckets; +} + +function splitFilesIntoModuleAwareGroups( + files: string[], + groupCount: number, +): string[][] { + if (groupCount <= 1) { + return [files]; + } + + const buckets = groupFilesByWorkspaceArea(files); + if (buckets.length <= 1) { + return splitFilesIntoGroups(files, groupCount); + } + + if (buckets.length >= groupCount) { + const groups = Array.from({ length: groupCount }, () => [] as string[]); + const sortedBuckets = [...buckets].sort( + (a, b) => b.files.length - a.files.length || a.index - b.index, + ); + + for (const bucket of sortedBuckets) { + let targetIndex = 0; + for (let index = 1; index < groups.length; index += 1) { + if (groups[index].length < groups[targetIndex].length) { + targetIndex = index; + } + } + groups[targetIndex].push(...bucket.files); + } + + return groups.filter((group) => group.length > 0); + } + + const chunkCounts = buckets.map(() => 1); + let remainingChunks = groupCount - buckets.length; + while (remainingChunks > 0) { + let targetBucketIndex = -1; + let largestAverageChunkSize = 0; + + for (let index = 0; index < buckets.length; index += 1) { + if (chunkCounts[index] >= buckets[index].files.length) { + continue; + } + const averageChunkSize = buckets[index].files.length / chunkCounts[index]; + if (averageChunkSize > largestAverageChunkSize) { + largestAverageChunkSize = averageChunkSize; + targetBucketIndex = index; + } + } + + if (targetBucketIndex === -1) { + break; + } + + chunkCounts[targetBucketIndex] += 1; + remainingChunks -= 1; + } + + return buckets.flatMap((bucket, index) => + splitFilesIntoGroups(bucket.files, chunkCounts[index]), ); - const coreReviewers = applicableCoreMembers +} + +function effectiveMaxSameRoleInstances(params: { + executionPolicy: ReviewTeamExecutionPolicy; + concurrencyPolicy: ReviewTeamConcurrencyPolicy; + reviewerMemberCount: number; +}): number { + const reviewerMemberCount = Math.max(1, params.reviewerMemberCount); + const maxPerRole = Math.floor( + params.concurrencyPolicy.maxParallelInstances / reviewerMemberCount, + ); + + return Math.max( + 1, + Math.min(params.executionPolicy.maxSameRoleInstances, Math.max(1, maxPerRole)), + ); +} + +function resolveReviewerPacketScopes( + target: ReviewTargetClassification, + executionPolicy: ReviewTeamExecutionPolicy, + concurrencyPolicy: ReviewTeamConcurrencyPolicy, + reviewerMemberCount: number, +): ReviewTeamWorkPacketScope[] { + const includedFiles = target.files + .filter((file) => !file.excluded) + .map((file) => file.normalizedPath); + const shouldSplit = + executionPolicy.reviewerFileSplitThreshold > 0 && + executionPolicy.maxSameRoleInstances > 1 && + includedFiles.length > executionPolicy.reviewerFileSplitThreshold; + + if (!shouldSplit) { + return [buildWorkPacketScopeFromFiles(target, includedFiles)]; + } + + const maxSameRoleInstances = effectiveMaxSameRoleInstances({ + executionPolicy, + concurrencyPolicy, + reviewerMemberCount, + }); + const groupCount = Math.min( + maxSameRoleInstances, + Math.ceil(includedFiles.length / executionPolicy.reviewerFileSplitThreshold), + ); + if (groupCount <= 1) { + return [buildWorkPacketScopeFromFiles(target, includedFiles)]; + } + + const fileGroups = splitFilesIntoModuleAwareGroups(includedFiles, groupCount); + return fileGroups.map((files, index) => + buildWorkPacketScopeFromFiles(target, files, { + index: index + 1, + count: fileGroups.length, + }), + ); +} + +function buildWorkPackets(params: { + reviewerMembers: ReviewTeamMember[]; + judgeMember?: ReviewTeamMember; + target: ReviewTargetClassification; + executionPolicy: ReviewTeamExecutionPolicy; + concurrencyPolicy: ReviewTeamConcurrencyPolicy; +}): ReviewTeamWorkPacket[] { + const reviewerScopes = resolveReviewerPacketScopes( + params.target, + params.executionPolicy, + params.concurrencyPolicy, + params.reviewerMembers.length, + ); + const fullScope = buildWorkPacketScopeFromFiles( + params.target, + params.target.files + .filter((file) => !file.excluded) + .map((file) => file.normalizedPath), + ); + const reviewerSeeds = params.reviewerMembers.flatMap((member) => + reviewerScopes.map((scope) => ({ member, scope })), + ); + const orderedReviewerSeeds = params.concurrencyPolicy.batchExtrasSeparately + ? [ + ...reviewerSeeds.filter((seed) => seed.member.source === 'core'), + ...reviewerSeeds.filter((seed) => seed.member.source === 'extra'), + ] + : reviewerSeeds; + const reviewerPackets = orderedReviewerSeeds.map((seed, index) => + buildWorkPacket({ + member: seed.member, + phase: 'reviewer', + launchBatch: + Math.floor(index / params.concurrencyPolicy.maxParallelInstances) + 1, + scope: seed.scope, + timeoutSeconds: params.executionPolicy.reviewerTimeoutSeconds, + }), + ); + const finalReviewerBatch = reviewerPackets.reduce( + (maxBatch, packet) => Math.max(maxBatch, packet.launchBatch), + 0, + ); + const judgePacket = params.judgeMember + ? [ + buildWorkPacket({ + member: params.judgeMember, + phase: 'judge', + launchBatch: finalReviewerBatch + 1, + scope: fullScope, + timeoutSeconds: params.executionPolicy.judgeTimeoutSeconds, + }), + ] + : []; + + return [...reviewerPackets, ...judgePacket]; +} + +const SHARED_CONTEXT_CACHE_ENTRY_LIMIT = 80; +const SHARED_CONTEXT_CACHE_RECOMMENDED_TOOLS: ReviewTeamSharedContextTool[] = [ + 'GetFileDiff', + 'Read', +]; + +function buildSharedContextCachePlan( + workPackets: ReviewTeamWorkPacket[] = [], +): ReviewTeamSharedContextCachePlan { + const fileContextByPath = new Map< + string, + { + path: string; + workspaceArea: string; + consumerPacketIds: string[]; + firstSeenIndex: number; + } + >(); + let nextSeenIndex = 0; + + for (const packet of workPackets) { + if (packet.phase !== 'reviewer') { + continue; + } + + for (const path of packet.assignedScope.files) { + let entry = fileContextByPath.get(path); + if (!entry) { + entry = { + path, + workspaceArea: workspaceAreaForReviewPath(path), + consumerPacketIds: [], + firstSeenIndex: nextSeenIndex, + }; + nextSeenIndex += 1; + fileContextByPath.set(path, entry); + } + if (!entry.consumerPacketIds.includes(packet.packetId)) { + entry.consumerPacketIds.push(packet.packetId); + } + } + } + + const repeatedFileContexts = Array.from(fileContextByPath.values()) + .filter((entry) => entry.consumerPacketIds.length > 1) + .sort((a, b) => a.firstSeenIndex - b.firstSeenIndex); + const entries = repeatedFileContexts + .slice(0, SHARED_CONTEXT_CACHE_ENTRY_LIMIT) + .map((entry, index) => ({ + cacheKey: `shared-context:${index + 1}`, + path: entry.path, + workspaceArea: entry.workspaceArea, + recommendedTools: [...SHARED_CONTEXT_CACHE_RECOMMENDED_TOOLS], + consumerPacketIds: entry.consumerPacketIds, + })); + + return { + source: 'work_packets', + strategy: 'reuse_readonly_file_context_by_cache_key', + entries, + omittedEntryCount: Math.max( + 0, + repeatedFileContexts.length - SHARED_CONTEXT_CACHE_ENTRY_LIMIT, + ), + }; +} + +const INCREMENTAL_REVIEW_CACHE_INVALIDATIONS: ReviewTeamIncrementalReviewCacheInvalidation[] = [ + 'target_file_set_changed', + 'target_line_count_changed', + 'target_tag_changed', + 'target_warning_changed', + 'reviewer_roster_changed', + 'strategy_changed', +]; + +function stableFingerprint(input: unknown): string { + const serialized = JSON.stringify(input); + let hash = 0x811c9dc5; + for (let index = 0; index < serialized.length; index += 1) { + hash ^= serialized.charCodeAt(index); + hash = Math.imul(hash, 0x01000193); + } + return (hash >>> 0).toString(16).padStart(8, '0'); +} + +function buildIncrementalReviewCachePlan(params: { + target: ReviewTargetClassification; + changeStats: ReviewTeamChangeStats; + strategyLevel: ReviewStrategyLevel; + workPackets: ReviewTeamWorkPacket[]; +}): ReviewTeamIncrementalReviewCachePlan { + const filePaths = params.target.files + .filter((file) => !file.excluded) + .map((file) => file.normalizedPath) + .sort((a, b) => a.localeCompare(b)); + const workspaceAreas = Array.from( + new Set(filePaths.map((file) => workspaceAreaForReviewPath(file))), + ).sort((a, b) => a.localeCompare(b)); + const targetTags = [...params.target.tags].sort((a, b) => a.localeCompare(b)); + const targetWarnings = params.target.warnings + .map((warning) => warning.code) + .sort((a, b) => a.localeCompare(b)); + const reviewerPacketIds = params.workPackets + .filter((packet) => packet.phase === 'reviewer') + .map((packet) => packet.packetId) + .sort((a, b) => a.localeCompare(b)); + const fingerprint = stableFingerprint({ + source: params.target.source, + resolution: params.target.resolution, + filePaths, + workspaceAreas, + targetTags, + targetWarnings, + lineCount: params.changeStats.totalLinesChanged ?? null, + lineCountSource: params.changeStats.lineCountSource, + reviewerPacketIds, + strategyLevel: params.strategyLevel, + }); + + return { + source: 'target_manifest', + strategy: 'reuse_completed_packets_when_fingerprint_matches', + cacheKey: `incremental-review:${fingerprint}`, + fingerprint, + filePaths, + workspaceAreas, + targetTags, + reviewerPacketIds, + ...(params.changeStats.totalLinesChanged !== undefined + ? { lineCount: params.changeStats.totalLinesChanged } + : {}), + lineCountSource: params.changeStats.lineCountSource, + invalidatesOn: [...INCREMENTAL_REVIEW_CACHE_INVALIDATIONS], + }; +} + +function predictTimeoutSeconds(params: { + role: 'reviewer' | 'judge'; + strategyLevel: ReviewStrategyLevel; + changeStats: ReviewTeamChangeStats; + reviewerCount: number; +}): number { + const totalLinesChanged = params.changeStats.totalLinesChanged ?? 0; + const base = PREDICTIVE_TIMEOUT_BASE_SECONDS[params.strategyLevel]; + const raw = + base + + params.changeStats.fileCount * PREDICTIVE_TIMEOUT_PER_FILE_SECONDS + + Math.floor(totalLinesChanged / 100) * + PREDICTIVE_TIMEOUT_PER_100_LINES_SECONDS; + const reviewerCount = Math.max(1, params.reviewerCount); + const multiplier = + params.role === 'judge' + ? 1 + Math.floor((reviewerCount - 1) / 3) + : 1; + + return Math.min(raw * multiplier, MAX_PREDICTIVE_TIMEOUT_SECONDS); +} + +function buildEffectiveExecutionPolicy(params: { + basePolicy: ReviewTeamExecutionPolicy; + strategyLevel: ReviewStrategyLevel; + target: ReviewTargetClassification; + changeStats: ReviewTeamChangeStats; + reviewerCount: number; +}): ReviewTeamExecutionPolicy { + if ( + params.target.resolution === 'unknown' && + params.changeStats.fileCount === 0 && + params.changeStats.totalLinesChanged === undefined + ) { + return params.basePolicy; + } + + const reviewerTimeoutSeconds = predictTimeoutSeconds({ + role: 'reviewer', + strategyLevel: params.strategyLevel, + changeStats: params.changeStats, + reviewerCount: params.reviewerCount, + }); + const judgeTimeoutSeconds = predictTimeoutSeconds({ + role: 'judge', + strategyLevel: params.strategyLevel, + changeStats: params.changeStats, + reviewerCount: params.reviewerCount, + }); + + return { + ...params.basePolicy, + reviewerTimeoutSeconds: + params.basePolicy.reviewerTimeoutSeconds === 0 + ? 0 + : Math.max( + params.basePolicy.reviewerTimeoutSeconds, + reviewerTimeoutSeconds, + ), + judgeTimeoutSeconds: + params.basePolicy.judgeTimeoutSeconds === 0 + ? 0 + : Math.max( + params.basePolicy.judgeTimeoutSeconds, + judgeTimeoutSeconds, + ), + }; +} + +function estimateChangedLinesForScope(params: { + scope: ReviewTeamWorkPacketScope; + changeStats: ReviewTeamChangeStats; + totalIncludedFileCount: number; +}): number { + if (params.changeStats.totalLinesChanged === undefined) { + return params.scope.fileCount * PROMPT_BYTE_ESTIMATE_UNKNOWN_LINES_PER_FILE; + } + + if (params.totalIncludedFileCount <= 0) { + return params.changeStats.totalLinesChanged; + } + + return Math.ceil( + params.changeStats.totalLinesChanged * + (params.scope.fileCount / params.totalIncludedFileCount), + ); +} + +function estimateReviewerPromptBytes(params: { + packet: ReviewTeamWorkPacket; + changeStats: ReviewTeamChangeStats; + totalIncludedFileCount: number; +}): number { + const pathBytes = params.packet.assignedScope.files.reduce( + (total, filePath) => total + filePath.length + 1, + 0, + ); + const estimatedChangedLines = estimateChangedLinesForScope({ + scope: params.packet.assignedScope, + changeStats: params.changeStats, + totalIncludedFileCount: params.totalIncludedFileCount, + }); + + return Math.ceil( + PROMPT_BYTE_ESTIMATE_BASE_BYTES + + pathBytes + + params.packet.assignedScope.fileCount * PROMPT_BYTE_ESTIMATE_PER_FILE_BYTES + + estimatedChangedLines * PROMPT_BYTE_ESTIMATE_PER_CHANGED_LINE_BYTES, + ); +} + +function estimateMaxReviewerPromptBytes(params: { + workPackets: ReviewTeamWorkPacket[]; + target: ReviewTargetClassification; + changeStats: ReviewTeamChangeStats; +}): number { + const reviewerPackets = params.workPackets.filter( + (packet) => packet.phase === 'reviewer', + ); + const totalIncludedFileCount = params.target.files.filter( + (file) => !file.excluded, + ).length; + + if (reviewerPackets.length === 0) { + return PROMPT_BYTE_ESTIMATE_BASE_BYTES; + } + + return Math.max( + ...reviewerPackets.map((packet) => + estimateReviewerPromptBytes({ + packet, + changeStats: params.changeStats, + totalIncludedFileCount, + }), + ), + ); +} + +function buildTokenBudgetPlan(params: { + mode: ReviewTokenBudgetMode; + activeReviewerCalls: number; + eligibleExtraReviewerCount: number; + maxExtraReviewers: number; + skippedReviewerIds: string[]; + target: ReviewTargetClassification; + changeStats: ReviewTeamChangeStats; + executionPolicy: ReviewTeamExecutionPolicy; + workPackets: ReviewTeamWorkPacket[]; +}): ReviewTeamTokenBudgetPlan { + const includedFileCount = params.target.files.filter( + (file) => !file.excluded, + ).length; + const fileSplitGuardrailActive = + params.executionPolicy.reviewerFileSplitThreshold > 0 && + includedFileCount > params.executionPolicy.reviewerFileSplitThreshold; + const maxPromptBytesPerReviewer = + TOKEN_BUDGET_PROMPT_BYTE_LIMIT_BY_MODE[params.mode]; + const estimatedPromptBytesPerReviewer = estimateMaxReviewerPromptBytes({ + workPackets: params.workPackets, + target: params.target, + changeStats: params.changeStats, + }); + const promptByteLimitExceeded = + estimatedPromptBytesPerReviewer > maxPromptBytesPerReviewer; + const largeDiffSummaryFirst = promptByteLimitExceeded; + const decisions: ReviewTeamTokenBudgetDecision[] = []; + const warnings: string[] = []; + + if (promptByteLimitExceeded) { + decisions.push({ + kind: 'summary_first_full_scope', + reason: 'prompt_bytes_exceeded', + detail: + `Estimated reviewer prompt ${estimatedPromptBytesPerReviewer} bytes exceeds ${maxPromptBytesPerReviewer} bytes for ${params.mode} budget; use summary-first while keeping every assigned_scope file visible.`, + }); + warnings.push( + 'Estimated reviewer prompt exceeds the selected token budget; use summary-first without hiding assigned files.', + ); + } + + if (params.skippedReviewerIds.length > 0) { + decisions.push({ + kind: 'skip_extra_reviewers', + reason: 'extra_reviewers_skipped', + detail: + 'Some extra reviewers were skipped by the selected token budget mode.', + affectedReviewerIds: [...params.skippedReviewerIds], + }); + warnings.push( + 'Some extra reviewers were skipped by the selected token budget mode.', + ); + } + + return { + mode: params.mode, + estimatedReviewerCalls: params.activeReviewerCalls, + maxReviewerCalls: + params.activeReviewerCalls + + Math.max(0, params.eligibleExtraReviewerCount - params.maxExtraReviewers), + maxExtraReviewers: params.maxExtraReviewers, + ...(fileSplitGuardrailActive + ? { maxFilesPerReviewer: params.executionPolicy.reviewerFileSplitThreshold } + : {}), + maxPromptBytesPerReviewer, + estimatedPromptBytesPerReviewer, + promptByteEstimateSource: 'manifest_heuristic', + promptByteLimitExceeded, + largeDiffSummaryFirst, + decisions, + skippedReviewerIds: params.skippedReviewerIds, + warnings, + }; +} + +export function buildEffectiveReviewTeamManifest( + team: ReviewTeam, + options: ReviewTeamManifestOptions = {}, +): ReviewTeamRunManifest { + const target = resolveReviewTargetForOptions( + options.target, + options.reviewTargetFilePaths, + 'unknown', + ); + const tokenBudgetMode = options.tokenBudgetMode ?? 'balanced'; + const changeStats = resolveChangeStats(target, options.changeStats); + const concurrencyPolicy = applyRateLimitToConcurrencyPolicy( + normalizeConcurrencyPolicy(options.concurrencyPolicy), + options.rateLimitStatus, + ); + const strategyLevel = options.strategyOverride ?? team.strategyLevel; + const strategyRecommendation = recommendReviewStrategyForTarget(target, changeStats); + const backendStrategyRecommendation = recommendBackendCompatibleStrategyForTarget( + target, + changeStats, + ); + const strategyDecision = buildReviewStrategyDecision({ + teamDefaultStrategy: team.strategyLevel, + finalStrategy: strategyLevel, + ...(options.strategyOverride ? { userOverride: options.strategyOverride } : {}), + frontendRecommendation: strategyRecommendation, + backendRecommendation: backendStrategyRecommendation, + }); + const preReviewSummary = buildPreReviewSummary(target, changeStats); + const coreMembers = team.coreMembers.map((member) => + applyTeamStrategyOverrideToMember(member, strategyLevel), + ); + const extraMembers = team.extraMembers.map((member) => + applyTeamStrategyOverrideToMember(member, strategyLevel), + ); + const availableCoreMembers = coreMembers.filter((member) => member.available); + const unavailableCoreMembers = coreMembers.filter((member) => !member.available); + const notApplicableCoreMembers = availableCoreMembers.filter( + (member) => + member.definitionKey !== 'judge' && + !shouldRunCoreReviewerForTarget(member, target), + ); + const coreReviewerMembers = availableCoreMembers .filter((member) => member.definitionKey !== 'judge') - .map((member) => toManifestMember(member)); - const qualityGateReviewer = applicableCoreMembers.find( + .filter((member) => shouldRunCoreReviewerForTarget(member, target)); + const coreReviewers = coreReviewerMembers.map((member) => toManifestMember(member)); + const qualityGateReviewerMember = availableCoreMembers.find( (member) => member.definitionKey === 'judge', ); - const enabledExtraReviewers = team.extraMembers - .filter((member) => member.available && member.enabled) + const qualityGateReviewer = qualityGateReviewerMember + ? toManifestMember(qualityGateReviewerMember) + : undefined; + const eligibleExtraMembers = extraMembers + .filter((member) => member.available && member.enabled); + const maxExtraReviewers = resolveMaxExtraReviewers( + tokenBudgetMode, + eligibleExtraMembers.length, + ); + const enabledExtraMembers = eligibleExtraMembers.slice(0, maxExtraReviewers); + const budgetLimitedExtraMembers = eligibleExtraMembers.slice(maxExtraReviewers); + const enabledExtraReviewers = enabledExtraMembers .map((member) => toManifestMember(member)); + const reviewerCount = coreReviewers.length + enabledExtraReviewers.length; + const executionPolicy = buildEffectiveExecutionPolicy({ + basePolicy: team.executionPolicy, + strategyLevel, + target, + changeStats, + reviewerCount, + }); + const workPackets = buildWorkPackets({ + reviewerMembers: [...coreReviewerMembers, ...enabledExtraMembers], + judgeMember: qualityGateReviewerMember, + target, + executionPolicy, + concurrencyPolicy, + }); + const sharedContextCache = buildSharedContextCachePlan(workPackets); + const incrementalReviewCache = buildIncrementalReviewCachePlan({ + target, + changeStats, + strategyLevel, + workPackets, + }); + const tokenBudget = buildTokenBudgetPlan({ + mode: tokenBudgetMode, + activeReviewerCalls: workPackets.length, + eligibleExtraReviewerCount: eligibleExtraMembers.length, + maxExtraReviewers, + skippedReviewerIds: budgetLimitedExtraMembers.map((member) => member.subagentId), + target, + changeStats, + executionPolicy, + workPackets, + }); const skippedReviewers = [ - ...team.extraMembers + ...extraMembers .filter((member) => !member.available || !member.enabled) .map((member) => - toManifestMember(member, member.available ? 'disabled' : 'unavailable'), + toManifestMember( + member, + member.skipReason ?? (member.available ? 'disabled' : 'unavailable'), + ), ), + ...budgetLimitedExtraMembers.map((member) => + toManifestMember(member, 'budget_limited'), + ), ...unavailableCoreMembers.map((member) => toManifestMember(member, 'unavailable'), ), - ...inapplicableCoreMembers.map((member) => - toManifestMember(member, 'non_applicable'), + ...notApplicableCoreMembers.map((member) => + toManifestMember(member, 'not_applicable'), ), ]; @@ -1010,14 +2835,22 @@ export function buildEffectiveReviewTeamManifest( reviewMode: 'deep', ...(options.workspacePath ? { workspacePath: options.workspacePath } : {}), policySource: options.policySource ?? 'default-review-team-config', - strategyLevel: team.strategyLevel, - executionPolicy: team.executionPolicy, + target, + strategyLevel, + strategyRecommendation, + strategyDecision, + executionPolicy, + concurrencyPolicy, + changeStats, + preReviewSummary, + sharedContextCache, + incrementalReviewCache, + tokenBudget, coreReviewers, - ...(qualityGateReviewer - ? { qualityGateReviewer: toManifestMember(qualityGateReviewer) } - : {}), + ...(qualityGateReviewer ? { qualityGateReviewer } : {}), enabledExtraReviewers, skippedReviewers, + workPackets, }; } @@ -1025,8 +2858,11 @@ function formatResponsibilities(items: string[]): string { return items.map((item) => ` - ${item}`).join('\n'); } -function formatStrategyImpact(strategyLevel: ReviewStrategyLevel): string { - const definition = getReviewStrategyProfile(strategyLevel); +function formatStrategyImpact( + strategyLevel: ReviewStrategyLevel, + strategyProfiles: Record = REVIEW_STRATEGY_PROFILES, +): string { + const definition = strategyProfiles[strategyLevel]; return `Token/time impact: approximately ${definition.tokenImpact} token usage and ${definition.runtimeImpact} runtime.`; } @@ -1047,6 +2883,116 @@ function formatManifestList( .join(', '); } +function workPacketToPromptPayload(packet: ReviewTeamWorkPacket) { + return { + packet_id: packet.packetId, + phase: packet.phase, + launch_batch: packet.launchBatch, + subagent_type: packet.subagentId, + display_name: packet.displayName, + role: packet.roleName, + assigned_scope: { + kind: packet.assignedScope.kind, + target_source: packet.assignedScope.targetSource, + target_resolution: packet.assignedScope.targetResolution, + target_tags: packet.assignedScope.targetTags, + file_count: packet.assignedScope.fileCount, + files: packet.assignedScope.files, + excluded_file_count: packet.assignedScope.excludedFileCount, + ...(packet.assignedScope.groupIndex !== undefined + ? { group_index: packet.assignedScope.groupIndex } + : {}), + ...(packet.assignedScope.groupCount !== undefined + ? { group_count: packet.assignedScope.groupCount } + : {}), + }, + allowed_tools: packet.allowedTools, + timeout_seconds: packet.timeoutSeconds, + required_output_fields: packet.requiredOutputFields, + strategy: packet.strategyLevel, + model_id: packet.model, + prompt_directive: packet.strategyDirective, + }; +} + +function formatWorkPacketBlock(workPackets: ReviewTeamWorkPacket[] = []): string { + if (workPackets.length === 0) { + return '- none'; + } + + return [ + '```json', + JSON.stringify(workPackets.map(workPacketToPromptPayload), null, 2), + '```', + ].join('\n'); +} + +function formatPreReviewSummaryBlock(summary: ReviewTeamPreReviewSummary): string { + return [ + 'Pre-generated diff summary:', + '```json', + JSON.stringify(summary, null, 2), + '```', + ].join('\n'); +} + +function sharedContextCacheToPromptPayload(plan: ReviewTeamSharedContextCachePlan) { + return { + source: plan.source, + strategy: plan.strategy, + omitted_entry_count: plan.omittedEntryCount, + entries: plan.entries.map((entry) => ({ + cache_key: entry.cacheKey, + path: entry.path, + workspace_area: entry.workspaceArea, + recommended_tools: entry.recommendedTools, + consumer_packet_ids: entry.consumerPacketIds, + })), + }; +} + +function formatSharedContextCacheBlock(plan: ReviewTeamSharedContextCachePlan): string { + return [ + 'Shared context cache plan:', + '```json', + JSON.stringify(sharedContextCacheToPromptPayload(plan), null, 2), + '```', + ].join('\n'); +} + +function incrementalReviewCacheToPromptPayload(plan: ReviewTeamIncrementalReviewCachePlan) { + return { + source: plan.source, + strategy: plan.strategy, + cache_key: plan.cacheKey, + fingerprint: plan.fingerprint, + file_paths: plan.filePaths, + workspace_areas: plan.workspaceAreas, + target_tags: plan.targetTags, + reviewer_packet_ids: plan.reviewerPacketIds, + ...(plan.lineCount !== undefined ? { line_count: plan.lineCount } : {}), + line_count_source: plan.lineCountSource, + invalidates_on: plan.invalidatesOn, + }; +} + +function formatIncrementalReviewCacheBlock(plan: ReviewTeamIncrementalReviewCachePlan): string { + return [ + 'Incremental review cache plan:', + '```json', + JSON.stringify(incrementalReviewCacheToPromptPayload(plan), null, 2), + '```', + ].join('\n'); +} + +function formatTokenBudgetDecisionKinds( + decisions: ReviewTeamTokenBudgetDecision[] = [], +): string { + return decisions.length > 0 + ? decisions.map((decision) => decision.kind).join(', ') + : 'none'; +} + export function buildReviewTeamPromptBlock( team: ReviewTeam, manifest = buildEffectiveReviewTeamManifest(team), @@ -1093,17 +3039,59 @@ export function buildReviewTeamPromptBlock( }) .join('\n'); const executionPolicy = [ - `- reviewer_timeout_seconds: ${team.executionPolicy.reviewerTimeoutSeconds}`, - `- judge_timeout_seconds: ${team.executionPolicy.judgeTimeoutSeconds}`, - `- reviewer_file_split_threshold: ${team.executionPolicy.reviewerFileSplitThreshold}`, - `- max_same_role_instances: ${team.executionPolicy.maxSameRoleInstances}`, + `- reviewer_timeout_seconds: ${manifest.executionPolicy.reviewerTimeoutSeconds}`, + `- judge_timeout_seconds: ${manifest.executionPolicy.judgeTimeoutSeconds}`, + `- reviewer_file_split_threshold: ${manifest.executionPolicy.reviewerFileSplitThreshold}`, + `- max_same_role_instances: ${manifest.executionPolicy.maxSameRoleInstances}`, + `- max_retries_per_role: ${manifest.executionPolicy.maxRetriesPerRole}`, ].join('\n'); + const concurrencyPolicy = [ + `- max_parallel_instances: ${manifest.concurrencyPolicy.maxParallelInstances}`, + `- stagger_seconds: ${manifest.concurrencyPolicy.staggerSeconds}`, + `- max_queue_wait_seconds: ${manifest.concurrencyPolicy.maxQueueWaitSeconds}`, + `- batch_extras_separately: ${manifest.concurrencyPolicy.batchExtrasSeparately ? 'yes' : 'no'}`, + ].join('\n'); + const targetLineCount = + manifest.changeStats?.totalLinesChanged !== undefined + ? `${manifest.changeStats.totalLinesChanged}` + : 'unknown'; const manifestBlock = [ 'Run manifest:', `- review_mode: ${manifest.reviewMode}`, `- team_strategy: ${manifest.strategyLevel}`, + `- strategy_authority: ${manifest.strategyDecision.authority}`, + `- final_strategy: ${manifest.strategyDecision.finalStrategy}`, + `- frontend_recommended_strategy: ${manifest.strategyDecision.frontendRecommendation.strategyLevel}`, + `- backend_recommended_strategy: ${manifest.strategyDecision.backendRecommendation.strategyLevel}`, + `- strategy_user_override: ${manifest.strategyDecision.userOverride ?? 'none'}`, + `- strategy_mismatch: ${manifest.strategyDecision.mismatch ? 'yes' : 'no'}`, + `- strategy_mismatch_severity: ${manifest.strategyDecision.mismatchSeverity}`, + `- max_cyclomatic_complexity_delta: ${manifest.strategyDecision.backendRecommendation.factors.maxCyclomaticComplexityDelta}`, + `- max_cyclomatic_complexity_delta_source: ${manifest.strategyDecision.backendRecommendation.factors.maxCyclomaticComplexityDeltaSource}`, + ...(manifest.strategyRecommendation + ? [ + `- recommended_strategy: ${manifest.strategyRecommendation.strategyLevel}`, + `- strategy_recommendation_score: ${manifest.strategyRecommendation.score}`, + `- strategy_recommendation_rationale: ${manifest.strategyRecommendation.rationale}`, + ] + : []), `- workspace_path: ${manifest.workspacePath || 'inherited from current session'}`, `- policy_source: ${manifest.policySource}`, + `- target_source: ${manifest.target.source}`, + `- target_resolution: ${manifest.target.resolution}`, + `- target_tags: ${manifest.target.tags.join(', ') || 'none'}`, + `- target_warnings: ${manifest.target.warnings.map((warning) => warning.code).join(', ') || 'none'}`, + `- target_file_count: ${manifest.changeStats?.fileCount ?? manifest.target.files.length}`, + `- target_line_count: ${targetLineCount}`, + `- target_line_count_source: ${manifest.changeStats?.lineCountSource ?? 'unknown'}`, + `- token_budget_mode: ${manifest.tokenBudget.mode}`, + `- estimated_reviewer_calls: ${manifest.tokenBudget.estimatedReviewerCalls}`, + `- max_prompt_bytes_per_reviewer: ${manifest.tokenBudget.maxPromptBytesPerReviewer ?? 'none'}`, + `- estimated_prompt_bytes_per_reviewer: ${manifest.tokenBudget.estimatedPromptBytesPerReviewer ?? 'unknown'}`, + `- prompt_byte_estimate_source: ${manifest.tokenBudget.promptByteEstimateSource ?? 'none'}`, + `- prompt_byte_limit_exceeded: ${manifest.tokenBudget.promptByteLimitExceeded ? 'yes' : 'no'}`, + `- token_budget_decisions: ${formatTokenBudgetDecisionKinds(manifest.tokenBudget.decisions)}`, + `- budget_limited_reviewers: ${manifest.tokenBudget.skippedReviewerIds.join(', ') || 'none'}`, `- core_reviewers: ${formatManifestList(manifest.coreReviewers, 'none')}`, `- quality_gate_reviewer: ${manifest.qualityGateReviewer?.subagentId || 'none'}`, `- enabled_extra_reviewers: ${formatManifestList(manifest.enabledExtraReviewers, 'none')}`, @@ -1114,15 +3102,16 @@ export function buildReviewTeamPromptBlock( ) : [' - none']), ].join('\n'); + const strategyProfiles = team.definition?.strategyProfiles ?? REVIEW_STRATEGY_PROFILES; const strategyRules = REVIEW_STRATEGY_LEVELS.map((level) => { - const definition = getReviewStrategyProfile(level); + const definition = strategyProfiles[level]; const roleEntries = Object.entries(definition.roleDirectives) as [ReviewRoleDirectiveKey, string][]; const roleLines = roleEntries.map( ([role, directive]) => ` - ${role}: ${directive}`, ); return [ `- ${level}: ${definition.summary}`, - ` - ${formatStrategyImpact(level)}`, + ` - ${formatStrategyImpact(level, strategyProfiles)}`, ` - Default model slot: ${definition.defaultModelSlot}`, ` - Prompt directive (fallback): ${definition.promptDirective}`, ` - Role-specific directives:`, @@ -1135,25 +3124,57 @@ export function buildReviewTeamPromptBlock( return [ manifestBlock, + formatPreReviewSummaryBlock(manifest.preReviewSummary), + formatSharedContextCacheBlock(manifest.sharedContextCache), + formatIncrementalReviewCacheBlock(manifest.incrementalReviewCache), + 'Review work packets:', + formatWorkPacketBlock(manifest.workPackets), + 'Work packet rules:', + '- Each reviewer Task prompt must include the matching work packet verbatim.', + '- Include the packet_id in each Task description, for example "Security review [packet reviewer:ReviewSecurity:group-1-of-3]".', + '- Each reviewer and judge response must echo packet_id and set status to completed, partial_timeout, timed_out, cancelled_by_user, failed, or skipped.', + '- If the reviewer reports packet_id itself, mark reviewers[].packet_status_source as reported in the final submit_code_review payload.', + '- If the reviewer omits packet_id but the Task was launched from a packet, infer the packet_id from the Task description or work packet and mark packet_status_source as inferred.', + '- If packet_id cannot be reported or inferred, mark packet_status_source as missing and explain the confidence impact in coverage_notes.', + '- If a reviewer response is missing packet_id or status, the judge must treat that reviewer output as lower confidence instead of discarding the whole review.', + '- Use the pre-generated diff summary for initial orientation and token discipline, but verify claims against assigned files or diffs before reporting findings.', + '- When prompt_byte_limit_exceeded is yes, use the pre-generated diff summary before detailed reads. Do not remove files from assigned_scope or hide unreviewed files; if a file cannot be covered, report it in coverage_notes and reliability_signals.', + '- Use shared_context_cache entries to reuse read-only GetFileDiff/Read context by cache_key across reviewer packets. Do not duplicate full-file reads when a reusable cached diff or file summary already covers the same path.', + '- Use incremental_review_cache only when the target fingerprint matches a prior run; preserve completed reviewer outputs by packet_id and rerun only missing, failed, timed-out, or stale packets. If any invalidates_on condition changed, ignore the cache and explain the fresh review boundary.', + '- The assigned_scope is the default scope for that packet; only widen it when a critical cross-file dependency requires it and note the reason in coverage_notes.', 'Configured code review team:', members || '- No team members available.', 'Execution policy:', executionPolicy, + 'Concurrency policy:', + concurrencyPolicy, 'Team execution rules:', - '- Always run the four locked core reviewer roles first: ReviewBusinessLogic, ReviewPerformance, ReviewSecurity, and ReviewArchitecture.', + '- Run only reviewers listed in core_reviewers and enabled_extra_reviewers.', + '- Do not launch skipped_reviewers.', + '- If a skipped reviewer has reason not_applicable, mention it in coverage notes without treating it as reduced confidence.', + '- If a skipped reviewer has reason budget_limited, mention the budget mode and the coverage tradeoff.', + '- If a skipped reviewer has reason invalid_tooling, report it as a configuration issue and do not reduce confidence in the reviewers that did run.', + '- If target_resolution is unknown, conditional reviewers may be activated conservatively; report that as coverage context.', + `- Run the active core reviewer roles first: ${formatManifestList(manifest.coreReviewers, 'none')}.`, + '- Launch reviewer Tasks by launch_batch. Do not launch a later reviewer batch until every reviewer Task in the earlier batch has completed, failed, timed out, or returned partial_timeout.', + '- Never launch more reviewer Tasks in one batch than max_parallel_instances. If stagger_seconds is greater than 0, wait that many seconds before starting the next launch_batch.', '- Run ReviewJudge only after the reviewer batch finishes, as the quality-gate pass.', - '- If the Frontend Reviewer is enabled, run it in parallel with the locked reviewers whenever the change contains frontend files (src/web-ui/, .tsx, .scss, .css, locales/).', '- If other extra reviewers are configured and enabled, run them in parallel with the locked reviewers whenever possible.', '- When a configured member entry provides model_id, pass model_id with that value to the matching Task call.', '- If reviewer_timeout_seconds is greater than 0, pass timeout_seconds with that value to every reviewer Task call.', '- If judge_timeout_seconds is greater than 0, pass timeout_seconds with that value to the ReviewJudge Task call.', - '- If reviewer_file_split_threshold is greater than 0 and the target file count exceeds it, split files across multiple same-role reviewer instances (up to max_same_role_instances per role). Launch all split instances in the same parallel message.', - '- When file splitting is active, each same-role instance must only review its assigned file group. Label instances in the Task description (e.g. "Security review [group 1/3]").', + '- If a reviewer Task returns status partial_timeout, treat its output as partial evidence: preserve it in reviewers[].partial_output, mark the reviewer status partial_timeout, and mention the confidence impact in coverage_notes.', + '- If a reviewer fails or times out without useful partial output, retry that same reviewer at most max_retries_per_role times: reduce its scope, downgrade strategy by one level when possible, use a shorter timeout, and set retry to true on the retry Task call.', + '- In the final submit_code_review payload, populate reliability_signals for context_pressure, compression_preserved, partial_reviewer, and user_decision when those conditions apply. Use severity info/warning/action, count when useful, and source runtime/manifest/report/inferred.', + '- If reviewer_file_split_threshold is greater than 0 and the target file count exceeds it, split files across multiple same-role reviewer instances only up to the concurrency-capped max_same_role_instances for this run.', + '- Prefer module/workspace-area coherent file groups when splitting reviewer work; avoid mixing unrelated workspace areas in the same packet when the group budget allows it.', + '- When file splitting is active, each same-role instance must only review its assigned file group. Label instances in the Task description with both group and packet_id (e.g. "Security review [group 1/3] [packet reviewer:ReviewSecurity:group-1-of-3]").', '- Do not run ReviewFixer during the review pass.', '- Wait for explicit user approval before starting any remediation.', '- The Review Quality Inspector acts as a third-party arbiter: it primarily examines reviewer reports for logical consistency and evidence quality, and only uses code inspection tools for targeted spot-checks when a specific claim needs verification.', 'Review strategy rules:', - `- Team strategy: ${team.strategyLevel}. ${formatStrategyImpact(team.strategyLevel)}`, + `- Team strategy: ${manifest.strategyLevel}. ${formatStrategyImpact(manifest.strategyLevel, strategyProfiles)}`, + '- Risk recommendation is advisory; follow team_strategy, member strategy fields, and work-packet strategy for this run unless the user explicitly changes strategy.', commonStrategyRules, 'Review strategy profiles:', strategyRules, diff --git a/src/web-ui/src/shared/types/session-history.ts b/src/web-ui/src/shared/types/session-history.ts index 97d2579b0..fa3793764 100644 --- a/src/web-ui/src/shared/types/session-history.ts +++ b/src/web-ui/src/shared/types/session-history.ts @@ -4,6 +4,8 @@ * Used by session lists and persistence metadata in the frontend. */ +import type { ReviewTeamRunManifest } from '@/shared/services/reviewTeamService'; + export type SessionKind = 'normal' | 'btw' | 'review' | 'deep_review'; export type PersistedSessionKind = 'standard' | 'subagent'; export type SessionTitleSource = 'text' | 'i18n'; @@ -63,6 +65,11 @@ export interface SessionMetadata { * Allows restoring the review action bar across app restarts. */ reviewActionState?: ReviewActionPersistedState; + /** + * The per-run Deep Review reviewer manifest used to launch this session. + * Continuation and later backend gates use this as the source of truth. + */ + deepReviewRunManifest?: ReviewTeamRunManifest; } export interface ReviewActionPersistedState { From c1ca5b65f7adc02fe0d52982be5c261fedc82836 Mon Sep 17 00:00:00 2001 From: limit_yan Date: Sat, 9 May 2026 16:31:57 +0800 Subject: [PATCH 3/6] docs(deep-review): align phase two review plans --- agent-runtime-budget-governance-design.md | 1957 +++++++++++++++++++++ context-reliability-architecture.md | 1143 ++++++++++++ docs/deep-review-design.md | 1181 +++++++++++++ docs/deep-review-phase2-addendum.md | 582 ++++++ docs/deep-review-phase2-plan.md | 338 ++++ 5 files changed, 5201 insertions(+) create mode 100644 agent-runtime-budget-governance-design.md create mode 100644 context-reliability-architecture.md create mode 100644 docs/deep-review-design.md create mode 100644 docs/deep-review-phase2-addendum.md create mode 100644 docs/deep-review-phase2-plan.md diff --git a/agent-runtime-budget-governance-design.md b/agent-runtime-budget-governance-design.md new file mode 100644 index 000000000..43db92077 --- /dev/null +++ b/agent-runtime-budget-governance-design.md @@ -0,0 +1,1957 @@ +# 设计文档:LLM 输出截断检测、恢复与预算治理 + +> 日期:2026-04-27 +> 状态:Draft +> 范围:所有使用 LLM 的 Agent 场景,不限于深度审核 +> 目标:消除静默截断,建立结构化截断信号,并逐步引入请求预算治理 + +--- + +## 背景与问题 + +### 事件 + +2026-04-27 的会话中,模型(glm-5.1)在执行工具调用时突然停止输出。日志显示: + +- 输入 token:约 100,068 +- 输出 token:仅 64-261 +- 无 `finish_reason` +- 无 ERROR/WARN 日志 +- 应用无任何用户提示 + +用户看到的是“模型突然停止”,系统没有说明是网络问题、模型截断、工具调用失败,还是任务完成。 + +### 初步根因 + +长上下文挤占了输出 token 预算,模型没有足够空间完成工具调用参数 JSON。当前 `stream_processor` 在 `TimedStreamItem::End` 分支直接把流结束视为正常结束,未检查: + +- provider 是否报告 `length` / `max_tokens` 类 finish reason; +- 是否存在 pending tool call; +- pending tool call 的 JSON 是否完整; +- 是否需要向上游标记 partial recovery; +- 是否需要向用户显示明确提示。 + +这类问题不是单纯网络错误,同请求自动重试通常无法解决,甚至会放大资源消耗。 + +--- + +## 设计目标 + +1. **不再静默失败**:任何可识别的输出截断都必须进入结构化恢复路径,并向用户呈现原因。 +2. **机器可判定、用户可理解**:系统内部使用结构化 kind/status;UI 显示本地化、人类可读提示。 +3. **安全优先**:半截工具调用不得自动执行;语法修复仅用于完成事件链路和诊断。 +4. **避免无意义重试**:区分网络中断、watchdog timeout、provider max tokens、工具参数截断等情况。 +5. **逐步引入预算治理**:先止血,再从请求前预算、任务拆分、输出落盘等方向降低截断发生率。 +6. **保持平台边界**:产品逻辑位于 core / ai-adapters;前端只通过事件和 adapter 消费结构化状态;UI 组件不直接依赖 Tauri。 +7. **不得新增用户感知中止**:新增检测、预检、压缩、拦截只能减少原本会静默失败、突然断掉或不可恢复的场景;若某项治理动作会让原本可正常完成的对话变成用户可感知阻断,则该动作默认无效,必须改为后台恢复、轻提示或不实现。 + +### 产品体验约束 + +普通对话和 Deep Review 共享同一条体验原则:**把不可控断裂变成可恢复连续体验,而不是用更早的阻断换取技术上更安全的失败**。 + +- 预算预检不能因为低置信估算直接阻断请求;优先触发本地上下文压缩、摘要化、输出落盘或任务内部拆分。 +- 压缩动作如果耗时短,应作为后台治理,不打断用户;如果可能长耗时,必须进入可见的进度状态,避免用户看到长时间无输出。 +- 错误呈现应表达“模型或外部服务出现预期外中断,BitFun 已保留当前结果并尝试恢复/引导下一步”,避免把部分成功渲染成 fatal error。 +- 工具拦截应最小化:只读工具不因同批其他工具截断而关联拦截;写操作优先按依赖关系和副作用级别做最小拦截。 +- 任何新增动作的验收标准都要包含用户感知指标:中止次数不增加、可恢复提示更明确、有效输出不丢失。 + +--- + +## 非目标 + +- 不在 Phase 1 止血阶段重写 Deep Review 的完整调度系统;runtime-bounded scheduling 作为 Phase 2 后续治理能力分阶段引入。 +- 不自动“续写”工具调用参数。 +- 不把所有 provider 行为强行统一成同一个文本错误消息。 +- 不因为截断检测而执行语义不可靠的工具调用。 +- 不用字符串包含关系作为长期架构契约。 + +--- + +## 与 Context Reliability Architecture 的关系 + +根目录 `context-reliability-architecture.md` 是 **Context Reliability Architecture**,负责回答“上下文中哪些事实可信、哪些事实必须跨压缩保留、长任务如何可审计和可恢复”。本文是 **runtime 截断、预算、调度与大输出治理方案**,负责回答“模型输出或工具/子任务结果即将超出预算、已经截断、或被 gateway 限流时,runtime 如何检测、排队、恢复、落盘和呈现”。 + +因此两篇文档不合并为一篇总方案。合并会让基础上下文架构和运行时控制面混在一起,反而降低可维护性。正确关系是:`context-reliability-architecture.md` 提供信任、证据、压缩契约和 context profile;本文消费这些能力,并把截断恢复、scheduler、artifact、spill、large write 等运行时事件回写为可审计事实。 + +### 与 `deep-review-design.md` 已实现基线的关系 + +本轮对照按 `deep-review-design.md` 已实现处理:Deep Review 已拥有 Strategy Engine、Architecture / Frontend reviewer、predictive timeout、dynamic concurrency policy、partial result capture、retry budget、Judge overlap handling、strategy directive / model plumbing 和 continuation / remediation 基础能力。本文不重新实现这些 Deep Review 专项能力。 + +本文只在已实现基线之上补三类增量: + +1. **统一 runtime 调度**:Deep Review 的 `DeepReviewConcurrencyPolicy` / `max_parallel_instances` 作为 reviewer policy 输入和上限;真实执行顺序、gateway permit、queue、retry backoff、parent cancellation cleanup 由 `SubagentScheduler` 和 AI request limiter 负责。 +2. **统一事件语义**:Deep Review 已有 `partial_timeout` / retry / timeout 结果需要映射到通用 scheduler event。对外统一为 `completed_with_partial`、`timed_out`、`retry_waiting`、`failed` 等状态,避免前端和 judge 同时消费两套状态机。 +3. **统一证据与预算治理**:Deep Review 已有 partial output 和 launch manifest;本文只补 artifact 落盘、reviewer output budget、gateway-keyed limiter、token/byte/diff line 预算和跨场景观测字段。 + +因此,任何实现若让 `reviewTeamService.ts` / Deep Review orchestrator 自己做一套 batching + retry,同时 runtime scheduler 再做一套 queue + retry,都视为无效实现。Deep Review policy 可以决定“哪些 reviewer、什么 scope、什么超时/重试预算”;runtime 只能有一个最终调度器决定“何时运行、是否重试、如何释放 permit”。 + +### 重复实现禁止与收敛规则 + +新增实现必须先归类到已有 owner。若出现相似状态、相似重试、相似 artifact、相似 manifest、相似 budget policy 或相似 UI 状态,默认采用 adapter / projection / event mapping 收敛到现有 owner,不新增平行模块。只有现有 owner 无法覆盖跨场景需求,并且文档写明迁移和回滚路径时,才允许新增通用层。 + +| 相似能力 / 场景 | 唯一 owner | 允许的收敛动作 | 禁止动作 | +|---|---|---|---| +| Deep Review reviewer 并发、subagent 容量、gateway 压力 | `SubagentScheduler` + AI request limiter | `DeepReviewConcurrencyPolicy` / `max_parallel_instances` 只投影为 scheduler cap 和 policy input | frontend / Deep Review orchestrator / runtime 各自维护 queue 或 permit | +| Deep Review retry、provider overload retry、gateway backoff | scheduler retry classifier | Deep Review retry budget 投影为 classifier 可用预算和上限 | orchestrator 和 scheduler 对同一 reviewer 各自重试 | +| `partial_timeout`、partial recovery、subagent 终态 | scheduler state + `PartialRecoveryKind` 边界 | raw status 按 evidence 映射为 `completed_with_partial` / `timed_out` / `retry_waiting` | UI / Judge / ledger 同时把 raw status 和 normalized state 当主状态 | +| Deep Review launch manifest、Work Packet、reviewer role / scope | Deep Review canonical manifest + Work Packet projection | 从已实现 manifest 生成 Work Packet projection,保留 `model_id` / `prompt_directive` / scope / retry budget | 在 context 或 runtime 中复制一套 reviewer role schema | +| PR URL / 最近提交 / patch / 大 diff 的 review evidence | Deep Review source resolver + runtime artifact storage | 父任务一次生成 source-agnostic `ReviewEvidencePack`,按 reviewer scope 投影为 artifact slice | 每个 subagent 自行重复拉取或重建同一份完整变更证据 | +| artifact、spill-to-file、大文件写入 manifest | runtime artifact / session storage | Evidence Ledger 记录 artifact ref、hash、status、sensitivity、next action | ledger / context 架构创建第二套文件存储或回灌全文 | +| token / byte / diff line / context budget 阈值 | runtime budget module + model profile | Deep Review strategy、context health、tool metadata 作为估算输入 | 各功能点硬编码自己的预算阈值和阻断规则 | +| context compaction、microcompact、emergency truncation | `ContextMutationKind` + Compaction Contract | runtime 发 mutation event,context 架构记录事实和保留契约 | 把 `ContextCompacted` 放入 `PartialRecoveryKind` 或由 partial recovery 执行输入侧删除 | +| 用户可见 waiting / retrying / partial / failed 状态 | normalized runtime event contract | Deep Review UI、普通对话 UI、诊断面板消费同一事件 | 各 UI 自定义不可互通的状态字符串 | + +| 主题 | `context-reliability-architecture.md` 负责 | 本文负责 | 集成边界 | +|---|---|---|---| +| 信任与优先级 | `ContextTrustLevel`、`MessageSemanticKind`、prompt markup escaping | 不重新定义信任等级 | recovery / scheduler / artifact 事件进入上下文前必须带来源和语义类型 | +| 输入侧上下文变化 | Compaction Contract、Evidence Ledger、Context Health | `ContextMutationKind` 事件、预算预检触发和用户呈现 | 压缩质量与事实保留由上下文架构保障;本文只决定何时触发、如何节流、如何展示 | +| 输出侧截断恢复 | 不作为主语义 | `PartialRecoveryKind`、`ProviderFinishReason`、`ToolArgumentStatus` | 输出截断不得写成 context compaction;context mutation 不进入 partial recovery | +| subagent 任务契约 | Work Packet 定义目标、范围、权限、输入 artifact、输出 schema | `SubagentScheduler` 决定排队、gateway permit、retry、timeout、状态事件 | Work Packet 描述“该做什么”;scheduler 控制“什么时候、以什么容量、失败后如何处理” | +| Deep Review | reviewer/judge 证据契约和 partial evidence 进入 Evidence Ledger | reviewer token/byte/diff line 分片、runtime-bounded scheduling、ReviewJudge 状态输入 | policy 决定 reviewer 与输入范围;runtime 决定实际执行顺序和 gateway 压力控制 | +| 大输出与大文件写入 | Evidence Ledger 记录 artifact 事实、hash、状态和验证结果 | spill-to-file、subagent artifact、Large File Write Protocol 生产 artifact | 上下文中只注入摘要和引用;完整内容保存在 session artifact,不回灌全文 | +| 普通对话体验 | `conversation` profile 保持近期用户意图,少自动 subagent | 预算预检不得因低置信估算阻断;短耗时治理后台化 | 两篇文档共同约束:新增治理不能让原本可完成的普通对话变得更不流畅 | + +### 交叉方案待确认项 + +以下不是当前文档冲突,而是两篇方案在同一问题上的不同层次。竞品最新公开实现显示:Codex 已将自动 compaction 作为 agent loop 的一部分,Claude Code / Claude Agent SDK 强调 subagent 独立上下文、并行和工具限制,OpenCode 用 `mode: subagent`、hidden agent 和 `permission.task` 控制 subagent 可见性与调用权限。基于这些事实,当前建议如下,最终产品取舍仍需用户确认。 + +| 议题 | 可能分歧 | 竞品参考后的建议 | 需要确认 | +|---|---|---|---| +| 自动 compaction 默认开启时机 | 上下文架构希望提高压缩可靠性;本文要求不能新增用户感知中止 | Phase 1 只记录 `ContextMutationKind`;Phase 2 先 observe-only,再允许短耗时后台 compaction;长耗时或有损 mutation 必须展示进度/诊断 | Phase 2B 是否允许对普通对话默认启用短耗时后台 compaction | +| Work Packet 是否泛化为 TaskTool 通用 schema | Deep Review 已有 launch manifest / strategy directive;上下文架构倾向通用任务契约 | 将已实现 Deep Review manifest 作为 Work Packet 投影的第一版;通用 TaskTool schema 放到 Advanced runtime quality,不进入近期默认范围 | Work Packet 泛化是否作为 Phase 3/4 之后的独立方案 | +| Context Health 是否可见 | 上下文架构需要 health score;本文担心额外 UI 状态影响流畅性 | P0/P1 内部 telemetry;用户只看到 action-oriented 状态,例如“正在整理上下文以继续”或“需要拆分任务”,不显示原始分数 | 是否把 raw health score 暴露给高级诊断面板 | +| subagent 并行承诺 | Deep Review 已有 dynamic concurrency policy,但它不应成为第二套 runtime scheduler | 按 Claude/OpenCode 的权限与隔离方向保留 subagent;Deep Review policy 只给上限和意图,BitFun runtime 做 bounded scheduling | Review Team UI 文案是否明确显示 queued / running / retrying | +| artifact 与 Evidence Ledger 所有权 | 两篇文档都提到保留证据 | 本文负责创建 artifact、manifest、spill;Evidence Ledger 只记录事实、hash、路径、状态和验证结果 | session artifact 的长期保留策略和 ledger 持久化位置 | + +参考来源:Claude Code subagents、Claude Agent SDK subagents、OpenCode agents、OpenAI Codex agent loop / auto compaction。文档落地时以官方文档的当前行为为准;若竞品后续变化,只影响默认策略的保守程度,不改变本文的边界划分。 + +--- + +## 现有能力审计 + +### 已有基础设施 + +| 能力 | 位置 | 当前状态 | 评价 | +|---|---|---|---| +| Deep Review Strategy Engine | `deep-review-design.md`、`deep_review_policy.rs`、`reviewTeamService.ts`、`DeepReviewService.ts` | 按已实现基线处理:risk classification、predictive timeout、dynamic concurrency policy、partial result capture、retry budget、role strategy directive | 作为 Deep Review policy / manifest 输入;不再由本文重复实现 | +| 文件数拆分策略 | `src/crates/core/src/agentic/deep_review_policy.rs` | `should_split_files()` + `same_role_instance_count()` 已实现,并可被 Strategy Engine 调整 | 可作为快速路径,但仍需 token/byte/diff line 预算补强 | +| Deep Review 文件拆分 Prompt | `src/crates/core/src/agentic/agents/prompts/deep_review_agent.md` | 已提示按阈值拆分,并包含 reviewer role / strategy directive | prompt 只表达调度意图;真实执行顺序仍由 runtime scheduler 决定 | +| subagent 并行执行 | `task_tool.rs` + `coordinator.execute_subagent()` | 已实现独立 session、上下文隔离和 Deep Review partial result 基础 | 可复用,但应进入统一 `SubagentScheduler` 事件与 permit 路径 | +| subagent 并发控制 | Deep Review dynamic concurrency policy + `SubagentConcurrencyLimiter` | Deep Review 可计算 `max_parallel_instances` / retry budget / timeout policy | 只能作为 policy 上限;不得与 gateway request limiter 形成双调度器 | +| 部分恢复 | `stream_processor.rs`、Deep Review partial result capture | 已能在错误/timeout 后保留部分输出,Deep Review 可得到 partial reviewer evidence | 缺口是跨普通对话、subagent 和 UI 的结构化 kind / state 统一 | +| 工具调用聚合 | `src/crates/ai-adapters/src/tool_call_accumulator.rs` | 能聚合参数并做简单 JSON 修复 | 应在此层增强 JSON 完整性判断 | +| 前端错误呈现 | `src/web-ui/src/shared/ai-errors/aiErrorPresenter.ts` | 已有错误分类框架 | 缺少 `output_truncated` 分类和事件接入 | + +### 关键缺口 + +| 缺口 | 说明 | 影响 | +|---|---|---| +| 截断信号不结构化 | 当前主要依赖 `partial_recovery_reason: Option` | 后端重试、前端展示都容易被文案变化破坏 | +| `TimedStreamItem::End` 过早视为正常 | 未检查 pending tool call 或 provider max tokens | 截断被当成正常完成 | +| 工具调用完整性接口缺失 | `PendingToolCall` 没有 `arguments_closed` 字段 | 不能按原设计直接实现,需要 accumulator 提供语义方法 | +| partial recovery 重试过宽 | `round_executor` 对任意 partial recovery 都可能重试 | token 截断场景会重复失败 | +| 前端事件类型不完整 | `DialogTurnCompletedEvent` 类型未显式声明 recovery 字段 | UI 无法稳定消费截断状态 | +| Deep Review 预算维度仍需统一 | 已有 Strategy Engine 和文件拆分,但 token/byte/diff line 与共享上下文预算需要进入统一 runtime 预算模型 | 大文件、大 diff 仍可能打满上下文;共享上下文重复注入会系统性低估预算 | +| subagent 调度不可观测 | 当前主要看到模型等待或最终失败,缺少 queued / retrying / throttled 状态 | 用户无法区分“模型在思考”“排队等待容量”“因 gateway overload 重试” | +| gateway 并发与 Deep Review 并发策略混在一起 | Deep Review `max_parallel_instances` / `ai.subagent_max_concurrency` 控制 reviewer / hidden session 上限,但 provider / vLLM 实际并发可能更低 | 多 reviewer 可能因为瞬时 burst 被拒绝,而不是任务本身失败;若再叠加 runtime retry,可能形成双重试 | +| Deep Review 状态名未进入统一事件契约 | Deep Review 可能已有 `partial_timeout` / retry / timeout 状态,runtime 方案使用 `completed_with_partial` / `retry_waiting` / `timed_out` | Judge、前端和日志可能看到两套语义,导致重复提示或错误降级 | + +--- + +## 竞品与行业经验 + +公开 issue 和社区反馈显示,Claude Code、OpenAI Codex CLI、Cursor、Cline/Roo Code、LangChain/LangGraph 等工具都遇到过相似问题: + +1. **大输出不能只截断**:用户需要知道完整内容在哪里、哪些内容被省略、模型实际看到了什么。 +2. **工具输出和模型输出都需要预算**:terminal output、read_file、大文件、subagent 返回都可能撑爆上下文。 +3. **结构化错误优于文本匹配**:parser、agent runtime、UI 应消费机器可读状态,而不是日志文案。 +4. **spill-to-file 是常见方向**:大输出保存在本地文件,只把摘要和路径放进上下文。 +5. **任务拆分需要按 token/byte/diff line**:只按文件数量拆分不足以控制上下文风险。 +6. **subagent 并行需要 runtime 调度**:prompt 可以请求并行,但真实运行顺序、gateway permit、重试和 timeout 应由 runtime 控制。 + +BitFun 当前方案应吸收这些经验:先修复静默截断,再建立统一输出预算与可恢复链路。 + +--- + +## 核心设计 + +### 1. 引入结构化恢复类型 + +在 core 的 stream/round result 层增加机器可读字段,保留原有人类可读 reason。 + +建议枚举: + +```rust +pub enum PartialRecoveryKind { + StreamInterrupted, + WatchdogTimeout, + OutputTruncated, + ToolArgumentsIncomplete, + ProviderMaxTokens, + ToolOutputBudgetExceeded, + RateLimited, + UnknownPartial, +} +``` + +`StreamResult` 保留: + +```rust +pub partial_recovery_reason: Option, +pub partial_recovery_kind: Option, +``` + +设计原则: + +- `partial_recovery_kind` 给系统判断; +- `partial_recovery_reason` 给日志和诊断; +- 不再用 `reason.contains("truncated")` 作为架构契约; +- 若短期需要兼容旧字段,可继续透传 reason,但新逻辑必须优先使用 kind。 +- `ContextCompacted` 不放入 `PartialRecoveryKind`。上下文压缩、microcompact、emergency truncation 属于输入侧上下文变更,不是模型输出流的 partial recovery,应使用独立的 `ContextBudgetEvent` / `ContextMutationKind` 记录。 + +--- + +### 2. 在 `ai-adapters` 层暴露工具调用完整性接口 + +原设计中的 `arguments_closed` 字段不存在,不应让 core 层窥探 accumulator 内部结构。 + +在 `PendingToolCalls` 上增加语义方法: + +```rust +impl PendingToolCalls { + pub fn has_pending_payload(&self) -> bool; + pub fn has_incomplete_json_payload(&self) -> bool; + pub fn pending_payload_summary(&self) -> PendingToolCallSummary; +} +``` + +其中: + +- `has_pending_payload()` 判断是否仍有未 finalize 的工具调用; +- `has_incomplete_json_payload()` 基于 `raw_arguments` 的 JSON parse / boundary 检测判断是否可能未闭合; +- `pending_payload_summary()` 用于日志,不暴露敏感参数全文。 + +JSON 修复仍放在 `tool_call_accumulator.rs` 内部,例如增强 `PendingToolCall::parse_arguments()`: + +1. 先尝试原始 JSON parse; +2. 保留现有“删除一个多余右花括号”的修复; +3. 不做“补齐缺失括号/引号后继续执行”的自动修复; +4. 修复后必须再次 `serde_json::from_str` 验证; +5. 修复失败时 finalize 为 `is_error = true`。 + +工具参数必须携带可信状态,而不是只靠 `is_error`: + +```rust +pub enum ToolArgumentStatus { + Complete, // 原始 JSON 完整,可按现有权限策略执行 + RepairedTrusted, // 仅限删除单个多余右花括号等不改变语义的保守修复 + RepairedUntrusted, // 语法补齐等可能改变语义的修复,不得自动执行 + Incomplete, // 明确未闭合或无法解析,不得自动执行 +} +``` + +重要约束: + +- 只有 `Complete` 和少数经白名单证明不改变语义的 `RepairedTrusted` 可进入现有工具执行路径; +- `RepairedUntrusted` / `Incomplete` 必须转为结构化错误事件和诊断,不得自动执行; +- 修复用于完成事件链路、保存诊断、向用户提示; +- core 消费 `FinalizedToolCall.argument_status`、`is_error` 和 recovery kind,不实现 JSON 修复细节。 + +--- + +### 3. 在 `stream_processor` 中检测流结束时的截断 + +当前行为: + +```rust +TimedStreamItem::End => { + debug!("Stream ended normally (no more data)"); + break; +} +``` + +#### 3a. provider finish reason 归一化 + +不同 provider 对 finish reason 的命名和语义不一致(`stop` / `end_turn` / `stop_sequence` / `length` / `max_tokens` / `tool_use`)。为避免 core 层面对 provider 差异,在 `ai-adapters` 层增加归一化: + +```rust +pub enum ProviderFinishReason { + Stop, // 模型正常结束 + Length, // 模型达到 max output tokens + ToolUse, // 模型请求执行工具 + ContentFilter, // 内容被安全过滤中断 + Unknown(String), // 无法识别的原始值,保留用于诊断 +} +``` + +每个 ai-adapter 在解析 provider response 时,将原始 finish reason 映射到此枚举。`Unknown` 变体保留原始字符串,不触发截断检测。 + +`StreamContext` 在 stream 结束时读取归一化后的 `ProviderFinishReason`,而非原始字符串。 + +#### 3b. End 分支截断检测 + +```rust +TimedStreamItem::End => { + if ctx.pending_tool_calls.has_incomplete_json_payload() { + ctx.force_finish_pending_tool_calls(); + ctx.partial_recovery_kind = Some(PartialRecoveryKind::ToolArgumentsIncomplete); + ctx.partial_recovery_reason = Some( + "Model output ended while tool arguments were incomplete".to_string() + ); + warn!( + "Stream ended with incomplete tool arguments: session_id={}, round_id={}, summary={:?}", + ctx.session_id, + ctx.round_id, + ctx.pending_tool_calls.pending_payload_summary() + ); + } else if ctx.provider_metadata_indicates_max_tokens() { + ctx.partial_recovery_kind = Some(PartialRecoveryKind::ProviderMaxTokens); + ctx.partial_recovery_reason = Some( + "Model output stopped because the provider reported max token completion".to_string() + ); + warn!( + "Stream ended due to provider max tokens: session_id={}, round_id={}", + ctx.session_id, + ctx.round_id + ); + } else { + debug!("Stream ended normally (no more data)"); + } + break; +} +``` + +检测信号分层: + +| 信号 | 强度 | 处理 | +|---|---|---| +| provider finish reason 是 `length` / `max_tokens` | 强 | `ProviderMaxTokens` | +| stream end 时 pending tool call JSON 不完整 | 强 | `ToolArgumentsIncomplete` | +| stream error 且已有有效输出 | 中 | `StreamInterrupted` | +| watchdog timeout 且已有有效输出 | 中 | `WatchdogTimeout` | +| 无 finish reason 但无 pending tool call | 弱 | 记录 debug,不单独判定截断 | + +不建议仅因“无 finish_reason”判定截断,因为 provider 行为不一致。 + +--- + +### 4. 修正 `round_executor` 重试策略 + +当前 `round_executor` 对 partial recovery 的重试过宽:任何 `partial_recovery_reason.is_some()` 都可能重试。 + +目标策略: + +| Recovery kind | 是否自动重试同请求 | 原因 | +|---|---|---| +| `StreamInterrupted` 且无有效输出 | 可以 | 可能是瞬态网络问题 | +| `WatchdogTimeout` 且无有效输出 | 可以 | 可能是瞬态卡顿 | +| `StreamInterrupted` 且已有有效输出 | 谨慎,不默认重试 | 重试可能造成重复输出或重复工具调用 | +| `ToolArgumentsIncomplete` | 不重试 | 同请求大概率再次截断,且工具调用不安全 | +| `ProviderMaxTokens` | 不重试同请求 | 需要缩短上下文、增加输出预算或拆分任务 | +| `ToolOutputBudgetExceeded` | 不重试同请求 | 应落盘/摘要,而不是重复执行 | +| `RateLimited` 且无有效输出 | 交给调度器/限流器 backoff 后重试 | 顶层 round 不做即时同请求重试,但 subagent scheduler 可在 retry budget 内排队重试 | +| `RateLimited` 且已有有效输出 | 不默认重试 | 保留 partial output,避免重复文本或重复工具调用 | +| `UnknownPartial` | 不默认重试 | 保守处理,提示用户 | + +`ContextMutationKind`(microcompact / compression / emergency truncation)不进入上述 retry 表。它属于请求前上下文治理事件,不是模型输出流恢复原因;处理方式应是记录预算变化、展示诊断提示,并在仍然超预算时拆分任务或要求用户缩小范围。 + +伪代码: + +```rust +if is_partial_recovery && attempt_index < max_attempts - 1 { + match result.partial_recovery_kind { + Some(PartialRecoveryKind::StreamInterrupted) | Some(PartialRecoveryKind::WatchdogTimeout) + if !result.has_effective_output => retry_with_backoff(), + Some(PartialRecoveryKind::RateLimited) if !result.has_effective_output && is_scheduler_owned => { + retry_via_scheduler_backoff() + } + Some(PartialRecoveryKind::ToolArgumentsIncomplete) + | Some(PartialRecoveryKind::ProviderMaxTokens) + | Some(PartialRecoveryKind::ToolOutputBudgetExceeded) + | Some(PartialRecoveryKind::RateLimited) => finish_with_partial_recovery(), + _ => finish_with_partial_recovery(), + } +} +``` + +同时保留现有 `is_transient_network_error()`,但它只用于网络错误类,不用于 token 截断类。 + +--- + +### 5. 事件传播与前端展示 + +后端 `DialogTurnCompleted` 事件增加结构化字段: + +```rust +partial_recovery_kind: Option, +partial_recovery_reason: Option, +``` + +前端对应类型补齐: + +```ts +export type PartialRecoveryKind = + | 'stream_interrupted' + | 'watchdog_timeout' + | 'output_truncated' + | 'tool_arguments_incomplete' + | 'provider_max_tokens' + | 'tool_output_budget_exceeded' + | 'rate_limited' + | 'unknown_partial'; + +export type ContextMutationKind = + | 'microcompact' + | 'model_compression' + | 'structured_fallback_compression' + | 'emergency_truncation'; + +export interface DialogTurnCompletedEvent { + session_id: string; + turn_id: string; + turn_index: number; + files_changed: number; + lines_added: number; + lines_removed: number; + timestamp: number; + partial_recovery_kind?: PartialRecoveryKind; + partial_recovery_reason?: string; +} +``` + +`aiErrorPresenter.ts` 增加分类: + +```ts +type AiErrorCategory = ... | 'output_truncated'; +``` + +UI 提示按 kind 区分: + +| Kind | 用户提示 | +|---|---| +| `tool_arguments_incomplete` | 模型在生成工具调用参数时被截断,该工具未执行。建议拆分任务或减少上下文后重试。 | +| `provider_max_tokens` | 模型输出达到上限。建议缩短请求、压缩上下文或拆分任务。 | +| `output_truncated` | 模型输出可能不完整。建议缩短请求或新建会话继续。 | +| `stream_interrupted` | 模型响应流中断,已保留部分输出。可根据内容决定是否重试。 | +| `watchdog_timeout` | 模型响应长时间无新内容,已停止等待并保留部分输出。 | +| `rate_limited` | 模型输出因限流被中断,请稍后重试。 | +| `consecutive_truncation` | 连续多次截断,建议拆分任务或减少上下文。 | + +展示位置: + +- active session:在消息流末尾显示轻量提示条; +- inactive session:保留现有未读/中断状态,同时进入会话后显示具体提示; +- Deep Review 父任务:若子 reviewer 截断,父任务摘要中列出被截断的 reviewer 角色和文件组。 + +--- + +### 6. Deep Review 预算治理 + +`deep-review-design.md` 已实现后,Deep Review 已具备 Strategy Engine:risk classification、predictive timeout、dynamic concurrency policy、partial result capture、retry budget、Architecture / Frontend reviewer、Judge overlap handling 和 strategy directive / model plumbing。本文不再规划这些 Deep Review 专项能力,而是把它们接入统一预算与 runtime 调度。 + +剩余缺口是:Deep Review 的已有策略主要控制 reviewer 角色、scope、timeout、retry 上限和策略强度;它仍需要与统一 token/byte/diff line 预算、session artifact、gateway permit 和 scheduler state 对齐。 + +新增预算输入: + +- file count; +- total bytes; +- diff line count; +- estimated input tokens; +- per-file max bytes; +- model context window; +- reserved output tokens; +- reviewer prompt 固定成本; +- tool schema 成本。 + +目标拆分逻辑: + +```text +if estimated_input_tokens + reserved_output_tokens > safe_context_budget: + split by token/bytes/diff lines first +else if file_count > reviewer_file_split_threshold: + split by file count +else: + keep current grouping +``` + +设计约束: + +- Deep Review 已有 risk classification / strategy level / file splitting 结果保留,作为预算拆分的输入; +- 文件数阈值保留,作为简单场景的快速路径; +- token/byte/diff line 是更高优先级,用于补足 Strategy Engine 对单文件巨型 diff 和共享上下文重复注入的低估; +- 单个超大文件应单独分组,必要时提示用户缩小范围; +- 共享上下文必须计入每个 reviewer group 的预算,因为每个 reviewer 都会实际看到这些内容; +- 共享上下文本身超预算时,不重复注入全文,改为接口签名 / 类型声明 / import 图摘要; +- Deep Review policy 决定需要哪些 reviewer、每个 reviewer 的输入范围和角色约束;runtime scheduler 决定真实执行顺序、排队、重试和 gateway 压力控制; +- Deep Review `max_parallel_instances` / retry budget / predictive timeout 只作为 policy 上限和运行预算输入,不得直接绕过 runtime scheduler; +- prompt / manifest 不再表达“无条件并行启动所有 reviewer”,而应表达“调度所有 required reviewers,runtime 会按 gateway 容量有界执行”; +- 已有 `partial_timeout` / timeout / retry 状态必须映射到统一 scheduler event:有可用 evidence 时为 `completed_with_partial`,无可用 evidence 时为 `timed_out`,等待重试时为 `retry_waiting`; +- judge 聚合时也需要限制 reviewer 返回大小; +- ReviewJudge 输入必须包含每个 reviewer 的 `queued` / `retried` / `timed_out` / `failed` / `completed_with_partial` 状态和 partial evidence。 + +#### 6.1 Review Evidence Pack:source-agnostic 一次取证,多 reviewer 复用 + +`/DeepReview` 的输入来源不能被固化为本地 Git range。用户可能输入最近 X 个提交、branch/range/pathspec、PR URL、远端 compare 链接、上传的 patch artifact、当前 working tree、选中文件列表或混合上下文。方案的目标不是“优化 Git diff”,而是将任意 review source 先解析为可复用的标准化证据包。 + +当用户要求 Deep Review 大规模变更时,不能让每个 reviewer 在 subagent 中各自重新拉取、查询或重建同一份完整变更证据。重复取证会把 reviewer 的 `run_timeout_seconds` 消耗在 I/O、网络/API、命令输出和解析上,而不是消耗在分析上;在大仓库、远端 PR、Windows 环境或自托管代码平台下,这会直接放大 timeout 和上下文超限概率。 + +新增 source-agnostic `ReviewEvidencePack`,由 Deep Review 父任务在 launch preflight 阶段通过 `ReviewEvidenceSourceResolver` 和对应 `ReviewEvidenceProvider` 一次性生成,并落盘为 session artifact。subagent 默认只消费 pack 的 slice、source metadata 和必要源码读取权限。 + +第一批 source provider: + +| source kind | 输入示例 | 取证方式 | 备注 | +|---|---|---|---| +| `local_git_range` | 最近 X 个提交、本地 branch/range/pathspec | 本地 Git provider 生成 normalized change set | Git 是 provider,不是 pack 抽象本身 | +| `pull_request_url` | GitHub / GitCode / GitLab 等 PR URL | 平台 provider 拉取 PR files、patch、commit metadata、review base/head | 优先用 API / connector;必要时 fallback 到远端 patch | +| `working_tree` | 审核当前未提交改动 | workspace provider 读取 working tree diff 和文件快照 | stale 判断依赖 working tree state hash | +| `patch_artifact` | 用户提供 patch / diff 文件 | patch parser provider 解析变更和文件索引 | 适合离线或无仓库场景 | +| `explicit_files` | 用户选择文件或目录做深度审核 | filesystem provider 生成文件快照和范围索引 | 没有 diff 时应标记为 snapshot review | + +建议 pack 内容: + +```text +pack_id +source_kind +source_locator +source_provider +source_revision +source_fingerprint +generated_at +collector_events +pack_hash +file_index +change_index +rename_map +diff_stat +per_file_change_artifacts +per_file_size_and_diff_lines +cross_file_context_summary +staleness_policy +``` + +执行方式: + +1. 父任务解析 `/DeepReview` 输入,识别 source kind、source locator 和权限需求; +2. 对应 source provider 一次性收集证据,生成 normalized file index、change index、diff/stat、rename map、per-file change artifact 和摘要; +3. Deep Review Strategy Engine 继续决定 reviewer role / scope / strategy level; +4. Work Packet projection 为每个 reviewer 写入 `review_evidence_pack_id`、`evidence_slice`、`source_kind`、`allowed_source_paths`、`allowed_source_queries` 和 `forbidden_full_evidence_reconstruction`; +5. reviewer 默认从 artifact slice 读取证据,只在需要确认局部上下文时读取源码、小范围 hunk、单文件 patch、PR 文件详情或特定 symbol; +6. 若 pack stale、缺失或 scope 不足,reviewer 请求父任务刷新或扩展 pack,而不是自行重新拉取或重建完整 source evidence。 + +允许的 fallback: + +- pack 生成失败时,Deep Review 可以退回当前行为,但必须记录 `evidence_pack_failed` 和失败 provider,并将 reviewer timeout 调整为包含取证成本; +- 小规模 source 可以内联 pack summary,不必落盘每个 per-file artifact; +- reviewer 可执行局部只读查询,例如读取某个文件当前内容、查询特定 symbol、请求某个 PR 文件详情、或验证一个小范围 hunk,但不能默认重建完整 source; +- runtime 可加只读 source-result cache 作为保护网,键为 `source_kind + source_locator + source_fingerprint + query args`,但 cache 不是主要设计,主要设计仍是显式 pack; +- 对需要网络或平台 API 的 source provider,失败分类必须区分 auth / quota / not found / network / provider unavailable,避免把 PR URL 失败误判为 reviewer 分析失败。 + +可观测字段: + +- `review_evidence_pack_id` +- `source_kind` +- `source_provider` +- `source_locator_hash` +- `source_fingerprint` +- `pack_generation_ms` +- `source_collection_count` +- `pack_bytes` +- `pack_hash` +- `file_count` +- `diff_line_count` +- `artifact_slice_count` +- `cache_hit` +- `reviewer_full_evidence_reconstruction_count` +- `pack_stale` + +门禁: + +- 本地提交范围、PR URL、working tree、patch artifact fixture 都必须走同一 pack contract; +- 同一 source fingerprint 下的完整证据收集只能由父任务 preflight 执行一次; +- 4-5 个 reviewer 运行时,subagent 不得各自重复拉取或重建全部文件变更; +- reviewer timeout 不应包含父任务 pack generation 时间;pack generation 若长耗时,应以“正在准备审核证据”展示; +- pack stale 时必须阻止继续使用旧证据,或记录 stale 诊断并请求用户确认; +- pack artifact 必须复用 session artifact 权限、敏感内容检测、hash 校验和清理策略。 + +--- + +### 7. SubagentScheduler 与 Gateway Request Limiter + +subagent 调度不能只依赖静态并发 cap,也不能由 Deep Review 的 Strategy Engine 单独承担。Deep Review 已有 dynamic concurrency policy,可以决定 reviewer policy 上限;但本地或自托管 vLLM gateway 可能只允许 1-4 个并发 streaming request,真实容量必须由 runtime 在 AI request 边界观察和控制。因此需要将 subagent 派发升级为 core runtime 拥有的、可观测的、自适应队列。 + +目标路径: + +```text +TaskTool + -> ConversationCoordinator + -> SubagentScheduler + -> Hidden subagent session + -> ExecutionEngine + -> AI request limiter + -> Provider adapter / model gateway +``` + +边界: + +- `SubagentScheduler` 控制 hidden session 的生命周期、公平排队、重试状态和事件; +- AI request limiter 控制实际 provider / gateway / model group 的 streaming request permit; +- Deep Review policy 控制 reviewer 角色、文件范围、超时预算和最多同角色实例数; +- Deep Review Strategy Engine 的 `max_parallel_instances`、predictive timeout、retry budget 是 scheduler 输入,不是最终执行器; +- prompt 可以请求并行,runtime 决定安全可执行的真实并发。 + +调度状态: + +```text +accepted +queued +waiting_for_capacity +running +retry_waiting +completed +completed_with_partial +failed +cancelled +timed_out +``` + +关键转移: + +- `accepted -> queued`:Task call 合法并被 policy 接受; +- `queued -> waiting_for_capacity`:任务可运行,但 subagent slot 或 gateway permit 不足; +- `waiting_for_capacity -> running`:scheduler 授予 subagent slot,AI request limiter 授予 gateway permit; +- `running -> completed`:subagent 正常完成; +- `running -> completed_with_partial`:已有有效输出但未完整完成,应保留 evidence; +- `running -> retry_waiting`:无有效输出前遇到 transient gateway overload; +- `retry_waiting -> queued`:backoff 到期后重新排队; +- `queued/running -> cancelled`:父 session 或用户取消; +- `queued/running -> failed`:不可重试错误或 retry budget 耗尽。 + +动态并发: + +```text +configured_max = ai.subagent_max_concurrency 或 review-team policy 上限 +effective_max = runtime-adjusted value, clamped to [1, configured_max] +``` + +控制策略: + +1. 初始 `effective_max = configured_max`; +2. 遇到 gateway concurrency / rate-limit overload 时快速降低; +3. 连续成功窗口后缓慢升高; +4. 永远不低于 1,保证最低 forward progress; +5. 优先按 gateway key 维护计数,gateway key 可来自 model config 的 `concurrency_key`,否则用 `provider + base_url` 或 normalized request URL。 + +重试分类: + +- 可重试:HTTP 429、gateway concurrency saturation、queue full、server busy、capacity exceeded、无有效输出前 connection reset / stream closed; +- 不可重试:Deep Review policy violation、invalid subagent type、missing workspace、auth/quota/billing/model-not-found/invalid key、用户取消、tool permission denial; +- 条件重试:read-only subagent 有 partial output 时保留 partial evidence,通常不直接重跑全文;write-capable subagent 只有在执行历史证明没有 state-changing tool 后才允许重试。 + +timeout 语义: + +- `queue_timeout_seconds`:等待 scheduler capacity 的最长时间; +- `run_timeout_seconds`:subagent 开始运行后的最长 active execution 时间; +- `stream_idle_timeout_secs`:模型 stream chunk 之间的最大静默时间; +- `parent_deadline`:父任务端到端 deadline。 + +对 Deep Review,已实现的 predictive timeout / `effective_timeout_seconds` 应映射为 `run_timeout_seconds`,也就是 active review time;queue waiting 单独计量,避免本地低并发 gateway 让 reviewer 在尚未开始前就超时。 + +状态兼容: + +- Deep Review 已有 `partial_timeout` / `PartialTimeout` 状态时,若存在有效 reviewer evidence,对外映射为 `completed_with_partial`; +- 若 timeout 前没有足够 evidence,对外映射为 `timed_out`; +- Deep Review retry budget 触发重试等待时,对外映射为 `retry_waiting`; +- 原始 Deep Review 状态可保留在 diagnostics 字段,不作为 UI / Judge 的主判定状态。 + +UI 与日志: + +- UI 区分 model thinking、queue waiting、retry backoff、running、completed_with_partial; +- Deep Review summary 可显示“runtime 使用有界队列继续审核,因为当前 model gateway 限制并发”; +- 日志使用英文,无 emoji,记录 permit acquire/release、effective concurrency changed、retry classified、retry budget exhausted、partial output preserved; +- 关键事件需要包含 `scheduler_state`、`queued_ms`、`run_ms`、`retry_count`、`gateway_key`、`configured_max`、`effective_max`、`retry_classification`。 + +配置形状保持三层拆分: + +```json +{ + "ai": { + "subagent_max_concurrency": 4, + "gateway_concurrency": { + "default": { + "max_concurrent_requests": 4, + "adaptive": true, + "min_concurrent_requests": 1 + } + }, + "review_teams": { + "default": { + "max_parallel_reviewers": 3, + "reviewer_queue_timeout_seconds": 300, + "reviewer_retry_budget": 1 + } + } + } +} +``` + +精确字段名可在实现阶段调整,但必须保留三层语义:global subagent execution capacity、gateway/model request capacity、Deep Review reviewer policy。 + +禁止重复实现: + +- `reviewTeamService.ts` 可以生成 reviewer manifest、strategy directive、model_id、scope 和 policy 上限,但不能另建与 `SubagentScheduler` 并行的 runtime queue; +- Deep Review orchestrator 可以声明 retry budget 和降级策略,但不能绕过 scheduler retry classifier 自主重复发起同一批 reviewer; +- provider rate-limit / gateway overload 只能在 AI request limiter / provider diagnostics 层归一化,不能由前端仅凭 cached rate-limit 状态决定最终执行并发; +- Judge 和 UI 必须消费统一 scheduler state,Deep Review 原始状态只作为诊断补充。 + +--- + +### 8. 统一输出预算与 spill-to-file(后续阶段) + +为了对齐行业实践,后续应建立统一输出预算策略,覆盖: + +- 模型输出; +- 工具输出; +- terminal stdout/stderr; +- read file 输出; +- subagent 返回; +- Deep Review reviewer 报告; +- judge 聚合输入。 + +建议策略: + +1. 每类输出有 `max_inline_bytes` / `max_inline_tokens`; +2. 超限内容落盘到 session 目录; +3. 上下文中只注入摘要、统计和文件引用; +4. UI 提供“打开完整内容”; +5. 日志记录 spill 文件路径、原始大小、摘要大小; +6. 对敏感内容保持本地存储,不上传额外远端。 + +通用 spill-to-file 不是 Phase 1 必须完成,但 **subagent / Deep Review reviewer 返回的落盘引用是 Phase 1 必须完成的专项能力**。原因是当前 hidden subagent 成功后会清理子 session,若只做硬截断而不保留完整输出引用,reviewer 证据会永久丢失。 + +--- + +### 9. 大文件写入协议(Large File Write Protocol) + +竞品和社区问题显示,Claude、OpenCode 等工具在多轮长对话后,如果让模型通过普通 tool call JSON 一次性写入约 1k 行以上文件内容,容易出现输出截断、上下文超限或工具参数不完整。根因不是单纯上下文压缩不足,而是把“大内容传输”放进了“对话上下文 / 工具参数通道”。 + +因此,大文件写入应作为统一预算治理的上游预防机制,而不是截断后的恢复手段。目标是:**模型负责规划和校验,文件内容通过 session-backed writer 分块落盘,上下文只保留 manifest、摘要、hash 和引用**。 + +建议协议: + +```text +start_file_write(path, mode, expected_sections, intent) +append_file_chunk(write_id, section_id, sequence, content_chunk, chunk_hash) +finish_file_write(write_id, expected_hash, validation_plan) +abort_file_write(write_id, reason) +``` + +上下文中只注入轻量结果: + +```text +file: src/foo.ts +status: written +sections: imports, types, service, tests +bytes: 48231 +hash: abc123 +preview: first/last lines or structural summary +artifact_ref: .bitfun/sessions/{session_id}/artifacts/writes/{write_id} +``` + +设计约束: + +1. **优先 patch/hunk,不重写全文**:修改既有大文件时,默认生成 edit plan 和小范围 patch;只有新建大文件或确实需要全量重生成时进入大文件写入协议。 +2. **事务化写入**:所有 chunk 先写临时文件,`finish_file_write` 校验 hash、section manifest 和基础格式后再原子替换目标文件。 +3. **上下文不回灌全文**:tool result 不返回完整文件内容,只返回进度、摘要、hash、行数、artifact 引用和必要预览。 +4. **按需读取**:后续修改通过 range/symbol 读取相关片段,不自动把完整文件重新塞回对话。 +5. **可恢复**:写入中断时保留 write manifest,下一轮可继续 append、重新校验或 abort。 +6. **产品进度可见**:大文件写入表现为“正在创建文件 / 已写入 N 个 section / 正在校验 / 已完成”,而不是模型长时间无输出。 +7. **平台边界**:协议和事务逻辑属于 core / tool layer;桌面或前端只通过 adapter 展示进度和结果,不直接操作文件。 + +触发条件建议: + +- 单次 `write_file` / `replace_file` content 预计超过 `large_write_inline_threshold`; +- 模型计划新建或重写超过约 800-1000 行文件; +- 工具参数 JSON 中 `content` 字段预计会接近输出 token 预算; +- 多轮长对话后上下文已接近预算,且下一步是大文件生成。 + +若协议尚未实现,不应鼓励模型通过普通工具参数生成大文件全文;应提示模型先输出文件结构计划或拆成小范围 patch。 + +--- + +## 分阶段实施计划 + +### 落地原则与阶段门禁 + +本方案不能作为一次性大改上线。每个阶段必须满足以下可维护、可测试、可回滚要求后才能进入下一阶段: + +1. **先观测,后干预**:凡是可能改变用户路径的能力,先以 observe-only 模式记录结构化事件和指标,再开启真实拦截、压缩或落盘。 +2. **feature flag 分层**:按能力独立开关,而不是一个总开关。例如 `partial_recovery_kind_enabled`、`tool_argument_status_enforced`、`budget_precheck_observe_only`、`spill_to_file_enabled`、`large_file_write_protocol_enabled`。 +3. **事件契约稳定**:后端新增字段使用 optional / enum unknown fallback,前端必须兼容旧事件;系统逻辑不得依赖本地化文案。 +4. **测试夹具先行**:每个阶段先补可复现 fixture,再实现行为。fixture 覆盖 stream chunks、provider finish reason、半截 tool JSON、大输出、预算估算、大文件 chunk。 +5. **观测字段固定**:所有阶段至少记录 `session_id`、`turn_id`、`round_id`、`model_id`、`recovery_kind` / `budget_state` / `tool_argument_status`、估算 token、实际大小、动作耗时和是否用户可见。 +6. **默认可降级**:任何阶段发现误判、过度提示、频繁压缩或写入异常时,应能降级为记录诊断 + 保留旧行为,而不是影响主路径。 +7. **用户感知指标必须不倒退**:新增动作上线后,普通对话的可感知中止数、不透明等待次数、不必要确认次数不得增加。 + +### 可落地阶段总览 + +| 阶段 | 目标 | 默认上线形态 | 可维护性要求 | 可测试性要求 | 进入下一阶段门槛 | +|---|---|---|---|---|---| +| Phase 0:基线与夹具 | 不改行为,先固化当前失败模式 | observe-only / test-only | 建立统一 fixture 目录和事件快照格式;文档化 `ai.subagent_max_concurrency` | 可复现半截 JSON、max_tokens 文本截断、stream error、subagent 大返回、gateway burst failure、1k+ 行写文件 | fixture 能稳定失败于旧逻辑;Review Team manifest 不再承诺无条件并行 | +| Phase 1A:结构化截断与调度信号 | 引入 `PartialRecoveryKind`、`ProviderFinishReason`、`has_effective_output`、scheduler state event | 默认仅记录 kind/state,不改变执行策略 | enum unknown fallback;reason 只用于诊断;scheduler state 事件 optional | adapter finish reason 映射、stream end/error、scheduler state transition 单测 | 无误判正常 stop;Unknown 不触发截断;queued/running/retry/completed 事件可序列化 | +| Phase 1B:工具参数安全 | 引入 `ToolArgumentStatus` 和截断批次标记 | 默认拒绝 `Incomplete` / `RepairedUntrusted`,read-only 白名单可继续 | tool mutability metadata 有默认值和审计入口 | 半截 JSON、保守修复、read-only/mutating 分级测试 | 不可信参数不会进入 tool pipeline;只读工具不关联拦截 | +| Phase 1C:用户呈现与恢复/调度事件 | 前端消费 recovery/context mutation/scheduler 事件 | 轻提示 + 诊断入口;queue/retry 状态可区分 | i18n key 稳定,UI 不依赖后端英文文案 | event serialization、locale、消息流快照、waiting/retrying 展示测试 | 部分成功不显示 fatal;无有效输出才进入错误态;用户能区分排队与模型思考 | +| Phase 1D:subagent 专项落盘 | 解决 hidden subagent 输出丢失和父 session overflow | subagent 超限时先落盘再注入摘要 | artifact 生命周期和 session 删除绑定 | 超大 subagent result、artifact 清理、父事件上浮测试 | 不存在“只硬截断不落盘”的路径 | +| Phase 2A:预算观测与 gateway limiter | 建立预算估算、模型 profile、gateway-keyed request permits | observe-only + static permit cap,不做自适应降级 | budget module 独立;gateway key 可配置;stream 结束/错误/取消时释放 permit | tokenizer/启发式估算、actual token 校准、permit acquire/release 测试 | 估算误差可观测;低置信不参与阻断;permit 不泄漏 | +| Phase 2B:本地恢复与 adaptive concurrency | Soft/HardOverBudget 分层、压缩节流、effective concurrency 调整 | Soft 后台恢复;Hard 恢复后仍超限才引导用户;overload 时降低 effective max | 冷却窗口、收益阈值、耗时记录、adaptive policy 独立可调 | 压缩频率、收益阈值、长耗时进度、gateway overload 分类测试 | 不增加普通对话低置信中止;effective max 不低于 1 且不过度振荡 | +| Phase 2C:Deep Review Strategy Engine 接入 runtime control plane | 将已实现 Strategy Engine 的 timeout/concurrency/retry/partial 状态接入 token/byte/diff line 预算、source-agnostic Review Evidence Pack 和 runtime scheduler | 只影响 Deep Review 调度、状态映射、取证复用和 reviewer 状态汇总 | Deep Review policy 只给上限;runtime scheduler 是唯一执行调度器;完整 source evidence 由父任务 preflight 通过 provider 一次生成 | 大 diff / PR URL / patch fixture、共享上下文计预算、Review Evidence Pack 去重、gateway concurrency=2 排队完成测试、状态映射快照 | 大文件/大 diff 不再只按文件数拆分;4-5 reviewers 不因 burst launch 失败;subagent 不重复重建完整 evidence;无双调度/双重试 | +| Phase 3:统一输出预算 | 工具输出、terminal、read_file、subagent 泛化 spill | 超限才 spill,普通输出直显 | output budget policy 集中配置,summary formatter 可扩展 | spill 权限、LRU、敏感检测、摘要 fixture | 大输出不回灌全文,敏感内容不进摘要 | +| Phase 4:大文件写入协议 | 避免 1k+ 行文件走普通 tool JSON | 大文件/高风险上下文触发,小文件仍直接写 | writer 状态机、manifest、事务提交独立模块 | chunk sequence/hash、abort/continue、原子提交、range 后续修改测试 | 无事务/无校验路径不能写目标文件 | + +### 分阶段维护与测试资产 + +| 资产 | 所属阶段 | 维护方式 | 测试方式 | +|---|---|---|---| +| Stream fixture | Phase 0-1 | 保存 provider 原始 chunk、finish reason、error event 的最小样本 | Rust 单测 + adapter 映射测试 | +| Tool argument fixture | Phase 0-1B | 覆盖完整 JSON、半截 JSON、尾部多余右花括号、语义不可信补齐 | `tool_call_accumulator` 单测 + pipeline 拒绝测试 | +| Event snapshot | Phase 1A-1C | 对 `DialogTurnCompletedEvent` / context mutation event 做稳定快照 | 后端序列化测试 + 前端 event handler 测试 | +| Scheduler fixture | Phase 1A / Phase 2 | 覆盖 accepted、queued、waiting_for_capacity、running、retry_waiting、completed_with_partial、failed、cancelled、timed_out | state transition、gateway permit、retry classifier 单测 | +| Budget fixture | Phase 2 | 覆盖 CJK、代码、长日志、大 diff、不同 model profile | 估算误差测试 + observe-only 指标回放 | +| Deep Review compatibility fixture | Phase 2C | 覆盖已实现 Strategy Engine 的 `max_parallel_instances`、predictive timeout、partial timeout、retry budget 与 scheduler state 映射 | 无双调度、无双重试;`partial_timeout` 正确映射为 `completed_with_partial` 或 `timed_out` | +| Review Evidence Pack fixture | Phase 2C | 覆盖 local git range、PR URL、working tree、patch artifact、rename、pathspec、pack stale、4-5 reviewers 共享取证 | 父任务完整 source evidence 收集只执行一次;reviewer 消费 artifact slice;`reviewer_full_evidence_reconstruction_count = 0` | +| Artifact fixture | Phase 1D / Phase 3 | 覆盖 subagent result、terminal output、read_file output、敏感内容 | 落盘权限、LRU、REDACTED 摘要测试 | +| Large write fixture | Phase 4 | 覆盖 1k+ 行新文件、既有大文件小改、chunk 缺失/重复/乱序 | writer 状态机、hash 校验、原子提交、range 修改测试 | + +阶段推进时,先补资产,再接行为。没有对应 fixture 的行为不进入默认路径。 + +### 运行期可观测与回滚 + +| 能力 | 必备指标 / 日志字段 | 告警或回滚信号 | 回滚方式 | +|---|---|---|---| +| 结构化截断信号 | `recovery_kind`、provider raw finish reason、`has_effective_output`、是否用户可见 | 正常 stop 被标记为截断;Unknown finish reason 激增 | 关闭 enforcement,仅保留 reason 诊断 | +| 工具参数安全 | `tool_argument_status`、tool mutability、是否执行、拒绝原因 | read-only tool 被误拦截;mutating tool 拒绝率异常升高 | 降级为只拒绝 `Incomplete`,保留 mutability 诊断 | +| 前端恢复提示 | recovery kind、展示级别、用户是否继续、是否打开诊断 | 截断提示曝光明显超过实际 recovery 数;用户继续率下降 | 降级为诊断区展示,不打断消息流 | +| subagent scheduler | scheduler state、queued_ms、run_ms、retry_count、parent_session_id、cancelled_by_parent | queued/running 状态不闭合;父 session 取消后仍有 queued task | 关闭 scheduler enforcement,回退到静态 `SubagentConcurrencyLimiter` | +| gateway request limiter | gateway_key、configured_max、effective_max、permit acquired/released、overload classification | permit 泄漏;effective_max 振荡;fast cloud provider 被错误降速 | 关闭 adaptive concurrency,保留 configured static cap | +| subagent retry | retry_classification、retry_budget、effective_output_seen、state_changing_tool_seen | auth/quota/model-not-found 被重试;write-capable subagent 变更后被重试 | 关闭 retry,仅保留 queue 和 partial evidence | +| Review Evidence Pack | `review_evidence_pack_id`、`source_kind`、`source_provider`、`source_locator_hash`、`source_fingerprint`、`pack_generation_ms`、`source_collection_count`、`pack_hash`、`file_count`、`diff_line_count`、`artifact_slice_count`、`reviewer_full_evidence_reconstruction_count`、`pack_stale` | reviewer 重复重建完整 evidence;pack stale 仍被使用;pack 生成耗时长但无进度 | 禁用 pack enforcement,退回当前行为但记录重复取证诊断 | +| subagent artifact | result bytes、inline bytes、artifact path hash、清理状态 | artifact 创建失败;父 session 注入摘要失败 | 回退为中止 subagent 返回注入,保留子 session 不清理 | +| 预算预检 | estimated tokens、actual tokens、confidence、budget state、治理动作耗时 | SoftOverBudget 导致用户可见等待增加;压缩频率过高 | 切回 observe-only,不阻断不压缩 | +| context mutation | tokens before/after、summary source、耗时、有损标记 | 长耗时 mutation 无进度;收益低于阈值仍频繁触发 | 关闭自动 mutation,仅保留手动/诊断 | +| spill-to-file | original bytes、inline bytes、summary kind、sensitive flag、cleanup result | spill 失败、磁盘占用超限、敏感摘要泄漏 | 关闭泛化 spill,仅保留 Phase 1D subagent 专项 artifact | +| 大文件写入协议 | write_id、section、sequence、chunk hash、final hash、commit status | open write session 泄漏;hash mismatch;abort/continue 失败 | 禁用协议入口,回退为小文件直接写入 + 大文件 patch/hunk 提示 | + +所有回滚都必须保持“有效输出不丢失”:关闭新策略时,宁可回到旧行为或 observe-only,也不能删除 artifact、丢弃 partial result 或把未完成写入显示为成功。 + +### Phase 1:止血修复 + +目标:消除静默截断,避免无意义重试。 + +| 内容 | 主要文件 | 验证 | +|---|---|---| +| 增加 `PartialRecoveryKind` + `ProviderFinishReason` | core execution types / event types / ai-adapters stream types | Rust 单元测试 | +| `PendingToolCalls` 暴露完整性方法 + `ToolArgumentStatus` | `tool_call_accumulator.rs` | JSON 完整/不完整/保守修复测试 | +| `TimedStreamItem::End` 检测 pending tool call + 纯文本截断 | `stream_processor.rs` | 模拟 stream end + 半截 JSON / 纯文本截断 | +| `TimedStreamItem::Error` 分支截断 kind 判断 | `stream_processor.rs` | 模拟 stream error + watchdog/timeout/rate-limit | +| 修正 partial recovery 重试策略 + `has_effective_output` 定义 | `round_executor.rs` | 截断不重试、网络类仍可重试、重试耗尽行为 | +| 连续截断计数器 `TruncationGuard` | `round_executor.rs` 或 session 层 | 连续截断超阈值强制终止 | +| Task tool 返回大小检查 + subagent 输出落盘引用 + recovery kind 上浮 | `task_tool.rs` / coordinator / session storage | subagent 返回超限时父 session 收到摘要和本地完整输出引用 | +| ai-adapter finish reason 归一化 | 各 adapter stream handler | 各 provider finish reason 映射测试 | +| Context 预算事件建模(不作为 partial recovery) | `execution_engine.rs` / event 定义 | microcompact / compression / emergency truncation 事件不会被误判为模型输出截断 | +| subagent scheduler state event | coordinator / scheduler event 定义 | accepted / queued / running / retry / completed / failed / cancelled 序列化测试 | +| 事件透传 recovery kind | `execution_engine.rs` / event 定义 | 事件序列化测试 | +| 前端展示截断与调度提示 | `snapshot.ts`、`EventHandlerModule.ts`、`aiErrorPresenter.ts`、相关消息组件和 locales | 前端单测 + waiting / retrying / completed_with_partial 手动验证 | + +### Phase 2:预算预检与 Deep Review 拆分 + +目标:降低普通对话和大审核任务的截断率,同时避免因为低置信预算估算新增用户感知阻断。 + +| 内容 | 主要文件 | 验证 | +|---|---|---| +| 请求前预算预检分层(Near / Soft / Hard) | `execution_engine.rs` 或相邻 budget module | 低置信不阻断、高置信恢复后仍超限才阻断 | +| 本地上下文恢复优先 | context compression / microcompact 路径 | SoftOverBudget 后台恢复、HardOverBudget 恢复后重估 | +| 压缩冷却窗口和收益阈值 | context budget policy | 连续压缩不会频繁触发 | +| 长耗时 context mutation 进度事件 | event 定义 / frontend handler | 长耗时压缩有可见进度,短耗时不打断 | +| gateway-keyed AI request limiter | AI client boundary / provider adapter | permit acquire/release 覆盖 stream finish/error/cancel | +| provider overload retry classifier | provider diagnostics normalization | 429 / capacity / queue full 可重试,auth/quota/model-not-found 不重试 | +| adaptive effective concurrency | scheduler / gateway concurrency policy | overload 降低、成功窗口升高、最低保持 1、不振荡 | +| 估算文件组 token/byte/diff line | `deep_review_policy.rs` 或相邻预算模块 | 单元测试 | +| 分片策略加入预算维度 | Deep Review policy / Strategy Engine 输入 | 大 diff fixture、共享上下文计预算测试 | +| Review Evidence Pack preflight | Deep Review source resolver / provider / session artifact / Work Packet projection | 任意 source fingerprint 只完整收集一次;reviewer 只读取 slice artifact;pack stale 检测 | +| Deep Review Strategy Engine 接入 runtime scheduler | `reviewTeamService.ts` / `DeepReviewService.ts` / coordinator / scheduler | `max_parallel_instances` 只作为上限;无双调度;gateway concurrency=2 时 4-5 reviewers 排队完成 | +| reviewer queue/run/idle timeout 映射 | Deep Review policy / settings / scheduler | predictive timeout 映射为 `run_timeout_seconds`;queue waiting 不消耗 active review timeout | +| reviewer retry budget 接入 retry classifier | Deep Review policy / scheduler | 无双重试;auth/quota/model-not-found 不重试;gateway overload 可进入 `retry_waiting` | +| reviewer 返回大小限制 | subagent / review result 聚合路径 | 超大报告测试 | +| 子 reviewer 截断和调度状态上浮到父任务 | Deep Review service / UI | `partial_timeout` -> `completed_with_partial` / `timed_out` 映射;queued / retried / failed 展示测试 | + +### Phase 3:统一输出预算 + +目标:覆盖工具输出、terminal 输出、subagent 返回等更广泛场景。 + +| 内容 | 主要文件 | 验证 | +|---|---|---| +| 定义 output budget policy | core service / tool layer | 单元测试 | +| 大输出 spill-to-file | tool implementations / session storage | 文件落盘测试 | +| 上下文注入摘要而非全文 | tool result formatter / agent context builder | 集成测试 | +| UI 打开完整输出 | web-ui adapter + message components | 前端测试 | + +### Phase 4:大文件写入协议 + +目标:避免约 1k 行以上的大文件内容通过普通对话文本或单次 tool call JSON 传输,从源头降低长对话后的上下文超限和工具参数截断。 + +| 内容 | 主要文件 | 验证 | +|---|---|---| +| 定义 large file write policy 和触发阈值 | core tool policy / agent context builder | 阈值和模型预算单元测试 | +| 增加 session-backed writer manifest | tool layer / session storage | start / append / finish / abort 单元测试 | +| chunk 写入临时文件并原子提交 | filesystem service / tool implementation | hash mismatch、顺序错乱、abort 恢复测试 | +| tool result 只返回摘要、hash、artifact ref | tool result formatter / event types | 大文件写入不回灌全文的集成测试 | +| 大文件优先 patch/hunk 修改 | file edit tool / patch tool routing | 既有大文件小改动不触发全文重写 | +| 前端展示大文件写入进度 | web-ui adapter + message components | 创建、写入 section、校验、完成状态测试 | + +--- + +## 风险与缓解 + +### 原有风险(设计内已识别) + +| # | 风险 | 可能性 | 影响 | 解决办法 | 是否可根治 | 阶段 | +|---|---|---|---|---|---|---| +| R0-1 | 把正常 stream end 误判为截断 | 中 | 中 | 只把强信号判为截断;无 finish reason 单独不判截断 | 是,强信号策略可完全避免误判 | Phase 1 | +| R0-2 | provider finish reason 不统一 | 高 | 中 | 每个 ai-adapter 负责将 provider 原始 finish reason 归一化为 `Option` 枚举(`Stop` / `Length` / `ToolUse` / `ContentFilter` / `Unknown`)。无法识别的 finish reason 归为 `Unknown`,不触发截断检测 | 否,新 provider 或 provider 行为变更可能引入新的 `Unknown`,但归一化层确保不会误判 | Phase 1 | +| R0-3 | JSON 修复生成语义错误参数 | 中 | 高 | 引入 `ToolArgumentStatus`;只有原始完整 JSON 和白名单内不改变语义的保守修复可执行;语法补齐类修复不实现自动执行 | 是,但前提是执行路径强制检查 status;否则该修改无效 | Phase 1 | +| R0-4 | 字符串 reason 被误用为逻辑判断 | 高 | 中 | 引入 `PartialRecoveryKind`,系统逻辑只消费 kind;reason 仅用于日志和诊断 | 是,kind 枚举可完全替代字符串判断 | Phase 1 | +| R0-5 | 前端提示过多打扰用户 | 中 | 低 | 按严重度展示:普通 partial recovery 轻提示;短耗时后台治理只进诊断;长耗时治理显示进度;fatal error 仅用于无有效输出且不可恢复 | 是,分层展示可避免把恢复过程渲染成额外打扰 | Phase 1 | +| R0-6 | Deep Review 拆分过细导致成本上升 | 中 | 中 | 并发限制(`SubagentConcurrencyLimiter`)、最大同角色实例数、预算阈值共同控制 | 是,三项约束联合可限制成本上限 | Phase 2 | +| R0-7 | spill-to-file 暴露敏感内容路径 | 低 | 中 | 见 R12 完整方案(文件权限 + 敏感检测 + REDACTED + UI 警告) | 是,三层防护可解决 | Phase 3 | + +### 竞品对比发现的新风险与解决办法 + +#### R1:模型自主重试截断工具导致 doom loop(高风险) + +**来源**:Opencode #18108 — `finishReason: length` + `repairToolCall` 修复后模型重新提交,无限循环。 + +**场景**:round_executor 不重试,但下一轮模型看到截断工具的 `is_error` 结果后,自主决定"换个方式再试",再次触发同类工具调用,再次截断。 + +**解决办法**: + +在 `round_executor` 或 `session` 层增加 **连续截断计数器**: + +```rust +struct TruncationGuard { + consecutive_truncation_count: u32, + max_consecutive_truncations: u32, // 默认 3 +} +``` + +每次 round 结束时: + +- 若 `partial_recovery_kind` 是截断类(`ToolArgumentsIncomplete` / `ProviderMaxTokens`),计数器 +1; +- 若 round 正常完成,计数器归零; +- 若计数器 >= `max_consecutive_truncations`,强制终止当前 turn,向用户提示"连续多次截断,建议拆分任务或缩短上下文"。 + +此计数器是 **session 级别** 的,跨 turn 生效,防止模型在多个 turn 中反复尝试同一截断路径。 + +**阶段**:Phase 1(与 A5 同步实施)。 + +--- + +#### R2:subagent 截断导致父 session context overflow(高风险) + +**来源**:Claude Code #23463 — subagent results silently overflow context, causing unrecoverable session crash。 + +**场景**:Task tool 启动的 subagent 截断后返回了部分结果,部分结果仍然很大,注入父 session context 后导致父 session 也 overflow。 + +**解决办法**: + +在 Task tool 返回结果给父 session 之前,增加 **返回大小检查 + 本地落盘引用**: + +```rust +const MAX_SUBAGENT_RETURN_BYTES: usize = 32 * 1024; // 32 KiB + +if result_bytes > MAX_SUBAGENT_RETURN_BYTES { + // 完整结果先落盘到父 session 可访问的 artifact/spill 目录 + let artifact = persist_subagent_output(parent_session_id, subagent_session_id, result); + // 父 session 只注入摘要、统计和本地 artifact 引用 + result = summarize_with_artifact_ref(artifact, MAX_SUBAGENT_RETURN_BYTES); + result.truncation_kind = Some(TruncationKind::SubagentOutputBudgetExceeded); +} +``` + +Phase 1 不允许只做硬截断。原因是当前 hidden subagent 成功后会删除子 session;如果不先落盘,完整 reviewer 输出会永久丢失。通用 spill-to-file 可留到 Phase 3,但 subagent / Deep Review reviewer 返回必须在 Phase 1 先具备专项落盘能力。 + +同时在 subagent 的 turn completed 事件中,将 `partial_recovery_kind` 上浮到父 session 的事件流: + +```rust +// 在 task_tool 的结果聚合逻辑中 +if subagent_result.partial_recovery_kind.is_some() { + parent_event.subagent_truncation = Some(SubagentTruncationInfo { + subagent_session_id, + role: subagent_role, + recovery_kind: subagent_result.partial_recovery_kind, + }); +} +``` + +**阶段**:Phase 1 做大小检查、父 session artifact 落盘、摘要注入和 recovery kind 上浮;Phase 3 再泛化为统一输出预算。 + +--- + +#### R3:多 tool call 并行,部分截断(中风险) + +**来源**:Claude Code #19143 — streaming tool-use arguments truncated by premature stop。 + +**场景**:模型同时输出 3 个 tool call,第 3 个被截断,前 2 个已完整。若直接执行前 2 个完整 tool call,可能造成半套副作用;若全部标记失败,又会丢失已经完整的只读观察结果。 + +**解决办法**: + +在 `force_finish_pending_tool_calls()` 中区分完整性,但 **不要默认执行截断批次中的完整 mutating tool call**: + +```rust +fn force_finish_pending_tool_calls(&mut self) { + for tc in &mut self.pending_tool_calls { + if tc.is_json_complete() { + // JSON 完整,但本批次已发生截断,只标记为 CompleteWithinTruncatedBatch + tc.finalize_complete_within_truncated_batch(); + } else { + // JSON 不完整,标记为 Incomplete,不执行 + tc.finalize_as_incomplete(); + } + } +} +``` + +执行策略: + +1. 截断批次内的 `Incomplete` / `RepairedUntrusted` 一律不执行; +2. 截断批次内的完整 read-only tool 不因同批其他工具截断而关联拦截,可按白名单继续执行; +3. mutating tool 不再简单“一刀切”拦截,而是按工具元数据做最小拦截: + - `read_only`:继续执行; + - `idempotent_local_mutation`:参数完整、无依赖未完成工具、可记录 operation id 防重复时允许执行; + - `destructive_or_external_side_effect`:删除、提交、推送、外部网络写入、任意 shell 写操作等默认延后; + - `unknown_mutability`:按 mutating 高风险处理,不自动执行。 +4. 被延后的 mutating tool 不应直接把用户卡在错误态;系统应把完整参数、截断批次信息和延后原因返回给模型,优先让下一轮自动重新规划或补发缺失动作; +5. 只有在无法自动恢复,且继续执行可能产生不可逆副作用时,才向用户显示需要确认或缩小任务的提示。 + +**阶段**:Phase 1。 + +--- + +#### R4:stream error 携带截断信号(中风险) + +**来源**:Opencode #12233 — `StreamIdleTimeoutError` 导致 infinite retry loop。 + +**场景**:某些 provider 在输出截断时不发送正常 End,而是发送 error event(如 `StreamIdleTimeoutError`、`ContentFilterError`)。当前设计只在 `TimedStreamItem::End` 分支检测,会遗漏这类截断。 + +**解决办法**: + +在 `TimedStreamItem::Error` 分支也增加截断检测: + +```rust +TimedStreamItem::Error(err) => { + // 现有逻辑:标记 partial_recovery_reason + // 新增:如果已有有效输出且 error 类型可恢复,设置对应的 recovery kind + if ctx.has_effective_output() { + if is_stream_idle_timeout(&err) { + ctx.partial_recovery_kind = Some(PartialRecoveryKind::WatchdogTimeout); + } else if is_stream_interrupted(&err) { + ctx.partial_recovery_kind = Some(PartialRecoveryKind::StreamInterrupted); + } else if is_rate_limit_error(&err) { + ctx.partial_recovery_kind = Some(PartialRecoveryKind::RateLimited); + } + } + // ... 现有 error 处理逻辑 +} +``` + +同时在 `PartialRecoveryKind` 枚举中补充 `RateLimited` variant。 + +**阶段**:Phase 1。 + +--- + +#### R5:纯文本回答被截断,无 tool call(中风险) + +**来源**:所有竞品均未良好处理。 + +**场景**:模型输出纯文本回答(无 tool call),被 max_tokens 截断。当前设计只检测 pending tool call,不检测纯文本截断。 + +**解决办法**: + +在 `TimedStreamItem::End` 分支增加纯文本截断检测: + +```rust +// 在 ToolArgumentsIncomplete 和 ProviderMaxTokens 检测之后 +else if ctx.provider_metadata_indicates_max_tokens() && ctx.has_text_output() { + ctx.partial_recovery_kind = Some(PartialRecoveryKind::OutputTruncated); + ctx.partial_recovery_reason = Some( + "Model text output was truncated by provider max tokens".to_string() + ); +} +``` + +注意:`ProviderMaxTokens` 和 `OutputTruncated` 的区分——前者是 provider 明确报告的,后者是推断的。优先使用 provider 信号。 + +对纯文本截断,用户提示为:"模型输出可能不完整。建议缩短请求或新建会话继续。" + +**阶段**:Phase 1。 + +--- + +#### R6:`has_effective_output` 定义模糊(中风险) + +**来源**:A5 重试策略依赖此判断,但文档未精确定义。 + +**场景**:网络中断时,stream 可能已收到部分 text chunk 但没有完整 tool call。此时 `has_effective_output` 的判断直接影响是否重试。 + +**解决办法**: + +明确定义: + +```rust +fn has_effective_output(&self) -> bool { + // 有非空文本输出 + let has_text = self.assistant_text.as_ref().map_or(false, |t| !t.is_empty()); + // 有至少一个非 error 的 finalized tool call + let has_valid_tool = self.tool_calls.iter().any(|tc| !tc.is_error); + has_text || has_valid_tool +} +``` + +关键语义: + +- 空文本 + 空 tool call = 无有效输出 → 可安全重试(网络类); +- 有文本 + 空 tool call = 有有效输出 → 谨重重试(重试可能产生重复文本); +- 有 error tool call = 有有效输出 → 不重试(截断类)。 + +**阶段**:Phase 1(与 A5 同步实施)。 + +--- + +#### R7:前后端版本兼容(低风险) + +**来源**:A6 新增事件字段,旧版前端/后端不识别。 + +**场景**:滚动发布时,新版后端发送含 `partial_recovery_kind` 的事件,旧版前端忽略该字段;旧版后端发送不含该字段的事件,新版前端需要处理 `undefined`。 + +**解决办法**: + +- 后端:`partial_recovery_kind` 和 `partial_recovery_reason` 均为 `Option`,序列化时 `None` 不输出字段(serde 默认行为),旧版前端不受影响; +- 前端:所有新字段声明为 optional(`partial_recovery_kind?: ...`),`undefined` 时按"无截断"处理; +- 不需要版本协商,JSON 容忍性已足够。 + +**阶段**:Phase 1(设计约束,无需额外代码)。 + +--- + +#### R8:JSON 完整性检测性能与误判(中风险) + +**来源**:A2 每次调用 `has_incomplete_json_payload()` 做 full parse;原方案试图仅用 open brace/bracket count 避免 parse。 + +**场景**:`raw_arguments` 可能很大(几万字符),stream end 时做 full parse 有延迟。但如果只用简单括号计数,又可能被字符串内 `{}`、转义引号、snapshot 覆盖等情况误导,把不完整 JSON 判成完整,从而触发工具执行。 + +**解决办法**: + +Phase 1 不采用“只读 open brace/bracket count、不 re-parse”的方案。该方案会把性能问题转化为执行安全问题,标记为无效。 + +Phase 1 的有效方案: + +1. `has_incomplete_json_payload()` 在 stream end / finish boundary 做一次 `serde_json::from_str`,以 parse 结果作为最终完整性判断; +2. 可增加轻量 boundary hint(brace/bracket、字符串状态)作为快速判定“明显不完整”的早退路径; +3. 任何 hint 判定为完整后,仍必须执行 `serde_json::from_str` 验证; +4. 如需优化为增量检测,必须实现 JSON lexical state machine(跟踪字符串、转义、对象/数组栈和 snapshot reset),并保留 parse 作为最终校验; +5. 检测摘要只记录长度、边界状态、错误类别,不记录完整参数。 + +```rust +struct PendingToolCall { + // ... 现有字段 + json_boundary_hint: JsonBoundaryHint, +} +``` + +**阶段**:Phase 1。 + +--- + +#### R9:token 估算准确性不足(低风险) + +**来源**:A8 预算治理依赖 token 估算,但 char/4 粗估误差可能 2-3x。 + +**场景**:中文/日文等 CJK 字符 1 char ≈ 2-3 tokens,char/4 严重低估;代码中大量缩进/空行可能高估。 + +**解决办法**: + +采用 **分层估算策略**: + +1. **精确模式**:若模型有已知 tokenizer(如 tiktoken for GPT 系列),使用 tokenizer 计算; +2. **启发式模式**:否则使用 `chars * 0.6`(比 char/4 更保守,对 CJK 和代码都更安全); +3. **安全系数**:估算结果乘以 1.2(20% 安全余量),用于预算判断; +4. **置信度分层**:估算来源、模型 profile、新旧 actual token 偏差共同决定 confidence;低置信估算只能触发后台治理或诊断,不能阻断普通对话; +5. **日志记录**:每次估算后记录 estimated vs actual(如果后续有 actual token count),用于校准。 + +Phase 2 初期使用启发式模式 + 安全系数;后续可按模型引入精确 tokenizer。 + +**阶段**:Phase 2。 + +--- + +#### R10:Deep Review 拆分后 reviewer 之间有交叉依赖(中风险) + +**来源**:A8 预算治理按 token/byte 拆分,但文件间可能有语义依赖。 + +**场景**:文件 A 的 review 依赖文件 B 的类型定义,拆分后 reviewer A 看不到文件 B 的信息,review 质量下降。 + +**解决办法**: + +在拆分逻辑中保留 **共享上下文**: + +```rust +struct ReviewerGroup { + primary_files: Vec, + shared_context_files: Vec, // 类型定义、接口、公共模块 +} +``` + +拆分时: + +1. 识别"共享依赖文件"(被多个 primary file import 的文件); +2. 将共享依赖文件加入每个 reviewer group 的 `shared_context_files`; +3. 共享文件必须计入每个 reviewer group 的预算(因为每个 reviewer 都会看到); +4. 在 reviewer prompt 中明确标注哪些是共享上下文、哪些是主要审核目标。 + +预算公式: + +```text +group_input_tokens = + primary_files_tokens + + shared_context_tokens + + reviewer_prompt_tokens + + tool_schema_tokens + + reserved_output_tokens +``` + +若 `group_input_tokens > safe_context_budget`,不得继续重复注入共享全文。应按优先级降级: + +1. 将共享文件替换为接口签名 / 类型声明 / import 图摘要; +2. 仍超预算时,进一步缩小 primary file group; +3. 仍超预算时,提示用户缩小审核范围,而不是发送高风险请求。 + +**阶段**:Phase 2。 + +--- + +#### R11:spill 文件生命周期与磁盘累积(低风险) + +**来源**:A10 长期运行 session 产生大量 spill 文件。 + +**场景**:用户长时间使用同一 session,多次触发 spill,磁盘占用持续增长。 + +**解决办法**: + +1. spill 文件保存在 `.bitfun/sessions/{session_id}/spill/` 目录; +2. session 删除时一并清理 spill 目录(复用现有 session 清理逻辑); +3. 单个 session 的 spill 目录大小上限:默认 100 MiB,超限后最旧文件被 LRU 淘汰; +4. 全局 spill 目录总大小上限:默认 1 GiB,超限后按 session 最后活跃时间淘汰; +5. 日志记录 spill 文件创建、大小和淘汰事件。 + +**阶段**:Phase 3。 + +--- + +#### R12:spill 文件包含敏感信息(中风险) + +**来源**:Codex #14206 讨论;terminal 输出可能含 API key、环境变量值。 + +**场景**:工具输出包含 `AWS_SECRET_ACCESS_KEY=xxx`,spill 到文件后,文件权限不当可能泄露。 + +**解决办法**: + +1. spill 文件创建时设置权限为仅当前用户可读(Unix: 0600,Windows: 仅当前用户 ACL); +2. 在 spill 前对输出做 **敏感模式检测**(正则匹配常见 secret pattern:API key、token、password、private key); +3. 检测到敏感内容时: + - 仍然 spill(保留完整内容供诊断); + - 在摘要中用 `[REDACTED]` 替换敏感值; + - 日志中标记 `spill_file_contains_sensitive_content = true`; + - UI 打开 spill 文件时显示警告"该输出可能包含敏感信息"。 +4. 不上传 spill 文件到远端(已有约束)。 + +**阶段**:Phase 3。 + +--- + +#### R13:摘要质量不足(低风险) + +**来源**:A10 spill-to-file 的摘要策略。 + +**场景**:"前 N 行 + 统计 + 文件引用"是最简摘要,但某些场景需要语义摘要(如"该命令输出了 500 行编译错误,主要涉及 3 个模块"),生成语义摘要需要额外 LLM 调用。 + +**是否可解决**:可解决,但需要分层。 + +**解决办法**: + +采用 **分层摘要策略**,不用 LLM 生成语义摘要: + +1. **结构化摘要**(Phase 3 默认):前 N 行 + 行数/字节数统计 + 文件引用。对于模型判断"是否需要查看完整内容"已足够。 +2. **启发式摘要**(Phase 3 同步实现):对已知格式做轻量结构化提取,不调用 LLM: + - 编译错误:提取 error/warning 行 + 涉及文件列表 + 错误数统计; + - test 输出:提取 pass/fail/skip 计数 + 失败用例名; + - git diff stat:复用 diff stat 输出(已有); + - 其他:退回结构化摘要。 +3. **LLM 语义摘要**(不实施):额外 LLM 调用增加延迟、成本和截断风险(摘要本身也可能被截断),且引入循环依赖(截断治理依赖 LLM 调用,LLM 调用又可能截断)。**不纳入本设计**。 + +**判断**:Phase 3 实现结构化摘要 + 启发式摘要,不实现 LLM 语义摘要。启发式摘要已覆盖最常见的工具输出类型,无需 LLM 调用,不引入额外截断风险。 + +--- + +#### R14:预算随模型切换动态变化(低风险) + +**来源**:A9 统一预算策略。 + +**场景**:用户在 session 中切换模型(Claude 200K → GPT-4o 128K → glm 128K),预算应随之调整。 + +**解决办法**: + +预算策略从 `model_profile` 读取,不硬编码: + +```rust +struct ModelBudgetProfile { + context_window_tokens: u32, + max_output_tokens: u32, + safe_input_ratio: f32, // 默认 0.7,即输入不超过 70% context window +} +``` + +每次模型切换时重新计算预算。`ModelBudgetProfile` 按 model id 从配置加载,未知模型使用保守默认值(128K context, 4K output, 0.6 safe ratio)。 + +**阶段**:Phase 2(Deep Review 预算治理时同步实施)。 + +--- + +#### R15:预算耗尽前无预警(低风险) + +**来源**:A9 统一预算策略。 + +**场景**:等到截断才提示用户,用户已经浪费了一轮 LLM 调用。 + +**解决办法**: + +在请求发送前做 **预算预检**,但预检的产品目标是减少用户感知中止,不是提前制造阻断。估算逻辑不能保证完全准确,因此必须区分置信度和动作级别: + +```rust +fn check_budget_before_request(&self, estimate: BudgetEstimate, model: &ModelId) -> BudgetCheckResult { + let profile = self.get_model_budget(model); + let thresholds = profile.budget_thresholds_for(estimate.confidence); + if estimate.input_tokens > thresholds.hard_over_budget { + BudgetCheckResult::HardOverBudget { estimate, profile } + } else if estimate.input_tokens > thresholds.soft_over_budget { + BudgetCheckResult::SoftOverBudget { estimate, profile } + } else if estimate.input_tokens > thresholds.near_budget { + BudgetCheckResult::NearBudget { estimate, profile } + } else { + BudgetCheckResult::WithinBudget + } +} +``` + +- `HardOverBudget`:先执行本地确定性治理动作(microcompact / compression / 输出摘要化 / spill-to-file / Deep Review 分片);治理后重新估算,仍高置信超限时才不发送同一个大请求,并返回可恢复状态; +- `SoftOverBudget`:不直接阻断。优先在后台执行轻量压缩或摘要化;若压缩耗时可控,用户无需感知;若可能长耗时,进入“正在整理上下文以继续”的进度状态; +- `NearBudget`:记录 warn 日志,不干预执行; +- `WithinBudget`:正常执行。 + +为避免频繁上下文压缩,增加节流与收益判断: + +1. 同一 session 在短时间窗口内只能触发一次重压缩;窗口内再次接近预算时优先复用已有摘要或 spill 引用; +2. 压缩前估算可回收 token,若预计收益低于阈值(例如 < 10% context window),不触发重压缩; +3. 压缩后记录 tokens before/after、耗时和摘要来源,用于后续校准; +4. 如果连续压缩仍不能降低预算风险,则说明任务本身需要拆分,此时进入自动拆分或用户引导,而不是反复压缩。 + +准确性约束: + +- token 估算只能作为治理触发信号,不能作为低置信阻断依据; +- 阻断必须满足“高置信超限 + 本地恢复动作已尝试 + 仍无法安全发送”三个条件; +- `NearBudget` / `SoftOverBudget` / `HardOverBudget` 阈值必须来自 model profile、估算来源和历史误差校准;文档中的百分比只能作为示意,不得实现为全局固定魔数; +- 如果无法证明新预检能减少用户感知中止或异常数量,则该预检只能记录诊断,不进入默认执行路径。 + +注意:软提示只能作为辅助信息,不能作为 OverBudget 的主要解决措施。单纯在 prompt 中注入警告会进一步挤占上下文,而且弱模型可能无法自我拆分,因此该做法不能单独进入实现。 + +**阶段**:Phase 2。 + +--- + +#### R16:重试次数耗尽后截断类 partial recovery 的行为未定义(中风险) + +**来源**:A5 重试策略。 + +**场景**:`attempt_index == max_attempts - 1` 时,截断类 partial recovery 的处理未明确:是静默完成?还是标记为 error? + +**解决办法**: + +明确定义:重试次数耗尽时,截断类 partial recovery 的行为与"不重试"一致——**finish with partial recovery**: + +- round 正常结束(不是 error); +- `partial_recovery_kind` 和 `partial_recovery_reason` 被保留在 round result 中; +- 事件正常传播到前端; +- 前端显示截断提示。 + +不标记为 error 的原因:round 确实产生了部分有效输出(文本、已完成的 tool call),标记为 error 会丢失这些有效内容。 + +**阶段**:Phase 1。 + +--- + +#### R17:context compaction 截断历史导致 agent 丢失关键上下文(中风险) + +**来源**:Opencode #8089, #18037。 + +**场景**:auto-compaction 删除历史消息以腾出空间,但可能删除了 agent 当前任务依赖的关键上下文(如用户指令、文件结构、之前工具调用的结果)。 + +**是否可解决**:当前 BitFun 已有 context compression、microcompact 和 emergency truncation 这类输入侧上下文治理能力,因此该风险在当前产品中成立。它不应作为模型输出截断处理,但必须被纳入上下文预算事件和信任元数据治理。 + +**解决办法**: + +1. `ContextCompacted` 不放入 `PartialRecoveryKind`,避免把输入侧上下文变更误判为输出流恢复; +2. 增加独立事件 / 状态: + +```rust +pub enum ContextMutationKind { + Microcompact, + ModelCompression, + StructuredFallbackCompression, + EmergencyTruncation, +} +``` + +3. 每次上下文变更记录:`kind`、tokens before/after、被清理的 turn 范围、是否存在模型摘要、summary source、是否触发 emergency truncation; +4. emergency truncation 必须进入用户可见提示或诊断区域,因为它是有损删除,不应只写日志; +5. 用户指令、系统 prompt、当前任务描述、压缩边界摘要需要明确保护策略;如果某种上下文治理无法满足这些保护条件,则该治理模式不得发布或默认开启; +6. 按耗时区分展示策略: + - 短耗时压缩:作为后台治理,仅在诊断区记录,不打断消息流; + - 长耗时压缩:进入可见进度状态,例如“正在整理上下文以继续”,避免用户看到长时间无输出; + - 有损 emergency truncation:必须显示轻量提示,说明 BitFun 已尽量保留关键上下文,并提供查看诊断的入口。 + +**判断**:当前不实现 `PartialRecoveryKind::ContextCompacted`;改为实现输入侧 `ContextMutationKind` 事件。若无法给 emergency truncation 提供用户可见提示和诊断证据,则不要扩大自动上下文删除能力。 + +--- + +#### R18:i18n key 未定义,英文文案缺失(低风险) + +**来源**:A7 前端提示。 + +**场景**:文档列出了中文提示文案,但没有定义 i18n key 和英文对应。 + +**解决办法**: + +定义 i18n key 结构: + +```text +ai_error.output_truncated.tool_arguments_incomplete +ai_error.output_truncated.provider_max_tokens +ai_error.output_truncated.stream_interrupted +ai_error.output_truncated.watchdog_timeout +ai_error.output_truncated.rate_limited +ai_error.output_truncated.consecutive_truncation +ai_error.context.preparing_to_continue +ai_error.context.reorganized +ai_error.context.emergency_truncated +``` + +英文文案: + +| Key | English | +|---|---| +| `tool_arguments_incomplete` | The model stopped while preparing an action. BitFun kept the safe parts and will avoid running incomplete actions. | +| `provider_max_tokens` | The model reached its output limit. BitFun preserved the current answer and can continue with a smaller next step. | +| `stream_interrupted` | The model response was interrupted. Partial output has been preserved. | +| `watchdog_timeout` | The model stopped responding for a while. Partial output has been preserved. | +| `rate_limited` | The provider paused this response due to rate limits. BitFun preserved the current state so you can continue shortly. | +| `consecutive_truncation` | This task is repeatedly exceeding the model output limit. BitFun preserved the current result and needs a smaller next step. | +| `preparing_to_continue` | BitFun is organizing the context so the model can continue. | +| `reorganized` | BitFun reorganized earlier context to keep this conversation within the model limit. | +| `emergency_truncated` | BitFun had to trim older context to keep the task running. Key instructions were preserved where possible. | + +中文文案应遵循同一语气:明确问题来自模型输出限制、provider 中断或上下文预算,而不是用户操作错误;同时说明 BitFun 已保留有效结果、正在恢复或会引导用户更好地继续。避免使用“失败”“错误”“无法处理”作为主文案,除非确实没有可恢复内容。 + +**阶段**:Phase 1。 + +--- + +#### R19:大文件 chunk 顺序错乱或内容缺失(中风险) + +**来源**:Large File Write Protocol。 + +**场景**:模型或工具层分多次 append 大文件内容时,chunk sequence 错乱、重复、遗漏,最终文件看似写入成功但内容不完整。 + +**解决办法**: + +1. 每个 write session 维护 manifest:`write_id`、`expected_sections`、`next_sequence`、chunk hash、累计 bytes; +2. `append_file_chunk` 必须校验 sequence 单调递增,重复 sequence 只能在 hash 完全一致时幂等接受; +3. `finish_file_write` 校验 expected hash、section 完整性和基础格式; +4. hash 或 section 不匹配时不提交目标文件,只保留临时 artifact 和恢复建议。 + +**阶段**:Phase 4。 + +--- + +#### R20:半成品文件被误认为成功(高风险) + +**来源**:Large File Write Protocol。 + +**场景**:写入中断后临时文件已经存在,如果系统或模型把它当作目标文件完成状态,后续构建/测试会基于半成品继续。 + +**解决办法**: + +1. 所有大文件写入先进入 session temp path,不直接覆盖目标文件; +2. 只有 `finish_file_write` 校验成功后才原子替换目标文件; +3. turn 结束时若存在 open write session,必须生成 `FileWriteIncomplete` 事件; +4. UI 展示“写入未完成,可继续/放弃”,而不是显示成功。 + +**阶段**:Phase 4。 + +--- + +#### R21:后续修改读不到完整上下文(中风险) + +**来源**:Large File Write Protocol。 + +**场景**:上下文只保存 manifest 和摘要,模型后续想修改刚生成的大文件时,如果不读取相关 range/symbol,可能凭摘要误改。 + +**解决办法**: + +1. manifest 中记录 section line ranges、symbol index 或结构化摘要; +2. 后续修改大文件时,agent context builder 注入 manifest,并提示模型按需读取 range/symbol; +3. patch tool 在修改前可自动读取目标 range,避免模型凭记忆修改; +4. 对跨 section 修改,要求先读取相关 sections 再生成 patch。 + +**阶段**:Phase 4。 + +--- + +#### R22:大文件写入 artifact 泄漏或磁盘累积(中风险) + +**来源**:Large File Write Protocol 与 spill-to-file 共用 session artifact。 + +**场景**:大文件临时 chunk、manifest、失败 artifact 长期留在 `.bitfun/sessions` 下,可能占用磁盘或包含敏感内容。 + +**解决办法**: + +1. large write artifact 复用 R11/R12 的权限、敏感检测、session 删除清理、LRU 淘汰策略; +2. abort 或 finish 后清理不再需要的 chunk,只保留必要 manifest 和最终 artifact 引用; +3. 日志只记录路径、大小、hash 和敏感标记,不记录完整内容; +4. UI 打开 artifact 时沿用敏感内容警告。 + +**阶段**:Phase 4。 + +--- + +#### R23:gateway overload 误分类导致错误重试或错误降速(中风险) + +**来源**:Agent Runtime Subagent Scheduling Plan。 + +**场景**:provider 返回 auth、quota、billing、model-not-found 等不可重试错误,但 runtime 将其误判为 capacity / overload,导致无意义排队、重试或降低 effective concurrency。 + +**解决办法**: + +1. provider adapter 保留 raw diagnostics,并先分类不可重试错误; +2. 只有 HTTP 429、capacity exceeded、queue full、server busy、concurrency saturation、无有效输出前 transient network failure 进入 retryable; +3. `Unknown` provider error 不默认重试,只进入诊断; +4. retry classifier 有 fixture 覆盖 vLLM overload、OpenAI-compatible 429、auth/quota/model-not-found。 + +**阶段**:Phase 2。 + +--- + +#### R24:adaptive concurrency 振荡(中风险) + +**来源**:Agent Runtime Subagent Scheduling Plan。 + +**场景**:provider 错误噪声较大,runtime 一会儿降低 effective max、一会儿升高,导致 reviewer 执行节奏不稳定。 + +**解决办法**: + +1. 降低 effective max 要快,升高必须慢,并基于连续成功窗口; +2. effective max 永远不低于 1; +3. 每个 gateway key 独立维护计数,避免一个慢 gateway 影响其他 provider; +4. 若振荡超过阈值,关闭 adaptive,仅使用 configured static cap。 + +**阶段**:Phase 2。 + +--- + +#### R25:queue timeout 掩盖真实 provider 故障(中风险) + +**来源**:Agent Runtime Subagent Scheduling Plan。 + +**场景**:所有 reviewer 都在排队或 retry_waiting,但真实原因是 auth/quota/model 配置错误。若 UI 只显示“等待容量”,用户会误以为稍后会恢复。 + +**解决办法**: + +1. queue timeout 只用于等待 scheduler capacity,不用于吞掉 provider 非重试错误; +2. auth/quota/model-not-found/invalid key 必须立即进入设置或诊断提示; +3. retry budget 耗尽后显示最终 provider diagnostics 和已保留 partial evidence; +4. Deep Review summary 区分 queued execution、gateway overload、provider configuration error。 + +**阶段**:Phase 2。 + +--- + +#### R26:write-capable subagent 重试产生副作用(高风险) + +**来源**:Agent Runtime Subagent Scheduling Plan。 + +**场景**:写型 subagent 在执行过文件修改、shell 写操作、提交或外部写入后失败。如果 runtime 自动重试,可能造成重复修改或不可逆副作用。 + +**解决办法**: + +1. subagent retry policy 必须读取 tool history 的 mutability 结果; +2. write-capable subagent 只有在确认没有 state-changing tool 执行时才允许自动重试; +3. read-only reviewer 可使用更宽松 retry budget,但 partial output 仍应优先保留而不是盲目重跑; +4. 无法证明安全时,不重试,返回 `completed_with_partial` 或 failed with evidence。 + +**阶段**:Phase 2。 + +--- + +#### R27:父 session 取消后 queued subagent 未清理(中风险) + +**来源**:Agent Runtime Subagent Scheduling Plan。 + +**场景**:用户取消父任务或 session 关闭后,queued / retry_waiting subagent 仍在后台等待 permit,之后意外运行并污染事件流或资源。 + +**解决办法**: + +1. scheduler state 必须绑定 parent session / parent turn cancellation token; +2. parent cancelled 后,所有 queued / waiting_for_capacity / retry_waiting task 进入 `cancelled`; +3. running subagent 尽力取消 stream,并释放 gateway permit; +4. 取消路径必须有 permit release 和 artifact cleanup 测试。 + +**阶段**:Phase 1 / Phase 2。 + +--- + +#### R28:Deep Review Strategy Engine 与 runtime scheduler 双调度(高风险) + +**来源**:`deep-review-design.md` 已实现后的交叉对照。 + +**场景**:Deep Review 的 `DeepReviewConcurrencyPolicy` / frontend batching 已经控制 reviewer 分批启动,runtime `SubagentScheduler` 又再次排队和限流。两套调度叠加后,用户可能看到重复等待、队列状态不一致,甚至某些 reviewer 永远无法被调度。 + +**解决办法**: + +1. Deep Review policy 只输出 reviewer 列表、scope、`max_parallel_instances`、timeout / retry budget 等约束; +2. `SubagentScheduler` 是唯一实际执行调度器,负责 queue、permit、retry_waiting、cancel cleanup; +3. `reviewTeamService.ts` / `DeepReviewService.ts` 不直接根据 cached rate-limit 状态决定最终执行并发,只能把 provider/model hints 写入 manifest; +4. 增加 Deep Review compatibility fixture,验证同一 reviewer 只进入一个 scheduler state machine。 + +**阶段**:Phase 2C。 + +--- + +#### R29:Deep Review retry budget 与 scheduler retry classifier 双重试(高风险) + +**来源**:`deep-review-design.md` 已实现后的交叉对照。 + +**场景**:Deep Review orchestrator 根据每角色 retry budget 重发 reviewer,scheduler 又根据 provider overload 做自动 retry,导致同一 reviewer 被重复执行,成本和时间放大,甚至重复产生 conflicting evidence。 + +**解决办法**: + +1. Deep Review retry budget 只定义“最多允许 retry 的策略上限”; +2. 是否 retry 必须经过 scheduler retry classifier; +3. retry event 必须带 `retry_owner = scheduler`,Deep Review orchestrator 只能请求 retry,不能绕过 scheduler 自行重发; +4. 对已有 partial evidence 的 read-only reviewer,默认进入 `completed_with_partial`,除非 classifier 判断 retry 更有收益。 + +**阶段**:Phase 2C。 + +--- + +#### R30:Deep Review partial timeout 与通用 partial recovery 状态冲突(中风险) + +**来源**:`deep-review-design.md` 已实现后的状态映射。 + +**场景**:Deep Review 已有 `partial_timeout` / `PartialTimeout`,而 runtime 方案使用 `completed_with_partial` / `timed_out`。如果 UI、Judge、日志同时消费两套状态,可能把有用 partial evidence 显示成失败,或把无 evidence timeout 显示成部分成功。 + +**解决办法**: + +1. 统一对外 scheduler state:有有效 evidence 的 `partial_timeout` 映射为 `completed_with_partial`; +2. 无有效 evidence 的 timeout 映射为 `timed_out`; +3. Deep Review 原始状态保留在 diagnostics / raw_status,不作为主判断字段; +4. ReviewJudge 输入只消费统一状态和 partial evidence,不解析原始错误文案。 + +**阶段**:Phase 1 / Phase 2C。 + +--- + +#### R31:Deep Review 大 source 重复取证导致 reviewer 超时(中风险) + +**来源**:最近 X 个提交、PR URL、working tree、patch artifact、超大 diff Deep Review 的实际体验问题。 + +**场景**:每个 reviewer 在 subagent 内自行重新拉取或重建完整变更证据,例如本地提交范围、远端 PR 文件列表、patch 内容、文件快照和统计信息。4-5 个 reviewer 会重复读取同一批变更,导致大量时间消耗在命令执行、网络/API、输出传输和解析上,而不是实际分析;大仓库、远端 PR、Windows 环境或自托管代码平台下尤其容易触发 timeout。 + +**解决办法**: + +1. Deep Review 父任务在 launch preflight 阶段通过 source resolver 和 provider 一次生成 source-agnostic `ReviewEvidencePack`,并落盘到 session artifact; +2. Work Packet projection 只给 reviewer 对应的 `evidence_slice`、artifact ref、允许读取的源码路径和必要局部查询权限; +3. reviewer 默认禁止重复重建完整 source evidence;需要更多上下文时请求父任务扩展 pack,或执行小范围只读查询; +4. pack generation 时间单独记录和展示,不计入 reviewer active run timeout; +5. pack 绑定 `source_kind` / `source_locator_hash` / `source_fingerprint` / `pack_hash`,stale 时必须刷新或让用户确认。 + +**阶段**:Phase 2C。 + +--- + +### 方案变动的周边影响评估 + +本轮风险复核后,方案从“检测截断并提示”收紧为“检测截断、隔离不可信工具调用、保留完整证据、再提示”。主要影响如下: + +| 变动 | 影响范围 | 正面影响 | 用户负面感知风险与约束 | +|---|---|---|---| +| `ContextCompacted` 从 `PartialRecoveryKind` 移除,改为 `ContextMutationKind` | core execution events、transport、前端诊断展示 | 避免把输入侧上下文压缩误当作模型输出中断;能单独审计 microcompact / compression / emergency truncation | 短耗时压缩不打断消息流;长耗时压缩必须显示进度;只有有损 emergency truncation 显示轻量提示 | +| `ToolArgumentStatus` 替代单纯 `is_error` 判断 | ai-adapters accumulator、core tool call model、tool pipeline | 能明确区分完整 JSON、保守修复、不可信修复和不完整参数,阻止补齐式修复产生副作用 | 不应把部分成功渲染成 fatal error;提示语义是“已保留安全部分并继续恢复” | +| 截断批次工具最小化拦截 | stream_processor、round_executor、tool pipeline、前端 tool card | 防止半批工具执行造成不可逆副作用,同时保留只读观察结果 | read-only tool 不关联拦截;低风险本地幂等写操作满足条件可继续;高风险写操作延后并优先自动重新规划 | +| subagent / reviewer 输出 Phase 1 即落盘 | task_tool、coordinator、session storage、Deep Review UI | 避免 hidden subagent 清理后丢失完整 reviewer 证据;父 session 只吃摘要和引用,降低 overflow 风险 | 普通对话中 artifact 只在输出确实超大时使用;阈值不能过低,避免用户觉得内容被过早折叠 | +| `SubagentScheduler` + Gateway Request Limiter | coordinator、AI client boundary、provider adapter、Deep Review UI | 本地/弱并发 gateway 下 reviewer 会排队完成而不是 burst failure;用户能看到 waiting/retrying/running | Deep Review 可能更慢;UI 必须解释为受控排队执行,而不是系统卡住或审核变差;已实现 Strategy Engine 的并发策略只能作为上限 | +| Deep Review 已实现状态映射到统一 scheduler event | DeepReviewService、ReviewJudge 输入、前端 review UI、日志 | 避免 `partial_timeout` / retry / timeout 与通用状态重复解释;Judge 只处理一套状态 | 若映射错误,会把可用 partial evidence 渲染成失败,或把无 evidence timeout 渲染成部分成功 | +| Review Evidence Pack | Deep Review source resolver、source provider、session artifact、Work Packet、scheduler timeout | 避免每个 reviewer 重复重建完整 source evidence,让 timeout 留给分析而不是取证 | 大 source 任务启动前会多一个“准备审核证据”阶段;长耗时必须显示进度,pack stale 必须刷新或确认 | +| OverBudget 从软提示改为本地恢复优先 | execution_engine、Deep Review 分片、前端恢复提示 | 不把“已经超预算”的问题继续交给弱模型自救,降低重复截断概率 | 低置信超限不得阻断;先压缩/摘要/spill 并重估;只有高置信超限且恢复失败才引导用户缩小范围 | +| 共享上下文计入每个 reviewer 预算 | deep_review_policy、reviewTeamService、DeepReviewService | 避免系统性低估 reviewer 输入 token | 拆分后的 reviewer 数量或摘要需求可能增加,需要和并发/成本上限一起调参 | +| 大文件写入协议 | tool layer、session storage、filesystem service、前端进度展示 | 避免 1k+ 行文件内容通过普通 tool call JSON 撑爆上下文;写入可恢复、可校验、可原子提交 | 用户会看到文件生成进度而不是长时间流式文本;必须避免进度过碎或把小文件也强行协议化 | + +这些变动不会改变仓库的平台边界:判断和治理仍在 core / ai-adapters / transport 层完成,前端只消费结构化事件和本地化文案。 + +--- + +### 普通对话的用户负面感知复核 + +以下复核覆盖 Deep Review 以外的正常 Flow Chat 场景。原则上,新增机制只能把原本会突然断掉、静默失败或不可恢复的场景变成可恢复体验;不能让原本可完成的请求更容易被用户感知为中止。 + +| 场景 | 潜在负面感知 | 方案约束 | +|---|---|---| +| 预算预检 | 用户刚发送消息就被要求缩小范围,感觉比旧逻辑更早失败 | 只有 `HardOverBudget` 且本地恢复失败才阻断;`SoftOverBudget` 走后台压缩/摘要,不直接打断 | +| 频繁压缩 | 对话中反复出现“整理上下文”,感觉系统卡顿 | 加压缩冷却窗口、收益阈值和复用摘要;连续压缩无效时转为拆分任务,不反复压缩 | +| token 估算误差 | 本来可能成功的请求被误判为超预算 | 估算只作为治理触发信号,不作为低置信阻断依据;记录估算与实际差异用于校准 | +| 工具批次截断 | 只读观察结果被误拦截,用户看到不必要失败 | read-only tool 不关联拦截,完整只读结果继续返回给模型和用户 | +| 写操作延后 | 用户觉得“模型明明已经决定改文件,却没有动” | 使用工具 mutability 元数据做最小拦截;低风险本地幂等写操作可继续,高风险写操作延后后优先自动重新规划 | +| 错误文案 | 截断提示像 fatal error,削弱信任 | 文案强调模型/provider/预算限制、有效结果已保留、BitFun 正在恢复或引导继续 | +| 后台长耗时治理 | 用户看到长时间无输出,以为卡死 | 短耗时后台静默;长耗时显示进度状态;有损删除提供诊断入口 | +| subagent 排队执行 | 本地模型下 Deep Review 变慢,用户误以为 reviewer 没启动 | UI 明确区分 queued / waiting_for_capacity / running / retry_waiting,并显示 configured/effective concurrency | +| artifact / spill | 普通输出被过早折叠,阅读成本上升 | 只对真正超大输出启用;普通规模仍直接展示;摘要和引用必须同时存在 | +| 大文件写入协议 | 用户期待模型直接吐完整文件,但看到分段写入/校验状态 | 只在大文件或高风险上下文触发;进度状态要聚合为 section 级别,完成后提供文件路径、摘要和验证结果 | + +若某项机制在普通对话中增加了用户感知中止、等待不透明、或不必要确认次数,则该机制不能进入默认路径,只能作为诊断或显式高级选项。 + +--- + +### 高风险项实施硬门槛 + +以下项目只有满足硬门槛才进入实现;否则必须从实施计划中移除,并在 UI / 日志中保留现有保守行为。 + +| 项 | 必须满足的解决措施 | 不满足时的处理 | +|---|---|---| +| R0-3 JSON 修复生成语义错误参数 | `ToolArgumentStatus` 贯穿 accumulator、core tool call model、tool pipeline;pipeline 在执行前强制拒绝 `RepairedUntrusted` / `Incomplete` | 不实现补齐式 JSON 修复;只保留现有删除单个多余右花括号的保守修复,并保持 parse 失败为 `is_error=true` | +| R1 模型自主重试 doom loop | session / turn 层有连续截断计数器;超过阈值强制结束 turn,并给用户明确恢复建议 | 不实现模型自主“继续尝试”提示;截断后只完成 partial recovery,不鼓励模型继续同路径 | +| R2 subagent 返回 overflow / 证据丢失 | 完整 subagent 输出先落盘到父 session artifact;父 session 只注入摘要、统计、引用;session 删除时能清理 artifact | 不实现“只硬截断 subagent 返回”的 Phase 1 方案,避免 hidden subagent 清理后永久丢失 reviewer 证据 | +| R3 多 tool call 部分截断 | tool pipeline 能识别截断批次和 tool mutability;read-only tool 不关联拦截;mutating tool 按幂等/破坏性/外部副作用分级 | 不实现“一刀切拦截完整工具”或“完整者全部正常执行”的策略;缺少 mutability 元数据时按高风险保守处理 | +| R15 预算预检导致提前阻断 | 预检具备 `HardOverBudget` / `SoftOverBudget` 分层、本地恢复动作、压缩节流、估算校准;低置信不阻断 | 不把预算预检接入默认阻断路径,只记录诊断或后台轻量治理 | +| R17 emergency truncation / context mutation | context mutation 事件记录 tokens before/after、删除范围、summary source、耗时,并按短耗时/长耗时/有损删除决定展示 | 不扩大自动上下文删除能力;`ContextCompacted` 不进入 partial recovery;长耗时压缩不能静默造成长时间无输出 | +| R19-R22 大文件写入协议 | 具备 manifest、chunk sequence/hash 校验、temp file、finish 原子提交、abort/continue 恢复和 artifact 清理 | 不把大文件写入协议接入默认写文件路径;仍使用小文件直接写入和 patch/hunk 修改 | +| R23-R31 subagent / Deep Review 调度与取证复用 | 具备 retry classifier、gateway-keyed permit、adaptive 回滚、write-capable retry 安全检查、parent cancellation cleanup、Deep Review Strategy Engine 兼容映射、Review Evidence Pack 去重 | 不启用自适应并发和自动重试;不启用 pack enforcement;Deep Review 只保留已实现 policy,上层仅记录 queue / status / 重复取证诊断 | +| 重复实现与 owner 漂移 | 新增模块前必须完成“唯一 owner”检查:相似状态、重试、artifact、manifest、budget policy、UI 状态只能有一个行为 owner;其它层只能 projection / adapter / event mapping | 不新增平行模块;若现有 owner 不足,只能先提交迁移方案和回滚路径,不进入默认实现 | + +--- + +### 风险处置汇总 + +| # | 风险 | 等级 | 处置 | 阶段 | +|---|---|---|---|---| +| R1 | 模型自主重试 doom loop | 高 | 连续截断计数器,超阈值强制终止 turn | Phase 1 | +| R2 | subagent 截断致父 session overflow / 证据丢失 | 高 | Phase 1 必须先落盘完整 subagent 输出,父 session 只注入摘要和引用;否则不实现硬截断 | Phase 1 | +| R3 | 多 tool call 部分截断 | 中 | 完整性仍记录;read-only tool 不关联拦截;mutating tool 按幂等/破坏性/外部副作用分级做最小拦截 | Phase 1 | +| R4 | stream error 携带截断信号 | 中 | Error 分支增加截断 kind 判断,补充 RateLimited variant | Phase 1 | +| R5 | 纯文本截断无 tool call | 中 | End 分支增加纯文本 + max_tokens 检测 | Phase 1 | +| R6 | has_effective_output 定义模糊 | 中 | 明确定义:有非空文本或有非 error tool call | Phase 1 | +| R7 | 前后端版本兼容 | 低 | Option + optional 字段,JSON 容忍 | Phase 1 | +| R8 | JSON 完整性检测性能与误判 | 中 | 不采用“只看括号计数不 re-parse”;parse 是最终完整性校验 | Phase 1 | +| R9 | token 估算准确性 | 低 | 启发式 chars*0.6 + 1.2x 安全系数 + 日志校准 | Phase 2 | +| R10 | reviewer 交叉依赖 | 中 | 保留共享上下文,但必须计入每个 reviewer 预算;超预算改为接口/类型摘要 | Phase 2 | +| R11 | spill 文件磁盘累积 | 低 | session 清理 + 单 session/全局大小上限 + LRU 淘汰 | Phase 3 | +| R12 | spill 文件含敏感信息 | 中 | 文件权限 + 敏感模式检测 + 摘要 REDACTED + UI 警告 | Phase 3 | +| R13 | 摘要质量不足 | 低 | 结构化摘要 + 启发式摘要(编译错误/test输出等已知格式),不使用 LLM 语义摘要 | Phase 3 | +| R14 | 预算随模型切换变化 | 低 | ModelBudgetProfile 从配置加载,切换时重算 | Phase 2 | +| R15 | 预算耗尽前无预警 | 低 | 请求前预算预检;SoftOverBudget 后台恢复优先;HardOverBudget 也必须先压缩/摘要/spill 并重估,仍超限才引导用户缩小范围 | Phase 2 | +| R16 | 重试耗尽后行为未定义 | 中 | finish with partial recovery,保留有效输出 | Phase 1 | +| R17 | context compaction / emergency truncation 截断历史 | 中 | 当前成立;使用独立 `ContextMutationKind` 事件和用户可见诊断,不放入 `PartialRecoveryKind` | Phase 1 | +| R18 | i18n key 和英文文案缺失 | 低 | 定义 key 结构 + 中英文文案 | Phase 1 | +| R19 | 大文件 chunk 顺序错乱或内容缺失 | 中 | write manifest + sequence/hash 校验 + finish 完整性校验 | Phase 4 | +| R20 | 半成品文件被误认为成功 | 高 | temp file 写入 + finish 原子提交 + open write session 事件 | Phase 4 | +| R21 | 后续修改读不到完整上下文 | 中 | manifest 记录 section/range,后续修改按需读取 range/symbol | Phase 4 | +| R22 | 大文件 artifact 泄漏或磁盘累积 | 中 | 复用 spill 权限/敏感检测/清理/LRU,finish/abort 后清理 chunk | Phase 4 | +| R23 | gateway overload 误分类 | 中 | 不可重试错误优先分类;Unknown 不默认重试;retry classifier fixture | Phase 2 | +| R24 | adaptive concurrency 振荡 | 中 | 快降慢升、按 gateway key 独立计数、振荡时回退静态 cap | Phase 2 | +| R25 | queue timeout 掩盖真实 provider 故障 | 中 | queue timeout 与 provider error 分离;auth/quota/model 错误立即提示诊断 | Phase 2 | +| R26 | write-capable subagent 重试产生副作用 | 高 | 读取 tool mutability history;无法证明无状态变更则不自动重试 | Phase 2 | +| R27 | 父 session 取消后 queued subagent 未清理 | 中 | parent cancellation token 绑定 scheduler state;取消时释放 permit 和清理 artifact | Phase 1 / Phase 2 | +| R28 | Deep Review Strategy Engine 与 runtime scheduler 双调度 | 高 | Deep Review policy 只输出上限和约束;`SubagentScheduler` 是唯一执行调度器 | Phase 2C | +| R29 | Deep Review retry budget 与 scheduler retry classifier 双重试 | 高 | retry budget 只定义上限;实际 retry 必须由 scheduler classifier 判定并记录 owner | Phase 2C | +| R30 | `partial_timeout` 与通用 partial recovery 状态冲突 | 中 | `partial_timeout` 按 evidence 映射为 `completed_with_partial` 或 `timed_out`,raw status 仅诊断 | Phase 1 / Phase 2C | +| R31 | Deep Review 大 source 重复取证导致 reviewer 超时 | 中 | 父任务 preflight 一次生成 source-agnostic `ReviewEvidencePack`,reviewer 只消费 artifact slice;重复完整 evidence 重建进入诊断/门禁 | Phase 2C | + +--- + +## 验收标准 + +### Phase 0 验收 + +1. 已建立 stream fixture,覆盖正常 stop、provider max_tokens、无 finish reason、stream error、watchdog timeout。 +2. 已建立 tool argument fixture,覆盖完整 JSON、半截 JSON、保守修复、不可置信补齐。 +3. 已建立 budget fixture,覆盖 CJK、代码、大 diff、长日志和不同 model profile。 +4. 已建立 artifact / large write fixture,覆盖超大 subagent result、敏感输出、1k+ 行新文件和 chunk 异常。 +5. 已建立 scheduler fixture,覆盖 gateway concurrency=2、4-5 reviewers burst、retryable overload、non-retryable auth/quota/model errors。 +6. 已建立 Deep Review compatibility fixture,覆盖已实现 Strategy Engine 的 `max_parallel_instances`、predictive timeout、`partial_timeout`、retry budget 与 runtime scheduler 的映射关系。 +7. 已建立 Review Evidence Pack fixture,覆盖 local git range、PR URL、working tree、patch artifact、rename、pathspec、pack stale 和多 reviewer 共享取证。 +8. `ai.subagent_max_concurrency` 和 Deep Review `max_parallel_instances` 在设置和 manifest 中被描述为 runtime bounded scheduling 的上限,而不是无条件并行承诺。 +9. 旧逻辑在关键 fixture 上能稳定复现静默截断、父 session overflow、gateway burst failure、双调度风险、重复全量 diff 或大文件写入失败,作为后续回归基线。 +10. 所有后续阶段新增行为都有独立 feature flag 和 observe-only 开关。 + +### Phase 1 验收 + +1. stream end 时如果存在未完成工具参数,系统标记结构化恢复类型,而不是静默完成。 +2. 工具参数截断不会触发同请求自动重试。 +3. 网络类无有效输出错误仍保持可重试行为。 +4. 半截工具调用不会自动执行。 +5. 前端 active session 消息流末尾显示明确截断提示。 +6. 前端事件类型显式包含 recovery kind / reason。 +7. 日志使用英文,无 emoji,包含 session_id、round_id 和不泄露全文参数的 summary。 +8. **连续截断超过 3 次时强制终止 turn 并提示用户**(R1)。 +9. **Task tool 返回前检查结果大小;超限时完整输出先落盘到父 session artifact,父 session 只注入摘要和引用**(R2)。 +10. **多 tool call 并行时,不完整者标记为 `Incomplete`;read-only tool 不关联拦截;mutating tool 依赖 mutability 元数据做最小拦截**(R3)。 +11. **stream error 分支能识别 WatchdogTimeout / StreamInterrupted / RateLimited**(R4)。 +12. **纯文本截断能被检测并提示**(R5)。 +13. **`has_effective_output` 有明确定义和单测**(R6)。 +14. **重试耗尽时 finish with partial recovery,保留有效输出**(R16)。 +15. **i18n key 和中英文文案已定义**(R18)。 +16. **每个 ai-adapter 将 provider 原始 finish reason 归一化为 `ProviderFinishReason` 枚举**(R0-2)。 +17. **`ProviderFinishReason::Unknown` 不触发截断检测,保留原始值用于诊断**(R0-2)。 +18. **`PendingToolCalls` 使用 parse 作为 JSON 完整性最终校验;简单括号计数只能作为 early hint,不能单独决定可执行性**(R8)。 +19. **subagent 截断的 recovery kind 上浮到父 session 事件流**(R2)。 +20. **microcompact / compression / emergency truncation 通过独立 context mutation 事件记录,不进入 `PartialRecoveryKind`**(R17)。 +21. **subagent scheduler state event 能表达 accepted / queued / waiting_for_capacity / running / retry_waiting / completed / completed_with_partial / failed / cancelled / timed_out**。 +22. **父 session 取消会取消 queued / waiting_for_capacity / retry_waiting subagent,并释放 running subagent 的 gateway permit**(R27)。 + +### Phase 2 验收 + +1. Deep Review 分片策略支持 token/byte/diff line 预算。 +2. 单个超大文件或超大 diff 会被识别并隔离处理。 +3. 子 reviewer 截断能在父任务中可见。 +4. **token 估算使用启发式 + 安全系数,日志记录估算值**(R9)。 +5. **拆分时保留共享上下文文件,但共享上下文计入每个 reviewer group 预算;超预算使用接口/类型摘要**(R10)。 +6. **模型切换时预算自动重算**(R14)。 +7. **请求前预算预检;SoftOverBudget 不阻断;HardOverBudget 先执行压缩/摘要/spill 并重估,仍超预算才不发送同一大请求**(R15)。 +8. **预算治理记录压缩耗时、tokens before/after、估算与实际差异;连续压缩必须有冷却窗口和收益阈值**(R15 / R17)。 +9. **普通对话的默认路径不得因低置信预算估算新增用户可感知中止**(产品体验约束)。 +10. **gateway-keyed AI request limiter 在 stream finish / error / cancel 时都释放 permit**。 +11. **retry classifier 能区分 retryable overload 和 non-retryable auth/quota/model-not-found**(R23 / R25)。 +12. **adaptive effective concurrency 遇 overload 快速降低,成功窗口后缓慢升高,且不低于 1**(R24)。 +13. **write-capable subagent 只有在 tool history 证明无 state-changing tool 后才可自动重试**(R26)。 +14. **Deep Review 在 gateway concurrency=2 时可通过排队完成 4-5 reviewers,而不是 burst failure**。 +15. **ReviewJudge 输入包含 queued / retried / timed_out / failed / completed_with_partial 状态和 partial evidence**。 +16. **Deep Review 已实现 `partial_timeout` 状态按 evidence 映射为 `completed_with_partial` 或 `timed_out`,UI / Judge 不直接解析 raw status**(R30)。 +17. **Deep Review retry budget 不会绕过 scheduler retry classifier;同一 reviewer 不会被 orchestrator 和 scheduler 双重 retry**(R29)。 +18. **Deep Review dynamic concurrency policy 不会绕过 `SubagentScheduler`;同一 reviewer 不会同时进入两套 queue**(R28)。 +19. **大 source Deep Review 由父任务生成 source-agnostic `ReviewEvidencePack`;4-5 个 reviewer 不重复重建完整 source evidence;pack stale 会刷新或提示确认**(R31)。 + +### Phase 3 验收 + +1. 大工具输出不会直接塞爆上下文。 +2. 超限输出可落盘,UI 能打开完整内容。 +3. 上下文中注入的是摘要和引用,而不是不可控全文。 +4. **spill 目录有大小上限和 LRU 淘汰**(R11)。 +5. **spill 文件有正确权限,敏感内容在摘要中 REDACTED**(R12)。 +6. **启发式摘要能提取编译错误文件列表、test pass/fail 计数、diff stat**(R13)。 + +### Phase 4 验收 + +1. 模型尝试写入超过阈值的大文件全文时,不通过普通 tool call JSON 直接传输完整内容。 +2. `start_file_write` / `append_file_chunk` / `finish_file_write` / `abort_file_write` 有完整状态机和单元测试。 +3. chunk sequence 错乱、重复、hash mismatch 时不会提交目标文件。 +4. 未完成 write session 在 turn 结束时产生 `FileWriteIncomplete` 事件,UI 不显示成功。 +5. `finish_file_write` 成功后才原子替换目标文件,并记录最终 hash、bytes、section manifest。 +6. 上下文只注入 manifest、摘要、hash、artifact ref 和必要 preview,不注入完整文件内容。 +7. 后续修改大文件时优先 patch/hunk,并能按 range/symbol 读取相关内容。 +8. 大文件写入进度以 section 级别展示,避免长时间无输出,也避免 chunk 级别噪音。 +9. large write artifact 复用 spill 的权限、敏感检测、session 清理和 LRU 策略。 + +### 跨阶段门禁 + +1. 每个阶段必须提供最小回滚开关,回滚后不删除用户已有输出、artifact 或写入 manifest。 +2. 每个阶段必须有一组 deterministic fixture,不能只依赖真实 provider 手动复现。 +3. 每个阶段必须记录可观测字段,能区分“检测到问题”“执行了治理动作”“用户可见提示”三类事件。 +4. 每个阶段必须证明普通对话默认路径没有因为低置信估算、过度压缩、过重提示或不必要确认而增加用户感知中止。 +5. 任一阶段如果达不到门禁,只能停留在 observe-only 或诊断能力,不进入默认行为。 + +--- + +## 建议验证命令 + +根据仓库规则,相关改动最少应覆盖: + +- 前端改动:`pnpm run lint:web && pnpm run type-check:web && pnpm --dir src/web-ui run test:run` +- core / ai-adapters 改动:`cargo check --workspace && cargo test --workspace` +- Deep Review 行为改动:`cargo test -p bitfun-core deep_review -- --nocapture` +- subagent scheduler / gateway limiter 改动:scheduler state transition、permit acquire/release、retry classifier、gateway concurrency=2 Deep Review fixture + +执行前可根据改动范围选择最小验证集。若本设计仅修改文档,不需要运行上述命令。 + +--- + +## 最终合理性与先进性复核 + +本轮复核的标准不是“能不能做”,而是:在 BitFun 当前架构、已有能力、用户体验约束和竞品方向下,是否是当前阶段的最佳路径。结论如下: + +| 方案项 | 判定 | 为什么是当前最佳路径 | 被否定的次优路径 | +|---|---|---|---| +| `PartialRecoveryKind` + `ProviderFinishReason` | 保留 | 结构化状态是跨后端重试、前端提示、日志诊断的最小稳定契约;provider 差异留在 adapter 层,符合平台边界 | 继续依赖 `reason.contains(...)` 或 provider 原始字符串 | +| `ToolArgumentStatus` | 保留 | 工具参数完整性是执行安全边界,必须显式区分完整、保守修复、不可信修复和不完整;只靠 `is_error` 无法表达可信度 | 自动补齐 JSON 后执行,或把所有修复都当成 error 丢弃 | +| 截断批次的最小化工具拦截 | 保留 | read-only 工具不关联拦截可以保持流畅;mutating 工具按幂等/破坏性/未知分级,兼顾安全和体验 | 截断批次全部失败,或完整工具全部执行 | +| `RateLimited` 分层处理 | 修正后保留 | 顶层 round 不即时重复请求,避免重复输出;scheduler-owned subagent 在无有效输出时可 backoff 重试,符合 gateway overload 场景 | 一概不重试,或一概自动重试 | +| `ContextMutationKind` 独立建模 | 保留 | context compression / microcompact / emergency truncation 是输入侧治理,不是输出 partial recovery;独立事件能解释“为什么模型记忆变化” | 把 `ContextCompacted` 塞进 `PartialRecoveryKind` | +| 与 `context-reliability-architecture.md` 保持双文档边界 | 保留 | Context Reliability Architecture 是信任、证据、压缩契约和 context profile 的基础设施;本文是截断、预算、scheduler、artifact 和大文件写入的运行时控制面,双文档边界比全文合并更可维护 | 强行合并成一篇总纲,或在两篇文档中重复定义同一类事件 | +| 与 `deep-review-design.md` 已实现基线保持增量关系 | 保留 | Deep Review Strategy Engine 已解决角色、prompt、timeout、partial、retry、concurrency policy 等专项问题;本文只补 runtime scheduler、gateway limiter、统一事件、artifact 和预算,避免重复建设 | 在本文重新实现 Deep Review batching、retry 或 partial capture | +| Deep Review token/byte/diff line 预算 | 保留 | 文件数只能控制 reviewer 数量,不能控制上下文;预算拆分是减少大 diff 截断的必要条件 | 继续只按文件数拆分 | +| Review Evidence Pack | 新增保留 | PR URL、最近提交、working tree、patch artifact 或超大 diff 的取证是所有 reviewer 的共享输入,应由父任务通过 source provider 一次生成并落盘复用;这能把 reviewer timeout 留给分析,而不是重复拉取/解析证据 | 每个 reviewer 自行重复重建完整 source evidence,或只靠 prompt 要求 reviewer 少跑命令 | +| `SubagentScheduler` + gateway limiter | 保留 | 这是从 Deep Review policy 并发上限升级到 runtime-bounded scheduling 的关键;能让低并发 vLLM gateway 排队成功而不是 burst failure | 只调低 `ai.subagent_max_concurrency`,只靠 Deep Review batching,或让 prompt 自己少并发 | +| adaptive effective concurrency | 保留但必须可回滚 | 本地和云 gateway 能力差异大,静态配置不能覆盖;快降慢升、按 gateway key 隔离、最低 1 是当前最佳控制面 | 全局固定并发,或无边界自适应 | +| subagent retry classifier | 保留 | read-only reviewer、write-capable worker 和 Deep Review retry budget 风险不同,必须按错误类别、有效输出、tool mutability 决定是否执行 retry | 所有 subagent 统一重试策略,或 Deep Review orchestrator 与 scheduler 双重 retry | +| Deep Review partial 状态映射 | 保留 | 已实现 `partial_timeout` 是 Deep Review 内部状态;统一映射为 `completed_with_partial` / `timed_out` 才能让 UI、Judge、ledger 共用一套契约 | 前端/Judge 同时解析 raw `partial_timeout` 和通用 scheduler state | +| subagent / reviewer 输出专项落盘 | 保留 | hidden subagent 成功后会清理子 session,Phase 1 若不先落盘就会永久丢 evidence;这是比通用 spill 更早的必要能力 | 只硬截断返回,或等 Phase 3 通用 spill 后再处理 | +| `OverBudget` 本地恢复优先 | 修正后保留 | 低置信估算只触发后台治理;高置信超限也先压缩/摘要/spill 并重估,符合“不新增用户感知中止”原则 | 固定阈值直接阻断,或只在 prompt 里软提醒模型自救 | +| spill-to-file + 结构化/启发式摘要 | 保留 | 大输出不应回灌全文;启发式摘要覆盖 test/log/diff 等高频场景且不引入额外 LLM 调用 | LLM 语义摘要,或只截断不留完整引用 | +| Large File Write Protocol | 保留 | 1k+ 行文件写入不应走普通对话文本或单个巨大 tool JSON;事务化 chunk + manifest + 原子提交是源头预防 | 继续让模型一次性生成完整文件内容,或只靠上下文压缩兜底 | +| 分阶段 observe-only / feature flag / fixture 体系 | 保留 | 方案跨 core、adapter、frontend、storage,必须先观测再干预;没有 fixture 和回滚会让“治理能力”本身变成新风险 | 一次性大改上线,或只靠手工复现 | + +先进性判断: + +1. 与 Codex 类产品的方向一致:并行任务需要隔离执行、实时进度和可验证证据,而不是只返回最终答案。 +2. 与 Claude Code / OpenCode 的方向一致:subagent 应有独立上下文、工具权限、模式/permission profile,并对 read-only 与 write-capable 工作采用不同安全策略。 +3. 相比“简单可行”的上下文压缩或降低并发,本方案更接近 runtime control plane:结构化状态、队列、gateway permit、artifact、预算、事务写入、UI 可观测性彼此闭环。 +4. 当前不把 persisted scheduler state、LLM 语义摘要、全自动写型 subagent 重试放进近期默认范围,是合理克制;这些方向有价值,但不是当前风险/收益最优点。 + +最终修正结论:原方案整体合理且具备先进性;本轮仅将 `RateLimited` 和预算阈值从简单规则修正为分层策略,避免它们退化成“可行但不最佳”的实现。 + +--- + +## 最终判断 + +本设计运行在 `context-reliability-architecture.md` 定义的 Context Reliability Architecture 之上。上游架构提供信任边界、Evidence Ledger、Compaction Contract、Context Health 和 Work Packet;本文将原先的“截断检测与提示”扩展为六层运行时治理: + +1. **止血层**:检测工具参数截断,阻止静默成功和无意义重试; +2. **隔离层**:用 `ToolArgumentStatus`、tool mutability 元数据和最小化拦截策略阻止不可信工具调用自动执行; +3. **证据层**:subagent / reviewer 输出先落盘,再向父 session 注入摘要和引用; +4. **调度层**:用 `SubagentScheduler` 和 gateway request limiter 将 subagent / reviewer 从 burst launch 改为 runtime-bounded scheduling; +5. **治理层**:用预算估算、本地上下文恢复、分片、context mutation 事件和 spill-to-file 降低截断发生率; +6. **预防层**:用大文件写入协议避免 1k+ 行文件内容进入普通对话文本或单次 tool call JSON。 + +风险处置结论: + +- **可根治或可硬约束规避的**(R0-1, R0-3, R0-4, R0-5, R0-6, R0-7, R1-R16, R18-R27):通过结构化类型、执行前拒绝、计数器、大小检查、artifact 落盘、runtime queue、gateway permit、事务化写入等机制解决; +- **可缓解但不可根治的**(R0-2):provider 行为不一致是外部因素,通过归一化层 + `Unknown` 兜底确保不误判; +- **需单独建模的**(R17):上下文压缩 / 裁剪不是输出流 partial recovery,必须通过独立 context mutation 事件和诊断展示治理。 + +产品结论:方案不接受“用更多用户可感知中止换取更早失败”。新增动作必须减少静默失败、突然断掉、不可恢复和不透明等待;如果在普通对话中增加低置信阻断、反复压缩、过重错误提示或不必要确认,则该动作不能进入默认路径。 + +以下修改若无法满足硬门槛,则不得实现:补齐式 JSON 修复后自动执行、截断批次内 mutating tool 无分级自动执行或一刀切拦截、低置信 OverBudget 直接阻断、只硬截断 subagent 返回而不落盘、无 retry classifier 的 subagent 自动重试、write-capable subagent 状态变更后自动重试、无事务/无校验的大文件 chunk 写入、把 `ContextCompacted` 放进 `PartialRecoveryKind`、为已有 owner 可覆盖的相似能力新增平行实现、允许多个 Deep Review reviewer 对同一 source fingerprint 重复重建完整 evidence。这些约束优先级高于阶段计划。 diff --git a/context-reliability-architecture.md b/context-reliability-architecture.md new file mode 100644 index 000000000..e6dd1f907 --- /dev/null +++ b/context-reliability-architecture.md @@ -0,0 +1,1143 @@ +# BitFun Context Reliability Architecture + +> Updated proposal: 2026-04-27 +> +> This document replaces the earlier "Context Distraction 与 Context Poisoning 优化方案" draft. The new framing is not "make compression smarter" but "make long-running local agent work reliable, auditable, and recoverable across compression, pruning, weak models, and subagent handoffs." + +## 1. Executive Summary + +BitFun already has a useful foundation: + +- L0 Microcompact clears old compactable tool results. +- L1 ContextCompressor performs model summary plus structured fallback compression. +- L2 emergency truncation prevents provider context overflow. +- `MessageSemanticKind` distinguishes actual user input, internal reminders, compression boundaries, and compression summaries. +- `TaskTool` supports `timeout_seconds`, and Deep Review has reviewer/judge policy, file splitting, and read-only reviewer enforcement. +- `deep-review-design.md` is treated as an implemented baseline: Deep Review has Strategy Engine behavior, Architecture and Frontend reviewers, predictive timeout, dynamic concurrency policy, partial result capture, retry budget, strategy directive / model plumbing, and Judge overlap handling. + +The remaining risk is not just token pressure. The harder product problem is context reliability: + +1. The agent may forget the original task, scope, user constraints, touched files, or failed checks after compression. +2. User text or tool output can look like higher-priority instructions. +3. Tool results can be pruned correctly while the model still loses the operational fact that a command failed or a file changed. +4. Subagents can isolate context but still return outputs that are hard for the parent agent to merge safely. +5. Weak models amplify all of the above: they summarize worse, follow pointers less reliably, and recover less gracefully from partial state. + +The revised proposal is to build a **Context Reliability Architecture** around four product promises: + +- **Trusted:** every context item has a clear source and priority. +- **Auditable:** important tool facts survive pruning and compression. +- **Recoverable:** risky edits and long workflows have resumable state and rollback boundaries. +- **Adaptive:** stronger and weaker models receive different levels of structure, automation, and user confirmation. + +### 1.1 Relationship to the Runtime Budget Plan + +This document should remain independent from `agent-runtime-budget-governance-design.md`. + +The boundary is: + +- **This document is the context reliability foundation.** It owns context trust, Evidence Ledger, Compaction Contract, snapshot/recovery, Work Packet shape/projection boundaries, adaptive context profiles, weak-model policy, and Context Health. It does not own Deep Review-specific reviewer roles, prompt directives, or retry behavior. +- **The runtime budget plan is the control plane for failure prevention and recovery.** It owns `PartialRecoveryKind`, `ProviderFinishReason`, `ToolArgumentStatus`, `ContextMutationKind` events, `SubagentScheduler`, gateway request limiting, output spill, and the Large File Write Protocol. +- **Integration is event-based, not by merging responsibilities.** Runtime events from truncation recovery, scheduler state transitions, artifact creation, spill, and large file writes should become ledger facts when the ledger exists. The context architecture should then use those facts for compaction, health scoring, and user-facing recovery summaries. + +This split avoids a misleading mega-design where compression, scheduler capacity, provider retry, artifact storage, and trust boundaries appear to be one module. They are related, but they should remain separately testable and separately releasable. + +### 1.2 Relationship to Implemented Deep Review + +`deep-review-design.md` is the Deep Review product and strategy baseline. This document should not re-plan reviewer roles, prompt ownership, predictive timeouts, partial result capture, retry budget, or dynamic reviewer concurrency as if they were missing. + +Instead, this architecture consumes the facts produced by the implemented Deep Review flow: + +- launch manifest / strategy directive / reviewer role / scope +- `model_id` and reviewer-specific prompt directive +- predictive active timeout and retry budget +- partial reviewer evidence and timeout status +- reviewer queue / retry / completion state once runtime scheduler events exist +- reviewer report artifacts and final judge output + +The context architecture turns those facts into trust metadata, Evidence Ledger entries, Compaction Contract fields, and Context Health signals. It does not decide actual reviewer execution order, gateway capacity, or retry ownership. + +### 1.3 Convergence and No-Duplicate Rule + +Similar mechanisms must converge into the existing owner. This architecture should prefer projections over new modules: if Deep Review, runtime budget governance, and context management describe the same status, retry, artifact, budget, or task contract, only one layer owns behavior and the other layers consume typed facts. + +| Similar area | Source of truth | Context architecture action | Must not do | +| --- | --- | --- | --- | +| Deep Review launch manifest / Work Packet | implemented Deep Review canonical manifest | project manifest facts into Work Packet and Compaction Contract fields | duplicate reviewer roles, model ids, prompt directives, or retry budgets | +| Review Evidence Pack / large source snapshot | Deep Review source resolver + runtime artifact storage | record pack refs, hashes, source kind, source fingerprint, slice ids, and stale status in ledger/contract | let each reviewer reconstruct the same complete source evidence independently | +| scheduler states and subagent capacity | runtime `SubagentScheduler` + gateway limiter | record state projections in Evidence Ledger and Context Health | implement queueing, permits, retry backoff, or effective concurrency in context code | +| raw `partial_timeout` and partial recovery state | runtime normalized scheduler state | keep raw status as diagnostics and expose `completed_with_partial` / `timed_out` / `retry_waiting` as model-visible facts | let UI, Judge, or ledger treat raw Deep Review strings as the primary state | +| artifact refs, spill files, large-write manifests | runtime artifact / session storage | preserve refs, hashes, sensitivity flags, status, and next action in ledger/contract | create a parallel context artifact store or inject full large outputs back into context | +| context compaction and budget mutation | `ContextMutationKind` event + Compaction Contract | summarize mutation facts and preserve user intent / trusted evidence | infer mutations from prompt text or encode context mutation as partial output recovery | +| budget thresholds and model profile limits | runtime budget module and model/provider profile | consume budget outcomes as health signals and compaction inputs | hard-code separate budget thresholds inside context profiles | +| evidence facts | trusted tool/runtime events | project facts into ledger summaries and compaction contracts | promote model-generated summaries to authoritative facts | + +## 2. Competitive Reference: What Matters + +The relevant comparables are general coding-agent runtimes: + +### 2.1 Codex + +Codex emphasizes independent task execution, AGENTS.md as scoped project instructions, and verifiable evidence through terminal logs and test outputs. OpenAI's own guidance also warns against turning AGENTS.md into a giant manual: a short AGENTS.md should act as a map to deeper repository docs, not as the full knowledge base. + +The Codex agent loop also treats context window management as a runtime responsibility and describes automatic compaction once a token threshold is exceeded. + +Useful lesson for BitFun: + +- Keep always-injected instructions short. +- Treat repository docs as the system of record. +- Preserve evidence, not just prose summaries. +- Run longer tasks as independently reviewable units. +- Let automatic compaction start as an observable runtime behavior, then gate default enforcement on measured reliability. + +### 2.2 Claude Code + +Claude Code uses CLAUDE.md, auto memory, subagents, and compaction. Its public subagent guidance emphasizes independent context windows, concise final summaries back to the parent, tool restrictions, and foreground/background execution. The Agent SDK guidance similarly frames subagents as separate instances for context isolation, parallelization, specialized instructions, and tool restrictions. + +Useful lesson for BitFun: + +- Compaction needs a contract: it must preserve operational facts, not only conversation meaning. +- Subagents are most useful when they isolate focused work and return concise, structured results. +- Memory is context, not enforcement; hard safety and priority rules should be represented programmatically where possible. +- Subagent contracts should describe scope and permissions, while runtime scheduling should own concurrency, queueing, retry, and timeout semantics. + +### 2.3 OpenCode + +OpenCode exposes configurable compaction (`auto`, `prune`, `reserved`), snapshots, agent permissions, hidden subagents, and task permissions. Its agent model distinguishes primary agents from subagents and can hide internal subagents while still allowing programmatic invocation through the Task tool when permissions allow. + +Useful lesson for BitFun: + +- Context pruning should be configurable and observable. +- Snapshot/recovery is part of user trust, especially when agents edit files. +- Per-agent permissions and subagent invocation permissions reduce orchestration risk. +- Hidden/internal subagents should be programmatically invokable without cluttering the user-facing agent list. +- Permission policy and runtime capacity policy are separate controls; BitFun should not encode gateway pressure as prompt instructions. + +## 3. Product Positioning + +The old positioning was close to: + +> "BitFun has context compression to support long conversations." + +The revised positioning should be: + +> "BitFun is a local Agent Runtime for long-running, auditable, multi-agent work. It preserves task intent, execution evidence, and recovery boundaries even when context is compressed, pruned, or delegated." + +This matters because compression is table stakes. The product differentiation is whether BitFun can safely run real engineering workflows over time: + +- Deep Review can coordinate several reviewers without losing evidence. +- Agentic Mode can edit, test, retry, and still know exactly what changed. +- Computer Use can summarize screen/action history without dumping every screenshot into context. +- Cowork and Plan modes can remain conversational without inheriting overly aggressive long-task pruning. + +## 4. Design Principles + +1. **Programmatic facts beat model summaries.** + If a tool can record "file X was edited" or "test Y failed with exit code 1", do not ask the model to remember it. + +2. **Summaries are hints, not authority.** + Model-generated compression summaries should never override trusted instructions, tool facts, or user-visible state. + +3. **Weak models need more structure, not more autonomy.** + Weak models should receive simpler schemas, smaller scopes, stronger early-stop rules, and more user confirmation. + +4. **Subagents need contracts.** + Isolation helps only if the parent gives a precise work packet and receives a structured result. + +5. **Context policy should follow task shape.** + Long execution tasks and multi-turn conversations fail differently. They should not use exactly the same context strategy. + +## 5. Key Measures + +### 5.1 Repository System of Record + +#### Proposal + +Keep always-injected project guidance short. Use top-level and nearest AGENTS files as routing maps, with deeper architecture, verification, and module guidance stored in docs and loaded when relevant. + +For BitFun: + +- Keep `AGENTS.md` as a short architecture and verification index. +- Use nearest `AGENTS.md` / `AGENTS-CN.md` for touched directories. +- Treat detailed docs such as Deep Review strategy docs as on-demand references. +- Add mechanical checks later for stale pointers and missing linked docs. + +#### Benefits + +- Reduces initial context pressure. +- Makes project knowledge easier to maintain and review. +- Avoids burying task-specific context under a large instruction blob. +- Aligns with Codex-style AGENTS.md scoping and "map, not manual" guidance. + +#### Negative Impacts + +- More pointers means more chances for the agent to miss a required doc. +- Weak models may not follow "read this if relevant" instructions reliably. +- Users may think rules disappeared if they are no longer always visible. + +#### Weak-Model Experience + +Weak models need programmatic doc injection for high-risk scopes. For example: + +- Editing `src/web-ui` should automatically include the web-ui agent doc summary. +- Deep Review should automatically include the review-team execution contract. +- Desktop/Tauri API changes should include structured command conventions. + +#### Risk Mitigation + +- Add a "required context resolver" rather than relying purely on the model. +- Keep critical constraints duplicated as concise capsule fields. +- Log which instruction sources were included in each run. + +#### Product Impact + +This shifts BitFun toward a repository-aware runtime. It also reduces the product risk that users solve context problems by writing ever-larger instruction files. + +### 5.2 Context Trust Boundary + +#### Proposal + +Assign a trust level to each model-visible context item: + +| Trust level | Examples | Can override lower levels? | +| --- | --- | --- | +| `system` | Built-in system/developer policy | Yes | +| `workspace_instruction` | AGENTS.md, memory files, AI rules | Yes, within scoped priority | +| `user_input` | Raw user prompt and attached context | No | +| `tool_observation` | Read/Bash/Git/WebFetch results | No | +| `model_summary` | Compression summary | No | +| `external_artifact` | Imported docs, screenshots, webpages | No | + +User text and tool output can mention system-looking tags, but those tags must never upgrade their authority. + +#### Benefits + +- Reduces prompt injection through user input, code comments, docs, and tool output. +- Gives compression logic a safe ordering rule. +- Allows UI warnings without blocking normal legitimate discussion. +- Helps weak models by making priority explicit and mechanical. + +#### Negative Impacts + +- False positives can annoy users, especially when discussing prompt markup literally. +- Overly aggressive blocking can break debugging and prompt-authoring workflows. +- Additional metadata increases implementation complexity. + +#### Weak-Model Experience + +Weak models are more vulnerable to instruction-looking text in tool results. Trust metadata benefits weak models more than strong models because it reduces the need to infer source authority. + +#### Risk Mitigation + +- Start with escaping, tagging, and warnings instead of hard blocking. +- Block only clear privilege-escalation patterns. +- Keep raw user text accessible for display while sending escaped/annotated text to the model. +- Include tests where a user legitimately discusses `` and where a malicious tool result contains fake instructions. + +#### Product Impact + +This is a safety foundation for BitFun as a local runtime. It supports an enterprise-friendly story: local execution with explicit context provenance and priority. + +### 5.3 Evidence Ledger + +#### Proposal + +Record compact, structured facts for important tool events. The ledger survives pruning and feeds the compression contract. + +Minimum fields: + +```text +event_id +turn_id +tool_name +target_kind +target +status +exit_code_or_error_kind +touched_files +artifact_path +summary +created_at +``` + +Initial event categories: + +- file read/search results +- file write/edit/delete operations +- shell commands and exit codes +- git status/diff/test commands +- context compression events +- subagent start/completion/timeout/cancel +- model output recovery events such as `PartialRecoveryKind` +- input-side context mutation events such as microcompact, model compression, fallback compression, and emergency truncation +- subagent scheduler state transitions such as queued, waiting for capacity, running, retry waiting, completed with partial, failed, and cancelled +- implemented Deep Review Strategy Engine facts such as reviewer role, scope, strategy level, `model_id`, prompt directive, predictive timeout, retry budget, raw `partial_timeout`, normalized scheduler state, and judge decision +- Review Evidence Pack events such as pack id, source kind, source provider, source locator hash, source fingerprint, source collection count, file count, diff line count, pack hash, artifact slice ids, cache hit, and stale status +- session artifact events for subagent results, spill files, and large file write manifests + +#### Benefits + +- Tool outputs can be pruned without losing the fact that an action happened. +- The agent can preserve modified files and test commands across compression. +- Final responses and review reports can cite operational evidence. +- Repeated work decreases because the agent can see recent failed commands and touched files. + +#### Negative Impacts + +- Ledger can become a new source of noise if every minor action is included. +- Incorrect ledger entries are dangerous because they look authoritative. +- More storage and retention policy work is required. + +#### Weak-Model Experience + +This is one of the highest-value measures for weak models. Weak models often forget that a file was already changed or that a command already failed. A short ledger prevents loops and repeated exploration. + +#### Risk Mitigation + +- Generate ledger facts from tool layer code, not model prose. +- Keep model-visible ledger to a small recent slice: + - recently touched files + - latest verification commands + - latest blocking failures + - active subagent statuses +- Keep full ledger in session storage for UI/audit, but only inject a summary into model context. +- Treat runtime event producers as untrusted unless they come from core/tool-layer code. Model prose may explain an event, but it must not create authoritative ledger facts by itself. +- Normalize Deep Review raw statuses before projection: `partial_timeout` with evidence should appear as `completed_with_partial`, while timeout without useful evidence should appear as `timed_out`. Keep raw status only as diagnostics. + +#### Product Impact + +This is the core of BitFun's "auditable local agent" positioning. It also turns compression from lossy summarization into a state-aware runtime behavior. + +### 5.4 Snapshot and Recovery + +#### Proposal + +Introduce recovery boundaries for high-risk operations. + +Two levels: + +1. **Light checkpoint** + - Track dirty status, touched files, diff hash, current branch, and latest ledger event. + - Cheap and always available. + +2. **Strong checkpoint** + - Capture enough state to rollback or create a user-visible recovery point before high-risk edits. + - Used for auto-fix, batch edits, generated rewrites, and long-running Computer Use flows. + +#### Benefits + +- Users can trust longer autonomous tasks. +- Failed or weak-model edits become less scary. +- The system can explain what happened before an error. +- Deep Review remediation and Agentic Mode can safely attempt bounded changes. + +#### Negative Impacts + +- Strong snapshots can be slow or disk-heavy in large repositories. +- Snapshots can create a false sense of safety for external side effects. +- Rollback semantics can be complex when user edits happen concurrently. + +#### Weak-Model Experience + +Weak models should trigger stronger recovery boundaries earlier. They should also have stricter edit-size thresholds and more user confirmations before broad changes. + +#### Risk Mitigation + +- Start with light checkpoints only. +- Use strong checkpoints only when the repo is clean enough or when the user explicitly approves. +- Clearly label what is recoverable: local files and git state, not external APIs, databases, or remote side effects. +- Never rollback user edits without explicit approval. + +#### Product Impact + +Snapshot/recovery moves BitFun toward a controlled execution environment rather than a plain chat assistant that happens to edit files. + +### 5.5 Compaction Contract + +#### Proposal + +Replace free-form compression expectations with a fixed contract. Every model-generated or fallback compression summary must preserve these fields when available: + +```text +current_goal +active_scope +hard_constraints +decisions +touched_files +verification_commands +blocking_failures +open_questions +subagent_statuses +budget_state +artifact_refs +deep_review_manifest_summary +review_evidence_pack_summary +next_step +``` + +Facts such as touched files and verification commands should be populated from the Evidence Ledger whenever possible. +`budget_state` and `artifact_refs` are optional fields populated from the runtime budget plan when a request triggered budget governance, scheduler queueing, spill-to-file, or large file write artifacts. They should stay compact: include state, path/hash/reference, and next action, not raw large content. +`deep_review_manifest_summary` is optional and should be populated only when the active task is a Deep Review. It preserves reviewer roles, scope, strategy level, model slot/model id, timeout/retry budget, and normalized reviewer status without injecting full reviewer reports. +`review_evidence_pack_summary` is optional and should be populated for large source reviews such as PR URLs, local ranges, working tree diffs, patch artifacts, or explicit file snapshots. It preserves pack id, source kind, source provider, source fingerprint, pack hash, slice ids, stale status, and source collection count without injecting raw patch content. + +#### Benefits + +- Reduces context drift after compression. +- Makes compression quality easier to test. +- Preserves exactly the facts that coding agents most often lose: files, commands, failures, and next step. +- Aligns with Claude-style guidance to preserve modified files and test commands. + +#### Negative Impacts + +- The contract itself consumes tokens. +- If the model fills unknown fields by guessing, the contract can become misleading. +- Too many fields can make weak models produce lower-quality summaries. + +#### Weak-Model Experience + +Weak models should receive a shorter contract. Suggested weak-model contract: + +```text +Goal: +Scope: +Touched files: +Last tests: +Blocking issue: +Next step: +``` + +Do not ask weak models to produce nuanced decision histories unless the task requires it. + +#### Risk Mitigation + +- Populate factual fields programmatically. +- Allow empty fields rather than invented content. +- Validate contract length and truncate low-priority fields first. +- Add tests for compression summaries preserving touched files and test commands. +- Do not inject full spill files, large write chunks, or full reviewer artifacts into the compaction summary. Inject only summaries and references. +- Preserve Deep Review manifest facts from the implemented Strategy Engine, but never duplicate the full prompt block or full reviewer output. +- Preserve Review Evidence Pack refs and stale status, but never duplicate complete source evidence or per-file patch content in the compaction contract. + +#### Product Impact + +This is the most direct way to make long BitFun sessions feel stable after compaction. + +### 5.6 Subagent Work Packet + +#### Proposal + +Make subagent dispatch structured. Because `deep-review-design.md` is treated as implemented, the first Work Packet should be a compatibility projection of the existing Deep Review launch manifest, not a second schema built from scratch. A Work Packet defines what a subagent may do and what it must return. + +Minimum packet: + +```text +packet_id +parent_session_id +goal +scope +allowed_tools +forbidden_actions +input_artifacts +review_evidence_pack_id +evidence_slice +timeout_seconds +queue_timeout_seconds +run_timeout_seconds +stream_idle_timeout_seconds +output_budget +output_schema +expected_parent_state_patch +``` + +Minimum result: + +```text +packet_id +status +summary +findings_or_changes +evidence +touched_files +verification +artifact_refs +open_risks +``` + +#### Benefits + +- Parent context stays smaller. +- Subagents receive clearer scope and permissions. +- Judge/orchestrator logic can merge results more reliably. +- Deep Review can better handle partial, timed-out, or cancelled reviewers. + +#### Negative Impacts + +- Work packets can reduce flexibility for open-ended research. +- Schema validation and fallback handling add code paths. +- Poorly designed packets can hide important context from subagents. + +#### Weak-Model Experience + +Weak models benefit from clear packets but struggle with large schemas. Use minimal schemas for weak models and require short outputs. + +#### Risk Mitigation + +- Start by projecting the implemented Deep Review manifest into this shape. +- Keep initial schemas small. +- Allow `partial` status. +- Parent agent must validate required fields and ask for repair only once before falling back to a plain summary. +- Do not make Work Packet responsible for actual execution order or gateway pressure. The runtime budget plan's `SubagentScheduler` and gateway request limiter own queueing, permits, retry, and effective concurrency. +- Do not duplicate `reviewTeamService.ts` / backend role definitions. The Work Packet projection must consume the canonical Deep Review manifest and preserve `model_id`, `prompt_directive`, reviewer role, scope, timeout, and retry policy. +- Do not let each reviewer reconstruct the same complete source evidence. When a Review Evidence Pack exists, the Work Packet should pass artifact slice refs and stale policy, not prompt the reviewer to refetch PR files, rerun full local ranges, or rebuild the whole patch again. + +#### Product Impact + +This turns BitFun's multi-agent story from prompt-level parallelism into runtime-level orchestration. It is especially important for Code Review Team and future Team Mode workflows. + +### 5.7 Adaptive Context Policy + +#### Proposal + +Use two first-class context profiles: + +| Profile | Modes | Default policy | +| --- | --- | --- | +| `long_task` | Agentic, Deep Review, Deep Research, Computer Use, Team Mode | active ledger, aggressive tool pruning, compaction contract, subagent isolation | +| `conversation` | Cowork, Plan, general Q&A | preserve recent user intent, conservative pruning, fewer automatic subagents | + +Later, these profiles can branch by model capability. + +#### Benefits + +- Matches context strategy to failure mode. +- Keeps conversation modes from feeling over-managed. +- Gives long-running modes enough structure to survive compression. + +#### Negative Impacts + +- Users may not understand why modes behave differently. +- Switching profiles mid-session can be confusing. +- Profile-specific bugs can appear. + +#### Weak-Model Experience + +Weak models should default to lower autonomy: + +- smaller scope +- more explicit confirmations +- fewer automatic subagents +- stricter loop detection +- simpler compression contract + +#### Risk Mitigation + +- Derive default profile from agent type. +- Make profile visible but not noisy in UI. +- Allow per-session override. +- Avoid automatic profile switching until telemetry proves it is safe. + +#### Product Impact + +This makes BitFun's agent modes real runtime modes, not just prompt skins. + +### 5.8 Context Health Score + +#### Proposal + +Track internal health signals: + +- token usage ratio +- compacted turn count +- pruned tool output count +- ledger freshness +- repeated tool signature count +- consecutive failed commands +- subagent timeout/cancel count +- scheduler queued/retry count +- partial recovery count by kind +- Deep Review reviewer status counts, including normalized `completed_with_partial`, `timed_out`, `retry_waiting`, and raw diagnostic status when present +- Deep Review retry budget used vs allowed +- artifact/spill pressure +- large file write incomplete count +- unresolved open questions +- compression circuit breaker state + +Use this score internally first. Only show simple user-facing states: + +- Healthy +- Context pressure rising +- Recommend compacting +- Recommend splitting work +- Needs user decision + +#### Benefits + +- Prevents compaction loops and repeated failed tool calls. +- Gives the runtime a reason to ask the user before continuing. +- Helps tune weak-model behavior over time. + +#### Negative Impacts + +- A visible score can create user anxiety. +- Bad thresholds can interrupt good workflows. +- Health calculation can become a bag of arbitrary heuristics. + +#### Weak-Model Experience + +Weak models should hit early-stop and user-decision thresholds sooner. Strong models can continue longer before escalation. + +#### Risk Mitigation + +- Keep health score internal in P0/P1. +- Log telemetry locally before exposing UI. +- Start with a small set of high-signal metrics: + - repeated identical tool calls + - consecutive failed commands + - subagent timeout count + - compression failures +- Use action-oriented UI states instead of raw scores until product telemetry proves users benefit from more detail. + +#### Product Impact + +Health score supports a future "why did BitFun pause?" explanation and can become a key differentiator for long-task reliability. + +### 5.9 Runtime Budget Integration Boundary + +The runtime budget plan adds several producers of structured facts: + +- output truncation and partial recovery +- context mutation caused by budget governance +- scheduler queue/retry/run/completion state +- gateway overload classification and effective concurrency +- subagent/reviewer artifacts +- Deep Review Strategy Engine facts from the implemented launch manifest +- Review Evidence Pack facts from Deep Review source resolver / provider preflight +- spill-to-file artifacts +- large file write manifests and incomplete-write recovery + +This architecture should consume those facts through Evidence Ledger and Context Health, but it should not duplicate their execution policies. + +| Runtime area | This document consumes | Runtime budget plan owns | +| --- | --- | --- | +| `PartialRecoveryKind` | ledger facts and recovery summaries | classification, retry behavior, UI recovery event | +| `ContextMutationKind` | compaction history, health scoring, contract fields | budget trigger, mutation event emission, user progress state | +| `SubagentScheduler` | active subagent statuses for compaction and health | queue state machine, gateway permits, retry classifier, cancellation cleanup | +| Deep Review Strategy Engine | reviewer role/scope/model/strategy facts, normalized partial state, retry budget summary | role selection, strategy directives, predictive timeout, Deep Review-specific policy | +| Review Evidence Pack | pack refs, hash, source kind/provider/fingerprint, slice ids, stale status, source collection count | source resolution, provider evidence collection, normalized change snapshot creation, artifact storage, staleness policy | +| spill/artifact | artifact references, hash, sensitivity flag, retention hints | file creation, permissions, cleanup, inline summary policy | +| Large File Write Protocol | manifest facts and incomplete-write recovery state | chunking, hash validation, temp file, atomic commit, abort/continue | + +Default dependency direction: + +1. Phase 1 truncation recovery can ship before Evidence Ledger is complete, as long as it emits structured events. +2. Context P0/P1 should improve Phase 2+ budget recovery quality, especially compaction and health scoring. +3. Phase 3/4 artifact and large-write work should record ledger facts once the ledger exists, but should not block on a globally persisted ledger format. + +If the documents appear to disagree, prefer this rule: **Deep Review defines review policy and roles; context architecture defines what the model may know and trust; runtime budget governance defines what the system may execute, retry, queue, spill, or block.** + +## 6. Cross-Measure Risks + +| Combination | Risk | Mitigation | +| --- | --- | --- | +| Evidence Ledger + Compaction Contract | Contract can become too long if it includes too many ledger facts. | Inject only latest touched files, latest tests, latest failures, and active subagent statuses. | +| Snapshot + Weak-Model Auto-fix | Weak model may attempt broad edits because rollback exists. | Add edit-size thresholds, approval gates, and scope caps. | +| Subagent Work Packet + Adaptive Policy | Over-delegation can fragment reasoning. | Default automatic delegation off except Deep Review / Deep Research. | +| Trust Boundary + UX | Too many warnings can feel hostile. | Warn only for high-confidence injection patterns; otherwise silently escape. | +| Repository System of Record + Weak Models | Weak models may not follow document pointers. | Programmatically resolve required docs for touched paths. | +| Context Health Score + Autonomy | Bad thresholds can pause too early or too late. | Start telemetry-only, then enable advisory mode, then enforcement. | +| Runtime Budget Events + Evidence Ledger | Duplicated ownership can make the same failure appear as two conflicting facts. | Runtime emits typed events; ledger records facts and projections; model summaries never redefine event meaning. | +| Work Packet + SubagentScheduler | Packet timeouts can conflict with queue/run/idle timeout semantics. | Packet may request timeout budgets; scheduler owns final queue, run, idle, retry, and parent deadline enforcement. | +| Artifact References + Compaction | Summaries can lose the fact that full evidence exists on disk. | Compaction Contract must preserve artifact refs, hash/status, and next action while excluding raw large content. | +| Deep Review Manifest + Work Packet | A second Work Packet schema can drift from implemented reviewer roles, `model_id`, or prompt directives. | Generate the Work Packet projection from the implemented Deep Review manifest; do not duplicate role metadata in context code. | +| Review Evidence Pack + Work Packet | Reviewer packets can still prompt subagents to reconstruct complete source evidence, wasting timeout on duplicate evidence gathering. | Pack refs and slice ids must be first-class packet inputs; full source reconstruction is fallback-only and diagnosable. | +| Deep Review Partial Status + Ledger | Raw `partial_timeout` can conflict with normalized scheduler states. | Store raw status as diagnostics; project model-visible status as `completed_with_partial` or `timed_out` based on evidence. | + +## 7. Weak Model Policy + +BitFun should not treat weak models as simply cheaper strong models. They need a different runtime posture. + +### 7.1 Weak Model Defaults + +- Prefer `conversation` profile unless user explicitly starts a long-task mode. +- Use shorter compaction contracts. +- Inject more programmatic facts and fewer prose summaries. +- Require user confirmation for broad edits, destructive commands, and automatic remediation. +- Cap subagent fan-out more aggressively. +- Stop earlier on non-convergence. + +### 7.2 Strong Model Defaults + +- Allow longer autonomous runs. +- Permit richer Work Packet schemas. +- Allow more nuanced compression summaries. +- Use higher thresholds before asking the user to intervene. + +### 7.3 Model Capability Inputs + +Initial model capability classification can be heuristic: + +- context window size +- configured model slot (`fast`, `primary`, `reasoning`) +- known provider/model family +- user-selected "safe mode" preference +- observed loop/failure rate in the current session + +Do not rely only on model name. Runtime behavior should be measured. + +## 8. Revised Priority + +### P0: Safety and Facts + +1. Context Trust Boundary +2. Evidence Ledger minimum viable version +3. Prompt markup escaping/warning tests + +Why first: + +- High benefit for both strong and weak models. +- Low dependency on subjective model summarization. +- Makes later compression and pruning safer. + +### P1: Compression Reliability + +1. Compaction Contract +2. Ledger-backed touched-files/test-command preservation +3. Minimal Context Health telemetry +4. Read-only consumption of runtime budget events when available + +Why second: + +- Builds directly on ledger facts. +- Makes existing compression less lossy without changing the whole orchestration model. +- Improves Phase 2 budget recovery quality without blocking Phase 1 truncation stop-the-bleeding work. + +### P2: Recovery and Deep Review Handoff + +1. Light Snapshot checkpoints +2. Deep Review manifest to Work Packet compatibility projection +3. Review Evidence Pack projection for PR URL / local range / working tree / patch artifact reviews +4. Ledger projection for already-preserved partial/timed-out subagent results +5. Ledger projection for scheduler states and subagent artifacts + +Why third: + +- Higher complexity. +- Best validated in Deep Review, where subagent isolation and Strategy Engine behavior already exist. +- Should consume `SubagentScheduler` states from the runtime budget plan, not implement a second scheduler here. + +### P3: Runtime Policy + +1. Adaptive Context Profile +2. Weak-model policy gates +3. Advisory UI for context health +4. Product decision on whether short background compaction can be enabled by default in conversation mode + +Why later: + +- Needs telemetry from P0/P1. +- User-facing behavior must be tuned carefully. + +## 9. Implementation Plan + +### Task 1: Add Context Trust Metadata + +**Goal:** represent trust source without changing behavior first. + +**Likely files:** + +- `src/crates/core/src/agentic/core/message.rs` +- `src/crates/core/src/agentic/core/prompt_markup.rs` +- `src/crates/core/src/agentic/session/session_manager.rs` +- `src/crates/core/src/agentic/session/compression/fallback/builder.rs` + +**Steps:** + +1. Add `ContextTrustLevel` enum with values: + - `System` + - `WorkspaceInstruction` + - `UserInput` + - `ToolObservation` + - `ModelSummary` + - `ExternalArtifact` +2. Add optional trust metadata to message metadata. +3. Set trust level when creating: + - actual user input + - internal reminders + - compression summaries + - tool results +4. Add tests ensuring existing serialization remains backward compatible. +5. Add tests for prompt markup in user input being treated as user text. + +**Verification:** + +- `cargo test -p bitfun-core prompt_markup -- --nocapture` +- `cargo test -p bitfun-core compression -- --nocapture` + +### Task 2: Add Evidence Ledger Core Types + +**Goal:** create a programmatic fact store independent of model summaries. + +**Likely files:** + +- Create `src/crates/core/src/agentic/session/evidence_ledger.rs` +- Modify `src/crates/core/src/agentic/session/mod.rs` +- Modify tool execution path in `src/crates/core/src/agentic/execution/round_executor.rs` +- Modify file/shell/git tool result handling as needed. + +**Steps:** + +1. Define `EvidenceLedgerEvent`. +2. Define `EvidenceLedgerSummary` for model-visible projection. +3. Add append/read APIs scoped by `session_id` and `dialog_turn_id`. +4. Capture command status, touched files, and artifact pointers where available. +5. Keep the first implementation in-memory or session-local; avoid global persistence until format stabilizes. +6. Accept optional runtime budget event facts: + - `PartialRecoveryKind` + - `ContextMutationKind` + - scheduler state + - artifact/spill/large-write manifest reference +7. Add unit tests for summarizing: + - touched files + - latest failed commands + - latest verification commands + - active scheduler states + - artifact references without raw content + +**Verification:** + +- `cargo test -p bitfun-core evidence_ledger -- --nocapture` +- `cargo test -p bitfun-core agentic::execution -- --nocapture` + +### Task 3: Integrate Ledger With Microcompact + +**Goal:** allow pruning old tool output while preserving important operational facts. + +**Likely files:** + +- `src/crates/core/src/agentic/session/compression/microcompact.rs` +- `src/crates/core/src/agentic/execution/execution_engine.rs` +- `src/crates/core/src/agentic/session/evidence_ledger.rs` + +**Steps:** + +1. Before clearing compactable tool results, ensure a ledger event exists for that tool result. +2. Add microcompact stats for events preserved. +3. Do not clear recent failed command outputs until the ledger summary includes their error kind. +4. Add tests: + - old successful read result is cleared and ledger keeps target path + - failed command remains or is summarized safely + - TodoWrite and compression summary are not pruned incorrectly + +**Verification:** + +- `cargo test -p bitfun-core microcompact -- --nocapture` + +### Task 4: Implement Compaction Contract + +**Goal:** make compression preserve fixed critical fields. + +**Likely files:** + +- `src/crates/core/src/agentic/session/compression/compressor.rs` +- `src/crates/core/src/agentic/session/compression/fallback/builder.rs` +- `src/crates/core/src/agentic/core/message.rs` +- `src/crates/core/src/agentic/session/evidence_ledger.rs` + +**Steps:** + +1. Add a `CompressionContract` struct. +2. Populate factual fields from Evidence Ledger: + - touched files + - verification commands + - blocking failures + - subagent statuses + - budget state + - artifact references +3. Update model compression prompt to require the contract fields. +4. Update fallback compression builder to emit the same fields without model help. +5. Add tests proving touched files and test commands survive compression. +6. Add tests proving spill/artifact references survive without injecting raw large content. +7. Add a weak-model short contract mode behind config or capability detection. + +**Verification:** + +- `cargo test -p bitfun-core compression -- --nocapture` +- targeted manual long-session compression smoke test + +### Task 5: Add Context Health Telemetry + +**Goal:** observe before changing user-facing behavior. + +**Likely files:** + +- `src/crates/core/src/agentic/execution/execution_engine.rs` +- `src/crates/core/src/agentic/session/compression/microcompact.rs` +- `src/crates/core/src/agentic/session/evidence_ledger.rs` +- optional frontend later: `src/web-ui/src/flow_chat/*` + +**Steps:** + +1. Track: + - token usage ratio + - microcompact count + - full compression count + - compression failures + - repeated tool signatures + - consecutive failed commands +2. Emit English-only logs. +3. Add a small internal `ContextHealthSnapshot` type. +4. Keep UI hidden for now. +5. Add tests for repeated tool signature scoring. + +**Verification:** + +- `cargo test -p bitfun-core context_health -- --nocapture` +- `cargo check --workspace` + +### Task 6: Add Light Snapshot Checkpoints + +**Goal:** record recovery boundaries before risky edits without promising full rollback. + +**Likely files:** + +- `src/crates/core/src/agentic/session/evidence_ledger.rs` +- `src/crates/core/src/service/git/*` +- tool implementations for Edit/Write/Delete/Bash/Git + +**Steps:** + +1. Add `CheckpointCreated` ledger event. +2. Capture: + - current branch + - dirty state summary + - touched file list + - diff hash when cheap +3. Create checkpoint before high-risk operations: + - batch edits + - auto-fix + - destructive file operations +4. Do not implement automatic rollback in this phase. +5. Add tests for checkpoint event creation. + +**Verification:** + +- `cargo test -p bitfun-core checkpoint -- --nocapture` +- manual edit flow verifying ledger output + +### Task 7: Project Deep Review Manifest Into Work Packet + +**Goal:** standardize the already-implemented Deep Review launch manifest as a Work Packet projection without changing reviewer dispatch ownership. + +**Likely files:** + +- `src/crates/core/src/agentic/agents/prompts/deep_review_agent.md` +- `src/crates/core/src/agentic/deep_review_policy.rs` +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- `src/web-ui/src/shared/services/reviewTeamService.ts` +- `src/web-ui/src/flow_chat/services/DeepReviewService.ts` + +**Steps:** + +1. Read the implemented Deep Review launch manifest produced by the review team strategy flow. +2. Project it into a Work Packet markdown/JSON block for ledger/compaction/judge consumption. +3. Include: + - packet id + - role + - assigned scope + - allowed tools + - timeout + - queue/run/idle timeout requests + - output budget + - input artifact references + - `model_id` + - `prompt_directive` + - strategy level + - retry budget + - required output fields +4. Update reviewer prompt only if it does not already return packet id and status. +5. Update judge prompt to treat missing packet id/status as lower confidence. +6. Ensure the packet does not promise actual parallel execution; scheduler capacity is owned by the runtime budget plan. +7. Add frontend service tests proving the Work Packet projection matches the implemented manifest. +8. Add Rust tests for Deep Review policy compatibility. + +**Verification:** + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run src/shared/services/reviewTeamService.test.ts` + +### Task 7A: Project Review Evidence Pack Into Work Packets + +**Goal:** preserve shared source evidence across compaction and prevent reviewers from repeatedly reconstructing the same PR, local range, working tree diff, patch artifact, or file snapshot. + +**Likely files:** + +- `src/crates/core/src/agentic/deep_review_policy.rs` +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- `src/crates/core/src/agentic/coordination/coordinator.rs` +- `src/web-ui/src/flow_chat/services/DeepReviewService.ts` + +**Steps:** + +1. Consume the runtime budget plan's source-agnostic `ReviewEvidencePack` artifact emitted during Deep Review launch preflight. +2. Record pack id, source kind, source provider, source locator hash, source fingerprint, source collection count, pack hash, stale status, and slice ids in Evidence Ledger. +3. Project the relevant `evidence_slice` refs into each reviewer Work Packet. +4. Preserve `review_evidence_pack_summary` in the Compaction Contract without injecting raw patch content. +5. Add tests proving that compaction retains pack refs and that reviewer packets do not request complete source reconstruction when a pack exists. + +**Verification:** + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- ledger projection tests for pack refs, hash, and stale status + +### Task 8: Ledger Projection for Partial Subagent Results + +**Goal:** record already-preserved reviewer partial results as trustworthy ledger facts and normalized context state. + +**Likely files:** + +- `src/crates/core/src/agentic/coordination/coordinator.rs` +- `src/crates/core/src/agentic/execution/execution_engine.rs` +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- `src/web-ui/src/flow_chat/components/TaskDetailPanel/TaskDetailPanel.tsx` + +**Steps:** + +1. Inspect current partial recovery data from stream processing and Deep Review result objects. +2. Treat existing `partial_timeout` with evidence as `completed_with_partial` for model-visible projections. +3. Treat timeout without useful evidence as `timed_out`. +4. Record timeout, partial result, raw status, normalized status, reviewer role, scope, and artifact ref in Evidence Ledger. +5. Update Deep Review judge guidance only if it still consumes raw timeout strings instead of normalized status. +6. Map scheduler/runtime status into parent-visible result status: + - `queued` + - `retry_waiting` + - `timed_out` + - `completed_with_partial` + - `failed` +7. Add tests for raw-to-normalized status projection and completed-with-partial paths. + +**Verification:** + +- `cargo test -p bitfun-core coordination -- --nocapture` +- `cargo test -p bitfun-core deep_review -- --nocapture` + +### Task 9: Add Adaptive Context Profiles + +**Goal:** introduce policy without exposing too much UI complexity. + +**Likely files:** + +- `src/crates/core/src/agentic/agents/mod.rs` +- `src/crates/core/src/agentic/execution/execution_engine.rs` +- `src/crates/core/src/agentic/session/session_config.rs` or equivalent config type +- `src/web-ui/src/shared/services/reviewTeamService.ts` only if Deep Review needs explicit policy display + +**Steps:** + +1. Define `ContextProfile`: `LongTask` and `Conversation`. +2. Map default profiles by agent type. +3. Use profile to choose: + - microcompact aggressiveness + - compression contract length + - subagent fan-out caps + - health threshold behavior +4. Add weak-model override mode: + - shorter contract + - stricter loop threshold + - lower auto-delegation cap +5. Keep UI advisory only. + +**Verification:** + +- `cargo test -p bitfun-core context_profile -- --nocapture` +- `cargo check --workspace` + +### Task 10: Frontend Observability and UX + +**Goal:** expose reliability state without making users manage internals. + +**Likely files:** + +- `src/web-ui/src/flow_chat/*` +- `src/web-ui/src/component-library/components/FlowChatCards/*` +- `src/web-ui/src/locales/*` + +**Steps:** + +1. Add a compact context status surface only when action is needed. +2. Show simple statuses: + - Context pressure rising + - Compression preserved key facts + - Reviewer timed out with partial result + - User decision needed + - Waiting for model capacity + - Organizing context to continue +3. Avoid showing raw health score initially. +4. Add i18n entries. +5. Add tests for rendering timeout/partial states. +6. Add tests that ordinary conversation mode does not show context health UI for low-confidence or short background recovery. + +**Verification:** + +- `pnpm run lint:web` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run` + +## 10. Recommended Validation Matrix + +| Scenario | Strong model expectation | Weak model expectation | +| --- | --- | --- | +| Long edit/test loop | compression preserves touched files and tests | short contract, early stop on repeated failure | +| Prompt injection in file content | tool output remains observation only | same behavior, with stronger warning | +| Deep Review timeout | partial reviewer evidence retained | partial evidence retained, lower confidence | +| Large diff review | work packets split reviewers predictably | smaller fan-out, more user confirmation | +| PR / local range / patch Deep Review evidence | one source-agnostic Review Evidence Pack survives compaction and feeds reviewer slices | smaller pack summary, no repeated complete source reconstruction | +| Implemented Deep Review Strategy Engine | manifest facts survive compaction without duplicating reviewer definitions | manifest projected to shorter Work Packet; no duplicate scheduler | +| Conversation mode | recent user intent preserved | conservative compression and fewer automatic actions | +| Runtime budget recovery | recovery/context mutation events become ledger facts | no low-confidence blocking; smaller repair scope | +| Subagent scheduler queue | queued/retry/running states preserved for judge and compaction | lower fan-out; clear user-visible waiting state | +| Large file write | manifest and artifact refs survive compaction | prefer patch/hunk or smaller chunks | + +## 11. Open Questions + +1. Where should the full Evidence Ledger persist long term: session JSON, `.bitfun/sessions/{id}/ledger.jsonl`, or existing event storage? +2. Should strong checkpoints use git-native mechanisms, an internal snapshot store, or both? +3. Which model capability signal is reliable enough for weak-model policy: configured model slot, provider metadata, observed runtime behavior, or a user setting? +4. Should Context Health remain action-oriented UI only, or should raw health score be exposed in an advanced diagnostics panel? +5. Should Work Packet become a generic TaskTool schema or stay a projection of the implemented Deep Review manifest until validated? Current recommendation: keep it as a projection until scheduler/artifact behavior is proven. +6. Should short background compaction be enabled by default in conversation mode after telemetry, or remain opt-in until the product has stronger confidence? +7. Should session artifact retention be governed by the Evidence Ledger store, the runtime budget artifact policy, or a shared retention service? + +## 12. Cross-Document Product Decisions Pending Confirmation + +The current recommendation is to keep this document and `agent-runtime-budget-governance-design.md` separate but aligned. When they touch similar problems, use the following default decisions until product confirmation changes them: + +| Decision | Recommendation | Why | +| --- | --- | --- | +| Background compaction in normal conversation | Allow only short, non-blocking, telemetry-backed compaction by default; long or lossy mutation must show progress/diagnostics. | Codex-style automatic compaction is a strong direction, but BitFun's product rule forbids adding low-confidence user-visible interruptions. | +| Generic Work Packet | Project the implemented Deep Review manifest first; do not make it a universal TaskTool schema in early phases. | Claude/OpenCode show subagents are useful with permissions and isolation, but generic delegation increases blast radius before BitFun has enough runtime evidence. | +| Review Evidence Pack | Parent Deep Review preflight resolves the source and generates the pack once; context only records refs/hash/source fingerprint/slices/stale state. | Repeated PR/file/diff collection is shared evidence, not reviewer-specific reasoning. | +| Raw Context Health UI | Keep raw score internal; show only actionable states. | Users need recovery guidance, not another number to interpret. | +| Subagent parallelism | Do not promise simultaneous start for every subagent. Use runtime-bounded scheduling and show queue/retry/running states. | Gateway capacity and provider behavior are runtime facts, not prompt-level guarantees. | +| Artifact ownership | Runtime creates artifacts; Evidence Ledger records facts, refs, hashes, status, and sensitivity flags. | This keeps storage mechanics separate from model-visible facts. | + +## 13. Final Recommendation + +Implement this in the order of state authority: + +1. Trust boundaries define what can be believed. +2. Evidence ledger defines what happened. +3. Compaction contract defines what must survive. +4. Snapshot/checkpoint defines what can be recovered. +5. Work packets define what can be delegated. +6. Adaptive policy defines how much autonomy each model and mode should get. + +This order keeps the project from adding automation before the runtime can preserve facts. It also gives weak models a better user experience: fewer vague summaries, more explicit state, smaller scopes, and earlier handoff to the user when the task stops converging. + +The runtime budget plan should be treated as the first major consumer of this architecture, not as a replacement for it. Its recovery, scheduler, spill, and large-write events should feed the ledger and context health once those foundations exist. + +Implemented Deep Review should be treated as the first high-value producer of structured facts. Its manifest, reviewer roles, strategy directives, partial evidence, retry budget, and judge output should feed the ledger and compaction contract through projections, not through duplicated role definitions or a second scheduler. + +Large-source Deep Review should also treat shared review evidence as a first-class artifact. A parent-generated source-agnostic Review Evidence Pack is the preferred source of truth for PR URLs, local ranges, working tree diffs, patch artifacts, and explicit file snapshots; context management should preserve its refs and stale status, while reviewers consume slices instead of independently reconstructing the same source evidence. + +The implementation rule is therefore convergence first: before adding a context-side module for task packets, retry state, partial status, artifacts, budget thresholds, or UI health states, verify whether Deep Review or the runtime budget plan already owns the behavior. If an owner exists, context code should add a projection, adapter, or ledger mapping only. If no owner is sufficient, the proposal must name the new owner, migration path, rollback path, and the duplicated behavior it will retire. diff --git a/docs/deep-review-design.md b/docs/deep-review-design.md new file mode 100644 index 000000000..ec60fc57c --- /dev/null +++ b/docs/deep-review-design.md @@ -0,0 +1,1181 @@ +# Deep Review Strategy Engine & Architecture Reviewer Design + +## Overview + +This document proposes a set of improvements to BitFun's deep review system to address timeout, rate-limit, and coverage issues when reviewing large code changes with slow or rate-limited models. + +The proposal has three parts: +1. **Strategy Engine** (方案2): Programmatic strategy selection, predictive timeouts, dynamic concurrency control, and partial result capture. +2. **Architecture Reviewer**: A new core reviewer role focused on structural/architectural concerns, always-on across all strategy levels. +3. **Frontend Reviewer**: A new core reviewer role focused on frontend-specific concerns (React, i18n, accessibility, state management). + +## Implementation Status + +### Status Reconciliation + +This section reflects the current implementation shape after the latest Deep Review strategy-engine commits. Some Phase 2 work is now present as backend policy helpers, manifest parsing, prompt rules, schema/UI surfaces, or tool-level safety nets, but not all items have reached full deterministic scheduler semantics. In particular, "implemented" below means the code has a usable runtime surface; items that still rely on the DeepReview orchestrator prompt, or that are only renderable when the final report carries a structured signal, are called out separately. + +### Completed + +- Added `ReviewArchitecture` as an always-on core reviewer role. +- Added `ReviewFrontend` as a frontend-focused reviewer role in the review team model. +- Added dedicated prompts for Architecture and Frontend reviewers. +- Updated DeepReview orchestration prompt to dispatch Architecture and Frontend reviewers and to provide role-specific strategy amplification. +- Updated existing reviewer prompts to reduce overlap: + - Business Logic no longer owns UI state or layer-boundary analysis. + - Performance no longer owns React render optimization. + - Security focuses on exploitable trust-boundary risks, while structural boundaries move to Architecture. +- Updated Judge prompt with cross-reviewer overlap handling for Architecture/Business Logic, Architecture/Security, Frontend/Performance, and Frontend/Business Logic overlaps. +- Updated review team service, tests, and UI metadata for the new roles. +- Updated Settings > Review i18n so Architecture and Frontend reviewer names render in the active language. +- Updated the Agents page Code Review Team card to avoid clipped reviewer tags and present a compact role-summary layout. +- Added the initial predictive-timeout loop: the launch manifest records target file and diff line stats, derives effective reviewer/judge timeouts from strategy and target size, and the core Task tool honors that manifest policy when launching DeepReview subagents. +- Converted Frontend Reviewer dispatch from always-present execution to conditional execution based on changed frontend or frontend-backend contract files. +- Made backend-provided reviewer definitions the runtime source for frontend team resolution and review-agent visibility, with frontend fallback metadata only as a degraded-mode safety net. +- Added a backend `ChangeRiskFactors` structure and `auto_select_strategy()` helper as a pure policy computation. Runtime manifests now record frontend and backend-compatible recommendations, user override, final strategy, mismatch state, and mismatch severity; backend scoring remains advisory/mismatch-warning metadata and does not override the team or user-selected strategy. +- Added `DeepReviewConcurrencyPolicy` parsing and TaskTool local-cap capacity handling. Reviewer launches now bounded-wait for local reviewer capacity, expire as `CapacitySkipped`, lower a turn-local effective cap after local capacity skips, recover cautiously after successful reviewer observations, and fold capacity skips into final report reliability signals; this is not yet a staggered backend scheduler. +- Added adaptive capacity queue foundation primitives, a backend queue-state event contract, a compact frontend queue notice, local/manual queue-control surface, backend-bound queue pause/continue/cancel/optional-skip commands for local-cap waits, launch-time active-session concurrency warning, and provider transient-capacity observation. Explicit provider rate-limit/concurrency/temporary-overload reviewer failures now surface as `capacity_skipped`, lower the turn-local effective cap, and feed report reliability signals; automatic provider requeue/retry execution, user-facing effective-cap override controls, and staggered backend dispatch are not enabled. +- Added retry budget tracking, reviewer timeout retry guidance, TaskTool structured retry admission, and bounded retry-scope prompt injection. The visible retry ceiling uses the effective manifest policy when available; backend-owned automatic redispatch with reduced scope is not implemented. +- Added partial-timeout result preservation through a coordinator grace period when a timed-out subagent returns a usable final message before the grace window closes. +- Added per-session incremental review cache primitives, session metadata storage fields, TaskTool cache-hit read logic, completed-reviewer write-through, packet-key alignment, and hit/miss reliability signals. +- Added a shared review-subagent tooling contract so custom review agents have explicit required tools (`GetFileDiff`, `Read`) and invalid tooling is reported as `invalid_tooling` rather than silently disappearing. +- Added packet metadata fallback in `submit_code_review`, so missing reviewer `packet_id` values can be inferred from the run manifest when possible and marked as lower-confidence metadata when not. +- Added adaptive context-profile and model-capability policy support for long-running review work and weak-model handling. +- Added a reviewer applicability registry so conditional reviewer decisions are data-driven instead of hardcoded in review-team assembly. +- Added heuristic prompt-byte budget metadata: the launch manifest records per-mode prompt byte thresholds, estimated max reviewer prompt bytes, full-scope summary-first decisions, and token-budget warnings without clipping `assigned_scope` files. +- Added a compact pre-review launch summary in the Deep Review consent dialog, covering file count, risk areas, selected strategy, optional-reviewer count, summary-first state, and skipped-reviewer warnings without restoring dense lineup/cost cards. +- Added automated locale completeness and Agents page team-card layout resilience tests for review-team roles. + +### Remaining / Future Work + +The detailed execution order and per-round exit checks are tracked in `docs/deep-review-phase2-addendum.md`. Keep this section as the design-level summary, and update the addendum first when implementation status changes. + +- **Change risk runtime authority**: Backend-compatible scoring is now represented as advisory/mismatch-warning manifest metadata, while the final launch strategy remains the configured team strategy or explicit user override. `max_cyclomatic_complexity_delta` is still marked `not_measured`; do not make backend scoring authoritative until a measured complexity signal exists and product explicitly wants auto-selection to override user choice. +- **Backend queue/stagger scheduler**: `DeepReviewConcurrencyPolicy` exists and TaskTool bounded-waits for local reviewer-cap saturation before expiring to `CapacitySkipped`; queue state has a backend event contract and can be controlled from the action bar for local-cap waits. Turn-local effective learning exists for local-cap skips and explicit provider transient-capacity reviewer failures, but backend `staggerSeconds`, batch lifecycle management, automatic provider requeue/retry execution, and user-facing override controls are not implemented. Any future queue extension must keep using visible controls, stay timeout-separated, and must not silently consume the user's normal session concurrency. +- **Automatic retry dispatch**: Retry budget, guidance, structured retry admission, and bounded retry-scope prompt injection exist, but the backend does not automatically redispatch a timed-out reviewer with reduced scope or downgraded strategy. +- **Project-level incremental review cache**: Per-session cache read/write support is implemented and keyed by `packet_id`; cross-session/project-level persistence remains product-decision-required and deferred. Current cached reviewer outputs live only with session metadata and are deleted with that session metadata. +- **Shared context cache**: Frontend plan generation, prompt rules, local duplicate `Read`/`GetFileDiff` measurement, and aggregate debug diagnostics exist, but backend result reuse is not programmatically enforced. +- **Token budget enforcement**: File splitting, max-file style limits, heuristic prompt-byte estimates, and full-scope `largeDiffSummaryFirst` decisions are present in manifest policy. Hard prompt-byte clipping and byte-accurate enforcement remain deferred, and any summary-first path must keep unreviewed files visible in coverage notes/reliability signals. +- **Pre-review summary UI**: Compact launch-dialog summary is implemented. A separate dense pre-review report remains deferred unless product later needs it. +- **Work packet batched scheduling**: Frontend work packet data structure and prompt rules are complete; backend `launchBatch` / `staggerSeconds` / `batchExtrasSeparately` scheduling remains prompt-driven except for TaskTool's hard concurrency cap. +- **Conditional reviewer extensibility**: Path-domain classification and reviewer applicability rules now support the current Frontend Reviewer; future conditional reviewer families should extend the registry and add focused tests. +- **Custom reviewer quality boundary**: `GetFileDiff` + `Read` is the minimum valid review-agent tool contract; missing recommended tools such as `Grep`, `Glob`, and `LS` should remain visible as degraded review quality, not invalid configuration. +- **Compression contract integration**: `CompressionContract` structure and `From` conversion is complete; the compressor prompt already injects contract content; no additional implementation needed. +- Extend operational metrics beyond report reliability signals if runtime dashboards or telemetry are added later. +- Define retention/privacy boundaries for cached reviewer outputs, partial outputs, and evidence-ledger artifacts. + +## Current State Analysis + +### Architecture + +BitFun's deep review is a **prompt-driven 5-phase orchestrator** (`DeepReview` agent) coordinating 4 always-on specialist reviewers, an optional frontend-focused reviewer, and 1 sequential judge: + +``` +Phase 1: Scope identification +Phase 2: Parallel dispatch (BusinessLogic, Performance, Security, Architecture + Frontend when applicable + extras) +Phase 3: Quality gate (ReviewJudge validates/merges findings) +Phase 4: Report synthesis (submit_code_review) +Phase 5: Optional remediation (Edit/Write/Bash) +``` + +Key components: +- **Policy layer**: `deep_review_policy.rs` - execution policy, budget tracker, file splitting +- **Task tool enforcement**: `task_tool.rs` - readonly/review/budget/timeout enforcement +- **Coordinator**: `coordinator.rs` - subagent execution with dynamic timeout adjustment +- **Frontend**: `reviewTeamService.ts` + `DeepReviewService.ts` - manifest building, session management + +### Current Strengths + +1. Defense-in-depth policy enforcement (programmatic, not just prompt) +2. Clean separation: orchestrator (write) vs reviewers (read-only) +3. Configurable strategy levels with per-member overrides +4. File splitting for large changes (threshold: 20 files, max 3 instances/role) +5. Continuation/recovery for interrupted reviews +6. Budget tracking with TTL-based pruning + +### Current Weaknesses + +| Dimension | Problem | Impact | +|-----------|---------|--------| +| **Orchestration determinism** | 5-phase workflow exists only in prompt text; LLM may skip phases or serialize reviewers | Weak models increase total runtime 3x | +| **Timeout strategy** | Predictive timeout is present, but partial capture only preserves output that arrives during the grace period | Some timed-out work can still be lost | +| **Dynamic concurrency** | TaskTool bounded-waits for local reviewer-cap saturation and learns a turn-local effective cap after local capacity skips or explicit provider transient-capacity reviewer failures, but there is no backend batch/stagger scheduler or automatic provider requeue | Weak orchestrator models can still mis-order batches; broader automatic waiting would be confusing unless backend-bound controls are visible | +| **Error fallback** | Retry budget, guidance, structured retry admission, and bounded retry-scope prompt injection exist, but backend-owned automatic reduced-scope redispatch is not implemented | Retry launch behavior still depends on the orchestrator model | +| **Context management** | Shared context cache is prompt-only with local duplicate Read/GetFileDiff measurement and aggregate debug diagnostics; backend result reuse is not enforced | Reviewers may duplicate IO and token usage until real-run measurements justify an interception/cache plan | +| **Strategy selection** | Frontend recommendation, backend-compatible recommendation, user override, final strategy, mismatch state, and mismatch severity are recorded as launch metadata; runtime launch still follows configured/user-selected strategy | Users may still over- or under-review, but the product now has non-blocking metadata to explain the tradeoff without silently changing token/concurrency cost | + +### Scenario Breakdown + +| Scenario | Files | Lines | Current Runtime Behavior | Remaining Concern | +|----------|-------|-------|------------------|---------| +| A: Small change | < 5 | < 200 | 4 always-on reviewers, optional frontend only when applicable | Can still be over-provisioned if the user chooses a deeper strategy | +| B: Medium change | 5-20 | 200-1000 | 4 always-on reviewers with predictive timeout and local-cap backpressure | Logic-heavy reviewers may still return partial output on slow models | +| C: Large change | 20-50 | 1000+ | File split can create multiple reviewer packets plus judge; local reviewer-cap waiting is bounded | Provider/adaptive queueing, backend batch/stagger scheduling, and programmatic shared context reuse remain deferred | +| D: Any + slow model | Any | Any | Predictive timeout, partial capture, and structured retry admission exist | Backend-owned retry redispatch is still prompt-guided/deferred | +| E: Any + rate limit | Any | Any | Local cap pressure is bounded and visible; explicit provider transient-capacity reviewer failures become `capacity_skipped` and lower the turn-local effective cap | Provider-side automatic queueing/retry execution is not implemented | + +## Competitive Landscape + +| Tool | Architecture | Parallelism | Large Change | Adaptive | Budget | +|------|-------------|-------------|--------------|----------|--------| +| **BitFun (current)** | 4 always-on specialists + optional Frontend + judge | Prompt-guided parallelism with local-cap backpressure | File split (threshold 20) + summary-first metadata | Advisory strategy metadata; provider/adaptive queue deferred | Per-session cache + heuristic prompt-byte metadata | +| GitHub Copilot | Single model, single pass | N/A | Skip if > ~3000 lines | None | Service-managed | +| CodeRabbit | Single model, multi-pass | Sequential passes | Chunk + summary-first | Heuristic (PR size) | Implicit (chunking) | +| Amazon CodeGuru | Detector ensemble | Parallel detectors | File-level + incremental | Static (language) | Per-detector | +| Google AutoCommenter | Single model, chunked | Sequential chunks | Chunk by file | Confidence threshold | Per-chunk | + +**Key insights from research:** +- No major AI tool has dedicated architecture reviewer as separate agent +- Research (Hong et al., 2024) suggests 3-5 agents with non-overlapping scopes is optimal +- Risk-based automatic strategy selection can reduce review time ~30% (Munaiah et al., 2017) +- Budget-aware dynamic reallocation improves efficiency (Wang et al., 2024) + +## Part 1: Strategy Engine (方案2) + +### 1.1 Change Risk Auto-Classification + +**Goal**: Automatically recommend strategy level based on change characteristics. + +**Implementation**: + +Add a new method to `DeepReviewExecutionPolicy`: + +```rust +/// Risk factors used for automatic strategy selection +pub struct ChangeRiskFactors { + pub file_count: usize, + pub total_lines_changed: usize, + pub files_in_security_paths: usize, // e.g. auth/, crypto/, api/ + pub max_cyclomatic_complexity_delta: usize, + pub cross_crate_changes: usize, // files in different crates +} + +impl DeepReviewExecutionPolicy { + /// Auto-select strategy level based on change risk. + /// Returns recommended level and a human-readable rationale. + pub fn auto_select_strategy(&self, risk: &ChangeRiskFactors) -> (DeepReviewStrategyLevel, String) { + let score = risk.file_count + + risk.total_lines_changed / 100 + + risk.files_in_security_paths * 3 + + risk.cross_crate_changes * 2; + + match score { + 0..=5 => (DeepReviewStrategyLevel::Quick, + format!("Small change ({} files, {} lines). Quick scan sufficient.", + risk.file_count, risk.total_lines_changed)), + 6..=20 => (DeepReviewStrategyLevel::Normal, + format!("Medium change ({} files, {} lines). Standard review recommended.", + risk.file_count, risk.total_lines_changed)), + _ => (DeepReviewStrategyLevel::Deep, + format!("Large/high-risk change ({} files, {} lines, {} security files). Deep review recommended.", + risk.file_count, risk.total_lines_changed, risk.files_in_security_paths)), + } + } +} +``` + +**Risk factor computation**: +- `file_count` and `total_lines_changed`: from `GetFileDiff` or `Git diff --stat` +- `files_in_security_paths`: configurable list of path patterns (e.g. `**/auth/**`, `**/crypto/**`) +- `max_cyclomatic_complexity_delta`: computed by a lightweight AST pass or heuristic +- `cross_crate_changes`: count files across different `Cargo.toml` workspaces + +**Frontend integration**: +- `reviewTeamService.ts` computes risk factors before building the manifest +- UI shows recommended strategy with rationale; user can override +- Override is persisted per-project + +### 1.2 Predictive Timeout + +**Goal**: Set per-reviewer timeout based on change size and strategy, not static defaults. + +**Current state**: +```rust +const DEFAULT_REVIEWER_TIMEOUT_SECONDS: u64 = 600; // 10 minutes +const DEFAULT_JUDGE_TIMEOUT_SECONDS: u64 = 600; // 10 minutes +``` + +**Proposed state**: +```rust +/// Base timeout per strategy (seconds) +const BASE_TIMEOUT_QUICK: u64 = 180; +const BASE_TIMEOUT_NORMAL: u64 = 300; +const BASE_TIMEOUT_DEEP: u64 = 600; + +/// Per-file overhead (seconds) +const TIMEOUT_PER_FILE: u64 = 15; +const TIMEOUT_PER_100_LINES: u64 = 30; + +impl DeepReviewExecutionPolicy { + pub fn predictive_timeout( + &self, + role: DeepReviewSubagentRole, + strategy: DeepReviewStrategyLevel, + file_count: usize, + line_count: usize, + ) -> u64 { + let base = match strategy { + DeepReviewStrategyLevel::Quick => BASE_TIMEOUT_QUICK, + DeepReviewStrategyLevel::Normal => BASE_TIMEOUT_NORMAL, + DeepReviewStrategyLevel::Deep => BASE_TIMEOUT_DEEP, + }; + + let file_overhead = file_count as u64 * TIMEOUT_PER_FILE; + let line_overhead = (line_count as u64 / 100) * TIMEOUT_PER_100_LINES; + + let raw = base + file_overhead + line_overhead; + + // Judge needs more time when there are more reviewer reports + let judge_multiplier = match role { + DeepReviewSubagentRole::Judge => { + let reviewer_count = CORE_REVIEWER_AGENT_TYPES.len() + self.extra_subagent_ids.len(); + 1 + (reviewer_count as u64 - 1) / 3 // +1 for every 3 reviewers + } + DeepReviewSubagentRole::Reviewer => 1, + }; + + let predicted = raw * judge_multiplier; + predicted.min(MAX_TIMEOUT_SECONDS) + } +} +``` + +**Example predictions**: + +| Change | Strategy | Files | Lines | Reviewer Timeout | Judge Timeout | +|--------|----------|-------|-------|-----------------|---------------| +| 3 files, 150 lines | Quick | 3 | 150 | 180 + 45 + 30 = 255s | 255s | +| 15 files, 800 lines | Normal | 15 | 800 | 300 + 225 + 240 = 765s -> 600s (capped) | 600s | +| 30 files, 2000 lines | Deep | 30 | 2000 | 600 + 450 + 600 = 1650s -> 1200s (capped) | 1200s x 1 = 1200s | +| 30 files, 2000 lines + 2 extras | Deep | 30 | 2000 | 1650s -> 1200s | 1650s x 2 = 2400s -> 1800s (capped) | + +**Integration**: +- Frontend computes `file_count` and `line_count` from diff before building manifest +- Passes them in the prompt block or as config fields +- `effective_timeout_seconds` is updated to call `predictive_timeout` when risk factors are available + +### 1.3 Dynamic Concurrency Control + +**Goal**: Prevent rate limit violations by controlling how many reviewers launch in parallel. + +**Original problem state**: Reviewers were launched by prompt instruction with true parallelism. With core roles, file splitting, optional Frontend, and extras, this can create many parallel LLM calls. + +**Current runtime boundary**: TaskTool now bounded-waits for local reviewer-cap saturation, emits queue state, can expire over-cap reviewer work as `CapacitySkipped`, and converts explicit provider transient-capacity reviewer failures into `capacity_skipped` with turn-local effective-cap learning. It still does not own deterministic backend batch/stagger scheduling or automatic provider requeue/retry execution. + +**Proposed state**: + +Add `DeepReviewConcurrencyPolicy`: + +```rust +pub struct DeepReviewConcurrencyPolicy { + /// Maximum parallel reviewer instances at once + pub max_parallel_instances: usize, + /// Whether to stagger launches (wait N seconds between batches) + pub stagger_seconds: u64, + /// Whether to batch extras separately from core reviewers + pub batch_extras_separately: bool, +} + +impl Default for DeepReviewConcurrencyPolicy { + fn default() -> Self { + Self { + max_parallel_instances: 4, + stagger_seconds: 0, + batch_extras_separately: true, + } + } +} +``` + +**Launch strategy**: + +``` +Batch 1 (immediate): Core 4 reviewers (BL, Perf, Sec, Arch) + - If file splitting: up to 3 instances per role, but total <= max_parallel_instances + - Example: max_parallel_instances=4, 4 core roles -> 1 instance each (no splitting) + - Example: max_parallel_instances=8, 4 core roles -> 2 instances each + +Wait for Batch 1 to complete or timeout + +Batch 2 (if needed): Conditional Frontend reviewer and extra reviewers + - Only if Frontend is applicable or extras are configured, and Batch 1 completed + - Respects max_parallel_instances +``` + +**Rate limit awareness**: + +The frontend can query the current model's rate limit status (from a lightweight endpoint or cached state) and adjust `max_parallel_instances`: + +```typescript +function computeConcurrencyPolicy( + modelSlot: string, + rateLimitStatus: RateLimitStatus | null, +): DeepReviewConcurrencyPolicy { + const baseMax = 4; + if (!rateLimitStatus || rateLimitStatus.remaining > baseMax * 2) { + return { max_parallel_instances: baseMax, stagger_seconds: 0, batch_extras_separately: true }; + } + if (rateLimitStatus.remaining > baseMax) { + return { max_parallel_instances: baseMax, stagger_seconds: 5, batch_extras_separately: true }; + } + // Rate limit is tight: reduce parallelism and add stagger + return { + max_parallel_instances: Math.max(2, rateLimitStatus.remaining), + stagger_seconds: 10, + batch_extras_separately: true, + }; +} +``` + +**Integration with file splitting**: + +When `max_parallel_instances` is tight, file splitting should be reduced or disabled: + +```rust +pub fn effective_max_same_role_instances( + &self, + file_count: usize, + concurrency_policy: &DeepReviewConcurrencyPolicy, +) -> usize { + let role_count = CORE_REVIEWER_AGENT_TYPES.len() + self.extra_subagent_ids.len(); + let max_per_role = concurrency_policy.max_parallel_instances / role_count; + max_per_role.max(1).min(self.max_same_role_instances) +} +``` + +**Adaptive capacity queue follow-up**: + +The current runtime intentionally stops at bounded local-cap waiting plus turn-local effective-cap learning for local-cap skips and explicit provider transient-capacity failures. Broader provider/adaptive queueing is a future extension, not part of the current completed boundary. If implemented, it must be owned by the subagent runtime rather than by the DeepReview prompt alone: + +- Treat configured `max_parallel_instances` as a hard maximum and maintain a lower runtime `effective_parallel_instances` when provider or local capacity errors are observed. +- Queue only explicit transient capacity errors: provider rate limit, provider concurrency limit, explicit `Retry-After`, local subagent cap saturation, or temporary overload. Authentication, billing/quota exhaustion, invalid model, policy violation, user cancellation, invalid tooling, and validation errors must fail fast. +- Separate queue time from execution timeout. A reviewer in `QueuedForCapacity` or `PausedByUser` has not started its reviewer `timeout_seconds`; the timeout starts only after the reviewer enters `Running`. +- Surface the queue as a compact user-facing state, not as hidden waiting. Local-cap backend-driven queue notices now support backend-bound pause, continue, cancel, and optional-extra skipping. Cap adjustment and broader provider/adaptive queue controls remain future work. +- Preserve normal session responsiveness. Deep Review reviewer queueing must not silently consume all available subagent capacity. For broader provider/adaptive queueing, the UI should recommend pausing Deep Review or lowering strategy, and provide backend-bound controls before promising manual continuation. + +### 1.4 Partial Result Capture + +**Goal**: When a reviewer times out, preserve its last output instead of losing all work. + +**Current state**: Coordinator uses `tokio::time::timeout` which returns `Err(Timeout)` with no partial data. + +**Proposed state**: + +Modify the coordinator's `execute_subagent` to capture the last model response before timeout: + +```rust +// In coordinator.rs, around the timeout wrapping logic: +let result = if let Some(secs) = timeout_seconds.filter(|&s| s > 0) { + let timeout_future = tokio::time::timeout( + Duration::from_secs(secs), + self.run_subagent_loop(...) + ); + match timeout_future.await { + Ok(result) => result, + Err(_) => { + // Timeout fired - try to capture partial results + let partial = self.try_capture_partial_results(&session_id).await; + match partial { + Some(partial_result) => { + // Return partial result with a timeout marker + Ok(SubagentResult { + response: format!("{}", partial_result), + status: SubagentStatus::PartialTimeout, + ..partial_result + }) + } + None => Err(BitFunError::Timeout(timeout_error_message)), + } + } + } +} else { + self.run_subagent_loop(...).await +}; +``` + +**Partial result capture mechanism**: + +The subagent's dialog turns are stored in the session store. `try_capture_partial_results` would: +1. Read the subagent session's dialog turns +2. Find the last assistant message (model output) +3. Extract any findings already written in the expected format +4. Return them as a partial result + +**Frontend handling**: + +`deepReviewContinuation.ts` already detects `timed_out` status. It should be updated to handle `PartialTimeout`: + +```typescript +// In collectReviewerProgress: +if (toolResult.status === 'partial_timeout' || + (toolResult.error && /partial timeout/i.test(toolResult.error))) { + status = 'partial_timeout'; +} + +// In buildDeepReviewContinuationPrompt: +// Include partial findings with reduced confidence +``` + +**Judge handling**: + +The judge prompt already instructs it to handle partial results. The `ReviewJudge` should treat `partial_timeout` findings with lower confidence than `completed` findings. + +### 1.5 Retry Budget + +**Goal**: Allow each reviewer role one retry with reduced scope when it times out or fails. + +**Current state**: `DeepReviewTurnBudget` tracks: +- `reviewer_calls`: max calls per turn +- `judge_calls`: max 1 per turn + +**Proposed state**: + +Add retry tracking: + +```rust +#[derive(Debug, Clone)] +struct DeepReviewTurnBudget { + reviewer_calls: usize, + judge_calls: usize, + retries_used: HashMap, // role -> retry count + max_retries_per_role: usize, // default: 1 + updated_at: Instant, +} +``` + +**Retry logic**: + +When a reviewer times out or fails: +1. Orchestrator checks `retries_used[role] < max_retries_per_role` +2. If yes, re-dispatch with: + - Same target but reduced scope (only files not yet reviewed) + - Reduced timeout (original timeout / 2) + - Strategy downgraded one level (deep -> normal, normal -> quick) +3. Include structured `retry_coverage` so TaskTool can confirm the source packet, source status, covered files, and smaller retry scope +4. Increment `retries_used[role]` + +**Current runtime boundary**: TaskTool enforces the structured retry admission gate for retry reviewer Tasks. It rejects missing coverage, non-retryable source status, broad scope, non-lowered timeout, and exhausted retry budget, then prepends the accepted retry scope to the reviewer prompt. It does not infer coverage from free-form partial output and does not launch backend-owned automatic redispatch. + +**Integration with continuation**: + +The continuation system already handles re-running failed reviewers. The retry budget adds a cap to prevent infinite loops. + +### 1.6 Summary: Strategy Engine Changes + +| Component | File | Change | +|-----------|------|--------| +| Risk classification | `deep_review_policy.rs` | Add `ChangeRiskFactors` and `auto_select_strategy` | +| Predictive timeout | `deep_review_policy.rs` | Add `predictive_timeout` method | +| Concurrency policy | `deep_review_policy.rs` | Add `DeepReviewConcurrencyPolicy` and `effective_max_same_role_instances` | +| Budget retry tracking | `deep_review_policy.rs` | Add `retries_used` and `max_retries_per_role` to `DeepReviewTurnBudget` | +| Structured retry admission | `task_tool.rs` | Require bounded `retry_coverage`, reduced retry scope, retryable source status, lower timeout, retry budget, and bounded retry-scope prompt injection before accepting retry reviewer Tasks | +| Partial result capture | `coordinator.rs` | Add `try_capture_partial_results` and `SubagentStatus::PartialTimeout` | +| Task tool enforcement | `task_tool.rs` | Apply concurrency policy and predictive timeout | +| Frontend risk computation | `reviewTeamService.ts` | Compute risk factors from diff | +| Frontend concurrency | `reviewTeamService.ts` | Build batches based on concurrency policy | +| Frontend timeout pass-through | `DeepReviewService.ts` | Pass risk factors to backend | + +## Part 2: Architecture Reviewer + +### 2.1 Rationale + +**Original gap analysis**: The earlier reviewer set covered correctness, performance, and security. The implemented team now adds Architecture as an always-on reviewer, so this section is historical rationale for why the role was added: + +1. **Module coupling / dependency direction violations** - e.g. core crate importing from desktop app +2. **Layer violations** - e.g. service layer bypassing API abstraction +3. **API contract design** - e.g. Tauri commands not following `snake_case` + structured request pattern +4. **Abstraction integrity** - e.g. platform-specific details escaping through shared interfaces +5. **Design pattern consistency** - e.g. new features not following established patterns +6. **Structural scalability** - e.g. changes requiring cross-cutting modifications in 5+ crates + +**Research findings**: +- No major AI code review tool has a dedicated architecture reviewer as a separate parallel agent +- Architecture concerns are typically folded into "maintainability" or "code quality" within a single reviewer +- Research (Hong et al., 2024) suggests 3-5 agents with non-overlapping scopes is optimal; 5+1 is still within the efficient range +- Google's code review culture treats "Design" as the most important dimension, but handled by the same reviewer + +**Overlap risk with Business Logic reviewer**: +- BL reviewer: "Does this call chain produce correct results?" +- Architecture reviewer: "Should this call chain exist at all? Does it respect layer boundaries?" +- The deep strategy already asks BL to "map full call chains" - this borders on architectural analysis but from a correctness angle + +### 2.2 Scope Definition + +**Architecture Reviewer** (`ReviewArchitecture`): + +**Covers**: +- Module boundary violations (imports that violate layer dependencies) +- API contract design (Tauri commands, tool schemas, transport messages) +- Abstraction integrity (platform-agnostic violations, bypassed interfaces) +- Structural consistency (patterns, registration conventions) +- Dependency direction (circular dependencies, wrong-direction imports) +- Cross-cutting concern impact (changes touching too many layers) + +**Explicitly excludes** (to avoid overlap): +- Business rule correctness - Business Logic reviewer +- Algorithm performance - Performance reviewer +- Security vulnerabilities - Security reviewer +- Code style/formatting - not a review dimension + +### 2.3 Activation Strategy: Always-On Across All Strategy Levels + +**Revised recommendation: Architecture Reviewer should be always-on, not deep-only.** + +Previous analysis recommended deep-only activation to minimize cost. However, further investigation reveals: + +**Why architecture review matters at every strategy level**: + +1. **Quick reviews still need architecture checks**: A 3-file change that adds `import { invoke } from '@tauri-apps/api'` directly in a React component violates the adapter pattern regardless of strategy level. Quick reviews that skip architecture miss exactly the kind of issue that is cheap to find but expensive to fix later. + +2. **Architecture violations are cheap to detect**: Unlike business logic review (which requires reading surrounding context), architecture review primarily uses `LS`, `Glob`, and `Grep` for import analysis. The token cost is significantly lower than other reviewers - estimated at 0.6-0.8x of a typical reviewer's cost. + +3. **Layer violations compound**: An architecture violation that slips through a quick review will be harder to catch later. The cost of missing it early is disproportionately high. + +4. **BitFun's explicit architectural rules**: The project has documented rules ("keep product logic platform-agnostic", "do not call Tauri APIs directly from UI components") that should be checked on every review, not just deep ones. + +**Cost mitigation for always-on**: + +| Strategy | Architecture Reviewer Behavior | Estimated Token Cost | +|----------|-------------------------------|---------------------| +| Quick | Only check imports directly changed by the diff | ~0.3x | +| Normal | Check diff imports + one level of dependency direction | ~0.6x | +| Deep | Map full dependency graph for changed modules | ~0.8x | + +At quick strategy, the architecture reviewer's cost is minimal because it only inspects import statements in changed files - no context reading, no call chain tracing. + +**Updated team composition**: + +| Strategy | Reviewers | Total Parallel Calls (no split) | +|----------|-----------|-------------------------------| +| Quick | BL, Perf, Sec, **Arch** | 4 | +| Normal | BL, Perf, Sec, **Arch** | 4 | +| Deep | BL, Perf, Sec, **Arch**, **Frontend** (if frontend files present) | 4-5 | + +### 2.4 Implementation + +**New files**: +- `src/crates/core/src/agentic/agents/prompts/review_architecture_agent.md` + +**Modified files**: + +1. **`deep_review_policy.rs`**: + - Add `REVIEWER_ARCHITECTURE_AGENT_TYPE: &str = "ReviewArchitecture"` + - Update `CORE_REVIEWER_AGENT_TYPES` from `[&str; 3]` to `[&str; 4]` + - Budget test updates + +2. **`review_specialist_agents.rs`**: + - Add `ArchitectureReviewerAgent` using `define_readonly_subagent!` + +3. **`registry.rs`**: + - Add `ReviewArchitecture` to `default_model_id_for_builtin_agent` - `"fast"` + - Add to `is_review_agent_entry` check + +4. **`deep_review_agent.md` (orchestrator prompt)**: + - Update "Team Shape" to include Architecture Reviewer as 4th mandatory role + - Add role-specific strategy amplification for Architecture + +5. **`reviewTeamService.ts` (frontend)**: + - Add `'architecture'` to `ReviewTeamCoreRoleKey` + - Add `'ReviewArchitecture'` to `ReviewRoleDirectiveKey` + - Add architecture entry to `DEFAULT_REVIEW_TEAM_CORE_ROLES` + - Add strategy directives for architecture in all three profiles + - Update `buildReviewTeamPromptBlock` + +6. **Localization**: + - Add architecture reviewer strings in en-US and zh-CN + +7. **Tests**: + - `deep_review_policy.rs` tests: Update budget calculation assertions + - `reviewTeamService.test.ts`: Update manifest building tests + +### 2.5 Cost Impact (Always-On) + +| Metric | Current (3+1) | With Architecture (4+1, always-on) | Delta | +|--------|---------------|-----------------------------------|-------| +| Parallel calls (no split) | 3 | 4 | +33% | +| Parallel calls (3 instances/role) | 9 | 12 | +33% | +| Token cost (quick) | 0.4-0.6x | 0.5-0.8x | +~0.2x | +| Token cost (normal) | 1x | 1.2-1.3x | +~0.25x | +| Token cost (deep) | 1.8-2.5x | 2.2-3.0x | +~0.4x | + +**Mitigation**: Architecture reviewer uses `fast` model slot at all strategy levels, and primarily uses `LS`/`Glob`/`Grep` (cheaper than `Read`), so actual cost increase is lower than the raw +33% parallel call increase. + +## Part 3: Frontend Reviewer + +### 3.1 Rationale + +**Why a dedicated Frontend Reviewer is needed**: + +The BitFun frontend is a substantial portion of the codebase (~250+ TSX components, ~300+ TS files, 96 locale files) with domain-specific concerns that no current reviewer can effectively evaluate: + +| Concern | Current Coverage | Gap | +|---------|-----------------|-----| +| **i18n key synchronization** | None | 96 locale files across 3 languages; missing keys in one locale is a common failure | +| **React performance patterns** | Performance reviewer mentions "expensive renders" generically | Cannot identify React-specific anti-patterns: missing memo/useCallback/useMemo, inline functions in JSX, missing virtualization | +| **Accessibility** | None | Only ~40 aria/role attributes across 250+ components - severely under-covered | +| **Zustand state management** | BL reviewer might catch circular deps as "logic issue" | Cannot recognize Zustand-specific patterns: selector granularity, store dependencies, stale closures | +| **Platform boundary (frontend)** | None | ~6 files import `@tauri-apps/api` directly instead of through adapter layer | +| **Event bus contract alignment** | None | Backend events and frontend listeners must stay in sync; contract drift is invisible | +| **CSS/theme consistency** | None | ThemeService, Monaco theme sync, component library usage patterns | + +**Concrete examples that fall through the cracks**: + +1. A developer adds `t('scenes.agents.newFeature')` but only adds the key to `en-US/scenes/agents.json`, forgetting `zh-CN` and `zh-TW`. No current reviewer catches this. + +2. A developer creates a large list component without virtualization, or defines inline object/function references in JSX causing re-renders. The Performance Reviewer mentions "unnecessary re-renders" but lacks React-specific knowledge. + +3. A new modal dialog is added without `aria-labelledby`, focus trap, or keyboard navigation. No reviewer has accessibility in its mission. + +4. A Rust backend Tauri command changes its request/response types, but the corresponding TypeScript API client is not updated. No reviewer systematically checks frontend-backend API contract alignment. + +### 3.2 Scope Definition + +**Frontend Reviewer** (`ReviewFrontend`): + +**Covers**: +- i18n completeness and key synchronization across locales +- React performance patterns (memoization, virtualization, effect dependencies) +- Accessibility (ARIA attributes, keyboard navigation, focus management) +- State management patterns (Zustand selector granularity, store dependencies) +- Frontend-backend API contract alignment (Tauri command types, event payloads) +- Platform boundary compliance (no direct `@tauri-apps/api` outside adapter layer) +- CSS/theme consistency (ThemeService usage, component library patterns) + +**Explicitly excludes** (to avoid overlap): +- Business rule correctness - Business Logic reviewer +- Algorithm performance (non-React) - Performance reviewer +- Security vulnerabilities - Security reviewer +- Architecture (backend layer violations) - Architecture reviewer + +### 3.3 Activation Strategy: Conditional on Frontend File Presence + +The Frontend Reviewer should only activate when the change includes frontend files. This avoids wasting resources on pure-backend changes. + +**Detection logic** (in the orchestrator or frontend manifest builder): + +```typescript +function hasFrontendFiles(changedFiles: string[]): boolean { + return changedFiles.some(f => + f.startsWith('src/web-ui/') || + f.startsWith('src/mobile-web/') || + f.endsWith('.tsx') || + f.endsWith('.scss') || + f.endsWith('.css') || + f.includes('/locales/') + ); +} +``` + +**Updated team composition**: + +| Strategy | Backend-only change | Change with frontend files | +|----------|--------------------|---------------------------| +| Quick | BL, Perf, Sec, Arch | BL, Perf, Sec, Arch, **Frontend** | +| Normal | BL, Perf, Sec, Arch | BL, Perf, Sec, Arch, **Frontend** | +| Deep | BL, Perf, Sec, Arch | BL, Perf, Sec, Arch, **Frontend** | + +### 3.4 Implementation + +**New files**: +- `src/crates/core/src/agentic/agents/prompts/review_frontend_agent.md` + +**Modified files**: + +1. **`deep_review_policy.rs`**: + - Add `REVIEWER_FRONTEND_AGENT_TYPE: &str = "ReviewFrontend"` + - Add `FRONTEND_REVIEWER_AGENT_TYPE` to a new `CONDITIONAL_REVIEWER_AGENT_TYPES` array + - Update budget calculation to account for conditional reviewers + +2. **`review_specialist_agents.rs`**: + - Add `FrontendReviewerAgent` using `define_readonly_subagent!` + +3. **`registry.rs`**: + - Add `ReviewFrontend` to `default_model_id_for_builtin_agent` - `"fast"` + - Add to `is_review_agent_entry` check + +4. **`deep_review_agent.md` (orchestrator prompt)**: + - Add Frontend Reviewer as conditional role: "If the change includes frontend files (src/web-ui/, .tsx, .scss, locales/), also launch ReviewFrontend in the same parallel batch." + +5. **`reviewTeamService.ts` (frontend)**: + - Add `'frontend'` to `ReviewTeamCoreRoleKey` + - Add `'ReviewFrontend'` to `ReviewRoleDirectiveKey` + - Add frontend entry to `DEFAULT_REVIEW_TEAM_CORE_ROLES` with `conditional: true` flag + - Add strategy directives for frontend in all three profiles + - Detect frontend files in the change and conditionally include Frontend Reviewer in manifest + +6. **Localization**: + - Add frontend reviewer strings in en-US and zh-CN + +7. **Tests**: + - `deep_review_policy.rs` tests: Update budget calculation for conditional reviewers + - `reviewTeamService.test.ts`: Test conditional inclusion logic + +### 3.5 Cost Impact (Conditional) + +| Metric | Backend-only change | Change with frontend files | +|--------|--------------------|---------------------------| +| Parallel calls (no split) | 4 (same as architecture-only) | 5 | +| Token cost (quick) | 0.5-0.8x | 0.6-1.0x | +| Token cost (normal) | 1.2-1.3x | 1.4-1.6x | +| Token cost (deep) | 2.2-3.0x | 2.6-3.5x | + +**Mitigation**: Frontend Reviewer uses `fast` model slot at all strategy levels. At quick strategy, it primarily checks i18n keys and import patterns (cheap). At deep strategy, it does thorough React analysis (more expensive but still bounded). + +## Part 4: Changes to Existing Reviewer Prompts + +Adding Architecture and Frontend reviewers requires adjustments to existing reviewer prompts to eliminate overlap and clarify boundaries. + +### 4.1 Business Logic Reviewer Changes + +**Current mission items to modify**: + +| Current Item | Issue | Change | +|---|---|---| +| "partial updates that can leave data or UI in an inconsistent state" | "UI" overlaps with Frontend Reviewer's state management scope | Change to "partial updates that can leave data in an inconsistent state" - remove "or UI" since Frontend Reviewer covers React state consistency | +| Deep strategy: "map the full call chain for each changed function" | Overlaps with Architecture Reviewer's dependency analysis | Change to "map the full call chain for each changed function to verify business rules and state transitions" - explicitly scope to correctness, not structural analysis | + +**New exclusion note** (add to Review Standards section): + +```markdown +## What you do NOT review + +- Whether a call chain should exist or respects layer boundaries (Architecture Reviewer) +- React component state, i18n, or accessibility issues (Frontend Reviewer) +- Performance of specific algorithms (Performance Reviewer) +- Security vulnerabilities (Security Reviewer) +``` + +### 4.2 Performance Reviewer Changes + +**Current mission items to modify**: + +| Current Item | Issue | Change | +|---|---|---| +| "expensive renders or recomputations" | Overlaps with Frontend Reviewer's React performance scope | Change to "expensive computations on hot paths" - remove "renders" since Frontend Reviewer covers React rendering performance | +| "unnecessary re-renders" (in quick strategy efficiency rules) | Overlaps with Frontend Reviewer | Remove from Performance Reviewer; Frontend Reviewer handles React-specific render optimization | +| "oversized diffs / payloads / serialization" | Partially overlaps with Architecture Reviewer's API contract scope | Keep but clarify: "oversized payloads or serialization on data paths" - focus on runtime cost, not contract design | + +**New exclusion note**: + +```markdown +## What you do NOT review + +- React rendering performance or component memoization (Frontend Reviewer) +- Whether a data path respects layer boundaries (Architecture Reviewer) +- Security vulnerabilities (Security Reviewer) +- Business rule correctness (Business Logic Reviewer) +``` + +### 4.3 Security Reviewer Changes + +**Current mission items to modify**: + +| Current Item | Issue | Change | +|---|---|---| +| "trust-boundary violations" | Overlaps with Architecture Reviewer's layer boundary checks | Clarify scope: "trust-boundary violations that create exploitable security risks" - Architecture Reviewer checks structural boundaries; Security Reviewer checks exploitable ones | +| "insecure defaults" | Partially overlaps with Architecture Reviewer's API contract checks | Keep but clarify: "insecure defaults in authentication, authorization, or data handling" - scope to security-relevant defaults | + +**New exclusion note**: + +```markdown +## What you do NOT review + +- Structural layer violations without exploitable security impact (Architecture Reviewer) +- Frontend-specific security concerns like XSS in React components (Frontend Reviewer) +- Business rule correctness (Business Logic Reviewer) +- Algorithm performance (Performance Reviewer) +``` + +### 4.4 Judge Changes + +The Judge prompt needs to be updated to handle 4-5 reviewer reports instead of 3: + +**Current behavior**: Judge validates findings from 3 core reviewers. + +**Required changes**: + +1. **Update efficiency rules**: The judge already has strategy-aware efficiency rules. Update the deep strategy directive to explicitly mention cross-validation between Architecture and Business Logic findings (since these roles have the most potential overlap). + +2. **Add overlap detection guidance**: + +```markdown +## Overlap detection + +When multiple reviewers report findings about the same code location: +- If Architecture Reviewer flags a layer violation and Security Reviewer flags a trust-boundary issue at the same location, keep both but note the architectural root cause may address both. +- If Business Logic Reviewer flags a call chain issue and Architecture Reviewer flags the same chain as a dependency violation, the Architecture finding is the root cause; downgrade the BL finding to a symptom. +- If Frontend Reviewer flags a React performance issue and Performance Reviewer flags a general performance issue at the same component, merge into a single finding with both perspectives. +``` + +3. **Update partial timeout handling**: With more reviewers, the probability of partial timeouts increases. The judge should be instructed to handle partial results from 4-5 reviewers gracefully. + +### 4.5 Orchestrator Prompt Changes + +**Current "Team Shape" section**: + +``` +Team shape (mandatory): +- Business Logic Reviewer (ReviewBusinessLogic) +- Performance Reviewer (ReviewPerformance) +- Security Reviewer (ReviewSecurity) +- Review Quality Inspector (ReviewJudge) +``` + +**Updated "Team Shape" section**: + +``` +Team shape (mandatory): +- Business Logic Reviewer (ReviewBusinessLogic) +- Performance Reviewer (ReviewPerformance) +- Security Reviewer (ReviewSecurity) +- Architecture Reviewer (ReviewArchitecture) +- [Conditional] Frontend Reviewer (ReviewFrontend) - include only when the change contains frontend files (src/web-ui/, .tsx, .scss, locales/) +- Review Quality Inspector (ReviewJudge) +``` + +**Updated role-specific strategy amplification** - add entries for Architecture and Frontend: + +``` +- **ReviewArchitecture** + `quick`: "Only check imports directly changed by the diff. Flag violations of documented layer boundaries." +- **ReviewArchitecture** + `normal`: "Check the diff's imports plus one level of dependency direction. Verify API contract consistency." +- **ReviewArchitecture** + `deep`: "Map the full dependency graph for changed modules. Check for structural anti-patterns, circular dependencies, and cross-cutting concerns." +- **ReviewFrontend** + `quick`: "Only check i18n key completeness and direct platform boundary violations in changed frontend files." +- **ReviewFrontend** + `normal`: "Check i18n, React performance patterns, and accessibility in changed components. Verify frontend-backend API contract alignment." +- **ReviewFrontend** + `deep`: "Thorough React analysis: effect dependencies, memoization, virtualization. Full accessibility audit. State management pattern review. Cross-layer contract verification." +``` + +### 4.6 Frontend Strategy Directives + +Add to `REVIEW_STRATEGY_PROFILES` in `reviewTeamService.ts`: + +```typescript +// Architecture Reviewer directives +ReviewArchitecture: { + quick: 'Only check imports directly changed by the diff. Flag violations of documented layer boundaries.', + normal: "Check the diff's imports plus one level of dependency direction. Verify API contract consistency.", + deep: 'Map the full dependency graph for changed modules. Check for structural anti-patterns, circular dependencies, and cross-cutting concerns.', +} + +// Frontend Reviewer directives +ReviewFrontend: { + quick: 'Only check i18n key completeness and direct platform boundary violations in changed frontend files.', + normal: 'Check i18n, React performance patterns, and accessibility in changed components. Verify frontend-backend API contract alignment.', + deep: 'Thorough React analysis: effect dependencies, memoization, virtualization. Full accessibility audit. State management pattern review. Cross-layer contract verification.', +} +``` + +### 4.7 Summary of All Prompt Changes + +| File | Change Type | Description | +|------|-------------|-------------| +| `review_business_logic_agent.md` | Modify | Remove "or UI" from partial updates; scope deep strategy to correctness; add exclusion note | +| `review_performance_agent.md` | Modify | Remove "renders" from mission; remove "unnecessary re-renders" from efficiency rules; add exclusion note | +| `review_security_agent.md` | Modify | Clarify "trust-boundary violations" scope to exploitable risks; add exclusion note | +| `review_quality_gate_agent.md` | Modify | Add overlap detection guidance for Architecture/BL and Frontend/Perf; update partial timeout handling | +| `deep_review_agent.md` | Modify | Update team shape; add Architecture and Frontend to mandatory/conditional roles; add strategy amplification entries | +| `review_architecture_agent.md` | **New** | Full prompt for Architecture Reviewer | +| `review_frontend_agent.md` | **New** | Full prompt for Frontend Reviewer | + +## Part 5: Implementation Priority + +### Completed: Reviewer Role Expansion + +1. **Architecture Reviewer (always-on)** - implemented as a core reviewer role with dedicated prompt, backend registration, frontend metadata, and strategy directives. +2. **Frontend Reviewer metadata and prompt** - implemented with dedicated prompt, backend registration, frontend metadata, settings i18n, and UI support. +3. **Existing reviewer prompt adjustments** - implemented to clarify ownership boundaries and reduce cross-role duplication. +4. **Judge overlap handling** - implemented for Architecture/Business Logic, Architecture/Security, Frontend/Performance, and Frontend/Business Logic overlap cases. +5. **UI support** - implemented for Review Team page, Settings > Review member names, Agents overview Code Review Team card, continuation/report/remediation flows, and hidden-agent filtering. +6. **Backend-provided reviewer definition** - implemented as a core `default_review_team_definition()` manifest surfaced through the desktop API and consumed by the frontend review team resolver. +7. **Dynamic hidden-agent derivation** - implemented for Agents overview by combining static non-review hidden IDs with backend-provided review-agent hidden IDs. + +### Current Next Phase: Strategy Engine Closure (Highest Priority) + +1. **Queue-aware backend dispatch**: Keep the current bounded local-cap wait, backend-bound local-cap queue controls, explicit provider transient-capacity skip conversion, and turn-local effective-cap learning as narrow TaskTool-owned behavior, not a full scheduler. Add automatic provider/adaptive queueing, staggered backend batches, and user-facing effective-cap override controls only in small verified rounds. The queue design must keep queue time separate from execution timeout, expose honest user controls, and avoid starving normal user session concurrency. +2. **Retry execution semantics**: Keep backend-owned redispatch prompt-guided unless automatic reduced-scope dispatch is explicitly implemented. The retry hint uses the effective manifest policy, and TaskTool now accepts retry Tasks only with structured coverage, reduced scope, retryable source status, lower timeout, available retry budget, and bounded retry-scope prompt injection. +3. **Incremental cache expansion**: Keep the implemented per-session `packet_id` cache path. Cached reviewer outputs have no independent retention period beyond session metadata, and project-level reuse must wait for explicit retention, deletion, invalidation, and user-visibility rules. + +### Current Next Phase: Dynamic Control And Governance (High Priority) + +4. **Runtime strategy authority**: Keep backend `auto_select_strategy()` as advisory/mismatch-warning metadata. Only revisit authoritative auto-selection after measured complexity delta exists and product explicitly accepts strategy changes that can alter token/concurrency cost. +5. **Token and context budgets**: Keep heuristic prompt-byte estimates and full-scope summary-first metadata as the current boundary. Add hard clipping or byte-accurate enforcement only after it can preserve explicit coverage for every file. +6. **Operational evidence**: Keep the implemented report reliability surfaces for partial timeouts, retry guidance, cache hits/misses, skipped reviewers, token-budget tradeoffs, and TaskTool cap rejections. Keep shared-context duplicate measurement local and non-reporting; final Deep Review submission may emit aggregate debug counts for local sampling, but real runs must show that programmatic reuse is worth the runtime complexity before adding interception or cache reuse. Add external telemetry only if product diagnostics require it. + +### Superseded Next Phase: Strategy Engine Foundation + +1. **Predictive timeout refinement** - Add real diff line-count stats and keep frontend/core timeout formulas aligned. +2. **Partial result capture** - Prevents total work loss on timeout. +3. **Conditional Frontend dispatch** - Move from metadata-level inclusion to diff-aware launch behavior so frontend review only runs when frontend files are present. + +### Superseded Next Phase: Dynamic Control + +4. **Change risk auto-classification** - Reduces misconfiguration; ~30% time savings. +5. **Dynamic concurrency control** - Prevents rate limit violations, especially important with 4-5 reviewers. +6. **Retry budget** - Improves resilience for transient failures. + +### Implementation Additions (Beyond Original Design) + +The following additions emerged during implementation as natural extensions of the original design. + +#### ContextHealthSnapshot + +**Added in**: `execution_engine.rs` + +A runtime health snapshot used by the compression and context-profile subsystems to detect degraded sessions: + +```rust +struct ContextHealthSnapshot { + token_usage_ratio: f64, // current / context_window + repeated_tool_signature_count: usize, // same tool+args pattern in consecutive turns + consecutive_failed_commands: usize, // back-to-back tool errors +} +``` + +**Purpose**: The Context Profile Policy (Section 1.3) needs runtime signals to decide when to downgrade concurrency or switch to a lighter compression strategy. `ContextHealthSnapshot` provides these signals from observed turn history rather than static configuration. + +**Integration points**: +- `context_profile.rs` uses the snapshot to adjust `LongTask` profile concurrency limits when `repeated_tool_signature_count > 2` or `consecutive_failed_commands > 1`. +- The compression subsystem uses `token_usage_ratio` to decide between model-based and fallback compression. + +#### ModelCapabilityProfile + +**Added in**: `context_profile.rs` + +A lightweight model capability classifier used to adapt review behavior for weaker models: + +```rust +enum ModelCapabilityProfile { + Standard, // full-featured models + Weak, // models with limited reasoning (detected by id heuristic) +} +``` + +**Detection heuristic**: Matches model id against known weak-model suffixes (`haiku`, `mini`, `flash`, etc.). + +**Purpose**: Weak models require different concurrency and context strategies (lower parallel reviewer count, smaller per-reviewer context windows, reduced file-splitting). This is a runtime complement to the user-configured strategy level. + +**Integration points**: +- `context_profile.rs` reduces `max_parallel_reviewers` for `Weak` models. +- `deep_review_policy.rs` can lower predictive timeout multipliers for weak models (future work). + +#### Extended Review Target Path Classification + +**Added in**: `reviewTargetClassifier.ts` + +The original design defined a simple `hasFrontendFiles()` boolean check. The implementation extends this to a multi-domain path classification system with 15+ tag rules: + +| Domain Tag | Path Patterns | Purpose | +|---|---|---| +| `frontend_ui` | `src/web-ui/src/**`, `*.tsx` | Frontend UI components | +| `frontend_style` | `*.scss`, `*.css` (in web-ui) | Frontend styling | +| `frontend_i18n` | `**/locales/**` | Internationalization files | +| `frontend_contract` | `src/apps/desktop/src/api/**` | Frontend-backend API surface | +| `desktop_contract` | `src/apps/desktop/**` | Desktop-specific integration | +| `backend_core` | `src/crates/core/**` | Core Rust logic | +| `api_layer` | `src/crates/api-layer/**` | API abstraction layer | +| `transport` | `src/crates/transport/**` | Transport adapters | + +**Purpose**: Fine-grained classification enables: +1. More accurate `recommendReviewStrategyForTarget()` scoring (e.g., `contractSurfaceChanged` flag). +2. Conditional reviewer activation beyond just Frontend by extending the reviewer applicability registry (e.g., future backend-only optimizations). +3. Pre-review summary with workspace area breakdown. + +**Backward compatibility**: The simple `hasFrontendFiles()` check is derived from the tags: `target.tags.includes('frontend_ui') || target.tags.includes('frontend_style') || target.tags.includes('frontend_i18n')`. + +### Future Role Extensibility Improvements + +7. **Locale completeness checks** - Add tests that fail when a core role is missing translations in `scenes/agents.json` or `settings/review.json`. +8. **Card layout resilience tests** - Add visual or DOM-level tests ensuring role summary cards do not clip content when role count grows. + +### Advanced (Lower Priority) + +13. **Shared context cache** - Programmatic reuse remains deferred; current runtime measures duplicate `Read`/`GetFileDiff` calls and emits aggregate local debug diagnostics at report submission. +14. **Incremental review caching** - Per-session packet cache is implemented; project-level follow-up reuse remains product-decision-required. + +## Verification + +| Change Type | Verification Command | +|-------------|---------------------| +| Rust policy changes | `cargo test -p bitfun-core deep_review -- --nocapture` | +| Rust coordinator changes | `cargo test -p bitfun-core coordination -- --nocapture` | +| Frontend service changes | `pnpm run type-check:web && pnpm --dir src/web-ui run test:run` | +| Full integration | `cargo build -p bitfun-desktop` + manual deep review test | + +## Appendix A: Architecture Reviewer Prompt Draft + +```markdown +# Role + +You are an **independent Architecture Reviewer** for BitFun deep reviews. + +{LANGUAGE_PREFERENCE} + +You work in an isolated context. Treat this as a fresh review. Do not assume the main agent or other reviewers are correct. + +## Mission + +Inspect the requested review target and find **structural and architectural issues** such as: + +- module boundary violations (imports that cross layer boundaries) +- API contract design problems (inconsistent patterns, breaking changes) +- abstraction integrity issues (platform-specific details leaking through shared interfaces) +- dependency direction violations (circular dependencies, wrong-direction imports) +- structural consistency (patterns, registration conventions not followed) +- cross-cutting concern impact (changes that require touching too many layers) + +## What you do NOT review + +- Business rule correctness (Business Logic reviewer handles this) +- Algorithm performance (Performance reviewer handles this) +- Security vulnerabilities (Security reviewer handles this) +- React component state, i18n, or accessibility (Frontend Reviewer handles this) +- Code style or formatting + +## Tools + +Use only read-only investigation: + +- `GetFileDiff` +- `Read` +- `Grep` +- `Glob` +- `LS` +- `Git` with read-only operations only + +Never modify files or git state. + +## Review standards + +- Confirm the violation before reporting. Cite the specific architectural rule or convention being violated. +- Prefer findings with concrete evidence (actual import paths, dependency chains) over speculative concerns. +- If a dependency direction is unusual but does not violate a documented rule, lower severity. + +## Efficiency rules + +- Start by understanding the module structure. Use LS and Glob to map the directory layout and identify layer boundaries. +- Focus on imports and cross-module references. Use Grep to trace import patterns rather than reading full files. +- Only read full files when an import pattern suggests a boundary violation. +- When you have confirmed or dismissed an architectural concern, move on. Do not re-examine the same module from different angles. +- Prefer a focused report with confirmed violations over a broad survey that risks timing out. +- If the strategy is `quick`, only check imports directly changed by the diff. Flag violations of documented layer boundaries. +- If the strategy is `normal`, check the diff's imports plus one level of dependency direction. Verify API contract consistency. +- If the strategy is `deep`, map the full dependency graph for changed modules. Check for structural anti-patterns, circular dependencies, and cross-cutting concerns. + +## Output format + +Return markdown only, using this exact structure: + +## Reviewer +Architecture Reviewer + +## Verdict +clear | issues_found + +## Findings +- `[severity=] [certainty=] file:line - title` + Architectural rule violated: ... + Why it matters: ... + Suggested fix direction: ... + +If there are no confirmed or likely issues, write exactly: + +- No architectural issues found. + +## Reviewer Summary +2-4 sentences summarizing the structural health of the change. + +If there is nothing meaningful to summarize, write exactly: + +- Nothing to summarize. +``` + +## Appendix B: Frontend Reviewer Prompt Draft + +```markdown +# Role + +You are an **independent Frontend Reviewer** for BitFun deep reviews. + +{LANGUAGE_PREFERENCE} + +You work in an isolated context. Treat this as a fresh review. Do not assume the main agent or other reviewers are correct. + +## Mission + +Inspect the requested review target and find **frontend-specific issues** such as: + +- i18n key synchronization problems (missing keys in one or more locales) +- React performance anti-patterns (missing memoization, unnecessary re-renders, missing virtualization) +- Accessibility violations (missing ARIA attributes, keyboard navigation, focus management) +- State management issues (Zustand selector granularity, store dependency problems, stale closures) +- Frontend-backend API contract drift (Tauri command type mismatches, event payload changes without frontend updates) +- Platform boundary violations in frontend (direct @tauri-apps/api imports outside the adapter layer) +- CSS/theme consistency issues (ThemeService misuse, component library pattern violations) + +## What you do NOT review + +- Business rule correctness (Business Logic reviewer handles this) +- Non-React algorithm performance (Performance reviewer handles this) +- Security vulnerabilities (Security reviewer handles this) +- Backend architectural issues (Architecture reviewer handles this) +- Code style or formatting + +## Tools + +Use only read-only investigation: + +- `GetFileDiff` +- `Read` +- `Grep` +- `Glob` +- `LS` +- `Git` with read-only operations only + +Never modify files or git state. + +## Review standards + +- Confirm the issue before reporting. Show the specific code that has the problem. +- For i18n issues: verify that a key exists in one locale but is missing in another. +- For React performance issues: explain the concrete performance impact, not just the pattern violation. +- For accessibility issues: reference WCAG guidelines where applicable. +- If a pattern is unusual but functional, lower severity. + +## Efficiency rules + +- Start from the diff. Identify changed frontend files (.tsx, .ts, .scss, locale JSON). +- For i18n: use Grep to find all `t('...')` calls in changed files, then check each key across all locale files. +- For React performance: check changed components for common anti-patterns (inline functions in JSX, missing keys, missing memo). +- For accessibility: check changed components for ARIA attributes, keyboard handlers, and focus management. +- For API contracts: compare changed Tauri command types with corresponding TypeScript API clients. +- When you have confirmed or dismissed a frontend concern, move on. Do not re-examine the same component from different angles. +- Prefer a focused report with confirmed issues over a broad survey that risks timing out. +- If the strategy is `quick`, only check i18n key completeness and direct platform boundary violations in changed frontend files. +- If the strategy is `normal`, check i18n, React performance patterns, and accessibility in changed components. Verify frontend-backend API contract alignment. +- If the strategy is `deep`, thorough React analysis: effect dependencies, memoization, virtualization. Full accessibility audit. State management pattern review. Cross-layer contract verification. + +## Output format + +Return markdown only, using this exact structure: + +## Reviewer +Frontend Reviewer + +## Verdict +clear | issues_found + +## Findings +- `[severity=] [certainty=] file:line - title` + Why it matters: ... + Suggested fix: ... + +If there are no confirmed or likely issues, write exactly: + +- No frontend issues found. + +## Reviewer Summary +2-4 sentences summarizing the frontend health of the change. + +If there is nothing meaningful to summarize, write exactly: + +- Nothing to summarize. +``` diff --git a/docs/deep-review-phase2-addendum.md b/docs/deep-review-phase2-addendum.md new file mode 100644 index 000000000..b890d7850 --- /dev/null +++ b/docs/deep-review-phase2-addendum.md @@ -0,0 +1,582 @@ +# Deep Review Phase 2 Addendum Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Close the semantic gaps found after comparing the latest Deep Review implementation with `docs/deep-review-design.md` and `docs/deep-review-phase2-plan.md`. + +**Architecture:** Keep the existing prompt-driven DeepReview orchestrator, but move the riskiest guarantees into deterministic runtime boundaries where the current documents overstate completion. The addendum does not expand Deep Review into a new scheduler architecture; it tightens cache correctness, launch backpressure, retry semantics, observability, and documentation truthfulness. + +**Tech Stack:** Rust core (`bitfun-core`), TypeScript React frontend (`src/web-ui`), Vitest, Cargo tests. + +--- + +## Source Documents + +- `docs/deep-review-design.md`: original strategy-engine and reviewer-role design. +- `docs/deep-review-phase2-plan.md`: Phase 2 implementation plan and status table. +- This addendum only covers gaps discovered after the latest implementation. It does not replace either source document. + +## Current Truth Model + +| Area | Current code truth | Addendum stance | +|---|---|---| +| Conditional Frontend Reviewer | Diff/path classification and a reviewer applicability registry drive conditional manifest behavior | Keep as implemented for `ReviewFrontend`; add future conditional reviewers by extending the registry instead of scattering tag checks | +| Custom review subagents | Required tooling is centralized and invalid agents are skipped with `invalid_tooling`; recommended tools are tracked separately as degraded-readiness guidance | Keep as implemented; document `GetFileDiff` + `Read` as the minimum valid set, not the full-quality review toolset | +| Strategy authority | Launch manifests record the frontend recommendation, backend-compatible policy recommendation, user override, final strategy, mismatch state, and mismatch severity | Keep backend scoring advisory/mismatch-warning only; final strategy remains the configured or user-selected value, and `max_cyclomatic_complexity_delta` is explicitly `not_measured` | +| Dynamic concurrency | TaskTool enforces reviewer capacity, bounded-waits for local reviewer cap saturation, converts expired waits and explicit provider transient-capacity reviewer failures to `CapacitySkipped`, supports backend-bound pause/continue/cancel/optional-skip controls for local-cap waits, and maintains turn-local effective concurrency learning for local capacity skips, provider transient-capacity failures, and successful reviewer observations; capacity-error classification, queue-state events, a compact queue notice, local/manual queue controls, and active-session concurrency warning exist | Keep automatic provider/adaptive queueing, staggered backend dispatch, and user-facing override controls deferred; any future queue extension must reuse visible controls, stay timeout-separated, and remain isolated from normal user session concurrency | +| Retry | Budget tracking, retry guidance, and TaskTool structured retry admission exist. A retry reviewer Task must include `retry_coverage`, use `retry: true`, target only `partial_timeout` or explicit transient `capacity_skipped` sources, reduce scope, lower timeout, and receives a bounded retry-scope prompt block. Backend-owned automatic redispatch remains deferred | Keep backend redispatch deferred; accept only bounded structured retry calls and fall back to prompt/user decision when structured coverage is missing | +| Partial timeout | Grace-period final-message capture exists | Keep limitation documented; do not claim arbitrary stream-fragment capture | +| Incremental cache | Per-session cache struct, metadata field, `packet_id` read path, completed-reviewer write-through, and hit/miss report signals exist | Keep project-level cache product-decision-required and deferred | +| Token budget | File-split/max-file style guardrails, heuristic per-reviewer prompt-byte estimates, summary-first full-scope decisions, and token-budget warnings exist in the launch manifest | Keep hard prompt clipping and generated byte-accurate prompt enforcement deferred; summary-first must never remove `assigned_scope` files silently | +| Shared context cache | Prompt-only reuse plus local duplicate Read/GetFileDiff measurement | Keep interception/cache reuse deferred unless measured duplicate cost becomes a bottleneck | +| Observability | User/report-facing reliability summaries exist for cache, skipped-reviewer, token-budget, partial-timeout, retry, and runtime cap rejections | Keep external telemetry deferred unless diagnostics require it | +| First-run consent dialog | Dialog is compact, localized, preserves strategy choice, and shows skipped-reviewer warnings only when present | Keep as implemented; future changes must preserve low-density copy, theme stability, and locale coverage | + +## Status Wording Guide + +Use these labels consistently when comparing code and documents: + +| Status wording | Meaning | Current examples | +|---|---|---| +| **Runtime-complete** | Deterministic behavior exists in code and is covered by focused tests. | Per-session packet-id cache read/write, invalid custom reviewer reporting, compact localized first-run dialog, `concurrency_limited` cap-to-report propagation, reviewer applicability registry, advisory strategy-decision metadata, prompt-byte budget estimate metadata, capacity-error classifier, queue-state timer separation primitives, backend queue-state event contract, compact queue notice, backend-bound local-cap queue controls, active-session concurrency warning, bounded local reviewer-cap queue expiry, turn-local effective concurrency learning for local capacity skips and explicit provider transient-capacity failures, structured retry admission gate, shared-context duplicate tool-use measurement | +| **Safety net** | Runtime blocks or protects an unsafe action, but does not provide the smoother product behavior described by the original design. | Configured concurrency hard ceiling and judge launch validation | +| **Prompt-guided** | The manifest or prompt tells the orchestrator what to do; a weak model can still miss or mis-sequence the step. | Backend-owned retry dispatch, work-packet batch ordering, shared context reuse | +| **Schema/UI-ready** | The report schema and UI can represent a signal when present, but runtime does not guarantee automatic emission. | Use only for future schema-first report additions; no current high-priority gap remains in this category | +| **Scoped follow-up** | Product and runtime boundaries are specified, but implementation must land in small verified rounds. | Provider/adaptive capacity queueing, provider-side effective concurrency learning, user-facing override controls | +| **Product-decision-required** | Runtime work must wait for an explicit product/privacy decision, not just engineering capacity. | Project-level cache and cross-session reviewer-output persistence | +| **Deferred** | Intentionally outside the current implementation boundary. | Hard prompt-byte clipping/enforcement, programmatic shared Read/GetFileDiff cache, backend-owned DAG scheduler | + +## Current Deviations And Risks + +| Deviation / risk | Current boundary | Why it matters | Follow-up trigger | +|---|---|---|---| +| Minimum review tooling can be mistaken for quality tooling | `GetFileDiff` + `Read` is enough to run; missing `Grep`/`Glob`/`LS` is only degraded | A custom reviewer can pass validation but produce shallow review output | Keep UI/report wording explicit: invalid means cannot run; degraded means can run with lower investigative depth | +| Queue behavior can confuse users if it is automatic-only | Local reviewer cap saturation now waits within `max_queue_wait_seconds`; backend event-driven local-cap waits show a visible notice and support pause, continue, cancel, and optional-extra skip controls | Queueing improves completion but can look like a stuck review if the user cannot see or change it | Keep event-driven waits compact, localized, backend-controlled, and scoped to local-cap waits until provider/adaptive queueing has separate runtime semantics | +| Deep Review can compete with normal session concurrency | Local cap waiting does not start reviewer execution until capacity is acquired, launch surfaces warn when the target session already has high active Task/subagent activity, and explicit provider transient-capacity reviewer failures lower the turn-local effective cap; automatic provider/adaptive queueing is not implemented | A large review should not silently consume capacity needed by the user's active session work | Future scheduler must reserve or deprioritize Deep Review capacity and reuse the warning plus backend-bound queue controls | +| Strategy advice can feel like hidden override pressure | Strategy mismatch is recorded in manifest/prompt metadata, but the final launch strategy remains the team default or explicit user override | If the advisory policy silently changes reviewer roster or strategy, users may lose control of token/concurrency tradeoffs | Keep backend-compatible scoring non-blocking, avoid roster expansion from mismatch metadata, and only surface concise report/launch notes when useful | +| Token budget can under-review silently | Prompt-byte pressure now enables a `summary_first_full_scope` decision, but it keeps every `assigned_scope` file visible | If budget logic clips files without reporting, the judge may overtrust incomplete coverage | Keep summary-first as orientation only; any uncovered file must be reported in coverage notes/reliability signals | +| Capacity-error classification can be too broad | Current code classifies capacity causes, uses the local-cap path for bounded waiting, and converts explicit provider transient-capacity reviewer failures to `capacity_skipped`; provider capacity errors are not automatically requeued yet | Misclassifying auth, quota, invalid model, cancellation, or tooling errors as queueable can cause endless waiting | Queue/skip only explicit transient/capacity failures such as rate limit, provider concurrency limit, temporary overload, or local cap saturation | +| Per-session cache retention semantics are still product-light | Cache stays session-scoped; project-level persistence is deferred | Reviewer outputs can contain sensitive code findings, and future persistence needs deletion/retention rules | Define privacy, invalidation, and deletion semantics before cross-session/project cache | + +## Follow-up Work Decision + +| Decision | Status | Rationale | +|---|---|---| +| Cap-to-report reliability propagation | **Closed, runtime-complete** | TaskTool records `deep_review_concurrency_cap_reached` on the turn, and `submit_code_review` folds the count into a `concurrency_limited` runtime reliability signal. | +| Retry guidance config alignment | **Closed, runtime-complete for guidance text** | Retry remains prompt-guided by design, but the visible retry ceiling now uses the effective manifest policy when available. | +| Reviewer applicability registry | **Closed, runtime-complete for manifest construction** | Current conditional reviewer behavior now goes through `reviewTargetClassifier.ts` registry rules; future conditional reviewers should extend that registry. | +| Adaptive capacity queue | **Partial runtime-complete for local cap and provider transient-capacity observation; scoped follow-up for automatic provider/adaptive behavior** | Capacity-error classification, queue-state timer primitives, backend queue-state event contract, compact queue notice, active-session warning, bounded local reviewer-cap waiting, backend-bound local-cap queue controls, explicit provider transient-capacity skip conversion, and turn-local effective cap learning after local capacity skips or provider transient-capacity failures are complete. Automatic provider requeue/retry execution, staggered backend dispatch, and user-facing override controls remain follow-up work. | +| Runtime strategy authority and complexity signal | **Closed, runtime-complete for advisory metadata; complexity measurement deferred** | Launch manifests now record frontend and backend-compatible recommendations, user override, final strategy, mismatch state, and severity. Backend scoring is mismatch-warning metadata only and does not override user/team strategy. `max_cyclomatic_complexity_delta` remains explicitly `not_measured` until a measurable signal exists. | +| Token budget byte enforcement | **Closed, runtime-complete for heuristic estimate and full-scope summary-first metadata; hard clipping deferred** | Launch manifests now estimate max reviewer prompt bytes, record configured per-mode byte thresholds, enable `summary_first_full_scope` only when the estimate exceeds the threshold, and keep file-splitting guardrails separate. Warnings/decisions feed existing context-pressure reliability behavior. Byte-accurate prompt clipping and mandatory summarization remain deferred. | +| Automatic retry redispatch | **Partial runtime-complete for structured retry admission; scoped follow-up for backend-owned redispatch** | TaskTool now rejects retry calls without structured coverage, non-retryable source status, broad scope, non-lowered timeout, or exhausted retry budget, and prepends the accepted retry scope to the reviewer prompt. The orchestrator still has to issue the retry Task; backend-owned redispatch remains deferred. | +| Pre-review summary UI | **Closed, runtime-complete for compact consent summary; richer preview deferred** | The launch dialog now shows concise file count, risk areas, reviewer-call count, optional-reviewer count, selected strategy, summary-first marker, and skipped-reviewer warnings without restoring dense cost/time cards or full reviewer lineups. | +| Programmatic shared context cache | **Deferred pending measured need** | Prompt rules, local duplicate Read/GetFileDiff measurement, and aggregate debug diagnostics exist. Tool-result interception remains a separate deep runtime change. | +| Project-level cache and retention policy | **Product-decision-required / deferred** | Current production boundary is per-session only; cross-session persistence needs explicit retention, deletion, and visibility approval. | +| First-run consent dialog and invalid custom reviewer reporting | **No immediate follow-up** | Current code and docs now agree: the dialog is compact/localized, and invalid review agents surface as `invalid_tooling`. | + +## Scope Boundaries + +In scope: + +- Correct documentation claims so they match current code. +- Keep first-run Deep Review consent compact so users see only the key launch reminders and all visible strings remain localized. +- Close incremental review cache correctness for per-session continuation. +- Specify a deterministic backpressure path for DeepReview reviewer launch caps if hard rejection proves too brittle. +- Preserve normal user session responsiveness: Deep Review queueing must not silently consume all available subagent capacity, and high session concurrency should produce a clear pause/continue choice instead of hidden waiting. +- Make retry status explicit: structured retry admission is runtime-enforced, while backend-owned redispatch remains prompt-guided/deferred. +- Add metrics/report surfaces for cap rejection, partial timeout, retry, cache hit/miss, skipped reviewers, and token-budget decisions. + +Out of scope: + +- Project-level cross-session review cache. +- Programmatic shared Read/GetFileDiff cache. +- Hard prompt-byte clipping or byte-accurate prompt enforcement. +- Large diff summary generation as a mandatory pre-review step. +- Replacing the prompt-driven DeepReview orchestrator with a full backend-owned DAG scheduler. +- Implementing automatic adaptive queueing in the current UI/control round. + +## Risk Register + +| Risk | Scenario | Mitigation | +|---|---|---| +| Hard cap rejection causes missed reviewers | Weak orchestrator launches too many reviewers and does not recover from tool error | Cap rejection is now surfaced in report reliability notes; add queue/wait behavior only if coverage loss persists | +| Queue state feels like a hang | Reviewers wait for capacity without clear progress | Show `QueuedForCapacity`, queue position/reason when available, and backend-bound local-cap pause/continue/cancel/optional-skip controls before provider/adaptive queueing | +| Queue time is counted as execution time | A reviewer times out before it actually starts running | Track `queued_at`, `started_at`, `queue_elapsed_ms`, and `run_elapsed_ms`; apply reviewer timeout only after `Running` starts | +| Deep Review starves active user work | A large review consumes all subagent capacity while the user keeps working in the same session | Treat Deep Review reviewer capacity as lower priority or separately reserved; warn when active session concurrency is high and let the user pause Deep Review then continue manually | +| Non-capacity errors are requeued | Auth, billing, invalid model, policy, cancellation, or invalid tooling errors are mistaken for transient capacity | Maintain a narrow capacity-error classifier and fail fast for non-queueable errors | +| Cache returns stale review output | Fingerprint misses strategy, roster, renamed path, or model changes | Use `packet_id` keys and invalidate on target files, reviewer roster, strategy, model, and relevant manifest changes | +| Cache key mismatch | Writer stores by `packet_id`, reader checks `subagent_type` | Resolve packet id from Task description/run manifest before reading cache | +| Retry loops grow token cost | Model retries broad scope repeatedly | Enforce max retries per role, require `retry: true`, structured `retry_coverage`, reduced `retry_scope_files`, retryable source status, lower timeout, and prepend bounded retry scope to the reviewer prompt | +| Partial output is overtrusted | Timed-out reviewer emits incomplete analysis | Preserve `partial_timeout` and reliability signals; judge/report treats it as lower confidence | +| Observability adds noise | Reports become too dense | Keep raw counters in metadata and show only summarized user-facing notes | +| Privacy retention is unclear | Cached reviewer output stores sensitive code findings | Keep first closure per-session only; document retention and deletion semantics before project-level cache | + +## File Map + +Documentation: + +- `docs/deep-review-design.md`: source-of-truth design status and remaining work. +- `docs/deep-review-phase2-plan.md`: reconciled Phase 2 implementation status. +- `docs/deep-review-phase2-addendum.md`: this addendum and follow-up implementation plan. + +Rust core: + +- `src/crates/core/src/agentic/deep_review_policy.rs`: policy helpers, concurrency policy, retry budget, incremental cache helpers. +- `src/crates/core/src/agentic/tools/framework.rs`: low-noise shared-context duplicate measurement for reviewer `Read`/`GetFileDiff` calls. +- `src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs`: DeepReview reviewer parent-turn metadata propagation into tool-use context. +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs`: reviewer launch enforcement, cap-rejection tracking, cache hit lookup, retry guidance, structured retry admission, and DeepReview reviewer context tagging. +- `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs`: normalized reviewer packet metadata, cap-to-report reliability folding, and completed-reviewer cache write-through boundary. +- `src/crates/core/src/agentic/coordination/coordinator.rs`: subagent execution, timeout/partial-result handling, possible future queue/wait boundary. +- Future adaptive queue boundary: runtime-owned subagent scheduling near `coordinator.rs` / `task_tool.rs`, with Deep Review consuming queue policy rather than owning global session capacity. +- `src/crates/core/src/service/session/types.rs`: session metadata fields for run manifest and cache data. +- `src/crates/core/src/agentic/persistence/manager.rs`: session metadata persistence. + +Frontend: + +- `src/web-ui/src/shared/services/reviewTeamService.ts`: manifest construction, work packets, token budget plan, skipped reviewer reporting. +- `src/web-ui/src/shared/services/reviewTargetClassifier.ts`: review target classification and reviewer applicability registry. +- `src/web-ui/src/shared/services/reviewSubagentCapabilities.ts`: custom review subagent tool contract. +- `src/web-ui/src/flow_chat/utils/codeReviewReport.ts`: report formatting for packet, partial, cache, and reliability metadata. +- `src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.tsx`: user-facing run strategy override and launch summary. + +## Implementation Rounds + +### Round 1: Documentation Reconciliation + +**Goal:** Make the docs truthful before more code changes. + +- [x] Update `docs/deep-review-design.md` so completed/remaining work distinguishes complete runtime behavior from safety-net or prompt-guided behavior. +- [x] Update `docs/deep-review-phase2-plan.md` so Phase B and Phase C are marked partial where the code has cap/guidance/read-path behavior but not full scheduler/retry/cache semantics. +- [x] Add this addendum with scope boundaries, risk register, file map, and follow-up rounds. + +Verification: + +- `rg - n "PARTIAL|Safety net|write-through|queue|redispatch|prompt-guided" docs/deep-review-design.md docs/deep-review-phase2-plan.md docs/deep-review-phase2-addendum.md` +- Expected: the three docs explicitly describe partial completion and do not claim queued dispatch, automatic retry, or cache write-through as complete. + +### Round 2: First-Run Consent Dialog I18n And Density + +**Goal:** Keep the first Deep Review confirmation dialog useful without making it feel like a full report. + +**Current status wording:** Runtime-complete for the compact localized dialog. Future changes should preserve the reduced information density instead of restoring cost/time cards or full reviewer lineups. + +- [x] Replace the two cost/time fact cards and active reviewer chip list with a compact reminder area. +- [x] Keep only these visible decisions and warnings: + - Deep Review can take longer and use more tokens than a standard review. + - The first pass is read-only. + - The selected run strategy can be changed before launch. + - Skipped reviewers are shown only when there are skipped reviewers. +- [x] Keep strategy override behavior unchanged, including project-level persistence when the user confirms. +- [x] Add missing locale keys for skipped reviewer reasons in `en-US`, `zh-CN`, and `zh-TW`. +- [x] Update dialog tests so they assert compact summary behavior and no longer require active reviewer names to be shown. + +Verification: + +- `pnpm --dir src/web-ui run test:run -- src/flow_chat/components/DeepReviewConsentDialog.test.tsx` +- Expected: the dialog still opens when skipped reviewers exist, skipped reason text is localized through locale keys, and strategy override persistence still works. + +### Round 3: Incremental Cache Correctness Closure + +**Goal:** Make per-session incremental review cache safe enough to claim as implemented. + +- [x] Add tests in `src/crates/core/src/agentic/deep_review_policy.rs` for cache invalidation on fingerprint mismatch and for packet keys that include split-review suffixes such as `reviewer:ReviewSecurity:group-1-of-3`. +- [x] Add TaskTool tests in `src/crates/core/src/agentic/tools/implementations/task_tool.rs` for resolving cache keys from the Task description pattern `[packet reviewer:ReviewSecurity]` and from unique run-manifest work packets. +- [x] Change TaskTool cache lookup from `subagent_type` to resolved `packet_id`, with no cache hit when multiple packets match and no packet id is present. +- [x] Add a write-through boundary after `submit_code_review` normalizes reviewer packet metadata. Store only reviewers with `status=completed` and a non-empty `packet_id`. +- [x] Preserve per-session scope only. Do not add project-level storage in this round. + +Verification: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- Expected: cache helper tests pass, ambiguous packet lookup does not produce a false cache hit, and completed reviewer output can be written and read by the same packet id. + +### Round 4: Launch Backpressure And Retry Semantics + +**Goal:** Remove model-dependent ambiguity around concurrency cap and retry behavior without replacing the whole orchestrator. + +- [x] Add a focused DeepReview TaskTool test showing current cap rejection behavior for excess reviewer launches. +- [x] Decide implementation mode: + - Preserved rejection for now and kept the existing structured `code`/`message` tool-error payload. + - Deferred bounded backpressure until real launch failures show hard rejection is too brittle. +- [x] Keep `staggerSeconds` as a documented future scheduler field unless this round implements actual wait spacing. +- [x] For retry, keep backend-owned redispatch prompt-guided. Round 14 adds structured retry admission for model-issued retry Tasks and requires `retry: true`, lower timeout, reduced assigned scope, and structured coverage. +- [x] Align retry guidance wording with the effective manifest policy so non-default `maxRetriesPerRole` values are reflected in the visible hint. + +Verification: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- `cargo test -p bitfun-core coordination -- --nocapture` +- Expected: cap behavior is deterministic and tested; retry status is prompt-guided for backend redispatch, while structured retry admission is covered by tests. + +### Round 5: Observability, Report Surfacing, And UI Stability + +**Goal:** Make runtime tradeoffs visible without making the Deep Review report noisy. + +- [x] Add structured counters or report metadata for cache hit/miss, concurrency cap rejection, partial timeout, retry guidance, skipped reviewers, and token-budget skipped extras. +- [x] Fold TaskTool `deep_review_concurrency_cap_reached` errors into final report `concurrency_limited` reliability signals through turn-local runtime tracking. +- [x] Update `src/web-ui/src/flow_chat/utils/codeReviewReport.ts` to summarize only meaningful reliability signals in exported Markdown. +- [x] Keep dense details collapsed or metadata-only in UI components so report readability does not regress. +- [x] Confirm all new user-facing strings are localized in `src/web-ui/src/locales/en-US/flow-chat.json`, `zh-CN`, and `zh-TW` when UI text is added. + +Verification: + +- `pnpm run lint:web` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run` +- Expected: locale coverage remains complete, report formatting tests cover the new summary rows, and no layout text overflow is introduced. + +### Round 6: Final Verification And Documentation Freeze + +**Goal:** Prove code and docs still match after the addendum work. + +- [x] Re-read `docs/deep-review-design.md`, `docs/deep-review-phase2-plan.md`, and this addendum against the implemented code paths. +- [x] Update only status wording if implementation choices differ from this plan. +- [x] Run the smallest complete verification set for touched areas. + +Verification: + +- For Rust-only changes: `cargo test -p bitfun-core deep_review -- --nocapture` +- For frontend/report changes: `pnpm run lint:web && pnpm run type-check:web && pnpm --dir src/web-ui run test:run` +- For cross-boundary behavior: `cargo check --workspace --exclude bitfun-cli` + +Expected: no document claims exceed code behavior, and every intentionally deferred item remains marked deferred. + +### Round 7: Adaptive Capacity Queue Design Preparation + +**Goal:** Prepare queue/wait behavior without blurring the current Deep Review boundary or surprising users. + +- [x] Define queueable capacity errors narrowly: + - Queueable: provider rate limit, provider concurrency limit, explicit `Retry-After`, local subagent cap saturation, and temporary capacity/overload errors. + - Not queueable: authentication, billing/quota exhausted, invalid model, policy violation, user cancellation, invalid reviewer tooling, and deterministic validation failures. +- [x] Define lifecycle states for reviewer launches: + - `QueuedForCapacity`: waiting for runtime/provider capacity. + - `PausedByUser`: user paused Deep Review queue while keeping normal session work available. + - `Running`: reviewer execution has started and reviewer timeout begins. + - `CapacitySkipped`: queue wait exceeded the user-visible queue policy or the user skipped the reviewer. +- [x] Split timers: + - `queue_elapsed_ms` is informational and can drive queue warnings. + - `run_elapsed_ms` is the only value counted against reviewer `timeout_seconds`. +- [x] Define user controls before runtime automation: + - Pause Deep Review queue. + - Continue queued reviewers manually. + - Cancel a queued reviewer. + - Temporarily lower or raise the Deep Review reviewer cap within the configured hard maximum. + - Skip optional extra reviewers when capacity is constrained. +- [x] Protect normal session concurrency: + - Deep Review reviewer queue must not silently use all available subagent capacity. + - If the current session already has high subagent activity, show a concise prompt recommending pause, later continue, or a lower review strategy. + - Manual continue must resume queued reviewer work without restarting completed reviewers. +- [x] Keep the UX low-noise: + - Show one compact queue notice instead of per-reviewer popups. + - Keep details collapsible in the report. + - Localize all user-facing state labels and action text. + +Verification for the design-preparation round: + +- Documentation-only check: `rg - n "QueuedForCapacity|PausedByUser|CapacitySkipped|queue_elapsed_ms|run_elapsed_ms|normal session" docs/deep-review-design.md docs/deep-review-phase2-plan.md docs/deep-review-phase2-addendum.md` +- Expected: docs describe adaptive queue as a scoped follow-up, not as current runtime behavior. + +Exit checks: + +- [x] Update this addendum with the exact status of capacity-error classification, lifecycle states, and user-control readiness. +- [x] Update `docs/deep-review-design.md` only if the queue contract changes; do not mark queueing as complete. +- [x] Re-read the risk register and add any new queue/session-concurrency risks before implementation proceeds. +- [x] Confirm no runtime behavior changed unless this round explicitly includes code and tests. + +### Round 8: Adaptive Capacity Queue Runtime Foundation + +**Goal:** Add the smallest runtime primitives needed for queueing without enabling automatic waiting by default. + +- [x] Add a capacity-error classifier near the AI/subagent runtime boundary. +- [x] Add structured queue state metadata for reviewer launch attempts. +- [x] Add tests that prove only transient capacity errors are queueable. +- [x] Add tests that prove auth, quota, invalid model, invalid tooling, validation errors, and user cancellation fail fast. +- [x] Keep existing hard-cap behavior as the active path until queue controls, timeout accounting, and bounded queue execution exist. + +Verification: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- Expected at Round 8 exit: classifier and state tests pass, existing cap tests still pass, and automatic queueing remains disabled until the later bounded-wait round. + +Exit checks: + +- [x] Update the status wording in all three docs from "design preparation" to "runtime foundation" only where deterministic code exists. +- [x] Confirm implementation still treats queueing as a follow-up/safety path, not a full scheduler replacement. +- [x] Confirm no new risk allows Deep Review to consume normal session capacity invisibly. + +### Round 9: Queue UX Controls And Session-Concurrency Guard + +**Goal:** Make queue state understandable and controllable before automatic waiting is enabled. + +- [x] Add a compact, localized queue notice for Deep Review capacity waits. +- [x] Add user actions for pause, continue, cancel queued reviewer, and skip optional extras. +- [x] Add a guard that detects high active session subagent activity before Deep Review consumes reviewer capacity. +- [x] When the session is busy, prompt the user to pause Deep Review, lower strategy, or continue manually later. +- [x] Keep completed reviewer output stable when a queued review is paused and resumed. + +Status: Runtime-complete for compact queue notice, local/manual queue-control UI, backend-bound local-cap pause/continue/cancel/optional-skip controls, and launch-time session-concurrency warning. Automatic local-cap waiting is handled in Round 10; provider/adaptive queueing remains deferred. + +Verification: + +- `pnpm run lint:web` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run` +- Expected: locale keys exist in `en-US`, `zh-CN`, and `zh-TW`; queue controls do not render when there is no queued state; completed reviewer summaries are preserved. + +Exit checks: + +- Update UI status in this addendum and keep user-facing copy density constraints explicit. +- Verify launch surfaces that start Deep Review, including Flow Chat and file/session entry points, show consistent queue behavior. +- Re-check theme, layout, and i18n coverage before enabling automatic waiting. + +### Round 10: Bounded Queue Execution + +**Goal:** Convert selected over-cap reviewer launches from hard rejection to bounded waiting. + +- [x] Add a backend queue-state event contract and frontend binding that can render a visible waiting action bar without enabling automatic queue execution. +- [x] Enable bounded queueing for local reviewer cap saturation; provider explicit capacity errors remain classified here, are observed in Round 11b, and are not automatically requeued. +- [x] Start reviewer `timeout_seconds` only after the local-cap queued item transitions to `Running`. +- [x] Track `queue_elapsed_ms` separately from `run_elapsed_ms` in queue metadata and capacity-skip results. +- [x] Add `max_queue_wait_seconds` and convert expired local-cap queue items to `CapacitySkipped`. +- [x] Surface queue skips in the final reliability summary through the existing `concurrency_limited` signal without adding noisy report sections. + +Status: Round 10b is runtime-complete for bounded local reviewer-cap waiting, timeout-separated queue metadata, queue expiry, backend queue-state events, frontend event binding, backend-bound local-cap queue controls, and report skip propagation. It is intentionally not a full scheduler: automatic provider/adaptive queueing, user-facing override controls, and staggered dispatch remain scoped follow-ups. + +Verification: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- `cargo test -p bitfun-core coordination -- --nocapture` +- `cargo test -p bitfun-events deep_review_queue_state_event_serializes_stable_contract -- --nocapture` +- `pnpm --dir src/web-ui run test:run -- src/flow_chat/utils/deepReviewQueueStateEvents.test.ts src/flow_chat/store/deepReviewActionBarStore.test.ts` +- Focused web report tests if report formatting changes. +- Expected: queued wait does not consume reviewer timeout; queue expiry is reported as capacity-limited/skipped; hard-cap fallback remains available. + +Exit checks: + +- Keep `docs/deep-review-design.md` and `docs/deep-review-phase2-plan.md` aligned on bounded local-cap queueing without claiming provider/adaptive queueing. +- Confirm no wording claims adaptive effective concurrency unless Round 11 is complete. +- Re-run risk review for stuck queues, hidden waiting, and session starvation. + +### Round 11: Adaptive Effective Concurrency + +**Goal:** Learn a safe runtime effective concurrency without exceeding the configured hard maximum. + +- [x] Treat configured concurrency as the hard ceiling. +- [x] Lower `effective_parallel_instances` after local capacity skips, with Round 11b extending this to explicit provider transient-capacity failures. +- [x] Respect provider `Retry-After` in the effective-concurrency policy state; automatic provider retry/requeue execution remains deferred until provider/adaptive queueing exists. +- [x] Recover concurrency cautiously after a successful reviewer observation window. +- [x] Add a bounded user override helper within the configured hard maximum; user-facing override controls remain deferred. +- [x] Keep observations short-lived and turn-local at first; do not persist provider capacity learning across projects until behavior is proven. + +Status: Round 11a is runtime-complete for turn-local effective concurrency policy, local-cap skip learning, successful-observation recovery, Retry-After-aware policy state, bounded override helpers, queue-state effective-cap metadata, backend-bound local-cap queue commands, and no cross-project persistence. Round 11b adds provider transient-capacity observation; automatic provider/adaptive queueing, user-facing override controls, and staggered dispatch remain scoped follow-ups. + +### Round 11b: Provider Capacity Error Observation + +**Goal:** Make explicit provider transient-capacity reviewer failures visible and adaptive without adding automatic redispatch or a full provider queue. + +- [x] Convert DeepReview reviewer provider rate-limit/concurrency/temporary-overload failures to `capacity_skipped` instead of an opaque Task failure. +- [x] Lower the turn-local effective reviewer cap after such provider transient-capacity failures. +- [x] Keep auth, quota, billing, invalid model, cancellation, policy, validation, and tooling failures fail-fast. +- [x] Emit the same queue-state/report reliability surface used by other capacity skips. +- [x] Keep automatic provider requeue/retry execution deferred; retries still require prompt/user-issued structured retry Tasks. + +Status: Round 11b is runtime-complete for explicit provider transient-capacity observation. It does not wait hiddenly after provider errors, does not retry automatically, and does not persist provider capacity learning across turns/projects. + +Verification: + +- `cargo test -p bitfun-core deep_review_provider_capacity_error_builds_capacity_skipped_payload_and_lowers_effective_cap -- --nocapture` +- `cargo test -p bitfun-core deep_review_provider_quota_error_is_not_capacity_skipped -- --nocapture` +- `cargo test -p bitfun-core deep_review -- --nocapture` + +Verification: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- Expected: capacity errors reduce effective concurrency; stable success windows restore it gradually; user override is bounded and visible. + +Exit checks: + +- Update docs to distinguish configured max, effective runtime max, and user override. +- Confirm automatic adaptation cannot increase concurrency beyond user configuration. +- Confirm capacity learning does not affect unrelated sessions unexpectedly. + +### Round 12: Runtime Strategy Authority And Complexity Signal + +**Goal:** Decide and implement how backend risk scoring affects actual Deep Review launch strategy. + +- [x] Decide whether backend `auto_select_strategy()` is advisory, authoritative, or mismatch-warning only. +- [x] Add launch metadata that records frontend recommendation, backend-compatible recommendation, user override, and final selected strategy. +- [x] Add a measured or explicit heuristic for `max_cyclomatic_complexity_delta`, or remove it from runtime authority until measurable. +- [x] Show strategy mismatch as concise launch/prompt metadata, not as a blocking warning unless the risk is severe. + +Status: Round 12 is runtime-complete for advisory strategy-decision metadata. The backend-compatible score is represented as `mismatch_warning` metadata and never changes the configured or user-selected final strategy. `max_cyclomatic_complexity_delta` is recorded as `0` with source `not_measured`, so it is excluded from authority until a measured signal exists. Deterministic final-report UI surfacing remains intentionally low-noise and can be added later only if reports need to expose strategy mismatch beyond the existing prompt metadata. + +Verification: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts` +- Expected: strategy selection is deterministic, override behavior remains stable, and documents state whether backend scoring is advisory or authoritative. + +Exit checks: + +- Update all three docs with the final authority model. +- Verify launch UI wording does not pressure users with noisy strategy warnings. +- Confirm no strategy change expands reviewer roster beyond documented token/concurrency limits. + +### Round 13: Token Budget Byte Enforcement And Summary-First Flow + +**Goal:** Reduce timeout and queue pressure by enforcing prompt-size-aware reviewer scopes. + +- [x] Add prompt-byte or prompt-token estimation before reviewer dispatch. +- [x] Keep existing max-file and file-splitting guardrails. +- [x] Add summary-first behavior for large diffs only when the estimated reviewer prompt exceeds the configured threshold. +- [x] Record token-budget decisions in reliability metadata. +- [x] Ensure budget clipping never hides files silently; skipped or summarized files must be visible to the judge/report. + +Status: Round 13 is runtime-complete for heuristic manifest-level prompt-byte estimates and full-scope summary-first decisions. `largeDiffSummaryFirst` is no longer a proxy for file splitting alone; it is enabled only when estimated reviewer prompt bytes exceed the selected budget threshold. `maxFilesPerReviewer` remains a separate split guardrail. No files are clipped from `assigned_scope`; reviewers must use the pre-generated summary for orientation and report any uncovered files in coverage notes/reliability signals. Hard prompt-byte clipping and byte-accurate prompt measurement remain deferred. + +Verification: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- Focused web report tests if summary/budget metadata changes. +- Expected: large prompt inputs are summarized or split predictably; skipped/summarized scopes are reported; small reviews are unchanged. + +Exit checks: + +- Update docs from "file guardrails only" to the exact byte-budget behavior implemented. +- Re-check risk register for under-review caused by aggressive truncation. +- Confirm queue/adaptive concurrency plan still holds with smaller reviewer scopes. + +### Round 14: Automatic Reduced-Scope Retry + +**Goal:** Move from prompt-guided retry to deterministic retry only when coverage can be bounded. + +- [x] Require structured reviewer coverage data before any bounded retry is accepted. +- [x] Retry only `partial_timeout` or explicit transient capacity failures. +- [x] Require `retry: true`, reduced scope, lower timeout, and retry-budget accounting. +- [x] Stop retry loops through budget/scope admission; unresolved reviewers continue to surface through existing partial/retry reliability paths for user decision. +- [x] Keep prompt-guided/user-decision fallback when structured coverage is missing instead of silently retrying. + +Status: Round 14 is runtime-complete for the structured retry admission gate. It does not replace the prompt-driven orchestrator with backend-owned automatic redispatch: the model or continuation flow must still issue the retry Task, and TaskTool only accepts that call when coverage and timeout constraints make the retry bounded. Accepted retry Tasks prepend the bounded retry scope to the actual reviewer prompt so the runtime metadata and execution instructions do not diverge silently. + +Verification: + +- `cargo test -p bitfun-core task_tool::tests::deep_review_retry -- --nocapture` +- `cargo test -p bitfun-core deep_review -- --nocapture` +- Expected: retry budget is enforced, broad-scope retry is rejected, non-retryable capacity failures fail fast, and backend-owned automatic redispatch remains explicitly deferred. + +Exit checks: + +- Update docs to say structured retry admission is runtime-complete for bounded coverage, while backend-owned automatic redispatch remains deferred. +- Confirm token cost cannot grow unbounded through retry loops. +- Confirm partial output remains lower confidence even after a retry attempt. + +### Round 15: Pre-Review Summary UI + +**Goal:** Show users the key review shape before launch without recreating a dense report. + +- [x] Add compact pre-review summary UI for file count, risk tags, selected strategy, optional reviewers, and major skipped/degraded reviewers. +- [x] Avoid cost/time cards or full reviewer lineups that make the launch dialog noisy. +- [x] Add localized copy for all visible labels. +- [x] Keep strategy override and project persistence behavior unchanged. + +Status: Round 15 is runtime-complete for the compact launch-dialog summary. It deliberately stays inside the existing consent dialog and does not add a separate dense pre-review report. Degraded reviewer quality can only be shown when manifest data carries such a signal; current UI continues to surface skipped/invalid reviewers and optional-reviewer counts. + +Verification: + +- `pnpm run lint:web` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run` +- Expected: summary appears only where useful, all text is localized, and the dialog remains compact. + +Exit checks: + +- Update docs to mark pre-review summary UI complete only for the shipped surfaces. +- Verify Flow Chat and other Deep Review entry points do not diverge. +- Re-check style/theme stability and mobile/compact layout behavior. + +### Round 16: Shared Context Cache Measurement And Decision + +**Goal:** Decide whether programmatic Read/GetFileDiff cache is worth the runtime complexity. + +- [x] Add measurement for duplicate reviewer Read/GetFileDiff usage. +- [x] Keep measurement local and low-noise; do not add external telemetry unless needed. +- [x] Add a local debug diagnostic summary at Deep Review report submission time so real runs can be sampled without adding report noise or storing source content. +- [ ] Evaluate real DeepReview runs with the measurement snapshot before deciding whether duplicate IO/token cost is high. +- [ ] If duplicate IO/token cost is low, keep shared context cache prompt-only and update docs with that decision. +- [ ] If cost is high, design a separate tool-result interception plan before implementation. + +Status: Round 16a is runtime-complete for local duplicate tool-use measurement, and Round 16b is runtime-complete for low-noise local diagnostics. The runtime records only parent turn id, reviewer type, tool name, normalized file path, call count, and reviewer count for DeepReview reviewer `Read`/`GetFileDiff` calls. At final `submit_code_review` time, it emits a debug summary with aggregate counts only: total calls, duplicate calls, duplicate context count, maximum duplicate call count, and maximum duplicate reviewer count. It does not store source content, diff content, or tool outputs; it does not change tool results; and it does not add report or external telemetry noise. Programmatic shared context reuse remains deferred until real measurements justify a separate interception/cache plan. + +Verification: + +- `cargo test -p bitfun-core shared_context_measurement_tracks_duplicate_readonly_file_context_without_content -- --nocapture` +- `cargo test -p bitfun-core call_records_deep_review_read_file_measurement_without_touching_result -- --nocapture` +- `cargo test -p bitfun-core deep_review_shared_context_diagnostics_stays_out_of_report -- --nocapture` +- `cargo test -p bitfun-core deep_review -- --nocapture` +- Optional focused runtime diagnostics tests if measurement plumbing is added. +- Expected: the decision is evidence-based, and no interception layer is introduced without a separate plan. + +Exit checks: + +- Update docs with either "deferred by evidence" or a new scoped implementation plan. +- Confirm measurement does not leak sensitive source content. +- Confirm report density does not increase for ordinary users. + +### Round 17: Project-Level Cache Privacy Decision + +**Goal:** Decide whether cross-session/project review cache should exist at all. + +- [x] Define the current-phase cache retention boundary: per-session cache lives only with session metadata and has no independent retention period. +- [x] Define current storage scope: cached reviewer outputs stay in session storage only, not project `.bitfun` shared cache state. +- [x] Define current deletion behavior: deleting or clearing the session metadata removes the cached reviewer outputs; there is no separate project cache deletion path because no project cache exists. +- [x] Keep invalidation tied to the existing review fingerprint and `packet_id` matching; do not widen reuse beyond the verified session manifest. +- [x] Keep user visibility limited to existing cache hit/miss reliability signals; do not add dense cache-management UI until project-level persistence is approved. +- [x] Do not implement project-level persistence until privacy/product rules are explicit. + +Status: Round 17 is documentation-complete for the conservative privacy boundary. The implementation remains per-session only. Cross-session/project-level review cache is intentionally not implemented and should be treated as product-decision-required, because reviewer outputs can contain sensitive findings, source summaries, and model-derived judgments. A future project-level cache plan must first define retention duration, invalidation keys across rename/model/strategy/roster changes, deletion semantics, and user visibility/control. + +Verification: + +- Documentation-only until product approval. +- Expected: docs explicitly choose per-session-only for the current phase and do not mark project-level cache as implementable without deletion semantics. + +Exit checks: + +- Update all three docs to remove ambiguity around cache scope. +- Confirm project-level cache is not marked implementable without retention and deletion rules. +- Re-check compliance/privacy risks before any code work starts. + +### Round 18: Final Reconciliation And Release Gate + +**Goal:** Ensure implementation, tests, and documents agree before calling the Deep Review plan complete. + +- [x] Re-read `docs/deep-review-design.md`, `docs/deep-review-phase2-plan.md`, and this addendum against code. +- [x] For every item, mark exactly one state: runtime-complete, safety net, prompt-guided, scoped follow-up, deferred, or product-decision-required. +- [x] Run the smallest complete verification set for touched areas. +- [x] Run `rg` checks for stale wording such as "complete" near deferred scheduler/cache/retry/token-budget claims. +- [x] Update risk register with any newly observed implementation risks. + +Status: Documentation reconciliation and release-gate verification are complete. The three documents now distinguish local-cap queueing from provider/adaptive queueing, structured retry admission from backend-owned retry redispatch, per-session cache from product-level cache persistence, duplicate tool-use measurement from programmatic shared context reuse, and heuristic prompt-byte metadata from hard byte-level prompt clipping. The remaining open items are intentionally scoped follow-ups or product-decision-required work, not hidden completion claims. Verification covered Rust Deep Review policy/tool behavior, cross-boundary Rust compilation, frontend lint/type-check/test coverage, documentation hygiene, and stale wording checks. + +Verification: + +- Rust core changes: `cargo test -p bitfun-core deep_review -- --nocapture` (post-Round 11b: 105 passed, 0 failed) +- Cross-boundary Rust changes: `cargo check --workspace --exclude bitfun-cli` +- Frontend/report/UI changes: `pnpm run lint:web && pnpm run type-check:web && pnpm --dir src/web-ui run test:run` (post-rebase: 67 files passed, 391 tests passed) +- Documentation hygiene: `git diff --check` + +Exit checks: + +- No document claim exceeds code behavior. +- No completed item lacks focused verification. +- No new user-facing copy lacks locale coverage. +- No queue/retry/cache/token feature introduces hidden user confusion or undocumented privacy risk. + +## Completion Criteria + +- Existing docs no longer state that queued dispatch, automatic retry redispatch, cache write-through, or hard prompt-byte clipping/enforcement are complete unless code implements them. +- Per-session incremental cache can only return data by a verified packet id and matching fingerprint. +- Concurrency cap behavior is either user/report-visible as a skipped/rejected reviewer or is converted into bounded backpressure. +- Any future bounded backpressure keeps queue time separate from execution timeout and keeps Deep Review capacity from starving normal user session work. +- Retry behavior is explicit: backend-owned redispatch is prompt-guided/deferred, while structured retry admission is runtime-enforced for model-issued retry Tasks. +- User-facing additions remain localized and report density remains controlled. diff --git a/docs/deep-review-phase2-plan.md b/docs/deep-review-phase2-plan.md new file mode 100644 index 000000000..d67fdb3ce --- /dev/null +++ b/docs/deep-review-phase2-plan.md @@ -0,0 +1,338 @@ +# Deep Review Strategy Engine - Execution Plan (Phase 2) + +## Scope + +This plan covers the remaining work items identified by comparing `deep-review-design.md` against commit `9d97b88e81`. It is strictly bounded by the original design document - no speculative additions. + +## Status Reconciliation + +This document now distinguishes between three implementation levels: + +- **Complete**: Runtime behavior is deterministic and covered by tests. +- **Safety net / prompt-guided**: Runtime has policy parsing, manifest data, or a protective check, but the orchestrator model still owns sequencing. +- **Deferred**: The document intentionally keeps the item out of the current implementation boundary. + +The follow-up implementation plan for the remaining semantic gaps is tracked in `docs/deep-review-phase2-addendum.md`. +The addendum is the live progress ledger for the remaining rounds. After each implementation round, update the addendum status first, then reconcile this document and `docs/deep-review-design.md` only when the high-level state changes. + +## Current State Summary + +| Component | Frontend (TS) | Backend (Rust) | +|---|---|---| +| Change Risk Auto-Classification | `recommendReviewStrategyForTarget()` complete; manifest strategy-decision metadata records frontend/backend-compatible recommendations, user override, final strategy, mismatch state, and severity | `ChangeRiskFactors` struct + `auto_select_strategy()` **implemented as pure policy helper**; backend-compatible scoring is advisory/mismatch-warning metadata only, and measured complexity delta remains deferred | +| Predictive Timeout | `predictTimeoutSeconds()` complete | `predictive_timeout()` complete | +| Dynamic Concurrency Control | `computeConcurrencyPolicy()` complete, prompt rules emitted | `DeepReviewConcurrencyPolicy` parsing, bounded TaskTool local-cap waiting, backend-bound local-cap queue controls, turn-local effective cap learning after local capacity skips, and explicit provider transient-capacity skip conversion **implemented**; capacity-error classification, queue-state metadata/event contract, compact queue notice, local/manual queue controls, and active-session concurrency warning **implemented**; automatic provider requeue/retry execution, backend stagger scheduling, and user-facing override controls are deferred, and capacity skips are folded into final report reliability signals | +| Retry Budget | Not applicable (backend only) | `max_retries_per_role` tracking, TaskTool retry guidance, structured retry admission, and bounded retry-scope prompt injection **implemented**; backend-owned automatic redispatch is deferred, and retry guidance uses the effective manifest policy when available | +| Partial Result Capture | Prompt rules reference `partial_timeout` | `SubagentResultStatus::PartialTimeout` + coordinator grace-period capture complete, limited to final text returned inside the grace window | +| Incremental Review Cache | Fingerprint + plan generation complete, prompt rules emitted | Per-session `DeepReviewIncrementalCache`, metadata storage, TaskTool cache-hit read path, completed-reviewer write-through, packet-key alignment, and hit/miss report signals **implemented**; project-level persistence is product-decision-required and deferred | +| Shared Context Cache | Plan generation complete, prompt rules emitted | **Prompt-only with local duplicate Read/GetFileDiff measurement and aggregate debug diagnostics; result reuse deferred** | +| Token Budget Plan | Plan generation complete, prompt rules emitted; heuristic max reviewer prompt-byte estimate, per-mode byte threshold, and full-scope summary-first decision metadata **implemented** | File-split/max-file style guardrails **implemented**; hard prompt-byte clipping and byte-accurate enforcement are deferred | +| Pre-Review Summary | Data + prompt block complete; compact launch-dialog summary **implemented** | User-facing compact consent summary implemented; separate dense pre-review report remains deferred | +| Work Packet Batch Scheduling | `launchBatch` + `staggerSeconds` in data model, prompt rules emitted | Prompt-guided batching plus hard-cap safety net **implemented**; deterministic backend batch dispatcher deferred | +| Compression Contract | Not applicable (backend only) | Contract generation + prompt injection complete | + +**Key insight**: The frontend has built comprehensive data structures, plan generators, and prompt rules for all remaining items. The backend now reads several of those fields and enforces hard safety nets, including structured retry admission, but deterministic scheduling, backend-owned retry redispatch, project-level cache reuse, and byte-level budget enforcement remain open. + +--- + +## Plan Items + +### P2-1: Backend ChangeRiskFactors + auto_select_strategy + +**Design ref**: Section 1.1 + +**What**: Add `ChangeRiskFactors` struct and `auto_select_strategy()` method to `deep_review_policy.rs`. + +**Files**: +- `src/crates/core/src/agentic/deep_review_policy.rs` - add struct + method +- `src/crates/core/src/agentic/deep_review_policy.rs` - add unit tests + +**Design spec** (verbatim from doc): +```rust +pub struct ChangeRiskFactors { + pub file_count: usize, + pub total_lines_changed: usize, + pub files_in_security_paths: usize, + pub max_cyclomatic_complexity_delta: usize, + pub cross_crate_changes: usize, +} +``` +Score formula: `file_count + total_lines_changed / 100 + files_in_security_paths * 3 + cross_crate_changes * 2` +Thresholds: `0..=5` -> Quick, `6..=20` -> Normal, `_` -> Deep + +**Risk**: Low. Pure computation, no side effects. The frontend already computes this independently; the backend version serves as a validation/override path. + +**Uncertainty**: The design mentions `max_cyclomatic_complexity_delta` requiring "a lightweight AST pass or heuristic". This is non-trivial. Current launch metadata records `0` with source `not_measured`, and the field is excluded from runtime authority until a measured signal exists. + +**Verification**: `cargo test -p bitfun-core deep_review -- --nocapture` + +--- + +### P2-2: Backend DeepReviewConcurrencyPolicy + Cap Safety Net + +**Design ref**: Section 1.3 + +**What**: Add `DeepReviewConcurrencyPolicy` to Rust policy and enforce `max_parallel_instances` before TaskTool launches Deep Review subagents. This is a cap safety net, not a deterministic backend batch scheduler. + +**Files**: +- `src/crates/core/src/agentic/deep_review_policy.rs` - add struct + `effective_max_same_role_instances()` +- `src/crates/core/src/agentic/coordination/coordinator.rs` - future queue/stagger scheduler if backend-owned batching is added +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs` - read concurrency policy from manifest + +**Design spec**: +```rust +pub struct DeepReviewConcurrencyPolicy { + pub max_parallel_instances: usize, // default: 4 + pub stagger_seconds: u64, // default: 0 + pub batch_extras_separately: bool, // default: true +} +``` +`effective_max_same_role_instances`: `max(1, max_parallel_instances / role_count).min(existing_max)` + +**Launch strategy**: The prompt tells the LLM to respect `launch_batch`, while TaskTool now bounded-waits when local reviewer capacity is saturated, converts expired waits to `CapacitySkipped`, and converts explicit provider transient-capacity reviewer failures to `capacity_skipped` with turn-local effective-cap learning. This is still not a backend batch scheduler: `staggerSeconds`, batch lifecycle ordering, automatic provider requeue/retry execution, and user-facing effective-cap overrides remain deferred. + +**Risk**: Medium. The coordinator currently does fire-and-forget parallel dispatch. Adding batching requires restructuring the dispatch flow to wait for batch completion before launching the next. This is the most architecturally complex item. + +**Approach**: Two sub-steps: +1. Add the policy struct and `with_run_manifest_execution_policy` parsing (low risk). +2. Add TaskTool cap enforcement as the first safety net, then bounded local-cap waiting once queue state and report propagation are verified. Backend-bound pause/continue/cancel/optional-skip controls are implemented for local-cap waits; provider/adaptive queueing remains an addendum follow-up. + +**Adaptive queue follow-up boundary**: The implemented queue path is deliberately narrow: it waits only for local reviewer-cap saturation, separates queue time from reviewer runtime, emits queue-state events, supports backend-bound local-cap pause/continue/cancel/optional-skip controls, learns a turn-local effective cap after local capacity skips or explicit provider transient-capacity failures, and reports those skips as `concurrency_limited`. Future queueing must stay narrower than a full backend DAG scheduler unless explicitly redesigned. Automatic provider requeue/retry execution and user-facing overrides must remain visible, timeout-separated, and isolated from normal user session concurrency. + +**Uncertainty**: The design implies the coordinator itself should batch, but actual subagent launch goes through `task_tool` invoked by the orchestrator LLM. Current implementation chooses tool-level local-cap waiting as the minimal smoother path. Deterministic backend batching remains a separate follow-up. + +**Verification**: `cargo test -p bitfun-core deep_review -- --nocapture` + `cargo test -p bitfun-core coordination -- --nocapture` + +--- + +### P2-3: Backend Retry Budget And Structured Retry Admission + +**Design ref**: Section 1.5 + +**What**: Track retry budget, return retry guidance when a reviewer Task returns `partial_timeout`, and reject unsafe retry reviewer Tasks unless they include structured coverage, reduced scope, retryable source status, and lower timeout. Accepted retry Tasks also prepend a bounded retry-scope block to the reviewer prompt. Automatic backend redispatch with reduced scope and downgraded strategy is not implemented. + +**Files**: +- `src/crates/core/src/agentic/deep_review_policy.rs` - `retries_used` tracking (already done) +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs` - retry guidance, budget checks, and structured retry admission +- `src/crates/core/src/agentic/agents/prompts/deep_review_agent.md` - already has retry instructions + +**Design spec**: +1. Check `retries_used[role] < max_retries_per_role` +2. Re-dispatch with: reduced scope (only unreviewed files), timeout / 2, strategy downgraded one level +3. Increment `retries_used[role]` +4. Set `is_retry: true` on the retry Task call and include structured `retry_coverage` + +**Risk**: Low-Medium. The tracking structures and retry admission gate are already in place. The remaining risk is model dependence: the orchestrator must read the guidance and explicitly issue a retry Task. + +**Uncertainty**: "Reduced scope (only files not yet reviewed)" requires knowing which files were already covered by the partial output. TaskTool now requires explicit structured coverage before accepting a retry and injects the accepted retry scope into the reviewer prompt, but it does not infer coverage from free-form partial output or launch the retry by itself. Until backend-owned coverage extraction and redispatch exist, this remains prompt-guided retry with deterministic admission. + +**Verification**: `cargo test -p bitfun-core deep_review -- --nocapture` + +--- + +### P2-4: Backend Incremental Review Cache Primitives + +**Design ref**: Part 5, "Advanced (Lower Priority)" item 14 + +**What**: Provide the cache data model, session metadata field, TaskTool cache-hit read path, completed-reviewer write-through, and packet-id key alignment. Cross-session/project-level cache reuse remains deferred. + +**Files**: +- `src/crates/core/src/agentic/session/session_manager.rs` - cache storage (in session metadata) +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs` - cache read before dispatch +- `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` - completed reviewer output write-through and cache hit/miss report signals + +**Design spec**: +- Cache key: `incremental-review:{fingerprint}` (already computed in frontend) +- Store: completed reviewer outputs keyed by `packet_id` +- Invalidation: `target_file_set_changed`, `reviewer_roster_changed`, `strategy_changed` (already listed in frontend plan) +- On cache hit: skip dispatch for cached packets, inject cached output into the judge's context + +**Risk**: Medium. Cache invalidation correctness is critical because stale cache produces wrong reviews. The implemented scope stays per-session, aligns read/write on `packet_id`, and invalidates by fingerprint; project-level persistence is intentionally deferred. + +**Approach**: Store cache in `SessionMetadata` first. On `buildEffectiveReviewTeamManifest`, the frontend computes the fingerprint. The backend can read matching cache data and skip matching packets only after completed reviewer outputs are written using the same `packet_id` keys that work packets use. + +**Uncertainty**: Cache storage location. Session metadata is per-session, but incremental review spans sessions. Need to decide: store in project-level storage (`/.bitfun/review-cache/`) or in the previous session's metadata? Project-level storage is more natural for cross-session reuse but requires a new storage path. + +**Decision needed**: Cache persistence scope: per-session (simpler, only works within continuation) vs. per-project (cross-session, requires new storage). The addendum keeps per-session as the initial closure target. + +**Verification**: `cargo test -p bitfun-core deep_review -- --nocapture` + +--- + +### P2-5: Backend Shared Context Cache + +**Design ref**: Part 5, "Advanced (Lower Priority)" item 13 + +**What**: When multiple reviewers need to read the same file, cache the first read's result and reuse it for subsequent reviewers. + +**Files**: +- `src/crates/core/src/agentic/coordination/coordinator.rs` - shared context cache during subagent execution +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs` - inject cache context into subagent sessions + +**Risk**: High. This requires intercepting tool calls (Read, GetFileDiff) within subagent sessions and caching their results. This is a deep architectural change to the tool pipeline. + +**Approach**: The prompt already instructs reviewers to "reuse read-only context by cache_key". For initial implementation, the prompt-level instruction (already emitted) is the primary mechanism. Local runtime measurement now records duplicate reviewer `Read`/`GetFileDiff` calls by parent turn, reviewer type, tool name, and normalized file path only. Final Deep Review submission emits aggregate debug diagnostics with counts only, not file content, diffs, or tool outputs. Programmatic enforcement would require a tool-call interception layer and should not be implemented until measurement shows material duplicate IO/token cost. + +**Recommendation**: **Defer programmatic enforcement to a later phase pending measurement.** The prompt rules are already comprehensive and the LLM can follow them. The return-on-investment for programmatic enforcement must be proven against duplicate-call measurements because interception is a deep tool-pipeline change. + +**Verification**: Manual testing with `cargo build -p bitfun-desktop` + deep review on a multi-reviewer change. + +--- + +### P2-6: Backend Token Budget Enforcement Boundary + +**Design ref**: Part 5, "Advanced (Lower Priority)" + +**What**: Enforce the low-risk max-file/file-splitting boundary first, then add heuristic prompt-byte estimation that can trigger summary-first orientation without clipping the assigned file scope. + +**Files**: +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs` - scope truncation +- `src/crates/core/src/agentic/deep_review_policy.rs` - budget policy parsing + +**Risk**: Medium. `maxFilesPerReviewer` enforcement requires splitting file lists passed to subagent Tasks. `maxPromptBytesPerReviewer` cannot be byte-accurate without generating the final prompt, so the implemented boundary is a manifest heuristic that chooses summary-first orientation while preserving full file visibility. + +**Approach**: +1. `maxFilesPerReviewer`: Split reviewer work packets by file group, without silently dropping files. +2. `largeDiffSummaryFirst`: Enable only when the estimated reviewer prompt bytes exceed the configured threshold, and use the pre-generated diff summary for orientation while keeping `assigned_scope.files` intact. +3. `maxPromptBytesPerReviewer`: Record the selected threshold and heuristic estimate in the manifest; defer byte-accurate hard clipping. + +**Recommendation**: Treat heuristic estimate plus full-scope summary-first metadata as the production boundary for this phase. Hard prompt clipping and mandatory generated summaries remain deferred until prompt-size measurement is more precise. + +**Verification**: `cargo test -p bitfun-core deep_review -- --nocapture` + +--- + +### P2-7: Pre-Review Summary UI Display (Optional) + +**Design ref**: Part 5 + +**What**: Show a compact pre-review summary in the launch confirmation dialog before starting the review. The shipped surface includes file count, risk areas, selected strategy, optional-reviewer count, summary-first state, and skipped-reviewer warnings. + +**Files**: +- `src/web-ui/src/flow_chat/components/DeepReviewConsentDialog.tsx` - compact summary rows in the launch dialog +- `src/web-ui/src/locales/*/flow-chat.json` - localized labels for the compact summary + +**Risk**: Low. Purely additive UI. The data is already computed in `buildPreReviewSummary()`. + +**Decision**: Use the existing launch confirmation dialog. A Review Team page card or inline Flow Chat preview remains deferred to avoid adding another dense Deep Review surface. + +**Verification**: `pnpm run lint:web && pnpm run type-check:web && pnpm --dir src/web-ui run test:run` + +--- + +## Current Execution Order + +``` +Phase A (backend policy foundation): DONE + P2-1: ChangeRiskFactors + auto_select_strategy pure helper + P2-1b: advisory strategy-decision metadata with mismatch-warning authority + P2-6: max-file/file-split guardrails plus heuristic prompt-byte summary-first metadata + +Phase B (backend dispatch enforcement): PARTIAL + P2-2: ConcurrencyPolicy + bounded local-cap waiting + explicit provider transient-capacity skip conversion + turn-local effective learning is done; automatic provider/adaptive queueing and staggered backend dispatch are deferred + P2-3: Retry budget + guidance + structured retry admission + bounded retry prompt injection are done; backend-owned redispatch is deferred + +Phase C (backend caching): DONE for per-session scope + P2-4: Cache primitives, packet-id read/write path, and hit/miss reporting are done; project-level persistence is deferred + +Phase D (optional/lower priority): PARTIAL + P2-7: Compact pre-review consent summary is done; separate dense preview remains deferred + P2-5: Shared context cache + P2-6 follow-up: hard prompt-byte clipping and byte-accurate enforcement +``` + +## Historical Execution Order (Superseded Original Labels) + +``` +These labels are retained only to explain the original plan shape. They are not +current completion truth; use "Current Execution Order" and the addendum status +table above instead. + +Phase A (Backend policy foundation): original target + P2-1: ChangeRiskFactors + auto_select_strategy + P2-6: Token Budget - maxFilesPerReviewer only + +Phase B (Backend dispatch enforcement): original target, now partially scoped + P2-2: ConcurrencyPolicy + original backend batching target (current runtime has bounded local-cap waiting; backend batch/stagger scheduling is deferred) + P2-3: Original retry execution target (current runtime has structured retry admission; backend-owned redispatch is deferred) + +Phase C (Backend caching - higher risk): original target + P2-4: Incremental review cache (current runtime is per-session only; project-level persistence is product-decision-required) + +Phase D (Optional / lower priority): + P2-7: Pre-review summary UI (current runtime has compact consent summary) + P2-5: Shared context cache (current runtime has prompt rules plus duplicate-call measurement; result reuse is deferred) +``` + +## Implementation Summary + +### Changes Made + +| File | Changes | +|---|---| +| `deep_review_policy.rs` | `ChangeRiskFactors` struct, `auto_select_strategy()`, `DeepReviewConcurrencyPolicy` struct + `from_manifest()` + `effective_max_same_role_instances()` + `check_launch_allowed()`, `DeepReviewIncrementalCache` struct + `from_value()`/`to_value()`/`matches_manifest()`, `deep_review_active_reviewer_count()` / `deep_review_has_judge_been_launched()` / `deep_review_retries_used()` / `deep_review_max_retries_per_role()` / cap-rejection tracking free functions, and shared-context duplicate measurement snapshots | +| `framework.rs` / `tool_pipeline.rs` / `code_review_tool.rs` | DeepReview reviewer `Read`/`GetFileDiff` duplicate measurement, parent-turn context propagation, and aggregate report-submission debug diagnostics without storing source/diff/tool-result content | +| `task_tool.rs` | Concurrency policy cap enforcement before subagent launch, cap-rejection runtime tracking, incremental cache hit check by resolved packet id when matching cache data is present, retry guidance hint on partial_timeout, structured retry admission, bounded retry-scope prompt injection, and DeepReview reviewer context tagging | +| `code_review_tool.rs` | Runtime reliability signal filling for cap rejections, cache hit/miss reporting, partial reviewer status, retry guidance, skipped reviewers, and token-budget tradeoffs | +| `reviewTargetClassifier.ts` | Path-domain classification plus reviewer applicability registry for conditional reviewer activation | +| `session/types.rs` | `deep_review_cache: Option` field on `SessionMetadata` | +| `persistence/manager.rs` | Preserve `deep_review_cache` when loading existing session metadata | +| `coordinator.rs` | Initialize `deep_review_cache: None` for new subagent sessions | +| `deep-review-design.md` | "Implementation Additions" section (ContextHealthSnapshot, ModelCapabilityProfile, Extended Path Classification), reconciled "Remaining / Future Work" | + +### Known Semantic Gaps + +| Area | Current behavior | Follow-up needed | +|---|---|---| +| Strategy authority | Manifest metadata records frontend recommendation, backend-compatible recommendation, user override, final strategy, mismatch state, and severity | Measured complexity delta only; backend scoring remains advisory and must not expand reviewer roster or override user/team strategy | +| Batched dispatch | Tool-level local-cap waiting handles reviewer saturation, but backend batch ordering and `staggerSeconds` are still prompt-guided | Add deterministic backend batch/stagger scheduling only if prompt-guided ordering remains unreliable | +| Retry | Budget, guidance, structured retry admission, and bounded retry-scope prompt injection are emitted/enforced; guidance uses effective manifest policy when available | Backend-owned automatic reduced-scope redispatch, or keep the current prompt-guided status wording | +| Pre-review summary UI | Compact launch summary shows file count, risk areas, selected strategy, optional-reviewer count, summary-first marker, and skipped-reviewer warnings | Separate dense pre-review report only if product later needs it | +| Incremental cache | Per-session data model, metadata field, packet-id read/write path, and hit/miss reporting exist | Project-level persistence and retention/privacy policy only | +| Token budget | File-split/max-file guardrails, heuristic prompt-byte estimates, full-scope summary-first metadata, and context-pressure warnings exist | Hard prompt-byte clipping and byte-accurate enforcement only; summary-first must keep assigned files visible | +| Shared context cache | Prompt rules plus local duplicate Read/GetFileDiff measurement and aggregate debug diagnostics | Tool-result interception only if real-run measurement shows material duplicate IO/token cost | +| Observability | Report reliability signals cover cache hit/miss, concurrency cap rejection, partial timeout, retry guidance, skipped reviewers, and token-budget tradeoffs | External telemetry/dashboard metrics only if needed later | + +### Latest Release-Gate Verification + +Post-Round 11b reconciliation re-ran the current release gate after document/code review. The live progress ledger remains `docs/deep-review-phase2-addendum.md`. + +| Check | Result | +|---|---| +| `cargo test -p bitfun-core deep_review -- --nocapture` | 105 passed, 0 failed | +| `cargo check --workspace --exclude bitfun-cli` | Pass (warnings only, pre-existing) | +| `pnpm run lint:web` | Pass | +| `pnpm run type-check:web` | Pass | +| `pnpm --dir src/web-ui run test:run` | 67 files passed, 391 tests passed | +| `git diff --check` | Pass | + +## Decisions Remaining After Status Reconciliation + +1. **P2-1 strategy authority**: Backend-compatible scoring is now a mismatch-warning advisory signal. The final strategy remains the configured team strategy or explicit user override, and advisory mismatch metadata must not expand reviewer roster or silently change token/concurrency cost. + - **Recommendation**: Keep this as the production boundary unless a future product decision explicitly asks for authoritative auto-selection. Add measured complexity delta before giving this policy any stronger authority. + +2. **P2-2 batching approach**: Should the next step stay with tool-level local-cap waiting, or add broader provider/adaptive scheduling? + - **Recommendation**: Keep local-cap waiting, explicit provider transient-capacity skip conversion, and turn-local effective-cap learning as the production boundary for this phase. Add automatic provider/adaptive queueing only as a narrow follow-up with visible backend-bound controls, queue-time/execution-time separation, user override bounds, and protection for normal user session concurrency. + +3. **P2-4 cache persistence scope**: Per-session is implemented. Per-project reuse remains a future decision because it needs retention/privacy, invalidation, deletion, and user-visibility boundaries. + - **Recommendation**: Keep per-session as the production boundary for this phase. Cached reviewer outputs have no independent retention period beyond session metadata; add per-project only after explicit product approval and deletion semantics. + +4. **P2-5 shared context cache**: Accept prompt-only approach for now, or invest in programmatic enforcement? + - **Recommendation**: Prompt-only plus local duplicate measurement and aggregate debug diagnostics for this phase. The prompt rules are already emitted and comprehensive; programmatic enforcement remains deferred until real-run measurement data justifies a separate interception/cache plan. + +5. **P2-6 token budget scope**: The current production boundary is heuristic prompt-byte estimation plus full-scope summary-first metadata. Should future work add hard prompt clipping? + - **Recommendation**: Keep hard clipping deferred until byte-accurate prompt measurement exists, and never hide files without explicit coverage metadata. + +6. **P2-7 pre-review summary UI**: Where to display? + - **Decision**: Use the existing launch confirmation dialog for a compact summary. Defer separate Review Team page or inline Flow Chat preview surfaces until product needs a denser pre-review view. + +## Verification Commands + +| Phase | Command | +|---|---| +| Phase A | `cargo test -p bitfun-core deep_review -- --nocapture` | +| Phase B | `cargo test -p bitfun-core deep_review -- --nocapture && cargo test -p bitfun-core coordination -- --nocapture` | +| Phase C | `cargo test -p bitfun-core deep_review -- --nocapture` | +| All phases (frontend) | `pnpm run lint:web && pnpm run type-check:web && pnpm --dir src/web-ui run test:run` | +| All phases (full Rust) | `cargo check --workspace && cargo test --workspace` | +| Integration smoke | `cargo build -p bitfun-desktop` + manual deep review | From fda72f28e54c305868613db857bd338240506962 Mon Sep 17 00:00:00 2001 From: limit_yan Date: Sat, 9 May 2026 18:20:38 +0800 Subject: [PATCH 4/6] feat(deep-review): add capacity controls and cost plan --- docs/deep-review-design.md | 10 +- docs/deep-review-phase2-addendum.md | 10 + docs/deep-review-phase2-plan.md | 8 + docs/deep-review-phase3-followup-plan.md | 436 +++++++ ...05-09-deep-review-phase3-execution-plan.md | 1092 +++++++++++++++++ .../core/src/agentic/deep_review_policy.rs | 310 ++++- .../tools/implementations/code_review_tool.rs | 114 +- .../tools/implementations/task_tool.rs | 66 +- .../agents/components/ReviewTeamPage.tsx | 2 + .../btw/DeepReviewActionBar.i18n.test.ts | 4 + .../btw/DeepReviewActionBar.test.tsx | 37 + .../components/btw/DeepReviewActionBar.tsx | 182 ++- .../config/components/ReviewConfig.tsx | 93 +- src/web-ui/src/locales/en-US/flow-chat.json | 4 + .../src/locales/en-US/settings/review.json | 24 + src/web-ui/src/locales/zh-CN/flow-chat.json | 4 + .../src/locales/zh-CN/settings/review.json | 24 + src/web-ui/src/locales/zh-TW/flow-chat.json | 4 + .../src/locales/zh-TW/settings/review.json | 24 + .../shared/services/reviewTeamService.test.ts | 100 ++ .../src/shared/services/reviewTeamService.ts | 133 +- 21 files changed, 2555 insertions(+), 126 deletions(-) create mode 100644 docs/deep-review-phase3-followup-plan.md create mode 100644 docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md diff --git a/docs/deep-review-design.md b/docs/deep-review-design.md index ec60fc57c..97057c836 100644 --- a/docs/deep-review-design.md +++ b/docs/deep-review-design.md @@ -56,6 +56,8 @@ The detailed execution order and per-round exit checks are tracked in `docs/deep - **Project-level incremental review cache**: Per-session cache read/write support is implemented and keyed by `packet_id`; cross-session/project-level persistence remains product-decision-required and deferred. Current cached reviewer outputs live only with session metadata and are deleted with that session metadata. - **Shared context cache**: Frontend plan generation, prompt rules, local duplicate `Read`/`GetFileDiff` measurement, and aggregate debug diagnostics exist, but backend result reuse is not programmatically enforced. - **Token budget enforcement**: File splitting, max-file style limits, heuristic prompt-byte estimates, and full-scope `largeDiffSummaryFirst` decisions are present in manifest policy. Hard prompt-byte clipping and byte-accurate enforcement remain deferred, and any summary-first path must keep unreviewed files visible in coverage notes/reliability signals. +- **Cost-aware review depth**: Quick/default strategies still need a product-level depth contract that makes `quick` a high-risk gate, `normal` a risk-expanded review, and `deep` the explicit full-depth path. This should reduce slow-model time and token use without hiding changed files from coverage metadata. +- **Shared evidence pack**: Duplicate-tool diagnostics can show repeated `Read`/`GetFileDiff` work, but reviewers still rediscover common facts. A source-agnostic evidence pack should precompute changed files, hunk hints, domain/risk tags, packet ids, and cheap contract hints once per run so subagents spend more tokens on judgment than discovery. - **Pre-review summary UI**: Compact launch-dialog summary is implemented. A separate dense pre-review report remains deferred unless product later needs it. - **Work packet batched scheduling**: Frontend work packet data structure and prompt rules are complete; backend `launchBatch` / `staggerSeconds` / `batchExtrasSeparately` scheduling remains prompt-driven except for TaskTool's hard concurrency cap. - **Conditional reviewer extensibility**: Path-domain classification and reviewer applicability rules now support the current Frontend Reviewer; future conditional reviewer families should extend the registry and add focused tests. @@ -103,6 +105,8 @@ Key components: | **Error fallback** | Retry budget, guidance, structured retry admission, and bounded retry-scope prompt injection exist, but backend-owned automatic reduced-scope redispatch is not implemented | Retry launch behavior still depends on the orchestrator model | | **Context management** | Shared context cache is prompt-only with local duplicate Read/GetFileDiff measurement and aggregate debug diagnostics; backend result reuse is not enforced | Reviewers may duplicate IO and token usage until real-run measurements justify an interception/cache plan | | **Strategy selection** | Frontend recommendation, backend-compatible recommendation, user override, final strategy, mismatch state, and mismatch severity are recorded as launch metadata; runtime launch still follows configured/user-selected strategy | Users may still over- or under-review, but the product now has non-blocking metadata to explain the tradeoff without silently changing token/concurrency cost | +| **Review cost by strategy** | Quick and normal modes have budget metadata, but role prompts can still perform broad discovery unless a scope-depth contract is explicit | Slow models and large diffs can consume excessive time/tokens before reaching high-risk findings | +| **Repeated evidence discovery** | Duplicate `Read`/`GetFileDiff` calls are measured, but reviewers do not yet receive a shared evidence pack with hunk/risk/contract hints | Parallel subagents can spend their first turns reading the same files and git facts instead of reasoning | ### Scenario Breakdown @@ -110,7 +114,7 @@ Key components: |----------|-------|-------|------------------|---------| | A: Small change | < 5 | < 200 | 4 always-on reviewers, optional frontend only when applicable | Can still be over-provisioned if the user chooses a deeper strategy | | B: Medium change | 5-20 | 200-1000 | 4 always-on reviewers with predictive timeout and local-cap backpressure | Logic-heavy reviewers may still return partial output on slow models | -| C: Large change | 20-50 | 1000+ | File split can create multiple reviewer packets plus judge; local reviewer-cap waiting is bounded | Provider/adaptive queueing, backend batch/stagger scheduling, and programmatic shared context reuse remain deferred | +| C: Large change | 20-50 | 1000+ | File split can create multiple reviewer packets plus judge; local reviewer-cap waiting is bounded | Cost-aware high-risk-first scope and shared evidence packs are still needed before adding heavier scheduler behavior | | D: Any + slow model | Any | Any | Predictive timeout, partial capture, and structured retry admission exist | Backend-owned retry redispatch is still prompt-guided/deferred | | E: Any + rate limit | Any | Any | Local cap pressure is bounded and visible; explicit provider transient-capacity reviewer failures become `capacity_skipped` and lower the turn-local effective cap | Provider-side automatic queueing/retry execution is not implemented | @@ -899,6 +903,8 @@ ReviewFrontend: { 4. **Runtime strategy authority**: Keep backend `auto_select_strategy()` as advisory/mismatch-warning metadata. Only revisit authoritative auto-selection after measured complexity delta exists and product explicitly accepts strategy changes that can alter token/concurrency cost. 5. **Token and context budgets**: Keep heuristic prompt-byte estimates and full-scope summary-first metadata as the current boundary. Add hard clipping or byte-accurate enforcement only after it can preserve explicit coverage for every file. 6. **Operational evidence**: Keep the implemented report reliability surfaces for partial timeouts, retry guidance, cache hits/misses, skipped reviewers, token-budget tradeoffs, and TaskTool cap rejections. Keep shared-context duplicate measurement local and non-reporting; final Deep Review submission may emit aggregate debug counts for local sampling, but real runs must show that programmatic reuse is worth the runtime complexity before adding interception or cache reuse. Add external telemetry only if product diagnostics require it. +7. **Cost-aware scope depth**: Add a manifest-level depth profile before broadening runtime scheduling. `quick` should focus only on high-risk hunks and direct contract/security/config/concurrency paths, `normal` should review changed code plus one-hop high-risk context, and `deep` remains the full-depth option. Reports must label reduced-depth coverage honestly. +8. **Shared evidence pack**: Precompute compact source-agnostic evidence once per run and pass it to every reviewer. Start with metadata, hunk hints, domain/risk tags, packet ids, and cheap contract hints; keep full `Read` output reuse deferred until duplicate-call measurements prove it is worth the tool-pipeline complexity. ### Superseded Next Phase: Strategy Engine Foundation @@ -988,7 +994,7 @@ The original design defined a simple `hasFrontendFiles()` boolean check. The imp ### Advanced (Lower Priority) -13. **Shared context cache** - Programmatic reuse remains deferred; current runtime measures duplicate `Read`/`GetFileDiff` calls and emits aggregate local debug diagnostics at report submission. +13. **Shared context cache** - Programmatic reuse remains deferred; current runtime measures duplicate `Read`/`GetFileDiff` calls and emits aggregate local debug diagnostics at report submission. The next lower-risk step is a shared evidence pack, not cross-subagent full-file result caching. 14. **Incremental review caching** - Per-session packet cache is implemented; project-level follow-up reuse remains product-decision-required. ## Verification diff --git a/docs/deep-review-phase2-addendum.md b/docs/deep-review-phase2-addendum.md index b890d7850..9313c0bad 100644 --- a/docs/deep-review-phase2-addendum.md +++ b/docs/deep-review-phase2-addendum.md @@ -29,6 +29,8 @@ | Incremental cache | Per-session cache struct, metadata field, `packet_id` read path, completed-reviewer write-through, and hit/miss report signals exist | Keep project-level cache product-decision-required and deferred | | Token budget | File-split/max-file style guardrails, heuristic per-reviewer prompt-byte estimates, summary-first full-scope decisions, and token-budget warnings exist in the launch manifest | Keep hard prompt clipping and generated byte-accurate prompt enforcement deferred; summary-first must never remove `assigned_scope` files silently | | Shared context cache | Prompt-only reuse plus local duplicate Read/GetFileDiff measurement | Keep interception/cache reuse deferred unless measured duplicate cost becomes a bottleneck | +| Cost-aware strategy depth | Strategy levels, prompt-byte estimates, and summary-first metadata exist, but quick/default do not yet have a strict high-risk-first depth contract | Add an explicit review-depth profile before broadening scheduling: quick = high-risk-only, normal = risk-expanded, deep = full-depth | +| Shared evidence pack | Duplicate `Read`/`GetFileDiff` measurement exists, but reviewers do not yet receive a shared metadata pack with changed files, hunk hints, risk tags, and cheap contract hints | Add source-agnostic evidence metadata first; defer full tool-result reuse until measurements justify a tool-pipeline cache | | Observability | User/report-facing reliability summaries exist for cache, skipped-reviewer, token-budget, partial-timeout, retry, and runtime cap rejections | Keep external telemetry deferred unless diagnostics require it | | First-run consent dialog | Dialog is compact, localized, preserves strategy choice, and shows skipped-reviewer warnings only when present | Keep as implemented; future changes must preserve low-density copy, theme stability, and locale coverage | @@ -55,6 +57,8 @@ Use these labels consistently when comparing code and documents: | Deep Review can compete with normal session concurrency | Local cap waiting does not start reviewer execution until capacity is acquired, launch surfaces warn when the target session already has high active Task/subagent activity, and explicit provider transient-capacity reviewer failures lower the turn-local effective cap; automatic provider/adaptive queueing is not implemented | A large review should not silently consume capacity needed by the user's active session work | Future scheduler must reserve or deprioritize Deep Review capacity and reuse the warning plus backend-bound queue controls | | Strategy advice can feel like hidden override pressure | Strategy mismatch is recorded in manifest/prompt metadata, but the final launch strategy remains the team default or explicit user override | If the advisory policy silently changes reviewer roster or strategy, users may lose control of token/concurrency tradeoffs | Keep backend-compatible scoring non-blocking, avoid roster expansion from mismatch metadata, and only surface concise report/launch notes when useful | | Token budget can under-review silently | Prompt-byte pressure now enables a `summary_first_full_scope` decision, but it keeps every `assigned_scope` file visible | If budget logic clips files without reporting, the judge may overtrust incomplete coverage | Keep summary-first as orientation only; any uncovered file must be reported in coverage notes/reliability signals | +| Quick/default review can over-spend on low-risk breadth | Large diffs and slow models can push every reviewer into broad discovery before high-risk findings are decided | Users wait longer and spend more tokens without proportional quality gain | Add cost-aware depth profiles and label coverage as high-risk-only, risk-expanded, or full-depth | +| Shared evidence can become a hidden context blob | A naive shared pack could include full diffs or source text and recreate the token/privacy problem it tries to solve | Token usage and privacy risk increase instead of decrease | Keep evidence metadata-first and content-free by default; use targeted reads for confirmation | | Capacity-error classification can be too broad | Current code classifies capacity causes, uses the local-cap path for bounded waiting, and converts explicit provider transient-capacity reviewer failures to `capacity_skipped`; provider capacity errors are not automatically requeued yet | Misclassifying auth, quota, invalid model, cancellation, or tooling errors as queueable can cause endless waiting | Queue/skip only explicit transient/capacity failures such as rate limit, provider concurrency limit, temporary overload, or local cap saturation | | Per-session cache retention semantics are still product-light | Cache stays session-scoped; project-level persistence is deferred | Reviewer outputs can contain sensitive code findings, and future persistence needs deletion/retention rules | Define privacy, invalidation, and deletion semantics before cross-session/project cache | @@ -71,6 +75,8 @@ Use these labels consistently when comparing code and documents: | Automatic retry redispatch | **Partial runtime-complete for structured retry admission; scoped follow-up for backend-owned redispatch** | TaskTool now rejects retry calls without structured coverage, non-retryable source status, broad scope, non-lowered timeout, or exhausted retry budget, and prepends the accepted retry scope to the reviewer prompt. The orchestrator still has to issue the retry Task; backend-owned redispatch remains deferred. | | Pre-review summary UI | **Closed, runtime-complete for compact consent summary; richer preview deferred** | The launch dialog now shows concise file count, risk areas, reviewer-call count, optional-reviewer count, selected strategy, summary-first marker, and skipped-reviewer warnings without restoring dense cost/time cards or full reviewer lineups. | | Programmatic shared context cache | **Deferred pending measured need** | Prompt rules, local duplicate Read/GetFileDiff measurement, and aggregate debug diagnostics exist. Tool-result interception remains a separate deep runtime change. | +| Cost-aware review depth | **Scoped follow-up** | Quick/default should reduce breadth by focusing on high-risk hunks, direct contracts, security/config/concurrency/persistence paths, and applicable optional reviewers. Deep remains the explicit full-depth mode. | +| Shared evidence pack | **Scoped follow-up before cache reuse** | Precompute compact metadata once per run so reviewers start from the same changed-file, hunk, risk, packet, and contract hints. Do not reuse full `Read` outputs across subagents until diagnostics justify a separate cache design. | | Project-level cache and retention policy | **Product-decision-required / deferred** | Current production boundary is per-session only; cross-session persistence needs explicit retention, deletion, and visibility approval. | | First-run consent dialog and invalid custom reviewer reporting | **No immediate follow-up** | Current code and docs now agree: the dialog is compact/localized, and invalid review agents surface as `invalid_tooling`. | @@ -85,6 +91,8 @@ In scope: - Preserve normal user session responsiveness: Deep Review queueing must not silently consume all available subagent capacity, and high session concurrency should produce a clear pause/continue choice instead of hidden waiting. - Make retry status explicit: structured retry admission is runtime-enforced, while backend-owned redispatch remains prompt-guided/deferred. - Add metrics/report surfaces for cap rejection, partial timeout, retry, cache hit/miss, skipped reviewers, and token-budget decisions. +- Add cost-aware review-depth profiles for quick/default/deep so time and token cost are proportional to risk. +- Add a shared evidence pack that reduces repeated discovery without adding full source/diff content to diagnostics. Out of scope: @@ -92,6 +100,8 @@ Out of scope: - Programmatic shared Read/GetFileDiff cache. - Hard prompt-byte clipping or byte-accurate prompt enforcement. - Large diff summary generation as a mandatory pre-review step. +- Treating quick/default reduced-depth review as full coverage. +- Caching full `Read` tool results across reviewer sessions before duplicate-call diagnostics justify it. - Replacing the prompt-driven DeepReview orchestrator with a full backend-owned DAG scheduler. - Implementing automatic adaptive queueing in the current UI/control round. diff --git a/docs/deep-review-phase2-plan.md b/docs/deep-review-phase2-plan.md index d67fdb3ce..6209f25e5 100644 --- a/docs/deep-review-phase2-plan.md +++ b/docs/deep-review-phase2-plan.md @@ -291,6 +291,8 @@ Phase D (Optional / lower priority): | Incremental cache | Per-session data model, metadata field, packet-id read/write path, and hit/miss reporting exist | Project-level persistence and retention/privacy policy only | | Token budget | File-split/max-file guardrails, heuristic prompt-byte estimates, full-scope summary-first metadata, and context-pressure warnings exist | Hard prompt-byte clipping and byte-accurate enforcement only; summary-first must keep assigned files visible | | Shared context cache | Prompt rules plus local duplicate Read/GetFileDiff measurement and aggregate debug diagnostics | Tool-result interception only if real-run measurement shows material duplicate IO/token cost | +| Cost-aware strategy depth | Strategy metadata and token-budget hints exist, but quick/default reviewer prompts may still perform broad discovery on large changes | Add a manifest depth profile so quick means high-risk-only, normal means risk-expanded, and deep means full-depth | +| Shared evidence pack | Duplicate-tool measurement exists, but reviewers still rediscover changed files, hunk locations, risk tags, and cheap contract facts independently | Precompute compact source-agnostic evidence once per run before considering programmatic `Read`/`GetFileDiff` result reuse | | Observability | Report reliability signals cover cache hit/miss, concurrency cap rejection, partial timeout, retry guidance, skipped reviewers, and token-budget tradeoffs | External telemetry/dashboard metrics only if needed later | ### Latest Release-Gate Verification @@ -326,6 +328,12 @@ Post-Round 11b reconciliation re-ran the current release gate after document/cod 6. **P2-7 pre-review summary UI**: Where to display? - **Decision**: Use the existing launch confirmation dialog for a compact summary. Defer separate Review Team page or inline Flow Chat preview surfaces until product needs a denser pre-review view. +7. **Cost-aware review depth**: Should quick/default reduce depth automatically for large changes? + - **Recommendation**: Yes, but only through explicit scope wording and coverage metadata. Quick should be a high-risk gate, normal should be risk-expanded, and deep should remain full-depth. Reduced-depth modes must not hide changed files or suppress severe categories such as security, persistence, auth, concurrency, data loss, migrations, cross-boundary APIs, and i18n contract drift. + +8. **Shared evidence before shared tool cache**: Should the runtime cache full tool results across parallel reviewers now? + - **Recommendation**: Not yet. First add a shared evidence pack with changed files, hunk hints, risk/domain tags, packet ids, and cheap contract hints. Keep programmatic result reuse limited to immutable diff/stat data until aggregate duplicate-call measurements prove that full tool-result interception is worth the complexity. + ## Verification Commands | Phase | Command | diff --git a/docs/deep-review-phase3-followup-plan.md b/docs/deep-review-phase3-followup-plan.md new file mode 100644 index 000000000..2586d6c51 --- /dev/null +++ b/docs/deep-review-phase3-followup-plan.md @@ -0,0 +1,436 @@ +# Deep Review Phase 3 Follow-up Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use `superpowers:executing-plans` to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Finish the remaining Deep Review follow-ups with low-noise diagnostics, visible capacity handling, bounded retry controls, and no project-level cache expansion. + +**Architecture:** Keep Deep Review as a prompt-driven orchestrator with deterministic runtime guardrails. Add only narrow runtime automation where the user can see, pause, retry, or disable it. Do not introduce a full backend DAG scheduler, project-wide review cache, or hidden concurrency escalation. + +**Tech Stack:** Rust core (`bitfun-core`), events (`bitfun-events`), desktop API, shared React frontend (`src/web-ui`), Vitest, Cargo tests. + +--- + +## Confirmed Product Decisions + +1. **Provider transient capacity queue is allowed** when the error is narrowly classified as rate limit, provider concurrency limit, `Retry-After`, or temporary overload. +2. **Automatic reviewer retry stays manual by default.** Add an explicit retry action for unresolved bounded slices. The user can choose "allow bounded retries without asking again"; after that, Deep Review may run small automatic retries within the configured retry budget and must never form an infinite loop. +3. **Project-level review cache remains deferred.** Current per-session cache behavior stays the production boundary. + +## Non-Negotiable UX Constraints + +- No hidden long wait. Provider or local capacity queueing must be visible, pauseable, cancellable, and timeout-separated from reviewer execution. +- No automatic increase of maximum concurrency. The runtime may lower effective concurrency temporarily, but increasing configured concurrency requires explicit user action. +- No noisy diagnostics. Runtime metrics must be aggregate, per-turn, and logged or surfaced at completion only unless they drive an existing compact queue notice. +- No source, diff, or model output content in diagnostics. Store counts, durations, reason categories, and reviewer ids only. +- No disruptive settings expansion. Prefer adding a small "Review capacity and retry" subsection to existing Review settings over creating a new surface. + +## Current Instrumentation Audit + +| Signal | Current coverage | Current risk | Phase 3 action | +|---|---|---|---| +| Duplicate `Read` / `GetFileDiff` usage | Present. `Tool::call` records only DeepReview reviewer `Read` and `GetFileDiff` calls by parent turn, reviewer type, tool name, and normalized path. `submit_code_review` logs aggregate debug diagnostics once. | Per-tool-call in-memory update is acceptable, but it does not produce a durable product decision snapshot. | Keep the current low-content measurement. Add a final aggregate diagnostics object only when a run completes. | +| Local capacity queue wait | Present. Queue state events include status, reason, queue elapsed, run elapsed, effective cap, and max queue wait. | UI events are emitted while waiting; this is needed for the live queue notice but should not become per-event logging. | Keep live events for UI. Add completion-time aggregate counters instead of logging every event. | +| Provider transient capacity failure | Present as `capacity_skipped` with effective-cap learning and report reliability folding. | It skips immediately; it does not yet short-queue and reattempt within a small window. | Add short, visible provider queue retry before final `capacity_skipped`. | +| Concurrency-limited report signal | Present as a final `concurrency_limited` reliability signal. | It explains that concurrency limited coverage, but does not guide the user to a safer next run setting. | Add action-bar/report CTA to run slower next time or open Review settings. | +| Retry guidance | Present as assistant-facing retry guidance and TaskTool structured retry admission. | No explicit user-facing retry action; no persistent "do not ask again" bounded retry preference. | Add explicit retry action and a persisted bounded-auto-retry setting. | +| Queue control actions | Present for local-cap waits: pause, continue, cancel, skip optional. | Provider queue must reuse the same visible control model. | Extend queue state reason/control handling to provider capacity waits. | +| Token/runtime cost of retry | Partially controlled by `max_retries_per_role`, reduced scope, and lower timeout admission. | A future auto retry could extend review duration unexpectedly. | Add per-run retry elapsed guard, retry count display, and stop after one bounded retry per packet unless the configured budget explicitly allows more. | +| Project-level cache metrics | Not needed for current boundary. | A cache plan could accidentally imply persistence approval. | Keep project-level cache out of scope. | + +## Recommended User Experience + +### Capacity Problem Guidance + +When Deep Review sees repeated capacity pressure or a provider transient capacity error: + +- Show a compact action-bar notice: "Review capacity is constrained. Queue time does not count against reviewer runtime." +- Offer actions: + - `Wait briefly` / `Continue queue` + - `Pause Deep Review` + - `Cancel queued reviewers` + - `Skip optional extras` + - `Run slower next time` + - `Open Review settings` +- `Run slower next time` should lower Deep Review's configured max parallel reviewers by one, bounded to at least one. It must be explicit and reversible in settings. +- If the target session is busy, prefer "Pause Deep Review" over consuming more subagent capacity. + +This is better than only telling users to edit a numeric setting because it keeps the recovery path local to the failure, while still exposing the persistent setting for users who want control. + +### Retry Experience + +When a reviewer ends as `partial_timeout` or `capacity_skipped` and has structured coverage: + +- Show a `Retry unresolved slice` action near the report/action bar. +- The first click retries only the uncovered files with a lower timeout. +- Offer an inline checkbox or secondary action: `Allow bounded automatic retries for future Deep Reviews`. +- Persist the preference in Review settings and allow the user to turn it off. +- Auto retry must stop when any of these is true: + - retry budget for the role or packet is exhausted; + - retry scope is not smaller than the source packet; + - retry source status is not `partial_timeout` or transient `capacity_skipped`; + - overall Deep Review elapsed guard is exceeded; + - retry fails with auth, quota, billing, invalid model, policy, invalid tooling, user cancellation, or validation error. + +### Cost And Scope Experience + +Large changes should not force every strategy to perform a full-depth audit. The product should make the selected strategy's review depth explicit and should prefer a fast, high-risk pass before spending time and tokens on broad exploration. + +- `quick`: run a high-risk gate. Review only changed hunks, direct contract/security/config/concurrency paths, and required locale/API consistency checks. Optional reviewers and broad dependency exploration stay off unless the risk classifier marks a matching area. +- `normal`: run a risk-expanded review. Start from changed hunks, include one-hop dependencies only for high-risk domains, and keep optional specialists conditional. This should be the default balance for most product changes. +- `deep`: run full breadth and depth. Permit wider dependency tracing, more reviewer packets, and deeper role-specific exploration when the user explicitly chooses thoroughness. +- For slow models or very large diffs, offer a compact `Fast high-risk scan first` path and a later `Deepen selected areas` follow-up instead of silently widening the first run. +- The launch summary should describe the depth as `High-risk only`, `Risk-expanded`, or `Full-depth` rather than showing dense token estimates. + +This approach reduces negative user impact because the default path still finds severe issues first, while users can deliberately pay for deeper coverage when needed. + +## Data Model Additions + +### Per-Turn Runtime Diagnostics + +Add a small aggregate diagnostics structure under the Deep Review runtime path. It should be in-memory during the turn and optionally attached to final report metadata. + +Fields: + +- `queue_wait_count` +- `queue_wait_total_ms` +- `queue_wait_max_ms` +- `provider_capacity_queue_count` +- `provider_capacity_retry_count` +- `provider_capacity_retry_success_count` +- `capacity_skip_count` +- `effective_parallel_min` +- `effective_parallel_final` +- `manual_queue_action_count` +- `manual_retry_count` +- `auto_retry_count` +- `auto_retry_suppressed_reason_counts` +- `shared_context_total_calls` +- `shared_context_duplicate_calls` +- `shared_context_duplicate_context_count` + +Rules: + +- Update counters in memory only at state transitions or final submission. +- Emit at most one debug log line at final `submit_code_review`. +- Attach only aggregate fields to report metadata; do not render them by default unless they affect coverage reliability. +- Do not store source text, diff text, reviewer output, provider raw body, or full file contents. + +### Review Settings Additions + +Persist these under the default review team config, not as global session behavior: + +- `max_parallel_reviewers`: default `4`, min `1`, max `16`. +- `max_queue_wait_seconds`: default `60`, min `0`, max `600`. +- `allow_provider_capacity_queue`: default `true`. +- `allow_bounded_auto_retry`: default `false`. +- `auto_retry_elapsed_guard_seconds`: default `180`, min `30`, max `900`. + +Global `ai.subagent_max_concurrency` already exists in Rust config and affects all subagent use. It should not be the primary user-facing control for Deep Review because it can affect normal sessions. If exposed in the UI later, label it as an advanced global setting. + +### Cost-Aware Scope Profile + +Add a manifest-level scope profile that controls review depth without changing the configured team roster: + +- `review_depth`: `high_risk_only | risk_expanded | full_depth` +- `risk_focus_tags`: stable tags such as `security`, `api_contract`, `data_loss`, `concurrency`, `persistence`, `i18n`, `frontend_platform_boundary`, `cross_crate`, and `generated_or_low_risk` +- `max_dependency_hops`: `0` for quick, `1` for normal high-risk paths, and `unbounded_or_policy_limited` for deep +- `optional_reviewer_policy`: `risk_matched_only | configured | full` +- `allow_broad_tool_exploration`: false for quick, limited for normal, true for deep +- `coverage_expectation`: a short string for the judge/report so reduced-depth reviews cannot be mistaken for full coverage + +Rules: + +- The profile may shrink reviewer depth and optional reviewer activation, but it must not hide changed files from coverage metadata. +- Quick/default high-risk behavior should be fail-open for severe categories: security, data loss, migrations, auth, cross-boundary API, concurrency, and persistence changes remain in scope. +- If a reviewer skips broad exploration because of the profile, the judge must preserve that as an explicit coverage limitation. +- User-selected `deep` overrides reduced-depth defaults. + +### Shared Evidence Pack + +Create a source-agnostic evidence pack once before subagent launch so each reviewer starts from the same compact facts instead of rediscovering them through repeated tools. + +Initial fields: + +- changed file list, diff stat, and domain tags +- per-file hunk ranges and changed symbols when available +- risk focus tags and strategy depth profile +- review packet ids and assigned scope +- relevant git metadata such as base/head refs or source label when available +- compact contract hints, such as changed Tauri command names or locale key names, when cheaply derivable + +Rules: + +- The evidence pack should prefer metadata, hunk ranges, and short summaries over full file contents. +- It must remain source-agnostic: Git is one source, not the abstraction. +- Subagents should read full files only to confirm a suspected issue or inspect a specific missing context. +- Start with manifest/prompt consumption plus current duplicate-tool diagnostics. Programmatic tool-result reuse should be limited to immutable, content-addressed `GetFileDiff`/diff-stat results first, and only after measured duplicate cost justifies it. +- Do not store source text, full diffs, or reviewer output in shared diagnostics. + +## Implementation Rounds + +### Round 1: Low-Frequency Runtime Diagnostics + +**Goal:** Confirm real run behavior without adding hot-path logging or UI noise. + +**Status:** Implemented for the runtime signals that already exist today: local queue terminal waits, capacity skips, effective concurrency transitions, and shared-context reuse measurements. Provider capacity retry and auto-retry suppression counters are available in the aggregate diagnostics shape, but remain pending until Rounds 3 and 4 introduce those runtime transitions. + +**Files:** + +- Modify: `src/crates/core/src/agentic/deep_review_policy.rs` +- Modify: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Modify: `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` +- Test: `src/crates/core/src/agentic/deep_review_policy.rs` +- Test: `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` + +Steps: + +- [x] Add a per-turn `DeepReviewRuntimeDiagnostics` aggregate in the existing budget tracker. +- [x] Record only current state transitions: local queue terminal wait, capacity skipped, and effective concurrency changes. +- [x] Keep provider capacity retry, retry accepted, and retry suppressed counters as aggregate-only fields for Rounds 3 and 4. +- [x] Merge current shared-context measurement snapshot into the aggregate at final submission. +- [x] Log one debug line at final `submit_code_review` when diagnostics are non-empty. +- [x] Add tests proving duplicate `Read` / `GetFileDiff` counts remain content-free. +- [x] Add tests proving repeated queue events do not create repeated final diagnostics rows. + +Verification: + +- `cargo test -p bitfun-core deep_review_shared_context_diagnostics_stays_out_of_report -- --nocapture` +- `cargo test -p bitfun-core deep_review -- --nocapture` + +Exit criteria: + +- Diagnostics are aggregate-only. +- No per-second queue event is written as a log line. +- Report UI remains unchanged unless a reliability signal already exists. + +### Round 2: Settings And Capacity Guidance + +**Goal:** Give users a clear recovery path without automatic max-concurrency changes. + +**Status:** Implemented for persisted default Review Team capacity/retry settings, the compact Review settings subsection, and the capacity-queue recovery actions `Run slower next time` and `Open Review settings`. The runtime still treats provider transient queueing as a later Round 3 concern, and retry execution remains pending Round 4. No Rust global config schema change was required because these fields are scoped to the default Review Team config. + +**Files:** + +- Modify: `src/web-ui/src/shared/services/reviewTeamService.ts` +- Modify: `src/web-ui/src/infrastructure/config/components/ReviewConfig.tsx` +- Modify: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` +- Modify: `src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx` +- Modify: `src/web-ui/src/locales/en-US/settings/review.json` +- Modify: `src/web-ui/src/locales/zh-CN/settings/review.json` +- Modify: `src/web-ui/src/locales/zh-TW/settings/review.json` +- Modify: `src/web-ui/src/locales/en-US/flow-chat.json` +- Modify: `src/web-ui/src/locales/zh-CN/flow-chat.json` +- Modify: `src/web-ui/src/locales/zh-TW/flow-chat.json` +- Test: `src/web-ui/src/shared/services/reviewTeamService.test.ts` +- Test: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx` +- Test: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.i18n.test.ts` + +Steps: + +- [x] Persist Deep Review concurrency settings under the default review team config. +- [x] Keep existing defaults unchanged: 4 parallel reviewers, 60s max queue wait. +- [x] Add a compact Review settings subsection named "Capacity and retry". +- [x] Add `max_parallel_reviewers` and `max_queue_wait_seconds` controls with bounds and localized descriptions. +- [x] Add `allow_provider_capacity_queue` and `allow_bounded_auto_retry` toggles. +- [x] Add a `Run slower next time` action path from capacity-limited UI/report to lower `max_parallel_reviewers` by one. +- [x] Keep global `ai.subagent_max_concurrency` out of the normal Review settings path. + +Verification: + +- `pnpm run lint:web` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts` + +Exit criteria: + +- Users can recover from capacity pressure without editing raw config. +- Settings changes are explicit and reversible. +- Existing Review Team strategy/model behavior is unchanged. + +### Round 3: Short Provider Capacity Queue + +**Goal:** Automatically wait briefly for provider transient capacity without hiding the wait or stealing normal session capacity. + +**Files:** + +- Modify: `src/crates/core/src/agentic/deep_review_policy.rs` +- Modify: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Modify: `src/crates/events/src/agentic.rs` +- Modify: `src/crates/core/src/agentic/events/types.rs` +- Modify: `src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.ts` +- Modify: `src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts` +- Modify: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` +- Test: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Test: `src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.test.ts` +- Test: `src/web-ui/src/flow_chat/store/deepReviewActionBarStore.test.ts` + +Steps: + +- [ ] Treat only provider rate limit, provider concurrency limit, `Retry-After`, and temporary overload as provider-queueable. +- [ ] Before returning `capacity_skipped`, perform a short queue wait bounded by `min(Retry-After, max_queue_wait_seconds)`. +- [ ] Re-attempt the reviewer once after the short wait if the user has not paused/cancelled the queue. +- [ ] Emit the existing queue state event with provider-specific reason and aggregate diagnostics counters. +- [ ] Keep reviewer runtime timeout starting only after the re-attempt begins. +- [ ] If the short provider queue expires, return `capacity_skipped` with the same reliability signal used today. +- [ ] Fail fast for auth, quota, billing, invalid model, policy, invalid tooling, validation, and cancellation. + +Verification: + +- `cargo test -p bitfun-core deep_review_provider_capacity_error_builds_capacity_skipped_payload_and_lowers_effective_cap -- --nocapture` +- Add and run focused tests for provider queue success, queue expiry, pause, cancel, and non-queueable provider errors. +- `pnpm --dir src/web-ui run test:run -- src/flow_chat/utils/deepReviewQueueStateEvents.test.ts src/flow_chat/store/deepReviewActionBarStore.test.ts` + +Exit criteria: + +- Provider queue is visible. +- Queue time does not count against reviewer timeout. +- Provider queue never loops indefinitely. + +### Round 4: Explicit Retry Action And Bounded Auto-Retry Preference + +**Goal:** Let users recover partial reviewers without extending reviews indefinitely. + +**Files:** + +- Modify: `src/crates/core/src/agentic/deep_review_policy.rs` +- Modify: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Modify: `src/web-ui/src/flow_chat/utils/codeReviewReport.ts` +- Modify: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` +- Modify: `src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts` +- Modify: `src/web-ui/src/flow_chat/services/DeepReviewService.ts` +- Modify: `src/web-ui/src/locales/en-US/flow-chat.json` +- Modify: `src/web-ui/src/locales/zh-CN/flow-chat.json` +- Modify: `src/web-ui/src/locales/zh-TW/flow-chat.json` +- Test: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Test: `src/web-ui/src/flow_chat/utils/codeReviewReport.test.ts` +- Test: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx` + +Steps: + +- [ ] Add report metadata for retryable unresolved packets: source packet id, source status, covered files, unresolved files, retry timeout. +- [ ] Show `Retry unresolved slice` only when the runtime has structured coverage. +- [ ] When clicked, launch a retry Task with `retry: true`, reduced `retry_scope_files`, lower timeout, and source coverage metadata. +- [ ] Add `Allow bounded automatic retries for future Deep Reviews` as an explicit user action. +- [ ] Persist the preference to Review settings. +- [ ] Auto-retry only one bounded unresolved slice at a time and respect `max_retries_per_role`. +- [ ] Stop auto-retry when elapsed guard or budget is exhausted. +- [ ] Surface unresolved status if retry is suppressed or fails non-transiently. + +Verification: + +- `cargo test -p bitfun-core task_tool::tests::deep_review_retry -- --nocapture` +- `cargo test -p bitfun-core deep_review -- --nocapture` +- `pnpm --dir src/web-ui run test:run -- src/flow_chat/utils/codeReviewReport.test.ts src/flow_chat/components/btw/DeepReviewActionBar.test.tsx` + +Exit criteria: + +- Default behavior is manual retry. +- The "do not ask again" preference is explicit, reversible, and bounded. +- No retry loop can exceed configured role/packet budget or elapsed guard. + +### Round 5: Cost-Aware Scope And Shared Evidence Planning + +**Goal:** Reduce default review time and token use for large changes by narrowing quick/default review depth and precomputing shared evidence once. + +**Files:** + +- Modify: `src/web-ui/src/shared/services/reviewTeamService.ts` +- Modify: `src/web-ui/src/shared/services/reviewTeamService.test.ts` +- Modify: `src/crates/core/src/agentic/agents/prompts/deep_review_agent.md` +- Modify: `src/crates/core/src/agentic/agents/prompts/review_*_agent.md` +- Modify: `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` +- Modify: `docs/deep-review-design.md` +- Modify: `docs/deep-review-phase2-plan.md` +- Modify: `docs/deep-review-phase2-addendum.md` +- Modify: `docs/deep-review-phase3-followup-plan.md` + +Steps: + +- [ ] Add `review_depth`, `risk_focus_tags`, `max_dependency_hops`, and `coverage_expectation` to the Deep Review manifest. +- [ ] Map `quick` to high-risk-only review, `normal` to risk-expanded review, and `deep` to full-depth review. +- [ ] Keep optional reviewers risk-matched in quick/default paths instead of running every configured extra reviewer by default. +- [ ] Add a compact shared evidence pack to the manifest with changed files, hunk ranges, domain tags, packet ids, and cheap contract hints. +- [ ] Update reviewer prompts so subagents start from the evidence pack and call `Read`/`GetFileDiff` only for confirmation or missing context. +- [ ] Update Judge/report wording so reduced-depth reviews are clearly marked as high-risk or risk-expanded coverage, not full coverage. +- [ ] Keep programmatic cross-subagent tool-result reuse deferred unless duplicate-tool diagnostics show material repeated `Read`/`GetFileDiff` cost. + +Verification: + +- `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts` +- `cargo test -p bitfun-core deep_review -- --nocapture` +- Focused prompt/manifest snapshot tests if prompt packet generation changes. + +Exit criteria: + +- Quick/default reviews focus on high-risk items without silently dropping file coverage metadata. +- Deep reviews still provide full-depth behavior when explicitly selected. +- Subagents receive enough shared evidence to reduce discovery tool calls. +- The report distinguishes reduced-depth coverage from full-depth review. + +### Round 6: Documentation Reconciliation And Product Risk Review + +**Goal:** Keep docs aligned with code and leave deferred items explicit. + +**Files:** + +- Modify: `docs/deep-review-design.md` +- Modify: `docs/deep-review-phase2-plan.md` +- Modify: `docs/deep-review-phase2-addendum.md` +- Modify: `docs/deep-review-phase3-followup-plan.md` + +Steps: + +- [ ] Update status wording after each completed round. +- [ ] Mark provider short queue as runtime-complete only after tests prove visible bounded behavior. +- [ ] Mark bounded auto retry as runtime-complete only after the setting and loop guards exist. +- [ ] Keep project-level cache as product-decision-required/deferred. +- [ ] Keep programmatic shared context cache deferred unless real diagnostics justify it. +- [ ] Keep cost-aware depth profiles explicit so quick/default reduced-depth behavior cannot be mistaken for full review. +- [ ] Add a short "measured outcome" section once real run data is sampled. + +Verification: + +- `rg -n "project-level cache.*implemented|automatic retry.*complete|provider/adaptive queue.*complete|hard prompt.*complete" docs/deep-review-design.md docs/deep-review-phase2-plan.md docs/deep-review-phase2-addendum.md docs/deep-review-phase3-followup-plan.md` +- Expected: no stale wording claims deferred work is complete. + +Exit criteria: + +- Docs distinguish implemented behavior from scoped follow-up. +- Remaining risks are named with an owner decision or an implementation round. + +## Open Risks + +| Risk | Why it matters | Mitigation in this plan | +|---|---|---| +| Diagnostics overhead grows | Per-tool or per-second logging can slow large reviews and create noisy logs. | Aggregate in memory; final debug line only; no content capture. | +| Provider queue feels like a hang | Automatic wait without explanation can look stuck. | Reuse compact queue notice and controls; bounded wait; queue time separated from runtime. | +| Users change the wrong concurrency setting | Global subagent concurrency affects normal sessions. | Prefer Deep Review max parallel reviewer setting; label global concurrency as advanced if exposed later. | +| Retry extends reviews too long | Auto retry can silently double runtime. | Manual by default; opt-in auto retry; role budget, packet budget, smaller scope, lower timeout, elapsed guard. | +| Auto retry repeats bad context | Retrying without coverage can repeat the same large scope. | Require structured coverage and smaller retry scope. | +| Capacity failures are misclassified | Queueing quota/auth/model errors wastes user time. | Keep classifier narrow and fail fast for non-transient categories. | +| Settings surface becomes dense | Review settings already controls strategy, models, members, execution. | Add one compact "Capacity and retry" section; no new page. | +| Project cache leaks sensitive findings | Reviewer outputs can contain code summaries and security findings. | Keep project-level cache deferred; no new storage path in Phase 3. | +| Reduced-depth review misses low-risk regressions | Quick/default paths may skip broad dependency exploration to save time and tokens. | Make depth explicit, preserve file coverage metadata, keep severe categories in scope, and offer `Deepen selected areas` when needed. | +| Evidence pack becomes stale or too heavy | Shared facts can drift from actual tool results or recreate the same token burden in the manifest. | Build the pack immediately before launch, keep it metadata-first, and require reviewers to confirm specific issues with targeted reads. | +| Tool-result reuse changes semantics | Reusing full `Read` results across isolated subagents could hide file changes or leak too much context. | Start with shared evidence and duplicate-call measurement; limit any future programmatic reuse to immutable diff/stat data first. | + +## Full Release Gate + +Run after all selected Phase 3 rounds are implemented: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- `cargo check --workspace --exclude bitfun-cli` +- `pnpm run lint:web` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run` +- `git diff --check` + +Expected: + +- Diagnostics are aggregate and low-frequency. +- Provider transient queue is short, visible, and bounded. +- Users have a clear path to lower Deep Review parallelism without touching global subagent capacity. +- Retry defaults to explicit user action; bounded auto retry exists only after opt-in. +- Quick/default reviews use high-risk or risk-expanded scope profiles, while deep remains full-depth. +- Shared evidence reduces repeated discovery work without storing source content in diagnostics. +- Project-level cache remains unimplemented and explicitly deferred. diff --git a/docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md b/docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md new file mode 100644 index 000000000..c152d6ea5 --- /dev/null +++ b/docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md @@ -0,0 +1,1092 @@ +# Deep Review Phase 3 Execution Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use `superpowers:executing-plans` to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Execute the remaining Deep Review Phase 3 work from `docs/deep-review-phase3-followup-plan.md` with low-frequency diagnostics, visible capacity recovery, explicit retry controls, and no project-level cache expansion. + +**Architecture:** Keep Deep Review orchestration in the existing review manifest, TaskTool, runtime policy, events, and Flow Chat action-bar paths. Add narrowly scoped runtime state and user controls instead of a new scheduler, hidden concurrency escalation, or persistent project review cache. Each round must leave the product in a working state with tests proving that queueing, retry, and settings behavior remain bounded. + +**Tech Stack:** Rust core (`bitfun-core`), `bitfun-events`, shared React frontend (`src/web-ui`), i18next locale JSON, Vitest, Cargo tests. + +--- + +## Source Of Truth + +- Primary design document: `docs/deep-review-phase3-followup-plan.md` +- Related design documents: + - `docs/deep-review-design.md` + - `docs/deep-review-phase2-plan.md` + - `docs/deep-review-phase2-addendum.md` + +This plan intentionally does not implement project-level review cache, global automatic concurrency increase, a backend DAG scheduler, or high-frequency telemetry. + +## Current Code Assumptions + +- `src/crates/core/src/agentic/deep_review_policy.rs` owns Deep Review budget tracking, retry admission, effective concurrency state, capacity classification, and queue state payload types. +- `src/crates/core/src/agentic/tools/implementations/task_tool.rs` owns reviewer Task launch, queue wait behavior, retry coverage validation, provider capacity handling, and queue state event emission. +- `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` owns final `submit_code_review` reliability folding and shared-context diagnostics logging. +- `src/web-ui/src/shared/services/reviewTeamService.ts` owns default Review Team config, manifest construction, execution policy persistence, and concurrency policy defaults. +- `src/web-ui/src/infrastructure/config/components/ReviewConfig.tsx` owns the Review settings UI. +- `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` and `src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts` own live Deep Review queue and recovery controls. +- No `ReviewConfig.test.tsx` exists today, so Round 2 should add one only if component-level coverage is required after the service tests are in place. + +## Execution Rules + +- Do not stage unrelated untracked files. +- Keep all logs in English and without emoji. +- Use existing queue/action-bar UI patterns before adding new surfaces. +- Add locale entries for every new user-facing string in `en-US`, `zh-CN`, and `zh-TW`. +- Use aggregate counters and final snapshots only. Do not store source text, diff text, reviewer output, provider raw body, or full file contents in diagnostics. +- Queue time must not be counted as reviewer runtime timeout. +- Automatic retry is disabled by default and must stay bounded after the user enables it. +- Quick/default review paths should focus on high-risk coverage first; only `deep` should imply full-depth exploration across broad dependencies. +- Shared review evidence should be generated once and passed to reviewers as compact metadata before adding any deeper tool-result cache. +- After each round, run the listed focused verification and update the status wording in `docs/deep-review-phase3-followup-plan.md`. + +## Commit Strategy + +Use one commit per completed round unless two adjacent rounds are very small and verified together. + +Suggested commit boundaries: + +1. `feat(deep-review): add aggregate runtime diagnostics` +2. `feat(review-team): expose capacity and retry settings` +3. `feat(deep-review): queue transient provider capacity errors` +4. `feat(deep-review): add bounded retry controls` +5. `feat(deep-review): add cost-aware review scope` +6. `docs(deep-review): reconcile phase three status` + +--- + +## Round 0: Preflight And Baseline + +**Goal:** Prove the branch starts from a known state and prevent unrelated files from entering the work. + +**Files:** + +- Read: `docs/deep-review-phase3-followup-plan.md` +- Read: `src/crates/core/src/agentic/deep_review_policy.rs` +- Read: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Read: `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` +- Read: `src/web-ui/src/shared/services/reviewTeamService.ts` +- Read: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` + +- [x] **Step 0.1: Confirm git scope** + +Run: + +```powershell +git status --short --branch +``` + +Expected: + +- Current branch is the Deep Review feature branch. +- Pre-existing unrelated untracked files remain untracked. +- Only files from this plan are staged or committed during execution. + +- [x] **Step 0.2: Run the smallest baseline tests** + +Run: + +```powershell +cargo test -p bitfun-core deep_review -- --nocapture +pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts src/flow_chat/store/deepReviewActionBarStore.test.ts src/flow_chat/utils/deepReviewQueueStateEvents.test.ts +``` + +Expected: + +- Existing Deep Review Rust tests pass. +- Existing Review Team and Deep Review action-bar focused web tests pass. +- If an unrelated failure appears, capture the failing test name and do not mix that repair with Phase 3 implementation unless it blocks this work. + +- [x] **Step 0.3: Confirm feature boundaries** + +Run: + +```powershell +rg -n "project-level cache|global subagent|max_parallel|auto retry|provider capacity|shared_context" docs/deep-review-phase3-followup-plan.md +``` + +Expected: + +- Project-level cache is marked deferred. +- Global subagent concurrency is not the primary user-facing Deep Review setting. +- Provider transient queue and bounded retry remain the active Phase 3 scope. + +--- + +## Round 1: Low-Frequency Runtime Diagnostics + +**Goal:** Add aggregate runtime diagnostics for real-run analysis without hot-path logging or report noise. + +**Files:** + +- Modify: `src/crates/core/src/agentic/deep_review_policy.rs` +- Modify: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Modify: `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` +- Test: `src/crates/core/src/agentic/deep_review_policy.rs` +- Test: `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` +- Docs: `docs/deep-review-phase3-followup-plan.md` + +- [x] **Step 1.1: Write diagnostics tracker tests first** + +Add tests in `src/crates/core/src/agentic/deep_review_policy.rs` near the existing Deep Review budget tracker tests: + +```rust +#[test] +fn runtime_diagnostics_records_queue_and_capacity_transitions_as_counts() { + let tracker = DeepReviewBudgetTracker::new(); + + tracker.record_runtime_queue_wait("turn-runtime", 1_250); + tracker.record_runtime_queue_wait("turn-runtime", 2_500); + tracker.record_runtime_capacity_skip( + "turn-runtime", + DeepReviewCapacityQueueReason::ProviderConcurrencyLimit, + ); + + let diagnostics = tracker + .runtime_diagnostics_snapshot("turn-runtime") + .expect("runtime diagnostics should exist"); + + assert_eq!(diagnostics.queue_wait_count, 2); + assert_eq!(diagnostics.queue_wait_total_ms, 3_750); + assert_eq!(diagnostics.queue_wait_max_ms, 2_500); + assert_eq!(diagnostics.capacity_skip_count, 1); + assert_eq!(diagnostics.provider_capacity_queue_count, 0); +} + +#[test] +fn runtime_diagnostics_merges_shared_context_without_content() { + let tracker = DeepReviewBudgetTracker::new(); + + tracker.record_shared_context_tool_use( + "turn-runtime-shared", + "ReviewSecurity", + "Read", + "src/lib.rs", + ); + tracker.record_shared_context_tool_use( + "turn-runtime-shared", + "ReviewArchitecture", + "Read", + "src/lib.rs", + ); + + let diagnostics = tracker + .runtime_diagnostics_snapshot("turn-runtime-shared") + .expect("runtime diagnostics should exist"); + + assert_eq!(diagnostics.shared_context_total_calls, 2); + assert_eq!(diagnostics.shared_context_duplicate_context_count, 1); + assert!(!format!("{diagnostics:?}").contains("fn ")); +} +``` + +Run: + +```powershell +cargo test -p bitfun-core runtime_diagnostics_ -- --nocapture +``` + +Expected: + +- Fails because `DeepReviewRuntimeDiagnostics` and recorder methods do not exist yet. + +- [x] **Step 1.2: Add the aggregate diagnostics type** + +In `src/crates/core/src/agentic/deep_review_policy.rs`, add a serializable diagnostics struct near the effective concurrency structs: + +```rust +#[derive(Debug, Clone, Default, Serialize, PartialEq, Eq)] +pub struct DeepReviewRuntimeDiagnostics { + pub queue_wait_count: usize, + pub queue_wait_total_ms: u64, + pub queue_wait_max_ms: u64, + pub provider_capacity_queue_count: usize, + pub provider_capacity_retry_count: usize, + pub provider_capacity_retry_success_count: usize, + pub capacity_skip_count: usize, + pub effective_parallel_min: Option, + pub effective_parallel_final: Option, + pub manual_queue_action_count: usize, + pub manual_retry_count: usize, + pub auto_retry_count: usize, + pub auto_retry_suppressed_reason_counts: BTreeMap, + pub shared_context_total_calls: usize, + pub shared_context_duplicate_calls: usize, + pub shared_context_duplicate_context_count: usize, +} +``` + +Add `runtime_diagnostics: DeepReviewRuntimeDiagnostics` to the per-turn budget record. Use `BTreeMap` for deterministic test output. + +- [x] **Step 1.3: Add recorder and snapshot methods** + +In `DeepReviewBudgetTracker`, add methods with these names and behavior: + +```rust +pub fn record_runtime_queue_wait(&self, parent_dialog_turn_id: &str, queue_elapsed_ms: u64) +pub fn record_runtime_provider_capacity_queue(&self, parent_dialog_turn_id: &str) +pub fn record_runtime_provider_capacity_retry(&self, parent_dialog_turn_id: &str) +pub fn record_runtime_provider_capacity_retry_success(&self, parent_dialog_turn_id: &str) +pub fn record_runtime_capacity_skip( + &self, + parent_dialog_turn_id: &str, + reason: DeepReviewCapacityQueueReason, +) +pub fn record_runtime_manual_queue_action(&self, parent_dialog_turn_id: &str) +pub fn record_runtime_manual_retry(&self, parent_dialog_turn_id: &str) +pub fn record_runtime_auto_retry(&self, parent_dialog_turn_id: &str) +pub fn record_runtime_auto_retry_suppressed( + &self, + parent_dialog_turn_id: &str, + reason: &str, +) +pub fn runtime_diagnostics_snapshot( + &self, + parent_dialog_turn_id: &str, +) -> Option +``` + +Snapshot behavior: + +- Merge current shared-context measurement at snapshot time. +- Return `None` when every counter and optional effective parallel field is empty. +- Do not include raw paths in the returned diagnostics. + +- [x] **Step 1.4: Wire diagnostics at state transitions only** + +Update `src/crates/core/src/agentic/tools/implementations/task_tool.rs`: + +- When local queue emits a terminal wait result, record `record_deep_review_runtime_queue_wait`. +- When provider capacity is classified and skipped, record `record_deep_review_runtime_capacity_skip`. +- Do not record every one-second queue event as a log or diagnostics row. + +Update `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs`: + +- Replace shared-context-only final debug logging with a runtime diagnostics final snapshot. +- Keep the report payload free of `runtime_diagnostics` unless a later round explicitly needs it for a user-visible reliability signal. + +- [x] **Step 1.5: Run focused verification** + +Run: + +```powershell +cargo test -p bitfun-core runtime_diagnostics_ -- --nocapture +cargo test -p bitfun-core deep_review_shared_context_diagnostics_stays_out_of_report -- --nocapture +cargo test -p bitfun-core deep_review -- --nocapture +``` + +Expected: + +- Diagnostics tests pass. +- Shared-context diagnostics remain out of report content. +- Existing Deep Review tests pass. + +- [x] **Step 1.6: Update Phase 3 status wording** + +In `docs/deep-review-phase3-followup-plan.md`, mark Round 1 as implemented only after Step 1.5 passes. Keep provider queue, retry action, and settings work as pending. + +- [ ] **Step 1.7: Commit Round 1** + +Run: + +```powershell +git add src/crates/core/src/agentic/deep_review_policy.rs src/crates/core/src/agentic/tools/implementations/task_tool.rs src/crates/core/src/agentic/tools/implementations/code_review_tool.rs docs/deep-review-phase3-followup-plan.md +git commit -m "feat(deep-review): add aggregate runtime diagnostics" +``` + +Expected: + +- Commit includes only Round 1 files. + +--- + +## Round 2: Review Capacity And Retry Settings + +**Goal:** Give users explicit Deep Review capacity and retry controls without exposing global subagent concurrency as the normal path. + +**Status:** Implemented and locally verified. The settings fit existing component styles without `ReviewConfig.scss` changes, and the persistent controls are scoped to the default Review Team config rather than Rust global config. Round 2 commit is intentionally still pending until requested. + +**Files:** + +- Modify: `src/web-ui/src/shared/services/reviewTeamService.ts` +- Modify: `src/web-ui/src/shared/services/reviewTeamService.test.ts` +- Modify: `src/web-ui/src/infrastructure/config/components/ReviewConfig.tsx` +- Modify: `src/web-ui/src/infrastructure/config/components/ReviewConfig.scss` +- Modify: `src/web-ui/src/locales/en-US/settings/review.json` +- Modify: `src/web-ui/src/locales/zh-CN/settings/review.json` +- Modify: `src/web-ui/src/locales/zh-TW/settings/review.json` +- Docs: `docs/deep-review-phase3-followup-plan.md` + +- [x] **Step 2.1: Write service tests for persisted concurrency settings** + +Add tests in `src/web-ui/src/shared/services/reviewTeamService.test.ts` covering: + +```ts +it('loads default concurrency and retry settings when config is missing', async () => { + vi.mocked(configAPI.getConfig).mockRejectedValueOnce( + new Error("Config path 'ai.review_teams.default' not found"), + ); + + const team = await loadDefaultReviewTeam(WORKSPACE_PATH); + + expect(team.concurrencyPolicy).toEqual({ + maxParallelInstances: 4, + staggerSeconds: 0, + maxQueueWaitSeconds: 60, + batchExtrasSeparately: true, + allowProviderCapacityQueue: true, + allowBoundedAutoRetry: false, + autoRetryElapsedGuardSeconds: 180, + }); +}); + +it('clamps saved concurrency and retry settings to supported bounds', async () => { + vi.mocked(configAPI.getConfig).mockResolvedValueOnce({ + extra_subagent_ids: [], + strategy_level: 'normal', + member_strategy_overrides: {}, + reviewer_timeout_seconds: 600, + judge_timeout_seconds: 600, + reviewer_file_split_threshold: 20, + max_same_role_instances: 3, + max_retries_per_role: 1, + max_parallel_reviewers: 99, + max_queue_wait_seconds: 999, + allow_provider_capacity_queue: false, + allow_bounded_auto_retry: true, + auto_retry_elapsed_guard_seconds: 1, + }); + + const team = await loadDefaultReviewTeam(WORKSPACE_PATH); + + expect(team.concurrencyPolicy.maxParallelInstances).toBe(16); + expect(team.concurrencyPolicy.maxQueueWaitSeconds).toBe(600); + expect(team.concurrencyPolicy.allowProviderCapacityQueue).toBe(false); + expect(team.concurrencyPolicy.allowBoundedAutoRetry).toBe(true); + expect(team.concurrencyPolicy.autoRetryElapsedGuardSeconds).toBe(30); +}); +``` + +Use the existing `WORKSPACE_PATH` fixture constant if present. If no constant exists, introduce `const WORKSPACE_PATH = 'D:/workspace/project-a';` once at the top of the test file. + +Run: + +```powershell +pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts +``` + +Expected: + +- Fails until the persisted fields and normalization logic exist. + +- [x] **Step 2.2: Extend Review Team concurrency types** + +In `src/web-ui/src/shared/services/reviewTeamService.ts`, extend `ReviewTeamStoredConfig`, `ReviewTeamConcurrencyPolicy`, `ReviewTeam`, and `ReviewTeamRunManifest` handling with: + +```ts +allowProviderCapacityQueue: boolean; +allowBoundedAutoRetry: boolean; +autoRetryElapsedGuardSeconds: number; +``` + +Persist using snake_case fields: + +```ts +max_parallel_reviewers +max_queue_wait_seconds +allow_provider_capacity_queue +allow_bounded_auto_retry +auto_retry_elapsed_guard_seconds +``` + +Keep manifest shape camelCase: + +```ts +concurrencyPolicy: { + maxParallelInstances, + staggerSeconds, + maxQueueWaitSeconds, + batchExtrasSeparately, + allowProviderCapacityQueue, + allowBoundedAutoRetry, + autoRetryElapsedGuardSeconds, +} +``` + +- [x] **Step 2.3: Add save helper for concurrency settings** + +Add a service function near `saveDefaultReviewTeamExecutionPolicy`: + +```ts +export async function saveDefaultReviewTeamConcurrencyPolicy( + concurrencyPolicy: ReviewTeamConcurrencyPolicy, +): Promise +``` + +Implementation requirements: + +- Load the current default review team config. +- Preserve existing extra reviewers, strategy, member overrides, and execution policy values. +- Save only normalized supported bounds. +- Do not write global `ai.subagent_max_concurrency`. + +- [x] **Step 2.4: Add compact settings UI** + +In `src/web-ui/src/infrastructure/config/components/ReviewConfig.tsx`: + +- Import `Switch` from `@/component-library`. +- Add `savingConcurrencyKey` state. +- Add handlers for numeric concurrency settings and boolean toggles. +- Add one compact section titled by locale key `capacity.title`. + +Controls: + +- `max_parallel_reviewers`: `NumberInput`, min `1`, max `16`. +- `max_queue_wait_seconds`: `NumberInput`, min `0`, max `600`. +- `allow_provider_capacity_queue`: `Switch`. +- `allow_bounded_auto_retry`: `Switch`. +- `auto_retry_elapsed_guard_seconds`: `NumberInput`, min `30`, max `900`, disabled when bounded auto retry is off. + +Keep visual density consistent with the existing execution policy rows. + +- [x] **Step 2.5: Add locale entries** + +Add matching keys to: + +- `src/web-ui/src/locales/en-US/settings/review.json` +- `src/web-ui/src/locales/zh-CN/settings/review.json` +- `src/web-ui/src/locales/zh-TW/settings/review.json` + +Required key structure: + +```json +{ + "capacity": { + "title": "...", + "maxParallelReviewers": { + "label": "...", + "description": "..." + }, + "maxQueueWaitSeconds": { + "label": "...", + "description": "..." + }, + "allowProviderCapacityQueue": { + "label": "...", + "description": "..." + }, + "allowBoundedAutoRetry": { + "label": "...", + "description": "..." + }, + "autoRetryElapsedGuardSeconds": { + "label": "...", + "description": "..." + } + } +} +``` + +- [x] **Step 2.6: Run Round 2 verification** + +Run: + +```powershell +pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts +pnpm run lint:web +pnpm run type-check:web +``` + +Expected: + +- Service tests pass. +- Lint and type-check pass. +- No new untranslated literal appears in `ReviewConfig.tsx`. + +- [ ] **Step 2.7: Update status and commit** + +Update `docs/deep-review-phase3-followup-plan.md` to mark settings persistence and UI as implemented after Step 2.6 passes. + +Run: + +```powershell +git add src/web-ui/src/shared/services/reviewTeamService.ts src/web-ui/src/shared/services/reviewTeamService.test.ts src/web-ui/src/infrastructure/config/components/ReviewConfig.tsx src/web-ui/src/infrastructure/config/components/ReviewConfig.scss src/web-ui/src/locales/en-US/settings/review.json src/web-ui/src/locales/zh-CN/settings/review.json src/web-ui/src/locales/zh-TW/settings/review.json docs/deep-review-phase3-followup-plan.md +git commit -m "feat(review-team): expose capacity and retry settings" +``` + +Expected: + +- Commit contains only Round 2 files. + +--- + +## Round 3: Short Provider Capacity Queue + +**Goal:** Reattempt narrowly classified provider transient capacity errors after a visible bounded wait. + +**Files:** + +- Modify: `src/crates/core/src/agentic/deep_review_policy.rs` +- Modify: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Modify: `src/crates/events/src/agentic.rs` +- Modify: `src/crates/core/src/agentic/events/types.rs` +- Modify: `src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.ts` +- Modify: `src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts` +- Modify: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` +- Modify: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.scss` +- Modify: `src/web-ui/src/locales/en-US/flow-chat.json` +- Modify: `src/web-ui/src/locales/zh-CN/flow-chat.json` +- Modify: `src/web-ui/src/locales/zh-TW/flow-chat.json` +- Test: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Test: `src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.test.ts` +- Test: `src/web-ui/src/flow_chat/store/deepReviewActionBarStore.test.ts` +- Test: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx` +- Docs: `docs/deep-review-phase3-followup-plan.md` + +- [ ] **Step 3.1: Write provider queue classification tests** + +In `src/crates/core/src/agentic/deep_review_policy.rs`, add tests proving: + +- `rate_limit`, `provider_concurrency_limit`, `retry_after`, and temporary overload are queueable. +- auth, quota, billing, invalid model, invalid tooling, policy, validation, and cancellation are not queueable. +- `Retry-After` values are bounded by `max_queue_wait_seconds`. + +Run: + +```powershell +cargo test -p bitfun-core capacity_error -- --nocapture +``` + +Expected: + +- Existing classification tests pass. +- New max-wait bounding tests fail until implemented. + +- [ ] **Step 3.2: Add provider queue decision helpers** + +In `src/crates/core/src/agentic/deep_review_policy.rs`, add helpers: + +```rust +pub fn provider_capacity_queue_wait( + reason: DeepReviewCapacityQueueReason, + retry_after_seconds: Option, + max_queue_wait_seconds: u64, + allow_provider_capacity_queue: bool, +) -> Option +``` + +Rules: + +- Return `None` when `allow_provider_capacity_queue` is false. +- Return `None` for non-provider capacity reasons unless the reason is `RetryAfter`. +- Return `None` when `max_queue_wait_seconds` is `0`. +- Return `Some(duration)` bounded by `max_queue_wait_seconds`. +- Never return an unbounded duration. + +- [ ] **Step 3.3: Convert provider skip into one visible queue reattempt** + +In `src/crates/core/src/agentic/tools/implementations/task_tool.rs`: + +- When a reviewer fails with queueable provider capacity, emit a provider queue state event before final skip. +- Wait for the bounded provider queue duration. +- Respect existing pause, continue, cancel, and skip controls if they are stored by parent turn. +- Reattempt the reviewer once after the wait. +- Start reviewer runtime timeout only for the actual execution attempt. +- On second provider capacity failure or queue expiry, return the existing `capacity_skipped` payload. + +Required queue result fields: + +```json +{ + "status": "capacity_skipped", + "reason": "provider_concurrency_limit", + "queue_elapsed_ms": 60000 +} +``` + +- [ ] **Step 3.4: Extend event and UI reason mapping** + +In event types and web queue parsers, add provider-specific reason display without creating a new banner component: + +- `provider_rate_limit` +- `provider_concurrency_limit` +- `retry_after` +- `provider_temporary_overload` + +UI copy must say queue time does not count against reviewer runtime. + +- [ ] **Step 3.5: Add action-bar tests** + +Add or update tests proving: + +- Provider queue notice renders as a compact action-bar state. +- Pause and cancel actions remain available. +- Provider queue reason text is localized. +- Repeated queue events update the same state rather than appending duplicate notices. + +Run: + +```powershell +pnpm --dir src/web-ui run test:run -- src/flow_chat/utils/deepReviewQueueStateEvents.test.ts src/flow_chat/store/deepReviewActionBarStore.test.ts src/flow_chat/components/btw/DeepReviewActionBar.test.tsx +``` + +Expected: + +- New provider reason tests pass. +- Existing local capacity queue tests still pass. + +- [ ] **Step 3.6: Run Rust verification** + +Run: + +```powershell +cargo test -p bitfun-core deep_review_provider_capacity_error_builds_capacity_skipped_payload_and_lowers_effective_cap -- --nocapture +cargo test -p bitfun-core deep_review -- --nocapture +``` + +Expected: + +- Existing capacity-skip behavior remains compatible after bounded provider queueing. +- Deep Review suite passes. + +- [ ] **Step 3.7: Update status and commit** + +Update `docs/deep-review-phase3-followup-plan.md` to mark short provider capacity queue as implemented only after Steps 3.5 and 3.6 pass. + +Run: + +```powershell +git add src/crates/core/src/agentic/deep_review_policy.rs src/crates/core/src/agentic/tools/implementations/task_tool.rs src/crates/events/src/agentic.rs src/crates/core/src/agentic/events/types.rs src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.ts src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.scss src/web-ui/src/locales/en-US/flow-chat.json src/web-ui/src/locales/zh-CN/flow-chat.json src/web-ui/src/locales/zh-TW/flow-chat.json src/web-ui/src/flow_chat/utils/deepReviewQueueStateEvents.test.ts src/web-ui/src/flow_chat/store/deepReviewActionBarStore.test.ts src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx docs/deep-review-phase3-followup-plan.md +git commit -m "feat(deep-review): queue transient provider capacity errors" +``` + +Expected: + +- Commit contains provider queue runtime, UI, locale, tests, and status doc only. + +--- + +## Round 4: Explicit Retry Action And Bounded Auto-Retry Preference + +**Goal:** Let users retry unresolved reviewer slices manually by default, and only run bounded automatic retries after explicit opt-in. + +**Files:** + +- Modify: `src/crates/core/src/agentic/deep_review_policy.rs` +- Modify: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Modify: `src/web-ui/src/flow_chat/utils/codeReviewReport.ts` +- Modify: `src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts` +- Modify: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` +- Modify: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.scss` +- Modify: `src/web-ui/src/flow_chat/services/DeepReviewService.ts` +- Modify: `src/web-ui/src/shared/services/reviewTeamService.ts` +- Modify: `src/web-ui/src/locales/en-US/flow-chat.json` +- Modify: `src/web-ui/src/locales/zh-CN/flow-chat.json` +- Modify: `src/web-ui/src/locales/zh-TW/flow-chat.json` +- Test: `src/crates/core/src/agentic/deep_review_policy.rs` +- Test: `src/crates/core/src/agentic/tools/implementations/task_tool.rs` +- Test: `src/web-ui/src/flow_chat/utils/codeReviewReport.test.ts` +- Test: `src/web-ui/src/flow_chat/store/deepReviewActionBarStore.test.ts` +- Test: `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx` +- Test: `src/web-ui/src/flow_chat/services/DeepReviewService.test.ts` +- Docs: `docs/deep-review-phase3-followup-plan.md` + +- [ ] **Step 4.1: Write retry metadata parser tests** + +In `src/web-ui/src/flow_chat/utils/codeReviewReport.test.ts`, add cases for: + +- `partial_timeout` packet with unresolved files returns a retryable slice. +- transient `capacity_skipped` packet with unresolved files returns a retryable slice. +- non-transient `capacity_skipped` returns no retry action. +- retry scope must be smaller than original assigned scope. + +Expected retryable shape: + +```ts +{ + sourcePacketId: 'ReviewFrontend-1', + sourceStatus: 'partial_timeout', + reviewerId: 'ReviewFrontend', + retryScopeFiles: ['src/web-ui/src/App.tsx'], + coveredFiles: ['src/web-ui/src/index.tsx'], + retryTimeoutSeconds: 300, +} +``` + +Run: + +```powershell +pnpm --dir src/web-ui run test:run -- src/flow_chat/utils/codeReviewReport.test.ts +``` + +Expected: + +- Fails until retry metadata extraction exists. + +- [ ] **Step 4.2: Add retry action-bar state** + +In `src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts`, add state for: + +```ts +retryableSlices: DeepReviewRetryableSlice[]; +retryInProgress: boolean; +autoRetryPreferenceVisible: boolean; +``` + +Actions: + +- `setRetryableSlices` +- `startManualRetry` +- `finishManualRetry` +- `setAutoRetryPreferenceVisible` + +Do not show retry controls when there is no structured coverage. + +- [ ] **Step 4.3: Add explicit retry UI** + +In `DeepReviewActionBar.tsx`: + +- Add `Retry unresolved slice` button. +- Add secondary action `Allow bounded automatic retries for future Deep Reviews`. +- Keep the copy compact. +- Keep button disabled while a retry is already in progress. +- Show unresolved status when retry is suppressed by bounds. + +Locale keys must live in `flow-chat.json` for all three locales. + +- [ ] **Step 4.4: Wire manual retry service call** + +In `src/web-ui/src/flow_chat/services/DeepReviewService.ts`: + +- Add a method or helper that launches a Deep Review retry from a retryable slice. +- Pass `retry: true`, `retry_coverage`, `retry_scope_files`, source status, source packet id, covered files, and lower timeout. +- Do not alter the original report or hide unresolved findings while retry is running. + +- [ ] **Step 4.5: Add bounded auto-retry admission** + +In Rust policy and TaskTool: + +- Use `allowBoundedAutoRetry` from manifest concurrency policy. +- Record `manual_retry_count`, `auto_retry_count`, and suppressed reasons. +- Permit automatic retry only when: + - preference is true; + - source status is `partial_timeout` or transient `capacity_skipped`; + - retry scope is non-empty and smaller than source scope; + - role retry budget remains; + - elapsed guard remains; + - failure type is not auth, quota, billing, invalid model, policy, invalid tooling, validation, or cancellation. + +Use stable suppression reason strings: + +```rust +"preference_disabled" +"budget_exhausted" +"scope_not_reduced" +"elapsed_guard_exceeded" +"non_retryable_status" +"non_transient_error" +"missing_coverage" +``` + +- [ ] **Step 4.6: Persist auto-retry preference from the UI** + +Use the Round 2 `saveDefaultReviewTeamConcurrencyPolicy` helper to update `allowBoundedAutoRetry`. + +Requirements: + +- The first retry remains manual. +- The opt-in action updates settings explicitly. +- Settings can disable the preference later. + +- [ ] **Step 4.7: Run Round 4 verification** + +Run: + +```powershell +cargo test -p bitfun-core task_tool -- --nocapture +cargo test -p bitfun-core deep_review -- --nocapture +pnpm --dir src/web-ui run test:run -- src/flow_chat/utils/codeReviewReport.test.ts src/flow_chat/store/deepReviewActionBarStore.test.ts src/flow_chat/components/btw/DeepReviewActionBar.test.tsx src/flow_chat/services/DeepReviewService.test.ts +pnpm run lint:web +pnpm run type-check:web +``` + +Expected: + +- Manual retry works for structured unresolved slices. +- Bounded auto retry is disabled by default. +- Enabled auto retry stops at configured guards. +- No missing locale keys are introduced. + +- [ ] **Step 4.8: Update status and commit** + +Update `docs/deep-review-phase3-followup-plan.md` to mark retry controls as implemented after Step 4.7 passes. + +Run: + +```powershell +git add src/crates/core/src/agentic/deep_review_policy.rs src/crates/core/src/agentic/tools/implementations/task_tool.rs src/web-ui/src/flow_chat/utils/codeReviewReport.ts src/web-ui/src/flow_chat/store/deepReviewActionBarStore.ts src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.scss src/web-ui/src/flow_chat/services/DeepReviewService.ts src/web-ui/src/shared/services/reviewTeamService.ts src/web-ui/src/locales/en-US/flow-chat.json src/web-ui/src/locales/zh-CN/flow-chat.json src/web-ui/src/locales/zh-TW/flow-chat.json src/web-ui/src/flow_chat/utils/codeReviewReport.test.ts src/web-ui/src/flow_chat/store/deepReviewActionBarStore.test.ts src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx src/web-ui/src/flow_chat/services/DeepReviewService.test.ts docs/deep-review-phase3-followup-plan.md +git commit -m "feat(deep-review): add bounded retry controls" +``` + +Expected: + +- Commit includes only retry runtime, UI, service, locales, tests, and status doc. + +--- + +## Round 5: Cost-Aware Review Scope And Shared Evidence + +**Goal:** Reduce review time and token usage on large changes by making quick/default strategies high-risk-first and by giving reviewers a shared evidence pack before they perform targeted reads. + +**Files:** + +- Modify: `src/web-ui/src/shared/services/reviewTeamService.ts` +- Modify: `src/web-ui/src/shared/services/reviewTeamService.test.ts` +- Modify: `src/crates/core/src/agentic/agents/prompts/deep_review_agent.md` +- Modify: `src/crates/core/src/agentic/agents/prompts/review_business_logic_agent.md` +- Modify: `src/crates/core/src/agentic/agents/prompts/review_performance_agent.md` +- Modify: `src/crates/core/src/agentic/agents/prompts/review_security_agent.md` +- Modify: `src/crates/core/src/agentic/agents/prompts/review_architecture_agent.md` +- Modify: `src/crates/core/src/agentic/agents/prompts/review_frontend_agent.md` +- Modify: `src/crates/core/src/agentic/agents/prompts/review_quality_gate_agent.md` +- Modify: `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` +- Docs: `docs/deep-review-phase3-followup-plan.md` + +- [ ] **Step 5.1: Add manifest tests for strategy depth** + +In `src/web-ui/src/shared/services/reviewTeamService.test.ts`, add tests proving: + +- `quick` produces `reviewDepth: 'high_risk_only'`. +- `normal` produces `reviewDepth: 'risk_expanded'`. +- `deep` produces `reviewDepth: 'full_depth'`. +- Quick/default optional reviewers are included only when the target tags match their applicability rules or when the user explicitly selects deep/full coverage. +- Every reduced-depth manifest still includes file coverage metadata and a `coverageExpectation` string. + +Run: + +```powershell +pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts +``` + +Expected: + +- New tests fail until the manifest carries the scope profile. + +- [ ] **Step 5.2: Add the cost-aware scope profile** + +In `src/web-ui/src/shared/services/reviewTeamService.ts`, add a manifest profile with this shape: + +```ts +type DeepReviewScopeProfile = { + reviewDepth: 'high_risk_only' | 'risk_expanded' | 'full_depth'; + riskFocusTags: string[]; + maxDependencyHops: number | 'policy_limited'; + optionalReviewerPolicy: 'risk_matched_only' | 'configured' | 'full'; + allowBroadToolExploration: boolean; + coverageExpectation: string; +}; +``` + +Mapping: + +- `quick`: high-risk only, `maxDependencyHops: 0`, optional reviewers risk-matched only, broad exploration off. +- `normal`: risk-expanded, `maxDependencyHops: 1`, optional reviewers configured but still applicability-gated, broad exploration limited. +- `deep`: full-depth, `maxDependencyHops: 'policy_limited'`, optional reviewers follow configured/deep behavior, broad exploration allowed. + +- [ ] **Step 5.3: Build a shared evidence pack** + +In `reviewTeamService.ts`, add a compact evidence pack under the run manifest: + +```ts +type DeepReviewEvidencePack = { + changedFiles: string[]; + diffStat: { + fileCount: number; + totalChangedLines: number; + }; + domainTags: string[]; + riskFocusTags: string[]; + packetIds: string[]; + hunkHints: Array<{ + filePath: string; + changedLineCount: number; + }>; + contractHints: Array<{ + kind: 'i18n_key' | 'tauri_command' | 'api_contract' | 'config_key'; + value: string; + filePath: string; + }>; +}; +``` + +Rules: + +- Do not include full file text or full diff text. +- Derive contract hints only from already available changed files, target classification, and cheap key/name extraction. +- Keep the pack source-agnostic. If the target is not Git-based, file lists and source labels are still valid. + +- [ ] **Step 5.4: Update reviewer prompt boundaries** + +Update `deep_review_agent.md` and specialist prompts so reviewers: + +- Start from the shared evidence pack. +- Treat quick mode as high-risk gate, not broad audit. +- Treat normal mode as risk-expanded, one-hop review. +- Use `Read` and `GetFileDiff` for targeted confirmation, not initial discovery. +- Report coverage limitations when the scope profile prevents broad exploration. + +Update `review_quality_gate_agent.md` so the judge does not treat a high-risk-only pass as full-depth coverage. + +- [ ] **Step 5.5: Fold reduced-depth status into final report metadata** + +In `code_review_tool.rs`, preserve `reviewDepth`, `coverageExpectation`, and reduced-depth reliability notes when the manifest provides them. + +Do not create a dense report section; use existing reliability/coverage wording. + +- [ ] **Step 5.6: Run focused verification** + +Run: + +```powershell +pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts +cargo test -p bitfun-core deep_review -- --nocapture +``` + +Expected: + +- Manifest tests pass. +- Deep Review Rust tests pass. +- No prompt or report claim says quick/default is full-depth. + +- [ ] **Step 5.7: Update status and commit** + +Update `docs/deep-review-phase3-followup-plan.md` after Step 5.6 passes. + +Run: + +```powershell +git add src/web-ui/src/shared/services/reviewTeamService.ts src/web-ui/src/shared/services/reviewTeamService.test.ts src/crates/core/src/agentic/agents/prompts/deep_review_agent.md src/crates/core/src/agentic/agents/prompts/review_business_logic_agent.md src/crates/core/src/agentic/agents/prompts/review_performance_agent.md src/crates/core/src/agentic/agents/prompts/review_security_agent.md src/crates/core/src/agentic/agents/prompts/review_architecture_agent.md src/crates/core/src/agentic/agents/prompts/review_frontend_agent.md src/crates/core/src/agentic/agents/prompts/review_quality_gate_agent.md src/crates/core/src/agentic/tools/implementations/code_review_tool.rs docs/deep-review-phase3-followup-plan.md +git commit -m "feat(deep-review): add cost-aware review scope" +``` + +Expected: + +- Commit includes only scope-profile, evidence-pack, prompt/report, tests, and status doc changes. + +--- + +## Round 6: Documentation Reconciliation And Product Risk Review + +**Goal:** Make all Deep Review documents accurately reflect implemented behavior, deferred scope, and remaining risks. + +**Files:** + +- Modify: `docs/deep-review-design.md` +- Modify: `docs/deep-review-phase2-plan.md` +- Modify: `docs/deep-review-phase2-addendum.md` +- Modify: `docs/deep-review-phase3-followup-plan.md` +- Modify: `docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md` + +- [ ] **Step 6.1: Scan for stale completion claims** + +Run: + +```powershell +rg -n "project-level cache.*implemented|automatic retry.*complete|provider/adaptive queue.*complete|hard prompt.*complete|global.*concurrency.*automatic" docs/deep-review-design.md docs/deep-review-phase2-plan.md docs/deep-review-phase2-addendum.md docs/deep-review-phase3-followup-plan.md docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md +``` + +Expected: + +- No stale text claims deferred work is already implemented. +- Any matches describe a verified completed round or an explicit deferred item. + +- [ ] **Step 6.2: Reconcile status wording** + +Use these status labels consistently: + +- `Implemented` for code landed and verified. +- `Implemented with guardrails` for behavior that is active but bounded by settings, budgets, or user controls. +- `Deferred by product decision` for project-level cache and programmatic shared-context cache expansion. +- `Risk accepted for Phase 3` for known non-blocking behavior with a documented mitigation. +- `Pending implementation` for items that remain in this execution plan. + +- [ ] **Step 6.3: Add measured outcome notes** + +In `docs/deep-review-phase3-followup-plan.md`, add a concise measured outcome section after implementation: + +```markdown +## Measured Outcome Notes + +- Runtime diagnostics are emitted as one aggregate final snapshot per Deep Review turn. +- Provider transient capacity waits are visible through the existing queue action bar and bounded by Review settings. +- Retry defaults to manual action. Bounded automatic retry runs only after explicit opt-in and stops at role, packet, and elapsed guards. +- Quick/default review paths are high-risk or risk-expanded by design; deep remains the full-depth option. +- Shared evidence packs reduce repeated discovery work without storing source content in diagnostics. +- Project-level cache remains deferred and no new persistent review-output cache is introduced. +``` + +- [ ] **Step 6.4: Run full release gate** + +Run: + +```powershell +cargo test -p bitfun-core deep_review -- --nocapture +cargo check --workspace --exclude bitfun-cli +pnpm run lint:web +pnpm run type-check:web +pnpm --dir src/web-ui run test:run +git diff --check +``` + +Expected: + +- All listed verification commands pass. +- Any failure is triaged before final status is marked complete. + +- [ ] **Step 6.5: Commit Round 6** + +Run: + +```powershell +git add docs/deep-review-design.md docs/deep-review-phase2-plan.md docs/deep-review-phase2-addendum.md docs/deep-review-phase3-followup-plan.md docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md +git commit -m "docs(deep-review): reconcile phase three status" +``` + +Expected: + +- Commit contains only documentation reconciliation. + +--- + +## Final Acceptance Checklist + +- [ ] Runtime diagnostics are aggregate-only and low frequency. +- [ ] Shared-context measurements remain content-free. +- [ ] Provider transient queue is short, visible, pauseable, cancellable, and bounded. +- [ ] Queue time is separated from reviewer runtime timeout. +- [ ] Deep Review max parallel reviewers can be lowered explicitly without changing global subagent concurrency. +- [ ] Manual retry exists for structured unresolved slices. +- [ ] Bounded auto retry is disabled by default and requires explicit opt-in. +- [ ] Auto retry cannot loop indefinitely. +- [ ] Quick/default strategies emphasize high-risk review instead of full-depth broad exploration. +- [ ] Shared evidence packs reduce repeated reviewer discovery work without storing source, full diff, or reviewer output content. +- [ ] All new UI strings are localized in `en-US`, `zh-CN`, and `zh-TW`. +- [ ] Project-level review cache remains deferred. +- [ ] Existing Deep Review report and Review Team behavior are not disrupted outside the planned controls. +- [ ] Full release gate passes. + +## Stop Conditions + +Stop and review the design before continuing if any of these occur: + +- A fix requires changing global `ai.subagent_max_concurrency` as the normal Deep Review recovery path. +- A runtime change stores source, diff, reviewer output, provider raw body, or full file contents in diagnostics. +- Provider queue needs more than one automatic reattempt per reviewer packet. +- Auto retry needs to retry a scope that is not smaller than the original packet. +- Quick/default cost reduction would hide changed files from coverage metadata instead of marking reduced-depth coverage. +- Shared evidence reuse requires caching full `Read` outputs across subagents before duplicate-call diagnostics justify it. +- A UI change introduces a new page or modal when the existing action bar or Review settings section can carry the workflow. +- A test requires broad snapshots instead of behavior assertions for queue, retry, or settings state. diff --git a/src/crates/core/src/agentic/deep_review_policy.rs b/src/crates/core/src/agentic/deep_review_policy.rs index cb78f9cce..452fab01d 100644 --- a/src/crates/core/src/agentic/deep_review_policy.rs +++ b/src/crates/core/src/agentic/deep_review_policy.rs @@ -866,6 +866,64 @@ pub struct DeepReviewEffectiveConcurrencySnapshot { pub retry_after_remaining_ms: Option, } +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct DeepReviewRuntimeDiagnostics { + pub queue_wait_count: usize, + pub queue_wait_total_ms: u64, + pub queue_wait_max_ms: u64, + pub provider_capacity_queue_count: usize, + pub provider_capacity_retry_count: usize, + pub provider_capacity_retry_success_count: usize, + pub capacity_skip_count: usize, + pub effective_parallel_min: Option, + pub effective_parallel_final: Option, + pub manual_queue_action_count: usize, + pub manual_retry_count: usize, + pub auto_retry_count: usize, + pub auto_retry_suppressed_reason_counts: BTreeMap, + pub shared_context_total_calls: usize, + pub shared_context_duplicate_calls: usize, + pub shared_context_duplicate_context_count: usize, +} + +impl DeepReviewRuntimeDiagnostics { + fn is_empty(&self) -> bool { + self.queue_wait_count == 0 + && self.queue_wait_total_ms == 0 + && self.queue_wait_max_ms == 0 + && self.provider_capacity_queue_count == 0 + && self.provider_capacity_retry_count == 0 + && self.provider_capacity_retry_success_count == 0 + && self.capacity_skip_count == 0 + && self.effective_parallel_min.is_none() + && self.effective_parallel_final.is_none() + && self.manual_queue_action_count == 0 + && self.manual_retry_count == 0 + && self.auto_retry_count == 0 + && self.auto_retry_suppressed_reason_counts.is_empty() + && self.shared_context_total_calls == 0 + && self.shared_context_duplicate_calls == 0 + && self.shared_context_duplicate_context_count == 0 + } + + fn observe_effective_parallel(&mut self, effective_parallel_instances: usize) { + self.effective_parallel_min = Some( + self.effective_parallel_min + .map_or(effective_parallel_instances, |current| { + current.min(effective_parallel_instances) + }), + ); + self.effective_parallel_final = Some(effective_parallel_instances); + } + + fn merge_shared_context(&mut self, snapshot: DeepReviewSharedContextMeasurementSnapshot) { + self.shared_context_total_calls = snapshot.total_calls; + self.shared_context_duplicate_calls = snapshot.duplicate_calls; + self.shared_context_duplicate_context_count = snapshot.duplicate_context_count; + } +} + #[derive(Debug, Clone)] struct DeepReviewEffectiveConcurrencyState { configured_max_parallel_instances: usize, @@ -1404,6 +1462,7 @@ struct DeepReviewTurnBudget { capacity_skips: usize, shared_context_uses: HashMap, effective_concurrency: Option, + runtime_diagnostics: DeepReviewRuntimeDiagnostics, updated_at: Instant, } @@ -1419,6 +1478,7 @@ impl DeepReviewTurnBudget { capacity_skips: 0, shared_context_uses: HashMap::new(), effective_concurrency: None, + runtime_diagnostics: DeepReviewRuntimeDiagnostics::default(), updated_at: now, } } @@ -1496,6 +1556,120 @@ impl Default for DeepReviewBudgetTracker { } impl DeepReviewBudgetTracker { + fn update_runtime_diagnostics( + &self, + parent_dialog_turn_id: &str, + update: impl FnOnce(&mut DeepReviewRuntimeDiagnostics), + ) { + if parent_dialog_turn_id.trim().is_empty() { + return; + } + + let now = Instant::now(); + if let Ok(last_pruned) = self.last_pruned_at.lock() { + if now.saturating_duration_since(*last_pruned) >= PRUNE_INTERVAL { + drop(last_pruned); + self.prune_stale(now); + } + } + + let mut budget = self + .turns + .entry(parent_dialog_turn_id.to_string()) + .or_insert_with(|| DeepReviewTurnBudget::new(now)); + update(&mut budget.runtime_diagnostics); + budget.updated_at = now; + } + + pub fn record_runtime_queue_wait(&self, parent_dialog_turn_id: &str, queue_elapsed_ms: u64) { + if queue_elapsed_ms == 0 { + return; + } + self.update_runtime_diagnostics(parent_dialog_turn_id, |diagnostics| { + diagnostics.queue_wait_count = diagnostics.queue_wait_count.saturating_add(1); + diagnostics.queue_wait_total_ms = diagnostics + .queue_wait_total_ms + .saturating_add(queue_elapsed_ms); + diagnostics.queue_wait_max_ms = diagnostics.queue_wait_max_ms.max(queue_elapsed_ms); + }); + } + + pub fn record_runtime_provider_capacity_queue(&self, parent_dialog_turn_id: &str) { + self.update_runtime_diagnostics(parent_dialog_turn_id, |diagnostics| { + diagnostics.provider_capacity_queue_count = + diagnostics.provider_capacity_queue_count.saturating_add(1); + }); + } + + pub fn record_runtime_provider_capacity_retry(&self, parent_dialog_turn_id: &str) { + self.update_runtime_diagnostics(parent_dialog_turn_id, |diagnostics| { + diagnostics.provider_capacity_retry_count = + diagnostics.provider_capacity_retry_count.saturating_add(1); + }); + } + + pub fn record_runtime_provider_capacity_retry_success(&self, parent_dialog_turn_id: &str) { + self.update_runtime_diagnostics(parent_dialog_turn_id, |diagnostics| { + diagnostics.provider_capacity_retry_success_count = diagnostics + .provider_capacity_retry_success_count + .saturating_add(1); + }); + } + + pub fn record_runtime_capacity_skip( + &self, + parent_dialog_turn_id: &str, + _reason: DeepReviewCapacityQueueReason, + ) { + self.update_runtime_diagnostics(parent_dialog_turn_id, |diagnostics| { + diagnostics.capacity_skip_count = diagnostics.capacity_skip_count.saturating_add(1); + }); + } + + pub fn record_runtime_manual_queue_action(&self, parent_dialog_turn_id: &str) { + self.update_runtime_diagnostics(parent_dialog_turn_id, |diagnostics| { + diagnostics.manual_queue_action_count = + diagnostics.manual_queue_action_count.saturating_add(1); + }); + } + + pub fn record_runtime_manual_retry(&self, parent_dialog_turn_id: &str) { + self.update_runtime_diagnostics(parent_dialog_turn_id, |diagnostics| { + diagnostics.manual_retry_count = diagnostics.manual_retry_count.saturating_add(1); + }); + } + + pub fn record_runtime_auto_retry(&self, parent_dialog_turn_id: &str) { + self.update_runtime_diagnostics(parent_dialog_turn_id, |diagnostics| { + diagnostics.auto_retry_count = diagnostics.auto_retry_count.saturating_add(1); + }); + } + + pub fn record_runtime_auto_retry_suppressed(&self, parent_dialog_turn_id: &str, reason: &str) { + let reason = reason.trim(); + if reason.is_empty() { + return; + } + self.update_runtime_diagnostics(parent_dialog_turn_id, |diagnostics| { + *diagnostics + .auto_retry_suppressed_reason_counts + .entry(reason.to_string()) + .or_insert(0) += 1; + }); + } + + pub fn runtime_diagnostics_snapshot( + &self, + parent_dialog_turn_id: &str, + ) -> Option { + let budget = self.turns.get(parent_dialog_turn_id)?; + let mut diagnostics = budget.runtime_diagnostics.clone(); + diagnostics.merge_shared_context(shared_context_measurement_snapshot_from_uses( + &budget.shared_context_uses, + )); + (!diagnostics.is_empty()).then_some(diagnostics) + } + pub fn record_shared_context_tool_use( &self, parent_dialog_turn_id: &str, @@ -1701,6 +1875,10 @@ impl DeepReviewBudgetTracker { .entry(parent_dialog_turn_id.to_string()) .or_insert_with(|| DeepReviewTurnBudget::new(now)); budget.capacity_skips += 1; + budget.runtime_diagnostics.capacity_skip_count = budget + .runtime_diagnostics + .capacity_skip_count + .saturating_add(1); budget.updated_at = now; } @@ -1852,9 +2030,15 @@ impl DeepReviewBudgetTracker { .entry(parent_dialog_turn_id.to_string()) .or_insert_with(|| DeepReviewTurnBudget::new(now)); budget.updated_at = now; - let state = budget.effective_concurrency_mut(configured_max_parallel_instances); - state.record_capacity_error(reason, retry_after, now); - state.snapshot(now) + let snapshot = { + let state = budget.effective_concurrency_mut(configured_max_parallel_instances); + state.record_capacity_error(reason, retry_after, now); + state.snapshot(now) + }; + budget + .runtime_diagnostics + .observe_effective_parallel(snapshot.effective_parallel_instances); + snapshot } pub fn record_effective_concurrency_success( @@ -1873,9 +2057,15 @@ impl DeepReviewBudgetTracker { .entry(parent_dialog_turn_id.to_string()) .or_insert_with(|| DeepReviewTurnBudget::new(now)); budget.updated_at = now; - let state = budget.effective_concurrency_mut(configured_max_parallel_instances); - state.record_success(now); - state.snapshot(now) + let snapshot = { + let state = budget.effective_concurrency_mut(configured_max_parallel_instances); + state.record_success(now); + state.snapshot(now) + }; + budget + .runtime_diagnostics + .observe_effective_parallel(snapshot.effective_parallel_instances); + snapshot } pub fn set_effective_concurrency_user_override( @@ -1895,9 +2085,15 @@ impl DeepReviewBudgetTracker { .entry(parent_dialog_turn_id.to_string()) .or_insert_with(|| DeepReviewTurnBudget::new(now)); budget.updated_at = now; - let state = budget.effective_concurrency_mut(configured_max_parallel_instances); - state.set_user_override(user_override_parallel_instances); - state.snapshot(now) + let snapshot = { + let state = budget.effective_concurrency_mut(configured_max_parallel_instances); + state.set_user_override(user_override_parallel_instances); + state.snapshot(now) + }; + budget + .runtime_diagnostics + .observe_effective_parallel(snapshot.effective_parallel_instances); + snapshot } } @@ -2024,6 +2220,48 @@ pub fn record_deep_review_capacity_skip(parent_dialog_turn_id: &str) { GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_capacity_skip(parent_dialog_turn_id) } +pub fn record_deep_review_runtime_queue_wait(parent_dialog_turn_id: &str, queue_elapsed_ms: u64) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER + .record_runtime_queue_wait(parent_dialog_turn_id, queue_elapsed_ms) +} + +pub fn record_deep_review_runtime_provider_capacity_queue(parent_dialog_turn_id: &str) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_runtime_provider_capacity_queue(parent_dialog_turn_id) +} + +pub fn record_deep_review_runtime_provider_capacity_retry(parent_dialog_turn_id: &str) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_runtime_provider_capacity_retry(parent_dialog_turn_id) +} + +pub fn record_deep_review_runtime_provider_capacity_retry_success(parent_dialog_turn_id: &str) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER + .record_runtime_provider_capacity_retry_success(parent_dialog_turn_id) +} + +pub fn record_deep_review_runtime_capacity_skip( + parent_dialog_turn_id: &str, + reason: DeepReviewCapacityQueueReason, +) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_runtime_capacity_skip(parent_dialog_turn_id, reason) +} + +pub fn record_deep_review_runtime_manual_queue_action(parent_dialog_turn_id: &str) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_runtime_manual_queue_action(parent_dialog_turn_id) +} + +pub fn record_deep_review_runtime_manual_retry(parent_dialog_turn_id: &str) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_runtime_manual_retry(parent_dialog_turn_id) +} + +pub fn record_deep_review_runtime_auto_retry(parent_dialog_turn_id: &str) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.record_runtime_auto_retry(parent_dialog_turn_id) +} + +pub fn record_deep_review_runtime_auto_retry_suppressed(parent_dialog_turn_id: &str, reason: &str) { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER + .record_runtime_auto_retry_suppressed(parent_dialog_turn_id, reason) +} + pub fn record_deep_review_shared_context_tool_use( parent_dialog_turn_id: &str, subagent_type: &str, @@ -2044,6 +2282,12 @@ pub fn deep_review_shared_context_measurement_snapshot( GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.shared_context_measurement_snapshot(parent_dialog_turn_id) } +pub fn deep_review_runtime_diagnostics_snapshot( + parent_dialog_turn_id: &str, +) -> Option { + GLOBAL_DEEP_REVIEW_BUDGET_TRACKER.runtime_diagnostics_snapshot(parent_dialog_turn_id) +} + pub fn try_begin_deep_review_active_reviewer( parent_dialog_turn_id: &str, max_active_reviewers: usize, @@ -3168,6 +3412,54 @@ mod tests { assert_eq!(snapshot.repeated_contexts[0].reviewer_count, 2); } + #[test] + fn runtime_diagnostics_records_queue_and_capacity_transitions_as_counts() { + let tracker = DeepReviewBudgetTracker::default(); + + tracker.record_runtime_queue_wait("turn-runtime", 1_250); + tracker.record_runtime_queue_wait("turn-runtime", 2_500); + tracker.record_runtime_capacity_skip( + "turn-runtime", + super::DeepReviewCapacityQueueReason::ProviderConcurrencyLimit, + ); + + let diagnostics = tracker + .runtime_diagnostics_snapshot("turn-runtime") + .expect("runtime diagnostics should exist"); + + assert_eq!(diagnostics.queue_wait_count, 2); + assert_eq!(diagnostics.queue_wait_total_ms, 3_750); + assert_eq!(diagnostics.queue_wait_max_ms, 2_500); + assert_eq!(diagnostics.capacity_skip_count, 1); + assert_eq!(diagnostics.provider_capacity_queue_count, 0); + } + + #[test] + fn runtime_diagnostics_merges_shared_context_without_content() { + let tracker = DeepReviewBudgetTracker::default(); + + tracker.record_shared_context_tool_use( + "turn-runtime-shared", + REVIEWER_SECURITY_AGENT_TYPE, + "Read", + "src/lib.rs", + ); + tracker.record_shared_context_tool_use( + "turn-runtime-shared", + REVIEWER_ARCHITECTURE_AGENT_TYPE, + "Read", + "src/lib.rs", + ); + + let diagnostics = tracker + .runtime_diagnostics_snapshot("turn-runtime-shared") + .expect("runtime diagnostics should exist"); + + assert_eq!(diagnostics.shared_context_total_calls, 2); + assert_eq!(diagnostics.shared_context_duplicate_context_count, 1); + assert!(!format!("{diagnostics:?}").contains("fn ")); + } + #[test] fn effective_concurrency_lowers_after_capacity_errors_without_exceeding_hard_cap() { let tracker = DeepReviewBudgetTracker::default(); diff --git a/src/crates/core/src/agentic/tools/implementations/code_review_tool.rs b/src/crates/core/src/agentic/tools/implementations/code_review_tool.rs index b298d1725..293752a42 100644 --- a/src/crates/core/src/agentic/tools/implementations/code_review_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/code_review_tool.rs @@ -7,7 +7,8 @@ use crate::agentic::context_profile::ContextProfilePolicy; use crate::agentic::coordination::get_global_coordinator; use crate::agentic::core::CompressionContract; use crate::agentic::deep_review_policy::{ - deep_review_shared_context_measurement_snapshot, DeepReviewIncrementalCache, + deep_review_runtime_diagnostics_snapshot, DeepReviewIncrementalCache, + DeepReviewRuntimeDiagnostics, }; use crate::agentic::tools::framework::{Tool, ToolResult, ToolUseContext}; use crate::service::config::get_app_language_code; @@ -27,15 +28,6 @@ struct DeepReviewCacheUpdate { miss_count: usize, } -#[derive(Debug, Clone, PartialEq, Eq)] -struct DeepReviewSharedContextDiagnostics { - total_calls: usize, - duplicate_calls: usize, - duplicate_context_count: usize, - max_duplicate_call_count: usize, - max_duplicate_reviewer_count: usize, -} - impl CodeReviewTool { pub fn new() -> Self { Self @@ -879,48 +871,60 @@ impl CodeReviewTool { } } - fn deep_review_shared_context_diagnostics( - dialog_turn_id: Option<&str>, - ) -> Option { - let dialog_turn_id = dialog_turn_id + fn log_deep_review_runtime_diagnostics(dialog_turn_id: Option<&str>) { + let Some(dialog_turn_id) = dialog_turn_id .map(str::trim) - .filter(|value| !value.is_empty())?; - let snapshot = deep_review_shared_context_measurement_snapshot(dialog_turn_id); - if snapshot.total_calls == 0 { - return None; - } - - Some(DeepReviewSharedContextDiagnostics { - total_calls: snapshot.total_calls, - duplicate_calls: snapshot.duplicate_calls, - duplicate_context_count: snapshot.duplicate_context_count, - max_duplicate_call_count: snapshot - .repeated_contexts - .iter() - .map(|context| context.call_count) - .max() - .unwrap_or(0), - max_duplicate_reviewer_count: snapshot - .repeated_contexts - .iter() - .map(|context| context.reviewer_count) - .max() - .unwrap_or(0), - }) - } - - fn log_deep_review_shared_context_diagnostics(dialog_turn_id: Option<&str>) { - let Some(diagnostics) = Self::deep_review_shared_context_diagnostics(dialog_turn_id) else { + .filter(|value| !value.is_empty()) + else { return; }; + let Some(DeepReviewRuntimeDiagnostics { + queue_wait_count, + queue_wait_total_ms, + queue_wait_max_ms, + provider_capacity_queue_count, + provider_capacity_retry_count, + provider_capacity_retry_success_count, + capacity_skip_count, + effective_parallel_min, + effective_parallel_final, + manual_queue_action_count, + manual_retry_count, + auto_retry_count, + auto_retry_suppressed_reason_counts, + shared_context_total_calls, + shared_context_duplicate_calls, + shared_context_duplicate_context_count, + }) = deep_review_runtime_diagnostics_snapshot(dialog_turn_id) + else { + return; + }; + let auto_retry_suppressed_reason_counts = + serde_json::to_string(&auto_retry_suppressed_reason_counts) + .unwrap_or_else(|_| "{}".to_string()); debug!( - "DeepReview shared context measurement: total_calls={}, duplicate_calls={}, duplicate_context_count={}, max_duplicate_call_count={}, max_duplicate_reviewer_count={}", - diagnostics.total_calls, - diagnostics.duplicate_calls, - diagnostics.duplicate_context_count, - diagnostics.max_duplicate_call_count, - diagnostics.max_duplicate_reviewer_count + "DeepReview runtime diagnostics: queue_wait_count={}, queue_wait_total_ms={}, queue_wait_max_ms={}, provider_capacity_queue_count={}, provider_capacity_retry_count={}, provider_capacity_retry_success_count={}, capacity_skip_count={}, effective_parallel_min={}, effective_parallel_final={}, manual_queue_action_count={}, manual_retry_count={}, auto_retry_count={}, auto_retry_suppressed_reason_counts={}, shared_context_total_calls={}, shared_context_duplicate_calls={}, shared_context_duplicate_context_count={}", + queue_wait_count, + queue_wait_total_ms, + queue_wait_max_ms, + provider_capacity_queue_count, + provider_capacity_retry_count, + provider_capacity_retry_success_count, + capacity_skip_count, + effective_parallel_min + .map(|value| value.to_string()) + .unwrap_or_else(|| "none".to_string()), + effective_parallel_final + .map(|value| value.to_string()) + .unwrap_or_else(|| "none".to_string()), + manual_queue_action_count, + manual_retry_count, + auto_retry_count, + auto_retry_suppressed_reason_counts, + shared_context_total_calls, + shared_context_duplicate_calls, + shared_context_duplicate_context_count ); } @@ -1215,7 +1219,7 @@ impl Tool for CodeReviewTool { &mut filled_input, context.dialog_turn_id.as_deref(), ); - Self::log_deep_review_shared_context_diagnostics(context.dialog_turn_id.as_deref()); + Self::log_deep_review_runtime_diagnostics(context.dialog_turn_id.as_deref()); if let Some(cache_update) = Self::deep_review_cache_from_completed_reviewers( &filled_input, run_manifest.as_ref(), @@ -1662,7 +1666,9 @@ mod tests { #[tokio::test] async fn deep_review_shared_context_diagnostics_stays_out_of_report() { - use crate::agentic::deep_review_policy::record_deep_review_shared_context_tool_use; + use crate::agentic::deep_review_policy::{ + deep_review_runtime_diagnostics_snapshot, record_deep_review_shared_context_tool_use, + }; let turn_id = "turn-code-review-shared-context-diagnostics"; record_deep_review_shared_context_tool_use(turn_id, "ReviewSecurity", "Read", "src/lib.rs"); @@ -1679,13 +1685,11 @@ mod tests { "src/lib.rs", ); - let diagnostics = CodeReviewTool::deep_review_shared_context_diagnostics(Some(turn_id)) + let diagnostics = deep_review_runtime_diagnostics_snapshot(turn_id) .expect("diagnostics should be available for measured turn"); - assert_eq!(diagnostics.total_calls, 3); - assert_eq!(diagnostics.duplicate_calls, 1); - assert_eq!(diagnostics.duplicate_context_count, 1); - assert_eq!(diagnostics.max_duplicate_call_count, 2); - assert_eq!(diagnostics.max_duplicate_reviewer_count, 2); + assert_eq!(diagnostics.shared_context_total_calls, 3); + assert_eq!(diagnostics.shared_context_duplicate_calls, 1); + assert_eq!(diagnostics.shared_context_duplicate_context_count, 1); let tool = CodeReviewTool::new(); let mut context = tool_context(Some("DeepReview")); diff --git a/src/crates/core/src/agentic/tools/implementations/task_tool.rs b/src/crates/core/src/agentic/tools/implementations/task_tool.rs index 38e5f27cb..74c80e753 100644 --- a/src/crates/core/src/agentic/tools/implementations/task_tool.rs +++ b/src/crates/core/src/agentic/tools/implementations/task_tool.rs @@ -7,11 +7,11 @@ use crate::agentic::deep_review_policy::{ deep_review_max_retries_per_role, deep_review_queue_control_snapshot, load_default_deep_review_policy, record_deep_review_capacity_skip, record_deep_review_effective_concurrency_capacity_error, - record_deep_review_effective_concurrency_success, record_deep_review_task_budget, - try_begin_deep_review_active_reviewer, DeepReviewActiveReviewerGuard, - DeepReviewCapacityQueueReason, DeepReviewConcurrencyPolicy, DeepReviewExecutionPolicy, - DeepReviewIncrementalCache, DeepReviewPolicyViolation, DeepReviewRunManifestGate, - DeepReviewSubagentRole, DEEP_REVIEW_AGENT_TYPE, + record_deep_review_effective_concurrency_success, record_deep_review_runtime_queue_wait, + record_deep_review_task_budget, try_begin_deep_review_active_reviewer, + DeepReviewActiveReviewerGuard, DeepReviewCapacityQueueReason, DeepReviewConcurrencyPolicy, + DeepReviewExecutionPolicy, DeepReviewIncrementalCache, DeepReviewPolicyViolation, + DeepReviewRunManifestGate, DeepReviewSubagentRole, DEEP_REVIEW_AGENT_TYPE, }; use crate::agentic::events::{ DeepReviewQueueReason, DeepReviewQueueState, DeepReviewQueueStatus, ErrorCategory, @@ -568,6 +568,7 @@ impl TaskTool { if control_snapshot.cancelled || (is_optional_reviewer && control_snapshot.skip_optional) { + record_deep_review_runtime_queue_wait(dialog_turn_id, queue_elapsed_ms); record_deep_review_capacity_skip(dialog_turn_id); clear_deep_review_queue_control_for_tool(dialog_turn_id, tool_id); Self::emit_deep_review_queue_state( @@ -626,6 +627,7 @@ impl TaskTool { try_begin_deep_review_active_reviewer(dialog_turn_id, effective_parallel_instances) { let active_reviewer_count = deep_review_active_reviewer_count(dialog_turn_id); + record_deep_review_runtime_queue_wait(dialog_turn_id, queue_elapsed_ms); clear_deep_review_queue_control_for_tool(dialog_turn_id, tool_id); Self::emit_deep_review_queue_state( session_id, @@ -652,6 +654,7 @@ impl TaskTool { reason, decision.retry_after_seconds.map(Duration::from_secs), ); + record_deep_review_runtime_queue_wait(dialog_turn_id, queue_elapsed_ms); record_deep_review_capacity_skip(dialog_turn_id); clear_deep_review_queue_control_for_tool(dialog_turn_id, tool_id); Self::emit_deep_review_queue_state( @@ -1832,6 +1835,59 @@ mod tests { assert_eq!(deep_review_capacity_skip_count(turn_id), 1); } + #[tokio::test] + async fn deep_review_capacity_queue_records_one_runtime_wait_when_ready() { + use crate::agentic::deep_review_policy::{ + deep_review_runtime_diagnostics_snapshot, try_begin_deep_review_active_reviewer, + DeepReviewConcurrencyPolicy, + }; + + let turn_id = "turn-queue-ready-diagnostics"; + let tool_id = "tool-queue-ready-diagnostics"; + let occupied = try_begin_deep_review_active_reviewer(turn_id, 1) + .expect("precondition should occupy reviewer capacity"); + let policy = DeepReviewConcurrencyPolicy { + max_parallel_instances: 1, + stagger_seconds: 0, + max_queue_wait_seconds: 1, + batch_extras_separately: true, + }; + let turn_id_owned = turn_id.to_string(); + let tool_id_owned = tool_id.to_string(); + + let handle = tokio::spawn(async move { + TaskTool::wait_for_deep_review_reviewer_capacity( + "session-queue-ready-diagnostics", + &turn_id_owned, + &tool_id_owned, + "ReviewSecurity", + &policy, + false, + ) + .await + }); + + tokio::time::sleep(tokio::time::Duration::from_millis(30)).await; + drop(occupied); + + let outcome = tokio::time::timeout(tokio::time::Duration::from_millis(500), handle) + .await + .expect("queue should become ready after capacity frees") + .expect("spawned wait should not panic") + .expect("queue wait should resolve"); + match outcome { + super::DeepReviewQueueWaitOutcome::Ready { .. } => {} + super::DeepReviewQueueWaitOutcome::Skipped { .. } => { + panic!("freed capacity should allow the queued reviewer to run"); + } + } + + let diagnostics = deep_review_runtime_diagnostics_snapshot(turn_id) + .expect("runtime diagnostics should record terminal queue wait"); + assert_eq!(diagnostics.queue_wait_count, 1); + assert!(diagnostics.queue_wait_total_ms >= 20); + } + #[tokio::test] async fn deep_review_capacity_queue_pause_does_not_expire_until_continued() { use crate::agentic::deep_review_policy::{ diff --git a/src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx b/src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx index 089bf3971..0e6a096cf 100644 --- a/src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx +++ b/src/web-ui/src/app/scenes/agents/components/ReviewTeamPage.tsx @@ -31,6 +31,7 @@ import { useSettingsStore } from '@/app/scenes/settings/settingsStore'; import { useSceneStore } from '@/app/stores/sceneStore'; import { useAgentsStore } from '../agentsStore'; import { + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY, DEFAULT_REVIEW_TEAM_EXECUTION_POLICY, DEFAULT_REVIEW_TEAM_MODEL, FALLBACK_REVIEW_TEAM_DEFINITION, @@ -167,6 +168,7 @@ const ReviewTeamPage: React.FC = () => { strategyLevel: 'normal', memberStrategyOverrides: {}, executionPolicy: { ...DEFAULT_REVIEW_TEAM_EXECUTION_POLICY }, + concurrencyPolicy: { ...DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY }, definition: FALLBACK_REVIEW_TEAM_DEFINITION, members: [], coreMembers: [], diff --git a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.i18n.test.ts b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.i18n.test.ts index af98398db..c76f66e69 100644 --- a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.i18n.test.ts +++ b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.i18n.test.ts @@ -42,6 +42,10 @@ const REQUIRED_ACTION_BAR_KEYS = [ 'deepReviewActionBar.capacityQueue.continueQueue', 'deepReviewActionBar.capacityQueue.cancelQueued', 'deepReviewActionBar.capacityQueue.skipOptionalQueued', + 'deepReviewActionBar.capacityQueue.runSlowerNextTime', + 'deepReviewActionBar.capacityQueue.openReviewSettings', + 'deepReviewActionBar.capacityQueue.runSlowerSaved', + 'deepReviewActionBar.capacityQueue.runSlowerFailed', 'deepReviewActionBar.capacityQueue.controlFailed', 'reviewActionBar.noIssuesFound', ]; diff --git a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx index 8b5e3e836..10d84a20b 100644 --- a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx +++ b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.test.tsx @@ -14,6 +14,7 @@ const buildRecoveryPlanMock = vi.hoisted(() => vi.fn(() => ({ summaryText: '1 completed reviewer will be preserved; 1 reviewer will be rerun', }))); const controlDeepReviewQueueMock = vi.hoisted(() => vi.fn()); +const lowerDefaultReviewTeamMaxParallelReviewersMock = vi.hoisted(() => vi.fn()); vi.mock('react-i18next', () => ({ initReactI18next: { @@ -66,6 +67,10 @@ vi.mock('@/infrastructure/api/service-api/AgentAPI', () => ({ }, })); +vi.mock('@/shared/services/reviewTeamService', () => ({ + lowerDefaultReviewTeamMaxParallelReviewers: lowerDefaultReviewTeamMaxParallelReviewersMock, +})); + vi.mock('@/infrastructure/event-bus', () => ({ globalEventBus: { emit: eventBusEmitMock, @@ -166,6 +171,13 @@ describeWithJsdom('DeepReviewActionBar', () => { confirmWarningMock.mockResolvedValue(true); eventBusEmitMock.mockReturnValue(false); continueDeepReviewSessionMock.mockResolvedValue(undefined); + lowerDefaultReviewTeamMaxParallelReviewersMock.mockResolvedValue({ + maxParallelInstances: 1, + maxQueueWaitSeconds: 120, + allowProviderCapacityQueue: true, + allowBoundedAutoRetry: false, + autoRetryElapsedGuardSeconds: 180, + }); useReviewActionBarStore.getState().reset(); }); @@ -419,6 +431,8 @@ describeWithJsdom('DeepReviewActionBar', () => { expect(container.textContent).toContain('Reviewers waiting for capacity'); expect(container.textContent).toContain('Queue wait does not count against reviewer runtime.'); expect(container.textContent).toContain('Your active session is busy.'); + expect(container.textContent).toContain('Run slower next time'); + expect(container.textContent).toContain('Open Review settings'); const pauseButton = Array.from(container.querySelectorAll('button')) .find((button) => button.textContent?.includes('Pause queue')); @@ -433,6 +447,29 @@ describeWithJsdom('DeepReviewActionBar', () => { capacityQueueState: { status: string }; }).capacityQueueState.status).toBe('paused_by_user'); expect(container.textContent).toContain('Queue paused'); + + const runSlowerButton = Array.from(container.querySelectorAll('button')) + .find((button) => button.textContent?.includes('Run slower next time')); + expect(runSlowerButton).toBeTruthy(); + + await act(async () => { + runSlowerButton!.dispatchEvent(new dom.window.MouseEvent('click', { bubbles: true })); + await Promise.resolve(); + }); + + expect(lowerDefaultReviewTeamMaxParallelReviewersMock).toHaveBeenCalledTimes(1); + + const openSettingsButton = Array.from(container.querySelectorAll('button')) + .find((button) => button.textContent?.includes('Open Review settings')); + expect(openSettingsButton).toBeTruthy(); + + await act(async () => { + openSettingsButton!.dispatchEvent(new dom.window.MouseEvent('click', { bubbles: true })); + await Promise.resolve(); + }); + + const { useSettingsStore } = await import('@/app/scenes/settings/settingsStore'); + expect(useSettingsStore.getState().activeTab).toBe('review'); }); it('sends backend queue control actions for event-driven capacity waits', async () => { diff --git a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx index a96641851..d1cc65db5 100644 --- a/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx +++ b/src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx @@ -17,6 +17,7 @@ import { RotateCcw, Eye, Minus, + Settings, } from 'lucide-react'; import { Button, Checkbox, Tooltip } from '@/component-library'; import { @@ -46,10 +47,19 @@ import { import { flowChatStore } from '../../store/FlowChatStore'; import { CodeReviewReportExportActions } from '../../tool-cards/CodeReviewReportExportActions'; import { agentAPI } from '@/infrastructure/api/service-api/AgentAPI'; +import { lowerDefaultReviewTeamMaxParallelReviewers } from '@/shared/services/reviewTeamService'; +import { useSettingsStore } from '@/app/scenes/settings/settingsStore'; +import { useSceneStore } from '@/app/stores/sceneStore'; +import type { ConfigTab } from '@/app/scenes/settings/settingsConfig'; import './DeepReviewActionBar.scss'; const log = createLogger('DeepReviewActionBar'); +function openSettingsTab(tab: ConfigTab) { + useSettingsStore.getState().setActiveTab(tab); + useSceneStore.getState().openScene('settings'); +} + const PHASE_CONFIG: Record; iconClass: string; @@ -170,6 +180,25 @@ export const ReviewActionBar: React.FC = () => { } }, [capacityQueueState, childSessionId, t]); + const handleRunSlowerNextTime = useCallback(async () => { + try { + const nextPolicy = await lowerDefaultReviewTeamMaxParallelReviewers(); + notificationService.success(t('deepReviewActionBar.capacityQueue.runSlowerSaved', { + count: nextPolicy.maxParallelInstances, + defaultValue: `Next Deep Review will use up to ${nextPolicy.maxParallelInstances} parallel reviewers.`, + })); + } catch (error) { + log.warn('Failed to lower DeepReview max parallel reviewers', error); + notificationService.error(t('deepReviewActionBar.capacityQueue.runSlowerFailed', { + defaultValue: 'Failed to update Review settings.', + })); + } + }, [t]); + + const handleOpenReviewSettings = useCallback(() => { + openSettingsTab('review'); + }, []); + // ---- progress tracking ---- const sessions = flowChatStore.getState().sessions; const childSession = useMemo(() => { @@ -445,7 +474,7 @@ export const ReviewActionBar: React.FC = () => { const handleOpenModelSettings = useCallback(async () => { if (!interruption) return; - globalEventBus.emit('settings:open', { tab: 'models' }); + openSettingsTab('models'); }, [interruption]); const handleViewPartialResults = useCallback(() => { @@ -710,66 +739,87 @@ export const ReviewActionBar: React.FC = () => { )}
- {supportsInlineQueueControls && ( -
- {capacityQueueState.status === 'paused_by_user' ? ( - - ) : ( - - )} - {(capacityQueueState.optionalReviewerCount ?? 0) > 0 && ( +
+ {supportsInlineQueueControls && ( + <> + {capacityQueueState.status === 'paused_by_user' ? ( + + ) : ( + + )} + {(capacityQueueState.optionalReviewerCount ?? 0) > 0 && ( + + )} - )} - -
- )} + + )} + + +
)} @@ -828,6 +878,32 @@ export const ReviewActionBar: React.FC = () => { {t(errorAttribution.description, { defaultValue: '' })} + {errorAttribution.actions.length > 0 && ( +
+ {errorAttribution.actions.map((action) => ( + + ))} +
+ )} )} diff --git a/src/web-ui/src/infrastructure/config/components/ReviewConfig.tsx b/src/web-ui/src/infrastructure/config/components/ReviewConfig.tsx index 753c7cc32..f12b6f731 100644 --- a/src/web-ui/src/infrastructure/config/components/ReviewConfig.tsx +++ b/src/web-ui/src/infrastructure/config/components/ReviewConfig.tsx @@ -1,6 +1,6 @@ import React, { useCallback, useEffect, useMemo, useState } from 'react'; import { useTranslation } from 'react-i18next'; -import { Badge, Button, ConfigPageLoading, NumberInput, Select } from '@/component-library'; +import { Badge, Button, ConfigPageLoading, NumberInput, Select, Switch } from '@/component-library'; import { ConfigPageContent, ConfigPageHeader, @@ -23,12 +23,14 @@ import { removeDefaultReviewTeamMember, REVIEW_STRATEGY_DEFINITIONS, REVIEW_STRATEGY_LEVELS, + saveDefaultReviewTeamConcurrencyPolicy, saveDefaultReviewTeamExecutionPolicy, saveDefaultReviewTeamMemberStrategyOverride, saveDefaultReviewTeamStrategyLevel, type ReviewMemberStrategyLevel, type ReviewStrategyLevel, type ReviewTeam, + type ReviewTeamConcurrencyPolicy, type ReviewTeamExecutionPolicy, type ReviewTeamMember, } from '@/shared/services/reviewTeamService'; @@ -111,6 +113,7 @@ const ReviewConfig: React.FC = () => { const [subagents, setSubagents] = useState([]); const [candidateId, setCandidateId] = useState(''); const [savingPolicyKey, setSavingPolicyKey] = useState(null); + const [savingConcurrencyKey, setSavingConcurrencyKey] = useState(null); const [savingMemberId, setSavingMemberId] = useState(null); const [savingStrategyTarget, setSavingStrategyTarget] = useState(null); const [addingMember, setAddingMember] = useState(false); @@ -263,6 +266,29 @@ const ReviewConfig: React.FC = () => { } }, [loadData, notifyError, notifySuccess, t, team]); + const handleConcurrencyPolicyChange = useCallback(async ( + key: keyof ReviewTeamConcurrencyPolicy, + value: ReviewTeamConcurrencyPolicy[keyof ReviewTeamConcurrencyPolicy], + ) => { + if (!team) return; + + const nextPolicy = { + ...team.concurrencyPolicy, + [key]: value, + } as ReviewTeamConcurrencyPolicy; + setSavingConcurrencyKey(key); + setTeam({ ...team, concurrencyPolicy: nextPolicy }); + try { + await saveDefaultReviewTeamConcurrencyPolicy(nextPolicy); + notifySuccess(t('messages.saved')); + } catch (error) { + await loadData(); + notifyError(error instanceof Error ? error.message : t('messages.saveFailed')); + } finally { + setSavingConcurrencyKey(null); + } + }, [loadData, notifyError, notifySuccess, t, team]); + const handleModelChange = useCallback(async (member: ReviewTeamMember, modelId: string) => { if (!team) return; @@ -444,6 +470,71 @@ const ReviewConfig: React.FC = () => { + + + void handleConcurrencyPolicyChange('maxParallelInstances', value)} + min={1} + max={16} + step={1} + size="small" + disabled={savingConcurrencyKey === 'maxParallelInstances'} + /> + + + + void handleConcurrencyPolicyChange('maxQueueWaitSeconds', value)} + min={0} + max={600} + step={15} + unit="s" + size="small" + disabled={savingConcurrencyKey === 'maxQueueWaitSeconds'} + /> + + + + void handleConcurrencyPolicyChange( + 'allowProviderCapacityQueue', + event.target.checked, + )} + disabled={savingConcurrencyKey === 'allowProviderCapacityQueue'} + /> + + + + void handleConcurrencyPolicyChange( + 'allowBoundedAutoRetry', + event.target.checked, + )} + disabled={savingConcurrencyKey === 'allowBoundedAutoRetry'} + /> + + + + void handleConcurrencyPolicyChange('autoRetryElapsedGuardSeconds', value)} + min={30} + max={900} + step={30} + unit="s" + size="small" + disabled={ + !team.concurrencyPolicy.allowBoundedAutoRetry || + savingConcurrencyKey === 'autoRetryElapsedGuardSeconds' + } + /> + + + { reviewer_file_split_threshold: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.reviewerFileSplitThreshold, max_same_role_instances: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxSameRoleInstances, max_retries_per_role: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxRetriesPerRole, + max_parallel_reviewers: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.maxParallelInstances, + max_queue_wait_seconds: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.maxQueueWaitSeconds, + allow_provider_capacity_queue: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.allowProviderCapacityQueue, + allow_bounded_auto_retry: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.allowBoundedAutoRetry, + auto_retry_elapsed_guard_seconds: + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.autoRetryElapsedGuardSeconds, ...overrides, }); @@ -113,6 +122,12 @@ describe('reviewTeamService', () => { reviewer_file_split_threshold: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.reviewerFileSplitThreshold, max_same_role_instances: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxSameRoleInstances, max_retries_per_role: DEFAULT_REVIEW_TEAM_EXECUTION_POLICY.maxRetriesPerRole, + max_parallel_reviewers: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.maxParallelInstances, + max_queue_wait_seconds: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.maxQueueWaitSeconds, + allow_provider_capacity_queue: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.allowProviderCapacityQueue, + allow_bounded_auto_retry: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.allowBoundedAutoRetry, + auto_retry_elapsed_guard_seconds: + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.autoRetryElapsedGuardSeconds, }); }); @@ -148,6 +163,91 @@ describe('reviewTeamService', () => { }); }); + it('normalizes persisted capacity and retry settings into the team concurrency policy', async () => { + vi.mocked(configAPI.getConfig).mockResolvedValueOnce({ + extra_subagent_ids: [], + strategy_level: 'normal', + member_strategy_overrides: {}, + max_parallel_reviewers: 99, + max_queue_wait_seconds: 999, + allow_provider_capacity_queue: false, + allow_bounded_auto_retry: true, + auto_retry_elapsed_guard_seconds: 1, + }); + + const config = await loadDefaultReviewTeamConfig(); + const team = resolveDefaultReviewTeam(coreSubagents(), config); + + expect(team.concurrencyPolicy).toEqual({ + maxParallelInstances: 16, + staggerSeconds: 0, + maxQueueWaitSeconds: 600, + batchExtrasSeparately: true, + allowProviderCapacityQueue: false, + allowBoundedAutoRetry: true, + autoRetryElapsedGuardSeconds: 30, + }); + }); + + it('saves capacity and retry settings without changing unrelated review team config', async () => { + vi.mocked(configAPI.getConfig).mockResolvedValueOnce( + storedConfigWithExtra(['ExtraReviewer'], { + strategy_level: 'deep', + member_strategy_overrides: { ReviewSecurity: 'quick' }, + reviewer_timeout_seconds: 300, + }), + ); + + await saveDefaultReviewTeamConcurrencyPolicy({ + maxParallelInstances: 2, + staggerSeconds: 20, + maxQueueWaitSeconds: 45, + batchExtrasSeparately: false, + allowProviderCapacityQueue: false, + allowBoundedAutoRetry: true, + autoRetryElapsedGuardSeconds: 240, + }); + + expect(configAPI.setConfig).toHaveBeenCalledWith( + 'ai.review_teams.default', + expect.objectContaining({ + extra_subagent_ids: ['ExtraReviewer'], + strategy_level: 'deep', + member_strategy_overrides: { ReviewSecurity: 'quick' }, + reviewer_timeout_seconds: 300, + max_parallel_reviewers: 2, + max_queue_wait_seconds: 45, + allow_provider_capacity_queue: false, + allow_bounded_auto_retry: true, + auto_retry_elapsed_guard_seconds: 240, + }), + ); + }); + + it('lowers the next review max parallel reviewers without going below one', async () => { + vi.mocked(configAPI.getConfig) + .mockResolvedValueOnce(storedConfigWithExtra([], { max_parallel_reviewers: 3 })) + .mockResolvedValueOnce(storedConfigWithExtra([], { max_parallel_reviewers: 1 })); + + await expect(lowerDefaultReviewTeamMaxParallelReviewers()).resolves.toMatchObject({ + maxParallelInstances: 2, + }); + expect(configAPI.setConfig).toHaveBeenNthCalledWith( + 1, + 'ai.review_teams.default', + expect.objectContaining({ max_parallel_reviewers: 2 }), + ); + + await expect(lowerDefaultReviewTeamMaxParallelReviewers()).resolves.toMatchObject({ + maxParallelInstances: 1, + }); + expect(configAPI.setConfig).toHaveBeenNthCalledWith( + 2, + 'ai.review_teams.default', + expect.objectContaining({ max_parallel_reviewers: 1 }), + ); + }); + it('propagates config errors that are not missing review team config paths', async () => { const error = new Error('Config service unavailable'); vi.mocked(configAPI.getConfig).mockRejectedValueOnce(error); diff --git a/src/web-ui/src/shared/services/reviewTeamService.ts b/src/web-ui/src/shared/services/reviewTeamService.ts index d14b7d56d..48f793615 100644 --- a/src/web-ui/src/shared/services/reviewTeamService.ts +++ b/src/web-ui/src/shared/services/reviewTeamService.ts @@ -35,10 +35,14 @@ export const DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY = { staggerSeconds: 0, maxQueueWaitSeconds: 60, batchExtrasSeparately: true, + allowProviderCapacityQueue: true, + allowBoundedAutoRetry: false, + autoRetryElapsedGuardSeconds: 180, } as const; const MAX_PREDICTIVE_TIMEOUT_SECONDS = 3600; const MAX_PARALLEL_REVIEWER_INSTANCES = 16; const MAX_QUEUE_WAIT_SECONDS = 600; +const MAX_AUTO_RETRY_ELAPSED_GUARD_SECONDS = 900; const PREDICTIVE_TIMEOUT_PER_FILE_SECONDS = 15; const PREDICTIVE_TIMEOUT_PER_100_LINES_SECONDS = 30; const PREDICTIVE_TIMEOUT_BASE_SECONDS: Record = { @@ -221,6 +225,11 @@ export interface ReviewTeamStoredConfig { reviewer_file_split_threshold: number; max_same_role_instances: number; max_retries_per_role: number; + max_parallel_reviewers: number; + max_queue_wait_seconds: number; + allow_provider_capacity_queue: boolean; + allow_bounded_auto_retry: boolean; + auto_retry_elapsed_guard_seconds: number; } export interface ReviewTeamExecutionPolicy { @@ -236,6 +245,9 @@ export interface ReviewTeamConcurrencyPolicy { staggerSeconds: number; maxQueueWaitSeconds: number; batchExtrasSeparately: boolean; + allowProviderCapacityQueue: boolean; + allowBoundedAutoRetry: boolean; + autoRetryElapsedGuardSeconds: number; } export interface ReviewTeamRateLimitStatus { @@ -455,6 +467,7 @@ export interface ReviewTeam { strategyLevel: ReviewStrategyLevel; memberStrategyOverrides: Record; executionPolicy: ReviewTeamExecutionPolicy; + concurrencyPolicy: ReviewTeamConcurrencyPolicy; definition: ReviewTeamDefinition; members: ReviewTeamMember[]; coreMembers: ReviewTeamMember[]; @@ -930,6 +943,62 @@ function normalizeConcurrencyPolicy( typeof raw?.batchExtrasSeparately === 'boolean' ? raw.batchExtrasSeparately : DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.batchExtrasSeparately, + allowProviderCapacityQueue: + typeof raw?.allowProviderCapacityQueue === 'boolean' + ? raw.allowProviderCapacityQueue + : DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.allowProviderCapacityQueue, + allowBoundedAutoRetry: + typeof raw?.allowBoundedAutoRetry === 'boolean' + ? raw.allowBoundedAutoRetry + : DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.allowBoundedAutoRetry, + autoRetryElapsedGuardSeconds: clampInteger( + raw?.autoRetryElapsedGuardSeconds, + 30, + MAX_AUTO_RETRY_ELAPSED_GUARD_SECONDS, + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.autoRetryElapsedGuardSeconds, + ), + }; +} + +function normalizeStoredConcurrencyPolicy( + raw: unknown, +): Pick< + ReviewTeamStoredConfig, + | 'max_parallel_reviewers' + | 'max_queue_wait_seconds' + | 'allow_provider_capacity_queue' + | 'allow_bounded_auto_retry' + | 'auto_retry_elapsed_guard_seconds' +> { + const config = raw as Partial | undefined; + + return { + max_parallel_reviewers: clampInteger( + config?.max_parallel_reviewers, + 1, + MAX_PARALLEL_REVIEWER_INSTANCES, + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.maxParallelInstances, + ), + max_queue_wait_seconds: clampInteger( + config?.max_queue_wait_seconds, + 0, + MAX_QUEUE_WAIT_SECONDS, + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.maxQueueWaitSeconds, + ), + allow_provider_capacity_queue: + typeof config?.allow_provider_capacity_queue === 'boolean' + ? config.allow_provider_capacity_queue + : DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.allowProviderCapacityQueue, + allow_bounded_auto_retry: + typeof config?.allow_bounded_auto_retry === 'boolean' + ? config.allow_bounded_auto_retry + : DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.allowBoundedAutoRetry, + auto_retry_elapsed_guard_seconds: clampInteger( + config?.auto_retry_elapsed_guard_seconds, + 30, + MAX_AUTO_RETRY_ELAPSED_GUARD_SECONDS, + DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.autoRetryElapsedGuardSeconds, + ), }; } @@ -1036,12 +1105,27 @@ function executionPolicyFromStoredConfig( }; } +function concurrencyPolicyFromStoredConfig( + config: ReviewTeamStoredConfig, +): ReviewTeamConcurrencyPolicy { + return normalizeConcurrencyPolicy({ + maxParallelInstances: config.max_parallel_reviewers, + staggerSeconds: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.staggerSeconds, + maxQueueWaitSeconds: config.max_queue_wait_seconds, + batchExtrasSeparately: DEFAULT_REVIEW_TEAM_CONCURRENCY_POLICY.batchExtrasSeparately, + allowProviderCapacityQueue: config.allow_provider_capacity_queue, + allowBoundedAutoRetry: config.allow_bounded_auto_retry, + autoRetryElapsedGuardSeconds: config.auto_retry_elapsed_guard_seconds, + }); +} + function normalizeStoredConfig(raw: unknown): ReviewTeamStoredConfig { const extraIds = Array.isArray((raw as { extra_subagent_ids?: unknown })?.extra_subagent_ids) ? (raw as { extra_subagent_ids: unknown[] }).extra_subagent_ids .filter((value): value is string => typeof value === 'string') : []; const executionPolicy = normalizeExecutionPolicy(raw); + const concurrencyPolicy = normalizeStoredConcurrencyPolicy(raw); const config = raw as Partial | undefined; return { @@ -1051,6 +1135,7 @@ function normalizeStoredConfig(raw: unknown): ReviewTeamStoredConfig { config?.member_strategy_overrides, ), ...executionPolicy, + ...concurrencyPolicy, }; } @@ -1092,6 +1177,11 @@ export async function saveDefaultReviewTeamConfig( reviewer_file_split_threshold: normalizedConfig.reviewer_file_split_threshold, max_same_role_instances: normalizedConfig.max_same_role_instances, max_retries_per_role: normalizedConfig.max_retries_per_role, + max_parallel_reviewers: normalizedConfig.max_parallel_reviewers, + max_queue_wait_seconds: normalizedConfig.max_queue_wait_seconds, + allow_provider_capacity_queue: normalizedConfig.allow_provider_capacity_queue, + allow_bounded_auto_retry: normalizedConfig.allow_bounded_auto_retry, + auto_retry_elapsed_guard_seconds: normalizedConfig.auto_retry_elapsed_guard_seconds, }); } @@ -1185,6 +1275,39 @@ export async function saveDefaultReviewTeamExecutionPolicy( }); } +export async function saveDefaultReviewTeamConcurrencyPolicy( + policy: ReviewTeamConcurrencyPolicy, +): Promise { + const current = await loadDefaultReviewTeamConfig(); + const normalizedPolicy = normalizeConcurrencyPolicy(policy); + await saveDefaultReviewTeamConfig({ + ...current, + max_parallel_reviewers: normalizedPolicy.maxParallelInstances, + max_queue_wait_seconds: normalizedPolicy.maxQueueWaitSeconds, + allow_provider_capacity_queue: normalizedPolicy.allowProviderCapacityQueue, + allow_bounded_auto_retry: normalizedPolicy.allowBoundedAutoRetry, + auto_retry_elapsed_guard_seconds: normalizedPolicy.autoRetryElapsedGuardSeconds, + }); +} + +export async function lowerDefaultReviewTeamMaxParallelReviewers(): Promise { + const current = await loadDefaultReviewTeamConfig(); + const currentPolicy = concurrencyPolicyFromStoredConfig(current); + const nextPolicy = { + ...currentPolicy, + maxParallelInstances: Math.max(1, currentPolicy.maxParallelInstances - 1), + }; + await saveDefaultReviewTeamConfig({ + ...current, + max_parallel_reviewers: nextPolicy.maxParallelInstances, + max_queue_wait_seconds: nextPolicy.maxQueueWaitSeconds, + allow_provider_capacity_queue: nextPolicy.allowProviderCapacityQueue, + allow_bounded_auto_retry: nextPolicy.allowBoundedAutoRetry, + auto_retry_elapsed_guard_seconds: nextPolicy.autoRetryElapsedGuardSeconds, + }); + return nextPolicy; +} + export async function saveDefaultReviewTeamStrategyLevel( strategyLevel: ReviewStrategyLevel, ): Promise { @@ -1563,6 +1686,7 @@ export function resolveDefaultReviewTeam( strategyLevel: storedConfig.strategy_level, memberStrategyOverrides: storedConfig.member_strategy_overrides, executionPolicy: executionPolicyFromStoredConfig(storedConfig), + concurrencyPolicy: concurrencyPolicyFromStoredConfig(storedConfig), definition, members: [...coreMembers, ...extraMembers], coreMembers, @@ -2727,8 +2851,12 @@ export function buildEffectiveReviewTeamManifest( ); const tokenBudgetMode = options.tokenBudgetMode ?? 'balanced'; const changeStats = resolveChangeStats(target, options.changeStats); + const baseConcurrencyPolicy = normalizeConcurrencyPolicy(team.concurrencyPolicy); const concurrencyPolicy = applyRateLimitToConcurrencyPolicy( - normalizeConcurrencyPolicy(options.concurrencyPolicy), + normalizeConcurrencyPolicy({ + ...baseConcurrencyPolicy, + ...options.concurrencyPolicy, + }), options.rateLimitStatus, ); const strategyLevel = options.strategyOverride ?? team.strategyLevel; @@ -3050,6 +3178,9 @@ export function buildReviewTeamPromptBlock( `- stagger_seconds: ${manifest.concurrencyPolicy.staggerSeconds}`, `- max_queue_wait_seconds: ${manifest.concurrencyPolicy.maxQueueWaitSeconds}`, `- batch_extras_separately: ${manifest.concurrencyPolicy.batchExtrasSeparately ? 'yes' : 'no'}`, + `- allow_provider_capacity_queue: ${manifest.concurrencyPolicy.allowProviderCapacityQueue ? 'yes' : 'no'}`, + `- allow_bounded_auto_retry: ${manifest.concurrencyPolicy.allowBoundedAutoRetry ? 'yes' : 'no'}`, + `- auto_retry_elapsed_guard_seconds: ${manifest.concurrencyPolicy.autoRetryElapsedGuardSeconds}`, ].join('\n'); const targetLineCount = manifest.changeStats?.totalLinesChanged !== undefined From 0a9460f7f10f966fb296944a6a36924cd222456f Mon Sep 17 00:00:00 2001 From: limit_yan Date: Sat, 9 May 2026 18:25:46 +0800 Subject: [PATCH 5/6] docs(deep-review): plan architecture refactor --- .../deep-review-architecture-refactor-plan.md | 496 ++++++++++++++++++ ...p-review-nondeepreview-impact-inventory.md | 65 +++ 2 files changed, 561 insertions(+) create mode 100644 docs/deep-review-architecture-refactor-plan.md create mode 100644 docs/deep-review-nondeepreview-impact-inventory.md diff --git a/docs/deep-review-architecture-refactor-plan.md b/docs/deep-review-architecture-refactor-plan.md new file mode 100644 index 000000000..57c2cd773 --- /dev/null +++ b/docs/deep-review-architecture-refactor-plan.md @@ -0,0 +1,496 @@ +# Deep Review Architecture Refactor Plan + +## Scope + +This plan reviews the Deep Review-related changes in the local branch from commit `fce420c87284b8534cae657fce07bd8c6fb9e3ef` through `HEAD`. It is a design and execution plan only. It must not be treated as approval to change runtime behavior. + +The branch range contains unrelated product and packaging changes. This document focuses only on Deep Review surfaces: review team construction, target classification, launch manifests, TaskTool reviewer execution, queue/capacity handling, retry admission, incremental cache, diagnostics, report shaping, consent UI, and Flow Chat recovery/action surfaces. + +## Refactor Goals + +1. Move Deep Review-specific logic out of broad shared files where possible. +2. Separate generic subagent runtime primitives from Deep Review policy adapters. +3. Keep standard subagent behavior stable unless a change is explicitly reviewed as a product decision. +4. Reduce oversized files and repeated definitions. +5. Preserve existing Deep Review behavior during refactor rounds. +6. Keep dependencies acyclic and location choices predictable. +7. Make any non-Deep Review impact explicit and testable. +8. Keep frontend and backend Deep Review boundaries clear. +9. Avoid new performance, quality, or security risks. + +## Current Change Surface + +### Backend Core + +| File | Current line count | Deep Review responsibility currently present | Refactor pressure | +|---|---:|---|---| +| `src/crates/core/src/agentic/deep_review_policy.rs` | 3426 | Roles, default team definition, strategy profiles, execution policy, concurrency policy, queue controls, effective cap learning, capacity classifier, budget tracker, diagnostics, shared-context measurement, incremental cache, tests | Very high. This is now a feature subsystem hidden in one file. | +| `src/crates/core/src/agentic/tools/implementations/task_tool.rs` | 2245 | Generic Task tool plus Deep Review reviewer cap waits, retry admission, packet/cache lookup, provider capacity skip, queue events, tests | Very high. Shared subagent execution is coupled to Deep Review behavior. | +| `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` | 1894 | Code review submission plus Deep Review packet fallback, reliability signals, runtime diagnostics, cache write-through, report schema tests | High. Standard Code Review and Deep Review report behavior share one tool. | +| `src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs` | 1363 | Generic tool pipeline plus Deep Review context propagation and duplicate `Read`/`GetFileDiff` measurement | Medium. Deep Review metadata leaks into a generic pipeline. | +| `src/crates/events/src/agentic.rs` | Not measured here | Adds Deep Review queue event contract | Medium. Event is domain-specific in a shared event crate. | + +### Frontend + +| File | Current line count | Deep Review responsibility currently present | Refactor pressure | +|---|---:|---|---| +| `src/web-ui/src/shared/services/reviewTeamService.ts` | 3068 | Defaults, config persistence, backend role resolution, custom reviewer validation, strategy profiles, risk scoring, manifest building, work packets, cache plan, token budget, prompt block | Very high. This should become a directory with a stable facade. | +| `src/web-ui/src/flow_chat/services/DeepReviewService.ts` | 645 | Slash parsing, target resolution, change stats, manifest runtime signals, launch cleanup, child session launch | Medium. Launch orchestration can be split from target/manifest helpers. | +| `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` | 1279 | Shared review action bar plus Deep Review queue controls, interruption recovery, diagnostics, remediation, settings actions | Medium-high. It is under 1500 lines but is already dense. | +| `src/web-ui/src/flow_chat/utils/codeReviewReport.ts` | 870 | Report normalization, reliability notices, manifest rendering, markdown export | Medium. More growth will make report semantics hard to audit. | +| `src/web-ui/src/shared/services/reviewTargetClassifier.ts` | 319 | Source-agnostic target classification and reviewer applicability registry | Good candidate to keep as an independent module. | +| `src/web-ui/src/shared/services/reviewSubagentCapabilities.ts` | 43 | Shared tool contract for custom review agents | Good candidate to keep as common review-team support. | + +## Architectural Problems + +### 1. Deep Review Is A Subsystem, But Backend Code Is Still File-Oriented + +`deep_review_policy.rs` now contains independent concepts that deserve separate modules: + +- role and team definition +- manifest parsing +- execution policy and predictive timeout +- concurrency policy +- queue controls +- effective concurrency learning +- retry budget/admission data +- runtime diagnostics +- shared-context measurement +- incremental cache + +Keeping these in one file increases merge risk and makes future contributors guess whether a new helper belongs in policy, queueing, cache, or diagnostics. + +### 2. Shared Subagent Execution Has Deep Review Branches + +`TaskTool` is the canonical route for hidden subagent execution, not only Deep Review. Current Deep Review additions are mostly gated by manifest/context, but the implementation details live directly inside the generic tool. This makes it too easy for future queue/retry behavior to accidentally affect ordinary subagents. + +### 3. Frontend Review Team Assembly Has Too Many Responsibilities + +`reviewTeamService.ts` constructs config, validates custom agents, classifies risk, builds manifests, formats prompt blocks, builds work packets, estimates token budgets, and creates cache plans. The public API is useful, but the implementation is too large to reason about safely. + +### 4. Report And UI Surfaces Are Blending Standard Review With Deep Review + +`CodeReviewTool` and `CodeReviewReport` are shared by standard Code Review and Deep Review. That reuse is good, but Deep Review-only packet, cache, queue, and reliability logic should be isolated behind Deep Review-specific normalizers so standard Code Review remains easy to reason about. + +### 5. Some Concepts Are Repeated Across Frontend And Backend + +Strategy levels, execution policy fields, concurrency fields, retry limits, and token budget concepts exist in both TypeScript and Rust. Some duplication is expected because frontend builds launch manifests and Rust enforces runtime guardrails, but the boundaries should be explicit: + +- TypeScript owns UX defaults, manifest construction, and prompt generation. +- Rust owns enforcement, queue safety, retry admission, and final trust boundaries. +- Shared JSON field names must be centralized in manifest parser/builders, not hand-read in many locations. + +## Target Architecture + +### Backend Module Layout + +Create a Deep Review subsystem directory under core: + +```text +src/crates/core/src/agentic/deep_review/ + mod.rs + constants.rs + team_definition.rs + manifest.rs + execution_policy.rs + concurrency_policy.rs + queue.rs + retry.rs + diagnostics.rs + shared_context.rs + incremental_cache.rs + report.rs + tests/ +``` + +Responsibilities: + +- `constants.rs`: agent type constants and role families. +- `team_definition.rs`: default review team definition and strategy profile data. +- `manifest.rs`: typed accessors for `deep_review_run_manifest`, packet lookup, strategy/concurrency/cache/token budget field parsing. +- `execution_policy.rs`: timeouts, file split thresholds, retry limit config, risk helper. +- `concurrency_policy.rs`: configured cap and effective-cap calculations. +- `queue.rs`: queue state, queue controls, capacity error classification, local/provider queue decisions. +- `retry.rs`: structured retry coverage validation, retry scope prompt block, retry budget helpers. +- `diagnostics.rs`: aggregate runtime diagnostics, final low-frequency logging data. +- `shared_context.rs`: duplicate `Read`/`GetFileDiff` measurement and future evidence-pack metadata helpers. +- `incremental_cache.rs`: per-session packet cache data model and serialization. +- `report.rs`: Deep Review-specific reliability signal and packet metadata helpers used by `CodeReviewTool`. + +The existing `src/crates/core/src/agentic/deep_review_policy.rs` should become a compatibility facade during migration, then shrink to re-exports or be removed after imports are updated. + +### Generic Subagent Runtime Boundary + +Deep Review should not own generic subagent scheduling. Introduce a generic runtime-facing shape only after the first extraction round: + +```text +src/crates/core/src/agentic/subagent_runtime/ + mod.rs + capacity.rs + queue_state.rs + retry_admission.rs +``` + +Initial rule: do not move behavior here until it is proven generic. + +Generic candidates: + +- capacity acquisition/release guard +- queue state shape independent of Deep Review labels +- timeout separation between queue wait and running time +- bounded retry admission primitives + +Deep Review-specific adapters remain in `agentic/deep_review/queue.rs` and `agentic/deep_review/retry.rs`. + +Do not make provider-capacity auto queueing a global subagent behavior in this refactor. That is a product behavior change and needs a separate confirmation. + +### Backend Tool Facades + +Keep tool entrypoints stable: + +- `TaskTool` remains the tool registered in the registry. +- `CodeReviewTool` remains the tool registered for report submission. + +But move Deep Review branches behind helper modules: + +```rust +// task_tool.rs +let deep_review_context = deep_review::manifest::Context::from_tool_context(context); +deep_review::task_adapter::prepare_launch(...); +deep_review::retry::validate_retry(...); +deep_review::queue::wait_for_reviewer_capacity(...); +``` + +```rust +// code_review_tool.rs +deep_review::report::fill_packet_metadata(...); +deep_review::report::fill_reliability_signals(...); +deep_review::incremental_cache::persist_completed_packets(...); +deep_review::diagnostics::log_final_snapshot(...); +``` + +This keeps the public tool behavior unchanged while making feature-specific code easier to test. + +### Frontend Module Layout + +Split `reviewTeamService.ts` into a directory with a facade: + +```text +src/web-ui/src/shared/services/review-team/ + index.ts + types.ts + defaults.ts + config.ts + backendDefinition.ts + strategy.ts + targetClassifier.ts + subagentCapabilities.ts + manifestBuilder.ts + workPackets.ts + tokenBudget.ts + risk.ts + promptBlock.ts + cachePlan.ts + preReviewSummary.ts +``` + +Keep the current import path working: + +```text +src/web-ui/src/shared/services/reviewTeamService.ts +``` + +The old file should become a facade exporting from `./review-team`. This preserves callers and allows incremental migration. + +### Flow Chat Deep Review Layout + +Split launch and UI helpers without changing visible behavior: + +```text +src/web-ui/src/flow_chat/deep-review/ + launch/ + commandParser.ts + targetResolver.ts + launchPrompt.ts + launchSession.ts + launchErrors.ts + action-bar/ + CapacityQueueNotice.tsx + InterruptionRecoveryPanel.tsx + RemediationControls.tsx + ReviewActionHeader.tsx + report/ + reliabilityNotices.ts + manifestSections.ts + markdown.ts +``` + +Keep current public exports from `DeepReviewService.ts`, `DeepReviewActionBar.tsx`, and `codeReviewReport.ts` during migration. + +## Proposed Execution Rounds + +### Round 0: Baseline And Guardrails + +Goal: create a safe refactor baseline. + +Actions: + +- Record line counts for the oversized files listed above. +- Run focused Deep Review tests already used on this branch. +- Add a short architecture note to the PR description that this is a no-behavior-change refactor plan. +- Confirm that non-Deep Review impact inventory is tracked in `docs/deep-review-nondeepreview-impact-inventory.md`. + +Verification: + +- `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts src/flow_chat/components/btw/DeepReviewActionBar.test.tsx src/flow_chat/utils/codeReviewReport.test.ts` +- `cargo test -p bitfun-core deep_review -- --nocapture` when Cargo registry access is available. + +Behavior change allowed: none. + +### Round 1: Backend Deep Review Module Extraction + +Goal: reduce `deep_review_policy.rs` without changing behavior. + +Actions: + +- Create `src/crates/core/src/agentic/deep_review/`. +- Move constants and default team definition to `constants.rs` and `team_definition.rs`. +- Move execution policy and strategy helpers to `execution_policy.rs`. +- Move concurrency, queue, diagnostics, shared context, retry, and cache into separate modules one at a time. +- Keep `deep_review_policy.rs` as a compatibility facade until imports are migrated. + +Verification: + +- Existing Rust Deep Review tests. +- `rg -n "deep_review_policy::" src/crates/core/src` to verify imports are intentionally retained or migrated. + +Behavior change allowed: none. + +Risks: + +- Moving tests can hide coverage if not migrated with the module. +- Public helper visibility may be widened accidentally. + +Mitigation: + +- Move tests with modules. +- Prefer `pub(crate)` until cross-module callers require `pub`. + +### Round 2: TaskTool Deep Review Adapter Extraction + +Goal: keep generic TaskTool free from Deep Review implementation detail. + +Actions: + +- Add a `deep_review::task_adapter` module that owns: + - detecting Deep Review context + - resolving packet ids from manifests + - attaching per-session cache to manifests + - validating structured retry coverage + - preparing retry prompt prefixes + - calling queue/capacity helpers +- Leave `TaskTool` with a small orchestration call into the adapter. +- Keep ordinary subagent path unchanged. + +Verification: + +- Deep Review TaskTool tests. +- Add or preserve a regression test that a non-DeepReview Task with the same fields does not enter Deep Review queue/retry/cache paths. + +Behavior change allowed: none. + +Risks: + +- This touches generic subagent execution. +- Mistakes can alter normal hidden subagent behavior. + +Mitigation: + +- Add explicit non-DeepReview tests before moving logic. +- Do not generalize provider queueing in this round. + +### Round 3: CodeReviewTool Deep Review Report Adapter + +Goal: separate standard Code Review report behavior from Deep Review report enrichments. + +Actions: + +- Add `deep_review::report` module for: + - packet metadata fallback + - reliability signal filling + - token budget reliability notices + - runtime diagnostics logging + - per-session incremental cache write-through +- Keep `CodeReviewTool` schema and public behavior unchanged. +- Ensure standard Code Review does not receive Deep Review-only signals. + +Verification: + +- Existing `code_review_tool` Deep Review tests. +- Standard Code Review report tests or new tests proving no Deep Review-only metadata appears outside Deep Review context. + +Behavior change allowed: none. + +### Round 4: Shared Event And Tool Pipeline Containment + +Goal: prevent Deep Review-specific details from spreading through shared runtime code. + +Actions: + +- Keep current `DeepReviewQueueStateChanged` event contract stable. +- Move event payload conversion helpers to Deep Review modules. +- In `tool_pipeline.rs`, replace inline Deep Review context propagation with a small hook/helper. +- Keep duplicate `Read`/`GetFileDiff` measurement gated by Deep Review context. + +Verification: + +- `cargo test -p bitfun-events deep_review_queue_state_event_serializes_stable_contract -- --nocapture` +- Tool pipeline tests covering non-DeepReview tools. + +Behavior change allowed: none. + +Deferred behavior change: + +- Replacing `DeepReviewQueueStateChanged` with a generic `SubagentQueueStateChanged` event. This would affect frontend/API contracts and requires user confirmation before implementation. + +### Round 5: Frontend Review Team Service Decomposition + +Goal: shrink `reviewTeamService.ts` and make review team responsibilities discoverable. + +Actions: + +- Create `src/web-ui/src/shared/services/review-team/`. +- Move type definitions first. +- Move pure helpers next: strategy, risk, work packets, token budget, cache plan, pre-review summary. +- Move config persistence and backend definition loading separately. +- Keep `reviewTeamService.ts` as a facade so existing imports remain stable. + +Verification: + +- `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts` +- `pnpm run type-check:web` + +Behavior change allowed: none. + +Risks: + +- Circular imports between `types`, `manifestBuilder`, and `promptBlock`. +- Tests can pass through the facade while internal modules become poorly bounded. + +Mitigation: + +- `types.ts` must not import implementation modules. +- `manifestBuilder.ts` may import pure helpers, but helpers must not import `manifestBuilder.ts`. + +### Round 6: Frontend Flow Chat Deep Review Decomposition + +Goal: separate launch, action bar, and report concerns. + +Actions: + +- Split `DeepReviewService.ts` into command parsing, target resolution, manifest runtime-signal assembly, launch cleanup, and child-session launch. +- Split `DeepReviewActionBar.tsx` into capacity queue, interruption recovery, remediation controls, and generic review action layout components. +- Split `codeReviewReport.ts` into reliability notice building, manifest markdown sections, and report normalization. + +Verification: + +- `pnpm --dir src/web-ui run test:run -- src/flow_chat/services/DeepReviewService.test.ts src/flow_chat/components/btw/DeepReviewActionBar.test.tsx src/flow_chat/utils/codeReviewReport.test.ts` +- `pnpm run lint:web` +- `pnpm run type-check:web` + +Behavior change allowed: none. + +### Round 7: Documentation, Comments, And Ownership Cleanup + +Goal: document module boundaries without adding noisy comments. + +Actions: + +- Add module-level Rust docs for new `deep_review` modules where responsibilities are not obvious. +- Add concise TypeScript file headers only for facades and boundary modules. +- Remove duplicated constants or status wording after the extraction. +- Update `docs/deep-review-design.md` and related phase docs only if the refactor changes file ownership, not product behavior. + +Verification: + +- `rg -n "TODO|TBD|temporary|copy of|duplicate" src/crates/core/src/agentic/deep_review src/web-ui/src/shared/services/review-team` +- Full focused frontend and Rust Deep Review tests. + +Behavior change allowed: none. + +## Dependency Rules + +### Backend + +- `agentic/deep_review/*` may depend on shared core utilities and tool/report types. +- Shared `TaskTool`, `CodeReviewTool`, and `tool_pipeline` may call Deep Review adapters, but they should not own Deep Review policy data. +- `agentic/subagent_runtime/*` must not import Deep Review modules. +- `events` crate must remain data-only and should not import core Deep Review policy. +- No module in `deep_review` should depend on desktop, Tauri, or frontend-specific concepts. + +### Frontend + +- `review-team/types.ts` must be dependency-light and should not import API adapters. +- `review-team/config.ts` may import config API. +- `review-team/backendDefinition.ts` may import agent API. +- `review-team/manifestBuilder.ts` may import target classification, strategy, work packets, token budget, and cache plan. +- Flow Chat launch modules may import the review-team facade, not internal modules unless there is a clear reason. +- UI components must not call Tauri APIs directly. + +## Non-Goals + +- Do not introduce a new crate unless module extraction shows a stable boundary. A crate split is higher friction because many Deep Review helpers still use core session, tool, and error types. +- Do not make Deep Review queueing global subagent behavior. +- Do not change default concurrency, retry, or strategy behavior. +- Do not add project-level review cache. +- Do not replace the prompt-driven DeepReview orchestrator with a backend DAG scheduler. +- Do not replace existing event names in this refactor. + +## Behavior Change Checkpoints + +The following items would be behavior changes and must be confirmed before implementation: + +1. Moving local/provider queue behavior from Deep Review to all subagents. +2. Replacing `DeepReviewQueueStateChanged` with a generic event. +3. Changing retry from model/user-issued structured retry to backend-owned automatic redispatch. +4. Making backend risk scoring authoritative over user/team strategy. +5. Persisting review cache outside session metadata. +6. Hard-clipping prompt bytes or hiding files from coverage metadata. +7. Changing default quick/normal/deep semantics beyond what is already documented in the current cost-aware plan. + +## Quality Gates + +Minimum verification for each refactor round: + +- Rust-only round: + - `cargo test -p bitfun-core deep_review -- --nocapture` + - Add narrower tests for the moved module when possible. +- Shared tool/runtime round: + - Rust Deep Review tests. + - Non-DeepReview TaskTool or CodeReviewTool regression test. +- Frontend service round: + - `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts` + - `pnpm run type-check:web` +- Frontend UI/report round: + - focused component/util tests + - `pnpm run lint:web` + - `pnpm run type-check:web` + +Full release gate after all rounds: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- `cargo check --workspace --exclude bitfun-cli` +- `pnpm run lint:web` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run` +- `git diff --check` + +## Expected End State + +- `deep_review_policy.rs` is no longer an oversized mixed-responsibility file. +- `TaskTool` and `CodeReviewTool` contain only thin Deep Review adapter calls. +- Generic subagent runtime concepts are available without forcing Deep Review behavior onto ordinary subagents. +- `reviewTeamService.ts` is a stable facade over smaller review-team modules. +- Flow Chat Deep Review launch, report, and action-bar logic are separated. +- Non-DeepReview impact is documented and covered by focused regression tests. +- No key Deep Review behavior changes unless separately confirmed. diff --git a/docs/deep-review-nondeepreview-impact-inventory.md b/docs/deep-review-nondeepreview-impact-inventory.md new file mode 100644 index 000000000..869577d0e --- /dev/null +++ b/docs/deep-review-nondeepreview-impact-inventory.md @@ -0,0 +1,65 @@ +# Deep Review Refactor Non-DeepReview Impact Inventory + +## Purpose + +This inventory lists the shared areas touched by the current Deep Review work where future refactoring could affect non-DeepReview behavior. It supports the architecture rule that subagent runtime changes must not silently become Deep Review-specific or unexpectedly alter ordinary subagents. + +## Current Shared Impact Areas + +| Area | Current Deep Review change | Non-DeepReview risk | Required mitigation | +|---|---|---|---| +| `TaskTool` | Deep Review reviewer capacity queueing, effective concurrency learning, provider capacity skip conversion, structured retry admission, packet/cache lookup | Ordinary hidden subagents could accidentally enter queue/retry/cache behavior if context gating is wrong | Keep all Deep Review logic behind explicit agent type/manifest checks; add regression tests for ordinary Task calls. | +| `tool_pipeline.rs` | Propagates Deep Review context variables and records duplicate `Read`/`GetFileDiff` measurements | Generic tool execution can become feature-aware and harder to reuse | Extract propagation and measurement into a Deep Review hook; keep the pipeline generic. | +| `CodeReviewTool` | Deep Review packet metadata fallback, reliability signals, runtime diagnostics, incremental cache write-through | Standard Code Review reports could gain Deep Review-only reliability or cache behavior | Gate enrichments by Deep Review context and add standard Code Review regression tests. | +| `bitfun-events` / agentic events | Adds Deep Review queue state event payload | Event enum becomes increasingly domain-specific | Keep current event stable for compatibility; only design generic subagent queue events after product/API review. | +| Session metadata | Adds Deep Review run manifest and per-session cache fields | Session metadata can accumulate feature-specific blobs | Keep cache per-session, content-bounded, and absent for non-DeepReview sessions. | +| Review action bar store/component | Shared `ReviewActionBar` path now includes Deep Review queue and recovery affordances | Standard Code Review UI can inherit irrelevant Deep Review states | Split queue/recovery panels and render them only for `reviewMode === 'deep'`. | +| Report utilities | Shared code review report helpers render manifest/cache/token-budget sections | Standard Code Review exports can become noisy or show irrelevant Deep Review sections | Keep manifest sections optional and Deep Review-gated. | +| Review settings | Adds Deep Review capacity/retry settings under Review config | Users may confuse Deep Review reviewer concurrency with global subagent concurrency | Label settings as Review Team scoped; keep global `ai.subagent_max_concurrency` out of normal Review settings. | + +## Safe Refactor Rules + +1. Generic subagent runtime modules must not import Deep Review modules. +2. Deep Review adapters may import generic runtime modules. +3. Shared tools may call Deep Review adapters only after context gating. +4. Standard Code Review must continue to work without a Deep Review manifest. +5. Deep Review queue time must not become a global subagent timeout rule unless explicitly approved. +6. Provider capacity requeue must remain Deep Review-scoped until product confirms broader behavior. +7. Diagnostics must stay aggregate-only and content-free. + +## Regression Tests To Keep Or Add + +### Backend + +- A normal `Task` tool call without `deep_review_run_manifest` does not apply Deep Review queue controls. +- A normal `Task` tool retry does not require Deep Review `retry_coverage`. +- Standard `CodeReviewTool` submission does not emit Deep Review packet metadata, cache hit/miss, or queue reliability signals. +- Deep Review queue events serialize with the existing stable event shape. +- Tool pipeline duplicate-read measurement ignores non-DeepReview `Read` and `GetFileDiff` calls. + +### Frontend + +- Standard Code Review action bar renders without capacity queue controls. +- Deep Review capacity queue controls render only when the store has Deep Review queue state. +- Standard Code Review markdown export omits Deep Review manifest/cache sections. +- Review settings copy distinguishes Review Team max reviewers from global subagent concurrency. + +## Behavior Changes That Need User Confirmation + +The following are not safe as pure refactors: + +1. Applying Deep Review capacity queueing to all subagents. +2. Making provider transient errors auto-queue for ordinary subagents. +3. Replacing Deep Review-specific queue events with generic subagent queue events. +4. Persisting Deep Review cache at project level. +5. Auto-retrying reviewer packets without explicit structured coverage and budget guards. +6. Making backend strategy recommendations override user-selected strategy. + +## Documentation Follow-Up + +If any refactor round touches one of the shared areas above, update this document in the same commit with: + +- the exact shared file touched; +- whether behavior changed or only ownership changed; +- the regression test that proves non-DeepReview behavior stayed stable; +- any product decision still required. From 140f4b1d9faaf17d59d7802e601d8a42c57641eb Mon Sep 17 00:00:00 2001 From: limit_yan Date: Sat, 9 May 2026 18:38:14 +0800 Subject: [PATCH 6/6] docs(deep-review): consolidate status and plans --- docs/deep-review-completed-status.md | 324 ++++++++ docs/deep-review-completed-status.zh-CN.md | 322 ++++++++ docs/deep-review-pending-plan.md | 811 +++++++++++++++++++++ docs/deep-review-pending-plan.zh-CN.md | 811 +++++++++++++++++++++ 4 files changed, 2268 insertions(+) create mode 100644 docs/deep-review-completed-status.md create mode 100644 docs/deep-review-completed-status.zh-CN.md create mode 100644 docs/deep-review-pending-plan.md create mode 100644 docs/deep-review-pending-plan.zh-CN.md diff --git a/docs/deep-review-completed-status.md b/docs/deep-review-completed-status.md new file mode 100644 index 000000000..d8c87d182 --- /dev/null +++ b/docs/deep-review-completed-status.md @@ -0,0 +1,324 @@ +# Deep Review Completed Status + +## Purpose + +This document consolidates the completed Deep Review work from the current design and phase documents into one standalone status reference. It separates verified implementation from planned or deferred work so future changes can be reviewed against the actual product boundary. + +## Source Documents + +This consolidation covers the following current Deep Review documents and companion local design notes: + +| Source | Role in this consolidation | +|---|---| +| `docs/deep-review-design.md` | Original strategy-engine, Architecture Reviewer, Frontend Reviewer, prompt ownership, and current implementation status. | +| `docs/deep-review-phase2-plan.md` | Phase 2 implementation status for strategy, concurrency, retry, cache, token budget, and report reliability work. | +| `docs/deep-review-phase2-addendum.md` | Truth model, status wording, risk register, completed rounds, and deferred boundaries. | +| `docs/deep-review-phase3-followup-plan.md` | Latest product decisions and the current Phase 3 split between implemented diagnostics/settings and pending provider queue, retry action, and cost scope work. | +| `docs/deep-review-architecture-refactor-plan.md` | Architecture refactor goals and module boundaries; implemented behavior from this document is not claimed here because it is a pending refactor plan. | +| `docs/deep-review-nondeepreview-impact-inventory.md` | Shared-runtime impact rules that are already documented and must continue to constrain future work. | +| `docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md` | Round-level execution status and verification history for Phase 3. | +| `docs/superpowers/specs/deep-review-design.md` and `docs/superpowers/plans/deep-review-phase2-plan.md` | Local companion copies checked for drift; they duplicate the main design and Phase 2 plan shape and do not add a different current boundary. | + +## Status Wording + +Use the wording below when updating or reviewing Deep Review work: + +| Wording | Meaning | +|---|---| +| Implemented | Runtime behavior exists in code and has focused verification. | +| Implemented with guardrails | Behavior exists but is intentionally bounded by settings, budgets, user controls, or trust metadata. | +| Safety net | Runtime blocks or reports an unsafe condition, but does not provide the full smoother product behavior. | +| Prompt-guided | The manifest or prompt asks the orchestrator to perform the step; weak models can still miss or mis-sequence it. | +| Deferred by product decision | Implementation is intentionally blocked until privacy, retention, UX, or product rules are approved. | +| Pending implementation | The design is agreed but code has not landed. | + +This document only lists implemented or implemented-with-guardrails behavior. Pending and deferred work is tracked in `docs/deep-review-pending-plan.md`. + +## Current Runtime Shape + +Deep Review remains a prompt-driven 5-phase orchestrator: + +1. Scope identification. +2. Parallel specialist review. +3. Sequential judge/quality gate. +4. Final report synthesis through `submit_code_review`. +5. Optional remediation through normal editing tools. + +The current reviewer team contains: + +| Reviewer | Runtime status | Scope | +|---|---|---| +| `ReviewBusinessLogic` | Always on | Correctness, business rules, data/state transitions. | +| `ReviewPerformance` | Always on | Runtime hot paths, expensive computations, payload cost. | +| `ReviewSecurity` | Always on | Exploitable trust-boundary, auth, data-handling, and security risks. | +| `ReviewArchitecture` | Always on | Layer boundaries, dependency direction, API contract shape, maintainability. | +| `ReviewFrontend` | Conditional | React, UI state, i18n, accessibility, frontend-backend contract drift, and frontend platform boundaries. | +| `ReviewJudge` | Sequential | Deduplicates, resolves overlap, validates evidence, and synthesizes the report. | +| Custom review agents | Optional | Must satisfy the minimum review-agent tooling contract. | + +The orchestrator is still source-agnostic. Git-backed changes, local workspace changes, or future sources should flow through a target manifest rather than assuming one Git-only abstraction. + +## Completed Reviewer And Prompt Work + +### Core Role Expansion + +- `ReviewArchitecture` is implemented as an always-on core reviewer. +- `ReviewFrontend` is implemented as a frontend-focused reviewer with conditional activation. +- Dedicated prompts exist for Architecture and Frontend reviewers. +- Existing prompts were narrowed to reduce overlap: + - Business Logic does not own UI state or layer-boundary analysis. + - Performance does not own React render optimization. + - Security focuses on exploitable risks rather than general structural boundary violations. +- The Judge prompt handles cross-reviewer overlap for Architecture/Business Logic, Architecture/Security, Frontend/Performance, and Frontend/Business Logic. +- The DeepReview orchestrator prompt contains role-specific strategy amplification and frontend strategy directives. + +### Role Metadata, Visibility, And I18n + +- Backend-provided reviewer definitions are the runtime source for frontend team resolution and review-agent visibility. +- Frontend fallback metadata remains only as degraded-mode safety behavior. +- Settings and Agents page i18n include Architecture and Frontend reviewer names. +- The Agents page Code Review Team card was adjusted to avoid clipped reviewer tags and to present a compact role summary. +- Hidden review-agent metadata is derived dynamically so review agents can stay hidden from normal agent pickers while still being visible in Review Team surfaces. + +## Completed Target Classification And Conditional Dispatch + +- `ReviewFrontend` dispatch changed from always-present execution to conditional execution. +- Conditional activation is driven by target/domain classification and a reviewer applicability registry, not scattered hardcoded file checks. +- The current classifier supports frontend UI, frontend style, frontend i18n, frontend contract, desktop contract, backend core, API layer, transport, and other domain tags. +- `hasFrontendFiles()` remains backward compatible by deriving from frontend-related tags. +- The same registry is intended to support future conditional reviewers. +- Custom review subagents are included only when valid and applicable; invalid custom review agents remain explainable instead of silently disappearing. + +## Completed Custom Review-Agent Contract + +- The minimum valid custom review-agent tool set is centralized as: + - `GetFileDiff` + - `Read` +- Missing required tools are reported as `invalid_tooling`. +- Missing recommended investigation tools such as `Grep`, `Glob`, or `LS` are treated as degraded review quality, not invalid configuration. +- The UI and runtime share the same contract definition so create/edit affordances and Review Team enforcement do not drift. +- Invalid or skipped reviewers are surfaced in manifest/report metadata rather than being filtered out before the user can understand why they did not run. + +## Completed Strategy And Risk Metadata + +- Backend `ChangeRiskFactors` and `auto_select_strategy()` exist as pure policy helpers. +- Launch manifests record: + - frontend recommendation; + - backend-compatible recommendation; + - explicit user override; + - final strategy; + - mismatch state; + - mismatch severity. +- Backend scoring is advisory and mismatch-warning only. +- Backend scoring does not override the selected strategy, expand the reviewer roster, or change token/concurrency cost. +- `max_cyclomatic_complexity_delta` remains explicitly `not_measured`; authoritative strategy selection is not implemented. + +## Completed Predictive Timeout And Partial Result Capture + +- Launch manifests record target file count and diff line stats. +- Effective reviewer and judge timeouts are derived from strategy and target size. +- TaskTool honors the effective manifest policy when launching Deep Review reviewer subagents. +- `SubagentResultStatus::PartialTimeout` exists. +- The coordinator can preserve a timed-out subagent final message when it arrives inside the configured grace period. +- The limitation is explicit: arbitrary stream fragments are not reconstructed outside the grace window. + +## Completed Concurrency And Queue Foundation + +### Runtime Enforcement + +- `DeepReviewConcurrencyPolicy` parsing exists. +- TaskTool bounded-waits for local reviewer-cap saturation. +- Queue time is separated from reviewer runtime time. +- Expired local-cap waits can become `CapacitySkipped`. +- Turn-local effective concurrency learning lowers capacity after local capacity skips and explicit provider transient-capacity reviewer failures. +- Successful reviewer observations can cautiously recover the effective cap. +- Capacity skips are folded into final report reliability signals. + +### User-Visible Queue Controls + +- A backend queue-state event contract exists. +- Compact queue notices exist in the Flow Chat action-bar path. +- Backend-bound local-cap queue controls exist for: + - pause; + - continue; + - cancel; + - optional-extra skip. +- Launch-time active-session concurrency warning exists so Deep Review does not silently compete with a busy user session. +- Recovery actions include running slower next time and opening Review settings. + +### Current Boundary + +- Current queue automation is narrow and local-cap oriented. +- Explicit provider transient-capacity reviewer failures currently become `capacity_skipped`, lower the turn-local effective cap, and feed reliability signals. +- Short automatic provider requeue/retry is pending, not implemented. +- Backend batch/stagger scheduling is pending. +- User-facing effective-cap override controls are pending. +- Deep Review queueing is not global subagent queueing. + +## Completed Retry Guardrails + +- Retry budget tracking exists. +- Reviewer timeout retry guidance exists. +- Retry guidance uses the effective manifest policy when available. +- TaskTool structured retry admission exists. +- A retry reviewer Task must include structured coverage and pass runtime checks: + - `retry: true`; + - source packet/status information; + - retryable source status; + - reduced retry scope; + - lower timeout; + - available retry budget. +- Accepted retry Tasks receive a bounded retry-scope prompt block. +- Missing coverage, broad scope, non-retryable status, non-lowered timeout, and exhausted budget are rejected. +- Backend-owned automatic redispatch remains pending. +- User-facing explicit retry action remains pending. + +## Completed Incremental Cache Boundary + +- Per-session `DeepReviewIncrementalCache` primitives exist. +- Session metadata contains the cache field. +- Existing persistence preserves the cache field. +- TaskTool can read a matching per-session cache hit by resolved `packet_id`. +- `submit_code_review` can write completed reviewer output back to the per-session cache. +- Read/write paths align on work-packet `packet_id`. +- Report reliability signals can show cache hit/miss behavior. +- The current cache has no independent retention period beyond session metadata. +- Deleting or clearing session metadata removes this cache. +- Project-level or cross-session cache is not implemented. + +## Completed Packet Metadata And Report Reliability + +- `submit_code_review` has packet metadata fallback. +- Missing reviewer `packet_id` values can be inferred from the manifest when possible. +- Lower-confidence fallback metadata is marked as such. +- Final report reliability signals cover: + - partial timeout; + - retry guidance; + - skipped reviewers; + - capacity/concurrency limits; + - cache hits/misses; + - token-budget tradeoffs. +- Report/export utilities keep dense reliability details collapsed or summarized. +- Standard Code Review should not receive Deep Review-only packet/cache/queue signals unless Deep Review context is present. + +## Completed Shared-Context Measurement + +- Deep Review reviewer `Read` and `GetFileDiff` calls are measured by parent turn, reviewer type, tool name, normalized path, call count, and reviewer count. +- Measurement is content-free. +- Measurement does not store source text, diff text, tool output, model output, or provider raw body. +- Final Deep Review submission can emit aggregate debug diagnostics once. +- The report remains free of raw shared-context diagnostics. +- Programmatic cross-subagent tool-result reuse is not implemented. + +## Completed Token And Context Budget Guardrails + +- Launch manifests include heuristic per-mode prompt-byte thresholds. +- Manifests include estimated maximum reviewer prompt bytes. +- Summary-first full-scope metadata exists. +- File split and max-file style guardrails exist. +- Summary-first behavior keeps every assigned file visible; it must not silently hide files from coverage metadata. +- Hard prompt-byte clipping and byte-accurate enforcement remain deferred. + +## Completed Consent, Recovery, And Settings UX + +### First-Run And Launch UX + +- The Deep Review consent dialog includes a compact pre-review summary: + - file count; + - risk areas; + - selected strategy; + - optional reviewer count; + - summary-first state; + - skipped reviewer warnings when present. +- The dialog copy was intentionally reduced to key reminders. +- User-facing copy is localized. +- Dense lineup/cost cards remain deferred. + +### Action Bar And Recovery + +- Deep Review action-bar surfaces support interruption/recovery states. +- Manual cancellation preserves parent summary rather than treating every stop as full review loss. +- Continue/resume controls remain visible when recovery is possible. +- Diagnostics copy actions preserve raw diagnostic metadata while keeping user-facing copy localized. + +### Review Capacity And Retry Settings + +- Default Review Team config stores: + - `max_parallel_reviewers`; + - `max_queue_wait_seconds`; + - `allow_provider_capacity_queue`; + - `allow_bounded_auto_retry`; + - `auto_retry_elapsed_guard_seconds`. +- Defaults remain conservative: + - 4 parallel reviewers; + - 60 seconds max queue wait; + - provider capacity queue allowed by policy but short runtime queue still pending; + - bounded automatic retry disabled by default; + - 180 seconds elapsed guard. +- Controls are scoped to Review Team settings. +- They do not change global `ai.subagent_max_concurrency`. + +## Completed Adaptive Runtime Support + +- Context health snapshot support exists for degraded long-running sessions. +- Model capability profile support exists for weaker model handling. +- Runtime policy can adapt context profile behavior based on model capability and session health. +- This is a guardrail layer, not a replacement for user-selected review strategy. + +## Completed Compression Contract Integration + +- `CompressionContract` and conversion from `EvidenceLedgerSummary` are complete. +- The compressor prompt already injects contract content. +- No additional Deep Review implementation is currently needed for this item. + +## Completed Non-DeepReview Impact Documentation + +The shared-impact inventory is documented and must remain active during future work: + +| Shared area | Completed boundary | +|---|---| +| `TaskTool` | Deep Review queue, retry, packet, and cache logic must stay behind explicit Deep Review context checks. | +| `tool_pipeline.rs` | Duplicate `Read`/`GetFileDiff` measurement is Deep Review-gated and content-free. | +| `CodeReviewTool` | Deep Review report enrichments are gated by Deep Review context. | +| `bitfun-events` | Current Deep Review queue event is stable and domain-specific; generic event replacement is a future product/API decision. | +| Session metadata | Deep Review cache is per-session and absent for non-DeepReview sessions. | +| Review action bar | Queue/recovery panels render only for Deep Review state. | +| Report utilities | Manifest/cache/token-budget sections remain optional and Deep Review-gated. | +| Review settings | Review Team capacity settings are labeled as review-scoped, not global subagent concurrency. | + +## Verification History Recorded In Source Docs + +The source documents record focused and release-gate verification, including: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- `cargo check --workspace --exclude bitfun-cli` +- `pnpm run lint:web` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run` +- focused frontend tests for `reviewTeamService`, Deep Review action bar/store, queue events, and report utilities; +- focused Rust tests for runtime diagnostics, cache behavior, retry admission, queue/capacity behavior, and report reliability. + +The latest consolidation in this document does not claim new runtime verification. It records already documented implementation status and separates pending work into `docs/deep-review-pending-plan.md`. + +## Completed Boundary Summary + +Deep Review has moved from a prompt-only concept to a guarded runtime with: + +- always-on architecture review; +- conditional frontend review; +- backend-provided team definitions; +- data-driven reviewer applicability; +- explainable custom-reviewer validation; +- advisory strategy metadata; +- predictive timeouts; +- partial-timeout final-message capture; +- local-cap queue controls; +- structured retry admission; +- per-session packet cache; +- packet fallback; +- report reliability signals; +- content-free duplicate-tool diagnostics; +- compact launch summary; +- review-scoped capacity and retry settings. + +The completed boundary intentionally stops before automatic provider requeue, user-facing retry actions, project-level cache, hard byte clipping, programmatic shared tool-result reuse, global subagent scheduling, and large-scale architecture refactoring. diff --git a/docs/deep-review-completed-status.zh-CN.md b/docs/deep-review-completed-status.zh-CN.md new file mode 100644 index 000000000..3b997bb6d --- /dev/null +++ b/docs/deep-review-completed-status.zh-CN.md @@ -0,0 +1,322 @@ +# Deep Review 已完成状态 + +## 目的 + +本文档把当前 Deep Review 相关设计和阶段文档中已经完成的内容合并为一份独立状态说明。它只记录已经实现或带防护边界实现的行为,把后续计划和延期事项放到 `docs/deep-review-pending-plan.zh-CN.md` 中,避免后续再从多份阶段文档中拼接状态。 + +## 来源文档 + +本次合并覆盖以下 Deep Review 文档和本地伴随设计副本: + +| 来源 | 在本合并文档中的作用 | +|---|---| +| `docs/deep-review-design.md` | 原始策略引擎、Architecture Reviewer、Frontend Reviewer、Prompt 职责边界和当前实现状态。 | +| `docs/deep-review-phase2-plan.md` | Phase 2 中策略、并发、重试、缓存、Token 预算和报告可靠性状态。 | +| `docs/deep-review-phase2-addendum.md` | 事实模型、状态措辞、风险登记、已完成轮次和延期边界。 | +| `docs/deep-review-phase3-followup-plan.md` | 最新产品决策,以及 Phase 3 中已完成诊断/设置和待完成 provider queue、retry action、cost scope 的拆分。 | +| `docs/deep-review-architecture-refactor-plan.md` | 架构重构目标和模块边界;该文档中的重构仍属于待完成事项。 | +| `docs/deep-review-nondeepreview-impact-inventory.md` | 已记录的共享运行时影响规则,后续工作仍必须遵守。 | +| `docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md` | Phase 3 的轮次执行状态和验证历史。 | +| `docs/superpowers/specs/deep-review-design.md`、`docs/superpowers/plans/deep-review-phase2-plan.md` | 本地伴随副本,已检查无新的不同边界。 | + +## 状态措辞 + +| 措辞 | 含义 | +|---|---| +| 已实现 | 运行时代码中已有确定行为,并有聚焦验证。 | +| 已实现并带防护 | 行为已经存在,但被设置、预算、用户控制或可信元数据明确约束。 | +| 安全网 | 运行时能阻止或报告不安全状态,但还不是完整的顺滑产品体验。 | +| Prompt 引导 | Manifest 或 Prompt 要求 orchestrator 执行,但弱模型仍可能遗漏或顺序错误。 | +| 产品决策延期 | 需要隐私、保留、删除、UX 或产品规则确认后才能实现。 | +| 待实现 | 设计已明确,但代码尚未落地。 | + +本文只列出已实现或已实现并带防护的内容。待实现和延期内容见 `docs/deep-review-pending-plan.zh-CN.md`。 + +## 当前运行时形态 + +Deep Review 仍是 Prompt 驱动的 5 阶段 orchestrator: + +1. 识别审核范围。 +2. 并行启动 specialist reviewer。 +3. 顺序执行 judge/quality gate。 +4. 通过 `submit_code_review` 合成最终报告。 +5. 可选通过普通编辑工具进行修复。 + +当前审核团队包括: + +| Reviewer | 运行时状态 | 范围 | +|---|---|---| +| `ReviewBusinessLogic` | 始终启用 | 正确性、业务规则、数据和状态迁移。 | +| `ReviewPerformance` | 始终启用 | 运行时热点路径、昂贵计算、payload 成本。 | +| `ReviewSecurity` | 始终启用 | 可利用的信任边界、认证授权、数据处理和安全风险。 | +| `ReviewArchitecture` | 始终启用 | 分层边界、依赖方向、API 契约形态、可维护性。 | +| `ReviewFrontend` | 条件启用 | React、UI 状态、i18n、可访问性、前后端契约漂移和前端平台边界。 | +| `ReviewJudge` | 顺序执行 | 去重、处理重叠、验证证据、合成报告。 | +| 自定义 review agent | 可选 | 必须满足最小审核工具契约。 | + +Deep Review 仍保持 source-agnostic。Git 变更、本地 workspace 变更或未来来源都应通过 target manifest 描述,而不是绑定为 Git-only 抽象。 + +## 已完成 Reviewer 与 Prompt 工作 + +### 核心角色扩展 + +- `ReviewArchitecture` 已作为始终启用的核心 reviewer 实现。 +- `ReviewFrontend` 已作为前端聚焦 reviewer 实现,并支持条件启用。 +- Architecture 和 Frontend reviewer 都已有独立 prompt。 +- 既有 reviewer prompt 已收窄职责以减少重叠: + - Business Logic 不再负责 UI 状态或层级边界分析。 + - Performance 不再负责 React render 优化。 + - Security 聚焦可利用风险,而不是一般结构边界。 +- Judge prompt 已覆盖 Architecture/Business Logic、Architecture/Security、Frontend/Performance、Frontend/Business Logic 的重叠处理。 +- DeepReview orchestrator prompt 包含角色级策略强化和 Frontend 策略指令。 + +### 角色元数据、可见性与国际化 + +- 后端提供的 reviewer definition 是前端团队解析和 review-agent 可见性的运行时来源。 +- 前端 fallback metadata 仅作为降级安全网。 +- Settings 和 Agents 页面已包含 Architecture、Frontend reviewer 的国际化名称。 +- Agents 页面 Code Review Team 卡片已调整,避免 reviewer tag 裁切,并使用更紧凑的角色摘要。 +- Hidden review-agent metadata 由静态非 review hidden id 与后端提供的 review-agent hidden id 动态合成。 + +## 已完成目标分类与条件派发 + +- `ReviewFrontend` 已从固定存在改为条件执行。 +- 条件启用由 target/domain classification 和 reviewer applicability registry 驱动。 +- 当前分类器支持 frontend UI、frontend style、frontend i18n、frontend contract、desktop contract、backend core、API layer、transport 等 domain tags。 +- 兼容的 `hasFrontendFiles()` 仍从 frontend 相关 tag 派生。 +- 同一 registry 可用于未来条件 reviewer。 +- 自定义 review subagent 只有在有效且适用时才会进入 manifest;无效 agent 会以可解释方式呈现,而不是静默消失。 + +## 已完成自定义 Review-Agent 契约 + +- 最小有效 review-agent 工具集已集中定义为: + - `GetFileDiff` + - `Read` +- 缺少必需工具会报告为 `invalid_tooling`。 +- 缺少 `Grep`、`Glob`、`LS` 等推荐调查工具时,只视为审核质量降级,不视为无效配置。 +- UI 和运行时共用同一契约定义,避免创建/编辑 UI 与 Review Team 执行规则漂移。 +- 无效或跳过的 reviewer 会进入 manifest/report metadata,用户能看到原因。 + +## 已完成策略与风险元数据 + +- 后端已有 `ChangeRiskFactors` 和 `auto_select_strategy()` 纯策略 helper。 +- Launch manifest 会记录: + - 前端推荐; + - 后端兼容推荐; + - 用户显式 override; + - 最终策略; + - mismatch 状态; + - mismatch 严重度。 +- 后端评分仅作为 advisory/mismatch-warning metadata。 +- 后端评分不会覆盖用户选择策略、扩展 reviewer roster 或改变 Token/并发成本。 +- `max_cyclomatic_complexity_delta` 仍明确为 `not_measured`;权威自动策略选择未实现。 + +## 已完成预测超时与部分结果捕获 + +- Launch manifest 会记录目标文件数和 diff 行数。 +- Reviewer 和 judge 的有效超时由策略和目标大小派生。 +- TaskTool 在启动 Deep Review reviewer subagent 时会遵守 manifest policy。 +- `SubagentResultStatus::PartialTimeout` 已存在。 +- 如果超时 subagent 在 grace period 内返回可用 final message,coordinator 可以保留该最终消息。 +- 当前限制明确:不会在 grace window 之外重建任意 stream fragment。 + +## 已完成并发与队列基础 + +### 运行时执行 + +- `DeepReviewConcurrencyPolicy` 解析已实现。 +- TaskTool 会针对本地 reviewer cap 饱和进行有界等待。 +- 队列等待时间与 reviewer 执行时间分离。 +- 本地 cap 等待超时可变为 `CapacitySkipped`。 +- 本轮有效并发会在本地 capacity skip 和显式 provider transient-capacity reviewer failure 后降低。 +- 成功 reviewer 观察可谨慎恢复有效 cap。 +- Capacity skip 会进入最终报告可靠性信号。 + +### 用户可见队列控制 + +- 后端队列状态事件契约已存在。 +- Flow Chat action bar 已有紧凑队列提示。 +- 本地 cap 队列支持后端绑定的: + - pause; + - continue; + - cancel; + - optional-extra skip。 +- 启动时已有 active-session concurrency warning,避免 Deep Review 静默争用繁忙用户 session。 +- 恢复动作包括下次降低速度运行和打开 Review settings。 + +### 当前边界 + +- 当前队列自动化范围很窄,主要面向本地 cap。 +- 显式 provider transient-capacity reviewer failure 当前会转成 `capacity_skipped`,降低本轮有效 cap,并进入可靠性信号。 +- 短暂自动 provider requeue/retry 尚未实现。 +- 后端 batch/stagger scheduling 尚未实现。 +- 用户可见 effective-cap override controls 尚未实现。 +- Deep Review queueing 不是全局 subagent queueing。 + +## 已完成重试防护 + +- Retry budget tracking 已存在。 +- Reviewer timeout retry guidance 已存在。 +- Retry guidance 会优先使用有效 manifest policy。 +- TaskTool structured retry admission 已存在。 +- Retry reviewer Task 必须包含结构化 coverage 并通过运行时检查: + - `retry: true`; + - source packet/status 信息; + - 可重试 source status; + - 缩小后的 retry scope; + - 更低 timeout; + - 可用 retry budget。 +- 接受的 retry Task 会获得有界 retry-scope prompt block。 +- 缺少 coverage、scope 未缩小、source status 不可重试、timeout 未降低、budget 耗尽都会被拒绝。 +- 后端自动 redispatch 尚未实现。 +- 用户显式 retry action 尚未实现。 + +## 已完成增量缓存边界 + +- Per-session `DeepReviewIncrementalCache` primitive 已存在。 +- Session metadata 包含 cache field。 +- 既有 persistence 会保留 cache field。 +- TaskTool 可以通过解析出的 `packet_id` 命中同一 session 的缓存。 +- `submit_code_review` 可以把已完成 reviewer 输出写回 per-session cache。 +- 读写路径都按 work-packet `packet_id` 对齐。 +- 报告可靠性信号可展示 cache hit/miss。 +- 当前 cache 不具备独立于 session metadata 的保留周期。 +- 删除或清理 session metadata 会移除该 cache。 +- Project-level 或 cross-session cache 未实现。 + +## 已完成 Packet Metadata 与报告可靠性 + +- `submit_code_review` 已有 packet metadata fallback。 +- 缺失 reviewer `packet_id` 时,能在可能情况下从 manifest 推断。 +- 低置信度 fallback metadata 会被标记。 +- 最终报告可靠性信号覆盖: + - partial timeout; + - retry guidance; + - skipped reviewer; + - capacity/concurrency limit; + - cache hit/miss; + - token-budget tradeoff。 +- Report/export utilities 会摘要或折叠密集可靠性细节。 +- 标准 Code Review 不应在没有 Deep Review context 时获得 Deep Review-only packet/cache/queue 信号。 + +## 已完成共享上下文测量 + +- Deep Review reviewer 的 `Read` 和 `GetFileDiff` 调用会按 parent turn、reviewer type、tool name、normalized path、call count、reviewer count 测量。 +- 测量不包含内容。 +- 测量不存储源码、diff、tool output、model output 或 provider raw body。 +- 最终 Deep Review submission 可输出一次 aggregate debug diagnostics。 +- 报告中不包含原始 shared-context diagnostics。 +- 跨 subagent 的 programmatic tool-result reuse 未实现。 + +## 已完成 Token 与上下文预算防护 + +- Launch manifest 包含按模式配置的 heuristic prompt-byte threshold。 +- Manifest 包含 estimated max reviewer prompt bytes。 +- Summary-first full-scope metadata 已存在。 +- File split 和 max-file 风格 guardrail 已存在。 +- Summary-first 行为仍保持每个 assigned file 可见,不能静默从 coverage metadata 中隐藏文件。 +- Hard prompt-byte clipping 和 byte-accurate enforcement 仍延期。 + +## 已完成 Consent、Recovery 与 Settings UX + +### 首次运行与启动体验 + +- Deep Review consent dialog 已包含紧凑 pre-review summary: + - 文件数; + - 风险区域; + - 选中策略; + - optional reviewer 数; + - summary-first 状态; + - 有跳过 reviewer 时显示提醒。 +- 弹框信息已精简为关键提醒。 +- 用户可见文案已国际化。 +- 密集 lineup/cost card 仍延期。 + +### Action Bar 与恢复 + +- Deep Review action bar 已支持 interruption/recovery 状态。 +- 手动取消会保留 parent summary,而不是把所有 stop 都当作完整审核丢失。 +- 可恢复时 continue/resume controls 保持可见。 +- Diagnostics copy actions 会保留原始诊断元数据,同时保持用户文案国际化。 + +### Review Capacity 与 Retry Settings + +- Default Review Team config 存储: + - `max_parallel_reviewers`; + - `max_queue_wait_seconds`; + - `allow_provider_capacity_queue`; + - `allow_bounded_auto_retry`; + - `auto_retry_elapsed_guard_seconds`。 +- 默认值保持保守: + - 4 个并行 reviewer; + - 60 秒最大队列等待; + - provider capacity queue 在配置上允许,但短 provider queue 运行时仍待实现; + - bounded automatic retry 默认关闭; + - elapsed guard 为 180 秒。 +- 控件属于 Review Team settings。 +- 不修改全局 `ai.subagent_max_concurrency`。 + +## 已完成自适应运行时支持 + +- Context health snapshot 已存在,用于降级长任务 session。 +- Model capability profile 已存在,用于弱模型处理。 +- Runtime policy 可以基于模型能力和 session health 调整 context profile。 +- 这是防护层,不替代用户选择的审核策略。 + +## 已完成 Compression Contract 集成 + +- `CompressionContract` 和从 `EvidenceLedgerSummary` 的转换已完成。 +- Compressor prompt 已注入 contract 内容。 +- 当前不需要更多 Deep Review 实现。 + +## 已完成非 DeepReview 影响文档化 + +共享影响清单已记录,后续仍必须遵守: + +| 共享区域 | 已完成边界 | +|---|---| +| `TaskTool` | Deep Review queue、retry、packet、cache 逻辑必须在显式 Deep Review context 检查后执行。 | +| `tool_pipeline.rs` | 重复 `Read`/`GetFileDiff` 测量受 Deep Review gate 保护,且不含内容。 | +| `CodeReviewTool` | Deep Review 报告增强由 Deep Review context gate 保护。 | +| `bitfun-events` | 当前 Deep Review queue event 稳定且为领域特定;是否替换成通用事件是未来产品/API 决策。 | +| Session metadata | Deep Review cache 是 per-session 的,非 DeepReview session 不存在。 | +| Review action bar | Queue/recovery panels 只在 Deep Review 状态下渲染。 | +| Report utilities | Manifest/cache/token-budget section 可选且受 Deep Review gate 保护。 | +| Review settings | Review Team capacity settings 标记为 review-scoped,不是全局 subagent concurrency。 | + +## 来源文档中记录的验证历史 + +来源文档记录了聚焦验证和 release gate,包括: + +- `cargo test -p bitfun-core deep_review -- --nocapture` +- `cargo check --workspace --exclude bitfun-cli` +- `pnpm run lint:web` +- `pnpm run type-check:web` +- `pnpm --dir src/web-ui run test:run` +- `reviewTeamService`、Deep Review action bar/store、queue events、report utilities 的聚焦前端测试; +- runtime diagnostics、cache、retry admission、queue/capacity、report reliability 的聚焦 Rust 测试。 + +本整合文档不声称新的运行时验证,只记录既有文档中的实现状态,并把待完成事项拆到 `docs/deep-review-pending-plan.zh-CN.md`。 + +## 已完成边界总结 + +Deep Review 已从纯 Prompt 概念推进为带运行时防护的能力,包含: + +- 始终启用 Architecture reviewer; +- 条件启用 Frontend reviewer; +- 后端提供团队定义; +- 数据驱动 reviewer applicability; +- 可解释的自定义 reviewer 校验; +- advisory strategy metadata; +- predictive timeout; +- partial-timeout final-message capture; +- local-cap queue controls; +- structured retry admission; +- per-session packet cache; +- packet fallback; +- report reliability signals; +- content-free duplicate-tool diagnostics; +- compact launch summary; +- review-scoped capacity/retry settings。 + +当前已完成边界明确不包含 automatic provider requeue、用户显式 retry action、project-level cache、hard byte clipping、programmatic shared tool-result reuse、global subagent scheduling 和大型架构重构。 diff --git a/docs/deep-review-pending-plan.md b/docs/deep-review-pending-plan.md new file mode 100644 index 000000000..2ab2a8806 --- /dev/null +++ b/docs/deep-review-pending-plan.md @@ -0,0 +1,811 @@ +# Deep Review Pending Plan And Architecture Refactor + +## Purpose + +This document consolidates all pending and deferred Deep Review work into one executable plan. It also includes the architecture refactor design that should guide future implementation without changing current behavior unless a specific product checkpoint approves it. + +The completed behavior baseline is tracked separately in `docs/deep-review-completed-status.md`. + +## Source Documents + +This plan merges the pending work from: + +- `docs/deep-review-design.md` +- `docs/deep-review-phase2-plan.md` +- `docs/deep-review-phase2-addendum.md` +- `docs/deep-review-phase3-followup-plan.md` +- `docs/deep-review-architecture-refactor-plan.md` +- `docs/deep-review-nondeepreview-impact-inventory.md` +- `docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md` +- local companion copies under `docs/superpowers/specs/` and `docs/superpowers/plans/` that duplicate the same Deep Review design lineage. + +## Product And Architecture Boundaries + +Future implementation must stay inside these boundaries: + +1. Deep Review remains prompt-driven with deterministic guardrails. Do not replace it with a backend DAG scheduler without a separate design approval. +2. Deep Review queueing must not become global subagent queueing by accident. +3. Deep Review must not silently consume normal user-session concurrency. If the session is already busy, the product should warn, pause, or require manual continuation. +4. Queue wait time must not count against reviewer runtime timeout. +5. Provider capacity queueing must be short, visible, bounded, pauseable, and cancellable. +6. Automatic retry is manual by default and can only become bounded automatic retry after explicit user opt-in. +7. Automatic retry must never loop indefinitely. +8. Quick/default review depth may reduce breadth, but must not hide changed files from coverage metadata. +9. Diagnostics must be low-frequency and content-free. +10. Project-level review cache remains deferred until retention, deletion, invalidation, and user visibility rules are approved. +11. Refactor rounds must preserve existing behavior unless a behavior-change checkpoint is explicitly approved. + +## Remaining Functional Plan + +### Round 1: Short Provider Capacity Queue + +Status: Pending implementation. + +Goal: When the provider returns a narrowly classified transient capacity error, Deep Review should wait briefly and retry once before reporting `capacity_skipped`. + +In scope: + +- Treat only the following as queueable: + - provider rate limit; + - provider concurrency limit; + - explicit `Retry-After`; + - temporary overload/capacity pressure. +- Fail fast for: + - authentication; + - billing/quota exhaustion; + - invalid model; + - policy violation; + - user cancellation; + - invalid reviewer tooling; + - deterministic validation errors. +- Bound wait by `min(Retry-After, max_queue_wait_seconds)`. +- Reattempt the reviewer once when the user has not paused or cancelled. +- Emit existing queue-state events with provider-specific reasons. +- Record aggregate diagnostics counters: + - provider queue count; + - provider retry count; + - provider retry success count; + - final capacity skip count. +- Keep queue time separate from reviewer runtime timeout. +- Reuse the compact action-bar queue notice and controls. + +Risks: + +| Risk | Why it matters | Mitigation | +|---|---|---| +| Provider wait looks stuck | Users may think the review froze. | Visible queue notice, elapsed queue time, pause/continue/cancel controls. | +| Wrong error is queued | Auth/quota/model errors could wait forever. | Narrow classifier with fail-fast tests. | +| Deep Review starves the active session | Queued reviewers could resume when normal work needs capacity. | Preserve active-session warning and manual pause/continue. | +| Retry extends total review time | A slow model can make capacity waits expensive. | One short reattempt, max queue wait, aggregate diagnostics. | + +Verification: + +- Rust tests for queueable vs non-queueable provider errors. +- Rust tests for queue expiry, queue success, pause, cancel, and diagnostics counters. +- Frontend tests for provider queue notice, localized reason text, and queue-state updates. +- Existing `cargo test -p bitfun-core deep_review -- --nocapture`. + +Exit criteria: + +- Provider queue is visible and bounded. +- Queue time is timeout-separated. +- Non-transient errors fail fast. +- Final report remains honest when the provider queue expires. + +### Round 2: Explicit Retry Action And Bounded Auto-Retry Preference + +Status: Pending implementation. + +Goal: Give users a clear retry action for unresolved reviewer slices, while allowing future small automatic retries only after explicit opt-in. + +In scope: + +- Extract retryable unresolved packets from report metadata: + - source packet id; + - reviewer id; + - source status; + - covered files; + - unresolved files; + - retry timeout. +- Retryable sources: + - `partial_timeout`; + - transient `capacity_skipped`. +- Non-retryable sources: + - auth; + - quota/billing; + - invalid model; + - policy; + - invalid tooling; + - validation; + - cancellation; + - non-transient capacity skip. +- Add an explicit action-bar button for retrying unresolved slices. +- Add an explicit opt-in action: allow bounded automatic retries without asking again. +- Persist the opt-in through Review Team settings. +- Keep bounded automatic retry disabled by default. +- Auto retry may run only when: + - preference is enabled; + - source status is retryable; + - retry coverage is structured; + - retry scope is non-empty and smaller than the original scope; + - role/packet retry budget remains; + - elapsed guard remains; + - timeout is lower than the source task timeout. +- Stable suppression reasons: + - `preference_disabled`; + - `budget_exhausted`; + - `scope_not_reduced`; + - `elapsed_guard_exceeded`; + - `non_retryable_status`; + - `non_transient_error`; + - `missing_coverage`. + +Risks: + +| Risk | Why it matters | Mitigation | +|---|---|---| +| Retry loops forever | Token and time usage can explode. | Role budget, packet budget, smaller scope, elapsed guard, one-slice-at-a-time execution. | +| Retry repeats bad context | Broad retry can produce the same failure. | Structured coverage and reduced scope are mandatory. | +| User loses control | Hidden retry feels like unexpected automation. | Manual by default; opt-in is explicit and reversible in settings. | +| Report becomes noisy | Retry metadata can crowd findings. | Keep retry controls compact and show only unresolved status when needed. | + +Verification: + +- Frontend report parser tests for retryable and non-retryable slices. +- Store/action-bar tests for manual retry, disabled states, and opt-in action. +- DeepReviewService tests for retry launch metadata. +- Rust tests for retry admission, suppression reasons, and budget guards. +- Lint, type-check, focused web tests, and Rust Deep Review tests. + +Exit criteria: + +- Manual retry works for structured unresolved slices. +- Automatic retry is disabled by default. +- Opted-in automatic retry remains bounded and cannot loop indefinitely. + +### Round 3: Cost-Aware Review Scope + +Status: Pending implementation. + +Goal: Reduce review time and token use on large or slow-model changes by making quick/default strategies focus first on high-risk evidence, while keeping `deep` as the full-depth option. + +Scope profile: + +```ts +type DeepReviewScopeProfile = { + reviewDepth: 'high_risk_only' | 'risk_expanded' | 'full_depth'; + riskFocusTags: string[]; + maxDependencyHops: number | 'policy_limited'; + optionalReviewerPolicy: 'risk_matched_only' | 'configured' | 'full'; + allowBroadToolExploration: boolean; + coverageExpectation: string; +}; +``` + +Strategy mapping: + +| Strategy | Review depth | Dependency context | Optional reviewers | Exploration | +|---|---|---|---|---| +| `quick` | `high_risk_only` | Changed files and direct high-risk contracts only | Risk-matched only | Broad exploration off | +| `normal` | `risk_expanded` | Changed files plus one-hop high-risk context | Configured but applicability-gated | Limited | +| `deep` | `full_depth` | Policy-limited broad context | Configured/full behavior | Allowed | + +High-risk categories that must stay in quick/default scope: + +- security; +- data loss; +- migrations; +- authentication/authorization; +- cross-boundary API contracts; +- concurrency; +- persistence; +- configuration changes; +- platform boundary violations. + +Risks: + +| Risk | Why it matters | Mitigation | +|---|---|---| +| Reduced depth misses low-risk regressions | Quick/default trades breadth for speed. | Label coverage honestly and offer a deeper path. | +| Judge overstates confidence | A high-risk-only pass is not full review. | Judge prompt and report metadata must preserve `reviewDepth` and `coverageExpectation`. | +| Optional reviewers disappear unexpectedly | Users may expect configured reviewers to run. | Use clear applicability/risk-match metadata and show skipped reasons. | +| Strategy becomes hidden override | Runtime might change user-selected intent. | Scope profile narrows depth, but does not secretly change selected strategy. | + +Verification: + +- Manifest tests for all three depth profiles. +- Tests that reduced-depth manifests preserve changed-file coverage metadata. +- Prompt updates for reviewers and judge. +- Report reliability tests for reduced-depth wording. +- Rust Deep Review tests to ensure report schema remains compatible. + +Exit criteria: + +- Quick is high-risk-only. +- Normal is risk-expanded. +- Deep remains full-depth. +- Reports do not claim full coverage for reduced-depth runs. + +### Round 4: Shared Evidence Pack + +Status: Pending implementation. + +Goal: Let reviewers start from compact shared facts so they spend less time and fewer tokens rediscovering the same files, hunks, and contract hints. + +Proposed manifest shape: + +```ts +type DeepReviewEvidencePack = { + changedFiles: string[]; + diffStat: { + fileCount: number; + totalChangedLines: number; + }; + domainTags: string[]; + riskFocusTags: string[]; + packetIds: string[]; + hunkHints: Array<{ + filePath: string; + changedLineCount: number; + }>; + contractHints: Array<{ + kind: 'i18n_key' | 'tauri_command' | 'api_contract' | 'config_key'; + value: string; + filePath: string; + }>; +}; +``` + +Rules: + +- Do not include full source text. +- Do not include full diff text. +- Do not include reviewer output. +- Do not include provider raw response bodies. +- Keep the evidence source-agnostic. +- Prefer metadata, hunk ranges, domain tags, risk tags, packet ids, and cheap contract hints. +- Use targeted `Read` and `GetFileDiff` calls only for confirmation or missing context. +- Defer programmatic cross-subagent `Read` output reuse until duplicate-tool diagnostics prove material repeated cost. + +Risks: + +| Risk | Why it matters | Mitigation | +|---|---|---| +| Evidence pack becomes a hidden context blob | It could recreate the token/privacy problem. | Metadata-first, content-free by default. | +| Hints become stale | Reviewers may rely on outdated metadata. | Derive pack once from the same manifest inputs and include source labels. | +| Tool-result reuse changes reviewer isolation | Shared full reads can leak or freeze context. | Defer full tool-result reuse; start with evidence only. | +| Contract extraction gets expensive | Heavy static analysis would hurt launch latency. | Use cheap extraction from already known changed files and names. | + +Verification: + +- Manifest tests for evidence pack structure. +- Tests proving no full source/diff content is stored in the pack. +- Prompt tests or snapshot-light assertions proving reviewers are instructed to start from evidence. +- Diagnostics comparison after real runs before any tool-result cache is designed. + +Exit criteria: + +- Reviewers receive compact shared evidence. +- No source/diff/model output is stored in diagnostics. +- Duplicate discovery should reduce without changing tool semantics. + +### Round 5: Documentation Reconciliation And Release Gate + +Status: Pending after the functional rounds above. + +Goal: Keep documents and code aligned after each functional close. + +Actions: + +- Update status wording only after verification passes. +- Mark provider queue as implemented only after visible bounded behavior is tested. +- Mark retry controls as implemented only after manual retry and opt-in guards exist. +- Keep project-level cache deferred. +- Keep programmatic shared context cache deferred unless measurements justify it. +- Keep hard prompt-byte clipping deferred. +- Scan for stale completion claims. + +Verification: + +```powershell +rg -n "project-level cache.*implemented|automatic retry.*complete|provider/adaptive queue.*complete|hard prompt.*complete|global.*concurrency.*automatic" docs/deep-review-design.md docs/deep-review-phase2-plan.md docs/deep-review-phase2-addendum.md docs/deep-review-phase3-followup-plan.md +cargo test -p bitfun-core deep_review -- --nocapture +cargo check --workspace --exclude bitfun-cli +pnpm run lint:web +pnpm run type-check:web +pnpm --dir src/web-ui run test:run +git diff --check +``` + +Exit criteria: + +- Docs distinguish implemented, guarded, prompt-guided, deferred, and pending behavior. +- No document claims exceed code behavior. +- No new user-facing string lacks locale coverage. +- No queue/retry/cache/token feature introduces hidden confusion or undocumented privacy risk. + +## Deferred Product Decisions + +### Project-Level Review Cache + +Status: Deferred by product decision. + +Do not implement until a separate plan defines: + +- retention duration; +- invalidation across file rename, model, strategy, roster, and prompt changes; +- deletion behavior; +- user visibility and management UI; +- whether reviewer outputs may be persisted outside session metadata; +- privacy review for source summaries and security findings. + +Current boundary: per-session cache only. + +### Programmatic Shared Tool-Result Cache + +Status: Deferred pending measured need. + +Current boundary: + +- prompt-level reuse guidance; +- content-free duplicate `Read`/`GetFileDiff` measurement; +- final aggregate diagnostics. + +Do not intercept and reuse full tool results until real-run measurements prove material duplicate cost and a separate semantics/privacy plan is approved. + +### Hard Prompt-Byte Clipping + +Status: Deferred. + +Current boundary: + +- heuristic prompt-byte estimate; +- summary-first full-scope metadata; +- file splitting/max-file guardrails. + +Do not hard-clip files from reviewer coverage unless every omitted or reduced file remains explicit in coverage/reliability metadata. + +### Backend DAG Scheduler + +Status: Deferred. + +Current boundary: + +- prompt-driven orchestration; +- TaskTool hard guardrails; +- local-cap queue controls; +- future short provider capacity queue. + +Do not replace the orchestrator with a deterministic backend workflow engine in the current plan. + +### Authoritative Runtime Strategy Selection + +Status: Deferred. + +Current boundary: + +- advisory/mismatch-warning metadata only. + +Do not let backend risk scoring override the user's selected strategy until measured complexity signals and product approval exist. + +## Architecture Refactor Plan + +Status: Pending implementation. Behavior change allowed: none unless explicitly called out and approved. + +### Refactor Goals + +1. Move Deep Review-specific logic out of broad shared files where possible. +2. Separate generic subagent runtime primitives from Deep Review policy adapters. +3. Keep standard subagent behavior stable unless a change is explicitly reviewed as a product decision. +4. Reduce oversized files and repeated definitions. +5. Preserve existing Deep Review behavior during refactor rounds. +6. Keep dependencies acyclic and location choices predictable. +7. Make non-DeepReview impact explicit and testable. +8. Keep frontend and backend Deep Review boundaries clear. +9. Avoid new performance, quality, or security risks. + +### Current Refactor Pressure + +| Area | Current pressure | +|---|---| +| `src/crates/core/src/agentic/deep_review_policy.rs` | Contains role/team definition, manifest parsing, execution policy, concurrency, queue controls, effective cap learning, retry, diagnostics, shared-context measurement, cache, and tests. | +| `src/crates/core/src/agentic/tools/implementations/task_tool.rs` | Generic Task tool contains Deep Review capacity wait, retry admission, packet/cache lookup, provider capacity skip, queue event, and tests. | +| `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` | Standard Code Review and Deep Review report logic are mixed with packet fallback, reliability, diagnostics, and cache write-through. | +| `src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs` | Generic tool pipeline carries Deep Review context propagation and duplicate read/diff measurement. | +| `src/crates/events/src/agentic.rs` | Shared event crate contains Deep Review queue event payload. | +| `src/web-ui/src/shared/services/reviewTeamService.ts` | Config, backend definition, validation, strategy, risk, manifest, work packets, cache plan, token budget, and prompt block are in one large file. | +| `src/web-ui/src/flow_chat/services/DeepReviewService.ts` | Slash parsing, target resolution, stats, runtime signals, launch cleanup, and child-session launch are coupled. | +| `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` | Queue controls, recovery, remediation, diagnostics, and review actions are dense in one component path. | +| `src/web-ui/src/flow_chat/utils/codeReviewReport.ts` | Report normalization, reliability notices, manifest rendering, and markdown export are growing together. | + +### Target Backend Layout + +Create a Deep Review subsystem under core: + +```text +src/crates/core/src/agentic/deep_review/ + mod.rs + constants.rs + team_definition.rs + manifest.rs + execution_policy.rs + concurrency_policy.rs + queue.rs + retry.rs + diagnostics.rs + shared_context.rs + incremental_cache.rs + report.rs + task_adapter.rs + tests/ +``` + +Responsibilities: + +| Module | Responsibility | +|---|---| +| `constants.rs` | Agent type constants and role families. | +| `team_definition.rs` | Default review team definition and strategy profile data. | +| `manifest.rs` | Typed accessors for `deep_review_run_manifest`, packet lookup, strategy/concurrency/cache/token fields. | +| `execution_policy.rs` | Timeouts, file split thresholds, retry limit config, and risk helper. | +| `concurrency_policy.rs` | Configured cap and effective-cap calculations. | +| `queue.rs` | Queue state, queue controls, capacity classification, local/provider queue decisions. | +| `retry.rs` | Structured retry coverage validation, retry prompt block, retry budget helpers. | +| `diagnostics.rs` | Aggregate runtime diagnostics and low-frequency final logging data. | +| `shared_context.rs` | Duplicate `Read`/`GetFileDiff` measurement and future evidence metadata helpers. | +| `incremental_cache.rs` | Per-session packet cache model and serialization. | +| `report.rs` | Deep Review-specific reliability signals and packet metadata helpers for `CodeReviewTool`. | +| `task_adapter.rs` | Deep Review-specific TaskTool orchestration hooks. | + +`deep_review_policy.rs` should become a compatibility facade during migration, then shrink to re-exports or be removed after imports are updated. + +### Generic Subagent Runtime Boundary + +Introduce a generic runtime area only after the Deep Review extraction proves stable: + +```text +src/crates/core/src/agentic/subagent_runtime/ + mod.rs + capacity.rs + queue_state.rs + retry_admission.rs +``` + +Rules: + +- Do not move behavior here until it is proven generic. +- Generic modules must not import Deep Review modules. +- Deep Review adapters may import generic modules. +- Provider-capacity auto queueing must not become global subagent behavior in this refactor. + +Generic candidates: + +- capacity acquisition/release guard; +- queue state shape independent of Deep Review labels; +- timeout separation between queue wait and running time; +- bounded retry admission primitives. + +### Backend Tool Facades + +Keep public tool entrypoints stable: + +- `TaskTool` remains the registered Task tool. +- `CodeReviewTool` remains the registered report submission tool. + +Move feature-specific logic behind adapters: + +```rust +let deep_review_context = deep_review::manifest::Context::from_tool_context(context); +deep_review::task_adapter::prepare_launch(...); +deep_review::retry::validate_retry(...); +deep_review::queue::wait_for_reviewer_capacity(...); +``` + +```rust +deep_review::report::fill_packet_metadata(...); +deep_review::report::fill_reliability_signals(...); +deep_review::incremental_cache::persist_completed_packets(...); +deep_review::diagnostics::log_final_snapshot(...); +``` + +Required guardrail: normal Task and standard Code Review behavior must remain unchanged when Deep Review context is absent. + +### Target Frontend Review-Team Layout + +Split `reviewTeamService.ts` into a directory with a compatibility facade: + +```text +src/web-ui/src/shared/services/review-team/ + index.ts + types.ts + defaults.ts + config.ts + backendDefinition.ts + strategy.ts + targetClassifier.ts + subagentCapabilities.ts + manifestBuilder.ts + workPackets.ts + tokenBudget.ts + risk.ts + promptBlock.ts + cachePlan.ts + preReviewSummary.ts +``` + +Keep this import path working: + +```text +src/web-ui/src/shared/services/reviewTeamService.ts +``` + +The old file should become a facade exporting from `./review-team`. + +Dependency rules: + +- `types.ts` must be dependency-light and should not import implementation modules. +- `config.ts` may import config APIs. +- `backendDefinition.ts` may import agent APIs. +- `manifestBuilder.ts` may import pure helpers. +- Pure helpers must not import `manifestBuilder.ts`. +- Flow Chat launch modules should import the facade unless a tighter boundary is justified. + +### Target Flow Chat Deep Review Layout + +Split launch, action-bar, and report concerns: + +```text +src/web-ui/src/flow_chat/deep-review/ + launch/ + commandParser.ts + targetResolver.ts + launchPrompt.ts + launchSession.ts + launchErrors.ts + action-bar/ + CapacityQueueNotice.tsx + InterruptionRecoveryPanel.tsx + RemediationControls.tsx + ReviewActionHeader.tsx + report/ + reliabilityNotices.ts + manifestSections.ts + markdown.ts +``` + +Keep current public exports from: + +- `DeepReviewService.ts` +- `DeepReviewActionBar.tsx` +- `codeReviewReport.ts` + +### Refactor Execution Rounds + +#### Refactor Round 0: Baseline And Guardrails + +Actions: + +- Record current line counts for oversized files. +- Run focused Deep Review tests. +- Confirm non-DeepReview impact inventory is current. +- Do not change behavior. + +Verification: + +- `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts src/flow_chat/components/btw/DeepReviewActionBar.test.tsx src/flow_chat/utils/codeReviewReport.test.ts` +- `cargo test -p bitfun-core deep_review -- --nocapture` + +#### Refactor Round 1: Backend Deep Review Module Extraction + +Actions: + +- Create `src/crates/core/src/agentic/deep_review/`. +- Move constants and team definitions first. +- Move execution policy and strategy helpers. +- Move concurrency, queue, diagnostics, shared context, retry, and cache one module at a time. +- Keep `deep_review_policy.rs` as compatibility facade. + +Verification: + +- Rust Deep Review tests. +- `rg -n "deep_review_policy::" src/crates/core/src` to confirm imports are intentional. + +Behavior change allowed: none. + +#### Refactor Round 2: TaskTool Adapter Extraction + +Actions: + +- Add `deep_review::task_adapter`. +- Move Deep Review context detection, packet id resolution, cache lookup, retry validation, retry prompt preparation, and queue/capacity calls behind the adapter. +- Add or preserve a non-DeepReview Task regression test. + +Verification: + +- Deep Review TaskTool tests. +- Non-DeepReview Task test proving queue/retry/cache paths do not run without Deep Review context. + +Behavior change allowed: none. + +#### Refactor Round 3: CodeReviewTool Report Adapter + +Actions: + +- Add `deep_review::report`. +- Move packet metadata fallback, reliability signals, token budget notes, diagnostics logging, and incremental cache write-through. +- Add or preserve standard Code Review regression tests. + +Verification: + +- Deep Review report tests. +- Standard Code Review test proving Deep Review metadata is absent outside Deep Review. + +Behavior change allowed: none. + +#### Refactor Round 4: Event And Tool Pipeline Containment + +Actions: + +- Keep current Deep Review queue event contract stable. +- Move payload conversion helpers into Deep Review modules. +- Replace inline Deep Review context propagation in `tool_pipeline.rs` with a small hook/helper. +- Keep duplicate read/diff measurement Deep Review-gated. + +Verification: + +- Queue event serialization tests. +- Tool pipeline non-DeepReview regression tests. + +Behavior change allowed: none. + +Deferred behavior change: + +- Replacing `DeepReviewQueueStateChanged` with a generic `SubagentQueueStateChanged` event. + +#### Refactor Round 5: Frontend Review Team Decomposition + +Actions: + +- Create `src/web-ui/src/shared/services/review-team/`. +- Move types first. +- Move pure helpers next: strategy, risk, work packets, token budget, cache plan, pre-review summary. +- Move config persistence and backend definition loading separately. +- Keep `reviewTeamService.ts` as facade. + +Verification: + +- `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts` +- `pnpm run type-check:web` + +Behavior change allowed: none. + +#### Refactor Round 6: Flow Chat Deep Review Decomposition + +Actions: + +- Split command parsing, target resolution, manifest runtime signals, launch cleanup, and child-session launch. +- Split queue notice, interruption recovery, remediation controls, and review action layout. +- Split reliability notices, manifest markdown, and report normalization. + +Verification: + +- `pnpm --dir src/web-ui run test:run -- src/flow_chat/services/DeepReviewService.test.ts src/flow_chat/components/btw/DeepReviewActionBar.test.tsx src/flow_chat/utils/codeReviewReport.test.ts` +- `pnpm run lint:web` +- `pnpm run type-check:web` + +Behavior change allowed: none. + +#### Refactor Round 7: Documentation, Comments, And Ownership Cleanup + +Actions: + +- Add module-level Rust docs where boundaries are not obvious. +- Add concise TypeScript headers only for facades and boundary modules. +- Remove duplicated constants and wording after extraction. +- Update docs only when file ownership changes. + +Verification: + +- `rg -n "TODO|TBD|temporary|copy of|duplicate" src/crates/core/src/agentic/deep_review src/web-ui/src/shared/services/review-team` +- Focused frontend and Rust Deep Review tests. + +Behavior change allowed: none. + +### Refactor Behavior-Change Checkpoints + +Stop and ask before doing any of the following: + +1. Applying Deep Review queue behavior to all subagents. +2. Making provider transient errors auto-queue for ordinary subagents. +3. Replacing Deep Review-specific queue events with generic subagent queue events. +4. Changing retry from structured model/user-issued retry to backend-owned automatic redispatch. +5. Making backend risk scoring authoritative over user/team strategy. +6. Persisting review cache outside session metadata. +7. Hard-clipping prompt bytes or hiding files from coverage metadata. +8. Changing quick/normal/deep semantics beyond the cost-aware plan. + +## Non-DeepReview Impact Requirements + +Future rounds must preserve these rules: + +1. Generic subagent runtime modules must not import Deep Review modules. +2. Deep Review adapters may import generic runtime modules. +3. Shared tools may call Deep Review adapters only after explicit context gating. +4. Standard Code Review must work without a Deep Review manifest. +5. Deep Review queue time must not become a global subagent timeout rule. +6. Provider capacity queueing must remain Deep Review-scoped unless product approves broader behavior. +7. Diagnostics must stay aggregate-only and content-free. + +Required regression tests: + +- Normal Task without `deep_review_run_manifest` does not apply Deep Review queue controls. +- Normal Task retry does not require Deep Review `retry_coverage`. +- Standard Code Review submission does not emit Deep Review packet/cache/queue metadata. +- Deep Review queue events serialize with the current stable shape. +- Tool pipeline duplicate-read measurement ignores non-DeepReview `Read` and `GetFileDiff` calls. +- Standard Code Review action bar renders without Deep Review queue controls. +- Deep Review queue controls render only for Deep Review state. +- Standard Code Review markdown export omits Deep Review manifest/cache sections. +- Review settings copy distinguishes Review Team max reviewers from global subagent concurrency. + +## Cross-Cutting UX And I18n Requirements + +- Every new user-facing string must be localized in `en-US`, `zh-CN`, and `zh-TW`. +- Prefer existing action bar and Review settings surfaces over new modals. +- Keep queue and retry notices compact. +- Do not show dense token/cost internals by default. +- Use default-collapsed details for reliability and coverage explanations. +- Preserve theme compatibility and compact layout behavior. +- Do not add visible text explaining implementation internals unless it helps the user make a decision. + +## Performance And Privacy Requirements + +- Diagnostics must be low-frequency. +- Runtime logs must be English-only and contain no emojis. +- Do not log or store source text, full diff text, reviewer output, provider raw body, or full file contents in diagnostics. +- Shared evidence must stay compact and metadata-first. +- Queue and retry automation must be bounded by settings and budgets. +- Large-change cost reduction must be transparent through coverage metadata. + +## Final Release Gate + +Run after any batch that completes pending functionality or refactor rounds: + +```powershell +cargo test -p bitfun-core deep_review -- --nocapture +cargo check --workspace --exclude bitfun-cli +pnpm run lint:web +pnpm run type-check:web +pnpm --dir src/web-ui run test:run +git diff --check +``` + +If backend, desktop API, or Tauri adapters are touched, also run the nearest desktop/backend verification required by `AGENTS.md`. + +## Stop Conditions + +Stop and re-review the design if: + +- a fix requires changing global `ai.subagent_max_concurrency` as the normal Deep Review recovery path; +- diagnostics need to store source, diff, reviewer output, provider raw body, or full file contents; +- provider queue needs more than one automatic reattempt per reviewer packet; +- auto retry needs a scope that is not smaller than the original packet; +- quick/default cost reduction would hide changed files instead of marking reduced-depth coverage; +- shared evidence reuse requires full `Read` output caching before duplicate-call diagnostics justify it; +- a UI change needs a new page or modal when the action bar or Review settings can carry the workflow; +- refactor work changes behavior outside Deep Review without a confirmed checkpoint. + +## Expected End State + +When all pending functional and refactor work is complete: + +- Provider transient queue is short, visible, bounded, and controllable. +- Retry defaults to explicit user action, and opted-in automatic retry is bounded. +- Quick/default reviews reduce time and token cost by focusing on risk, while `deep` remains full-depth. +- Reviewers start from a compact shared evidence pack. +- Project-level cache, full tool-result reuse, hard prompt clipping, and DAG scheduling remain deferred unless separately approved. +- Deep Review backend logic lives in a dedicated module tree. +- Shared TaskTool, CodeReviewTool, tool pipeline, and event code contain only thin Deep Review hooks. +- Frontend review-team and Flow Chat Deep Review code are split by responsibility with stable facades. +- Non-DeepReview behavior is covered by focused regression tests. +- Documentation and code status remain aligned. diff --git a/docs/deep-review-pending-plan.zh-CN.md b/docs/deep-review-pending-plan.zh-CN.md new file mode 100644 index 000000000..f9bb9264e --- /dev/null +++ b/docs/deep-review-pending-plan.zh-CN.md @@ -0,0 +1,811 @@ +# Deep Review 待完成计划与架构重构 + +## 目的 + +本文档把所有 Deep Review 待完成和延期事项合并为一份可执行计划,并纳入新的架构重构设计。除非具体产品 checkpoint 明确批准,后续实现不得改变当前行为。 + +已完成行为基线见 `docs/deep-review-completed-status.zh-CN.md`。 + +## 来源文档 + +本计划合并以下文档中的待完成内容: + +- `docs/deep-review-design.md` +- `docs/deep-review-phase2-plan.md` +- `docs/deep-review-phase2-addendum.md` +- `docs/deep-review-phase3-followup-plan.md` +- `docs/deep-review-architecture-refactor-plan.md` +- `docs/deep-review-nondeepreview-impact-inventory.md` +- `docs/superpowers/plans/2026-05-09-deep-review-phase3-execution-plan.md` +- `docs/superpowers/specs/` 和 `docs/superpowers/plans/` 下的本地伴随 Deep Review 设计副本。 + +## 产品与架构边界 + +未来实现必须保持在以下边界内: + +1. Deep Review 仍是 Prompt 驱动,并由确定性运行时防护补强。未经单独设计批准,不替换为后端 DAG scheduler。 +2. Deep Review queueing 不能意外变成全局 subagent queueing。 +3. Deep Review 不能静默消耗正常用户 session 的并发。如果 session 已繁忙,应提示、暂停或要求用户手动继续。 +4. 队列等待时间不能算入 reviewer runtime timeout。 +5. Provider capacity queueing 必须短、可见、有界、可暂停、可取消。 +6. 自动 retry 默认保持手动;只有用户显式 opt-in 后,才允许小范围有界自动 retry。 +7. 自动 retry 绝不能形成无限循环。 +8. Quick/default 审核可以降低广度,但不能从 coverage metadata 中隐藏变更文件。 +9. Diagnostics 必须低频且不含内容。 +10. Project-level review cache 在 retention、deletion、invalidation、user visibility 规则确认前保持延期。 +11. 重构轮次必须保持现有行为,除非具体 behavior-change checkpoint 获得批准。 + +## 剩余功能计划 + +### Round 1:短 Provider Capacity Queue + +状态:待实现。 + +目标:当 provider 返回窄分类的 transient capacity error 时,Deep Review 应短暂等待并重试一次,然后再报告 `capacity_skipped`。 + +范围: + +- 仅以下错误可 queue: + - provider rate limit; + - provider concurrency limit; + - 显式 `Retry-After`; + - temporary overload/capacity pressure。 +- 以下错误必须快速失败: + - authentication; + - billing/quota exhaustion; + - invalid model; + - policy violation; + - user cancellation; + - invalid reviewer tooling; + - deterministic validation errors。 +- 等待时间受 `min(Retry-After, max_queue_wait_seconds)` 限制。 +- 如果用户未 pause/cancel,等待后只 reattempt reviewer 一次。 +- 使用 provider-specific reason 发送既有 queue-state event。 +- 记录 aggregate diagnostics counters: + - provider queue count; + - provider retry count; + - provider retry success count; + - final capacity skip count。 +- 队列等待时间与 reviewer runtime timeout 分离。 +- 复用紧凑 action-bar queue notice 和控制。 + +风险: + +| 风险 | 影响 | 缓解 | +|---|---|---| +| Provider wait 看起来像卡住 | 用户可能以为审核冻结。 | 显示队列提示、elapsed queue time、pause/continue/cancel 控制。 | +| 错误分类过宽 | Auth/quota/model 错误可能无意义等待。 | 窄分类器和 fail-fast 测试。 | +| Deep Review 挤占活跃 session | Queue 恢复时可能抢占正常工作 capacity。 | 保留 active-session warning 和手动 pause/continue。 | +| Retry 拉长整体审核 | 慢模型下 capacity wait 成本变高。 | 只短暂 reattempt 一次,受 max queue wait 和 diagnostics 约束。 | + +验证: + +- Rust 测试覆盖可 queue 与不可 queue provider error。 +- Rust 测试覆盖 queue expiry、queue success、pause、cancel、diagnostics counters。 +- 前端测试覆盖 provider queue notice、本地化 reason text、queue-state update。 +- 既有 `cargo test -p bitfun-core deep_review -- --nocapture`。 + +退出标准: + +- Provider queue 可见且有界。 +- Queue time 与 timeout 分离。 +- 非 transient error 快速失败。 +- Provider queue 过期时最终报告保持诚实。 + +### Round 2:显式 Retry Action 与有界自动 Retry 偏好 + +状态:待实现。 + +目标:为 unresolved reviewer slice 提供清晰 retry action,同时只有用户显式 opt-in 后才允许小范围自动 retry。 + +范围: + +- 从 report metadata 中提取 retryable unresolved packets: + - source packet id; + - reviewer id; + - source status; + - covered files; + - unresolved files; + - retry timeout。 +- 可 retry source: + - `partial_timeout`; + - transient `capacity_skipped`。 +- 不可 retry source: + - auth; + - quota/billing; + - invalid model; + - policy; + - invalid tooling; + - validation; + - cancellation; + - non-transient capacity skip。 +- 在 action bar 添加显式 retry unresolved slice 按钮。 +- 添加显式 opt-in 操作:之后允许 bounded automatic retries。 +- 通过 Review Team settings 持久化 opt-in。 +- Bounded automatic retry 默认关闭。 +- Auto retry 仅在以下条件全部满足时运行: + - preference enabled; + - source status 可 retry; + - retry coverage 是结构化的; + - retry scope 非空且小于原 scope; + - role/packet retry budget 仍可用; + - elapsed guard 仍可用; + - timeout 小于 source task timeout。 +- 稳定 suppression reasons: + - `preference_disabled`; + - `budget_exhausted`; + - `scope_not_reduced`; + - `elapsed_guard_exceeded`; + - `non_retryable_status`; + - `non_transient_error`; + - `missing_coverage`。 + +风险: + +| 风险 | 影响 | 缓解 | +|---|---|---| +| Retry 无限循环 | Token 和时间成本暴涨。 | Role budget、packet budget、scope 缩小、elapsed guard、一次只处理一个 slice。 | +| Retry 重复坏上下文 | 大范围 retry 可能重复失败。 | 必须有结构化 coverage 且 scope 缩小。 | +| 用户失去控制 | 隐式 retry 会像不可预期自动化。 | 默认手动;opt-in 明确且可在 settings 关闭。 | +| 报告变嘈杂 | Retry metadata 可能挤占 findings。 | Retry 控件保持紧凑,只在必要时展示 unresolved status。 | + +验证: + +- 前端 report parser 测试 retryable 和 non-retryable slices。 +- Store/action-bar 测试 manual retry、disabled state、opt-in action。 +- DeepReviewService 测试 retry launch metadata。 +- Rust 测试 retry admission、suppression reasons、budget guards。 +- Lint、type-check、聚焦 web tests、Rust Deep Review tests。 + +退出标准: + +- Manual retry 可用于结构化 unresolved slices。 +- Automatic retry 默认关闭。 +- Opt-in 后的 automatic retry 有界且不会循环。 + +### Round 3:成本感知审核范围 + +状态:待实现。 + +目标:在大变更或慢模型场景下降低审核时间和 Token 占用,让 quick/default 优先关注高风险证据,并保持 `deep` 为 full-depth 选项。 + +Scope profile: + +```ts +type DeepReviewScopeProfile = { + reviewDepth: 'high_risk_only' | 'risk_expanded' | 'full_depth'; + riskFocusTags: string[]; + maxDependencyHops: number | 'policy_limited'; + optionalReviewerPolicy: 'risk_matched_only' | 'configured' | 'full'; + allowBroadToolExploration: boolean; + coverageExpectation: string; +}; +``` + +策略映射: + +| 策略 | 审核深度 | 依赖上下文 | Optional reviewers | 探索范围 | +|---|---|---|---|---| +| `quick` | `high_risk_only` | 变更文件和直接高风险契约 | 仅 risk-matched | 关闭 broad exploration | +| `normal` | `risk_expanded` | 变更文件加一跳高风险上下文 | 已配置但仍受 applicability gate | 有限 | +| `deep` | `full_depth` | 策略限制内的广泛上下文 | configured/full 行为 | 允许 | + +Quick/default 中仍必须覆盖的高风险类别: + +- security; +- data loss; +- migrations; +- authentication/authorization; +- cross-boundary API contracts; +- concurrency; +- persistence; +- configuration changes; +- platform boundary violations。 + +风险: + +| 风险 | 影响 | 缓解 | +|---|---|---| +| 降低深度导致漏掉低风险回归 | Quick/default 用广度换速度。 | 明确标记 coverage,并提供加深路径。 | +| Judge 过度信任 | high-risk-only pass 不是 full review。 | Judge prompt 和 report metadata 必须保留 `reviewDepth` 与 `coverageExpectation`。 | +| Optional reviewers 看似意外消失 | 用户可能期待配置的 reviewer 都运行。 | 显示 applicability/risk-match metadata 和 skipped reasons。 | +| 策略变成隐藏 override | 运行时可能改变用户意图。 | Scope profile 只收窄深度,不秘密改变选中策略。 | + +验证: + +- Manifest tests 覆盖三种 depth profile。 +- 测试 reduced-depth manifest 仍保留 changed-file coverage metadata。 +- 更新 reviewer 和 judge prompts。 +- Report reliability tests 覆盖 reduced-depth wording。 +- Rust Deep Review tests 确认 report schema 兼容。 + +退出标准: + +- Quick 是 high-risk-only。 +- Normal 是 risk-expanded。 +- Deep 仍是 full-depth。 +- 报告不会把 reduced-depth run 声称为 full coverage。 + +### Round 4:Shared Evidence Pack + +状态:待实现。 + +目标:让 reviewers 从紧凑共享事实开始,减少重复发现相同文件、hunk、contract hint 所消耗的时间和 Token。 + +建议 manifest 形态: + +```ts +type DeepReviewEvidencePack = { + changedFiles: string[]; + diffStat: { + fileCount: number; + totalChangedLines: number; + }; + domainTags: string[]; + riskFocusTags: string[]; + packetIds: string[]; + hunkHints: Array<{ + filePath: string; + changedLineCount: number; + }>; + contractHints: Array<{ + kind: 'i18n_key' | 'tauri_command' | 'api_contract' | 'config_key'; + value: string; + filePath: string; + }>; +}; +``` + +规则: + +- 不包含完整源码。 +- 不包含完整 diff。 +- 不包含 reviewer output。 +- 不包含 provider raw response body。 +- 保持 source-agnostic。 +- 优先使用 metadata、hunk ranges、domain tags、risk tags、packet ids、cheap contract hints。 +- `Read` 和 `GetFileDiff` 只用于确认或补缺上下文。 +- Programmatic cross-subagent `Read` output reuse 在 duplicate-tool diagnostics 证明成本较高前继续延期。 + +风险: + +| 风险 | 影响 | 缓解 | +|---|---|---| +| Evidence pack 变成隐藏大上下文 | 会复现 Token/隐私问题。 | 默认 metadata-first 且不含内容。 | +| Hints 过期 | Reviewer 可能依赖旧元数据。 | 从同一 manifest 输入中一次性派生,并包含 source labels。 | +| Tool-result reuse 改变 reviewer 隔离性 | 共享 full reads 可能泄漏或冻结上下文。 | 先只做 evidence,full tool-result reuse 延期。 | +| Contract extraction 过重 | 重静态分析会拖慢启动。 | 仅从已有 changed files/name 做廉价提取。 | + +验证: + +- Manifest tests 覆盖 evidence pack structure。 +- 测试 pack 不存储完整源码或 diff。 +- Prompt tests 或轻量断言证明 reviewer 会先使用 evidence。 +- 在设计 tool-result cache 前用真实运行 diagnostics 做比较。 + +退出标准: + +- Reviewer 收到紧凑 shared evidence。 +- Diagnostics 中不存储 source/diff/model output。 +- 减少重复发现,但不改变工具语义。 + +### Round 5:文档对齐与 Release Gate + +状态:在上述功能轮次后执行。 + +目标:每个功能闭环后保持文档与代码一致。 + +动作: + +- 只有验证通过后才更新状态措辞。 +- Provider queue 只有在可见有界行为验证后才能标记为已实现。 +- Retry controls 只有在 manual retry 和 opt-in guard 存在后才能标记为已实现。 +- Project-level cache 保持延期。 +- Programmatic shared context cache 在测量证明必要前保持延期。 +- Hard prompt-byte clipping 保持延期。 +- 扫描过期完成声明。 + +验证: + +```powershell +rg -n "project-level cache.*implemented|automatic retry.*complete|provider/adaptive queue.*complete|hard prompt.*complete|global.*concurrency.*automatic" docs/deep-review-design.md docs/deep-review-phase2-plan.md docs/deep-review-phase2-addendum.md docs/deep-review-phase3-followup-plan.md +cargo test -p bitfun-core deep_review -- --nocapture +cargo check --workspace --exclude bitfun-cli +pnpm run lint:web +pnpm run type-check:web +pnpm --dir src/web-ui run test:run +git diff --check +``` + +退出标准: + +- 文档能区分已实现、带防护实现、Prompt 引导、延期和待实现行为。 +- 文档声明不超过代码事实。 +- 新用户可见字符串都有 locale 覆盖。 +- Queue/retry/cache/token 功能没有引入隐藏困惑或未记录隐私风险。 + +## 延期产品决策 + +### Project-Level Review Cache + +状态:产品决策延期。 + +实现前必须有单独计划定义: + +- retention duration; +- 文件 rename、model、strategy、roster、prompt 变化下的 invalidation; +- deletion behavior; +- 用户可见性和管理 UI; +- reviewer outputs 是否可以持久化到 session metadata 之外; +- 对 source summaries 和 security findings 的隐私评审。 + +当前边界:仅 per-session cache。 + +### Programmatic Shared Tool-Result Cache + +状态:等待真实测量证明必要。 + +当前边界: + +- prompt-level reuse guidance; +- 不含内容的重复 `Read`/`GetFileDiff` 测量; +- 最终 aggregate diagnostics。 + +在真实运行证明重复成本较高且单独语义/隐私计划获批前,不拦截和复用完整 tool result。 + +### Hard Prompt-Byte Clipping + +状态:延期。 + +当前边界: + +- heuristic prompt-byte estimate; +- summary-first full-scope metadata; +- file splitting/max-file guardrails。 + +除非每个被省略或降级的文件都明确出现在 coverage/reliability metadata 中,否则不得 hard-clip reviewer coverage。 + +### Backend DAG Scheduler + +状态:延期。 + +当前边界: + +- prompt-driven orchestration; +- TaskTool hard guardrails; +- local-cap queue controls; +- future short provider capacity queue。 + +当前计划中不把 orchestrator 替换为确定性后端 workflow engine。 + +### 权威运行时策略选择 + +状态:延期。 + +当前边界: + +- 仅 advisory/mismatch-warning metadata。 + +在 measured complexity signal 和产品批准前,后端风险评分不得覆盖用户选择策略。 + +## 架构重构计划 + +状态:待实现。允许行为变化:无,除非明确列出并获得批准。 + +### 重构目标 + +1. 尽可能把 Deep Review 特定逻辑从宽泛共享文件中迁出。 +2. 分离通用 subagent runtime primitive 与 Deep Review policy adapter。 +3. 除非作为产品决策评审,否则保持标准 subagent 行为稳定。 +4. 减少超大文件和重复定义。 +5. 在重构轮次中保持现有 Deep Review 行为。 +6. 保持依赖无环且位置选择可预测。 +7. 明确并测试所有非 DeepReview 影响。 +8. 保持前后端 Deep Review 边界清晰。 +9. 避免引入新的性能、质量或安全风险。 + +### 当前重构压力 + +| 区域 | 当前压力 | +|---|---| +| `src/crates/core/src/agentic/deep_review_policy.rs` | 角色/团队定义、manifest 解析、execution policy、并发、queue controls、effective cap learning、retry、diagnostics、shared-context measurement、cache 和 tests 都在同一文件。 | +| `src/crates/core/src/agentic/tools/implementations/task_tool.rs` | 通用 Task tool 中混入 Deep Review capacity wait、retry admission、packet/cache lookup、provider capacity skip、queue event 和 tests。 | +| `src/crates/core/src/agentic/tools/implementations/code_review_tool.rs` | 标准 Code Review 与 Deep Review report 逻辑混合,包括 packet fallback、reliability、diagnostics、cache write-through。 | +| `src/crates/core/src/agentic/tools/pipeline/tool_pipeline.rs` | 通用 tool pipeline 携带 Deep Review context propagation 和 duplicate read/diff measurement。 | +| `src/crates/events/src/agentic.rs` | 共享 event crate 中包含 Deep Review queue event payload。 | +| `src/web-ui/src/shared/services/reviewTeamService.ts` | Config、backend definition、validation、strategy、risk、manifest、work packets、cache plan、token budget、prompt block 都在一个大文件。 | +| `src/web-ui/src/flow_chat/services/DeepReviewService.ts` | Slash parsing、target resolution、stats、runtime signals、launch cleanup、child-session launch 耦合。 | +| `src/web-ui/src/flow_chat/components/btw/DeepReviewActionBar.tsx` | Queue controls、recovery、remediation、diagnostics、review actions 集中且密集。 | +| `src/web-ui/src/flow_chat/utils/codeReviewReport.ts` | Report normalization、reliability notices、manifest rendering、markdown export 一起增长。 | + +### 目标后端结构 + +在 core 中创建 Deep Review 子系统: + +```text +src/crates/core/src/agentic/deep_review/ + mod.rs + constants.rs + team_definition.rs + manifest.rs + execution_policy.rs + concurrency_policy.rs + queue.rs + retry.rs + diagnostics.rs + shared_context.rs + incremental_cache.rs + report.rs + task_adapter.rs + tests/ +``` + +职责: + +| 模块 | 职责 | +|---|---| +| `constants.rs` | Agent type constants 和 role families。 | +| `team_definition.rs` | Default review team definition 和 strategy profile data。 | +| `manifest.rs` | `deep_review_run_manifest`、packet lookup、strategy/concurrency/cache/token fields 的 typed accessors。 | +| `execution_policy.rs` | Timeouts、file split thresholds、retry limit config、risk helper。 | +| `concurrency_policy.rs` | Configured cap 与 effective-cap calculations。 | +| `queue.rs` | Queue state、queue controls、capacity classification、local/provider queue decisions。 | +| `retry.rs` | Structured retry coverage validation、retry prompt block、retry budget helpers。 | +| `diagnostics.rs` | Aggregate runtime diagnostics 和低频 final logging data。 | +| `shared_context.rs` | Duplicate `Read`/`GetFileDiff` measurement 和未来 evidence metadata helpers。 | +| `incremental_cache.rs` | Per-session packet cache model 和 serialization。 | +| `report.rs` | `CodeReviewTool` 使用的 Deep Review-specific reliability signals 和 packet metadata helpers。 | +| `task_adapter.rs` | Deep Review-specific TaskTool orchestration hooks。 | + +`deep_review_policy.rs` 在迁移期间应变成 compatibility facade,导入更新完成后再缩小为 re-export 或移除。 + +### 通用 Subagent Runtime 边界 + +只有在 Deep Review 抽取稳定后,才引入通用 runtime 区域: + +```text +src/crates/core/src/agentic/subagent_runtime/ + mod.rs + capacity.rs + queue_state.rs + retry_admission.rs +``` + +规则: + +- 未证明通用前,不把行为移动到这里。 +- 通用模块不得 import Deep Review 模块。 +- Deep Review adapter 可以 import 通用模块。 +- Provider-capacity auto queueing 不得在本重构中变成全局 subagent 行为。 + +可通用候选: + +- capacity acquisition/release guard; +- 与 Deep Review label 无关的 queue state shape; +- queue wait 与 running time 的 timeout separation; +- bounded retry admission primitives。 + +### 后端工具 Facade + +保持公开 tool entrypoint 稳定: + +- `TaskTool` 仍是注册的 Task tool。 +- `CodeReviewTool` 仍是注册的 report submission tool。 + +把功能特定逻辑移动到 adapter 后: + +```rust +let deep_review_context = deep_review::manifest::Context::from_tool_context(context); +deep_review::task_adapter::prepare_launch(...); +deep_review::retry::validate_retry(...); +deep_review::queue::wait_for_reviewer_capacity(...); +``` + +```rust +deep_review::report::fill_packet_metadata(...); +deep_review::report::fill_reliability_signals(...); +deep_review::incremental_cache::persist_completed_packets(...); +deep_review::diagnostics::log_final_snapshot(...); +``` + +必要防护:没有 Deep Review context 时,普通 Task 和标准 Code Review 行为必须不变。 + +### 目标前端 Review-Team 结构 + +把 `reviewTeamService.ts` 拆为目录,并保留 compatibility facade: + +```text +src/web-ui/src/shared/services/review-team/ + index.ts + types.ts + defaults.ts + config.ts + backendDefinition.ts + strategy.ts + targetClassifier.ts + subagentCapabilities.ts + manifestBuilder.ts + workPackets.ts + tokenBudget.ts + risk.ts + promptBlock.ts + cachePlan.ts + preReviewSummary.ts +``` + +保持以下 import path 可用: + +```text +src/web-ui/src/shared/services/reviewTeamService.ts +``` + +旧文件应变成从 `./review-team` 导出的 facade。 + +依赖规则: + +- `types.ts` 依赖应很轻,不 import 实现模块。 +- `config.ts` 可 import config APIs。 +- `backendDefinition.ts` 可 import agent APIs。 +- `manifestBuilder.ts` 可 import pure helpers。 +- Pure helpers 不得 import `manifestBuilder.ts`。 +- Flow Chat launch modules 应优先 import facade,除非有清晰边界理由。 + +### 目标 Flow Chat Deep Review 结构 + +拆分 launch、action-bar、report 职责: + +```text +src/web-ui/src/flow_chat/deep-review/ + launch/ + commandParser.ts + targetResolver.ts + launchPrompt.ts + launchSession.ts + launchErrors.ts + action-bar/ + CapacityQueueNotice.tsx + InterruptionRecoveryPanel.tsx + RemediationControls.tsx + ReviewActionHeader.tsx + report/ + reliabilityNotices.ts + manifestSections.ts + markdown.ts +``` + +继续保留当前 public exports: + +- `DeepReviewService.ts` +- `DeepReviewActionBar.tsx` +- `codeReviewReport.ts` + +### 重构执行轮次 + +#### Refactor Round 0:Baseline 与 Guardrails + +动作: + +- 记录当前超大文件行数。 +- 运行聚焦 Deep Review tests。 +- 确认 non-DeepReview impact inventory 当前有效。 +- 不改变行为。 + +验证: + +- `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts src/flow_chat/components/btw/DeepReviewActionBar.test.tsx src/flow_chat/utils/codeReviewReport.test.ts` +- `cargo test -p bitfun-core deep_review -- --nocapture` + +#### Refactor Round 1:后端 Deep Review 模块抽取 + +动作: + +- 创建 `src/crates/core/src/agentic/deep_review/`。 +- 先移动 constants 和 team definitions。 +- 移动 execution policy 和 strategy helpers。 +- 逐个移动 concurrency、queue、diagnostics、shared context、retry、cache。 +- `deep_review_policy.rs` 保持 compatibility facade。 + +验证: + +- Rust Deep Review tests。 +- `rg -n "deep_review_policy::" src/crates/core/src` 确认 import 都是有意保留。 + +允许行为变化:无。 + +#### Refactor Round 2:TaskTool Adapter 抽取 + +动作: + +- 添加 `deep_review::task_adapter`。 +- 把 Deep Review context detection、packet id resolution、cache lookup、retry validation、retry prompt preparation、queue/capacity calls 移入 adapter。 +- 添加或保留非 DeepReview Task 回归测试。 + +验证: + +- Deep Review TaskTool tests。 +- 非 DeepReview Task 测试,证明没有 Deep Review context 时不会进入 queue/retry/cache 路径。 + +允许行为变化:无。 + +#### Refactor Round 3:CodeReviewTool Report Adapter + +动作: + +- 添加 `deep_review::report`。 +- 移动 packet metadata fallback、reliability signals、token budget notes、diagnostics logging、incremental cache write-through。 +- 添加或保留标准 Code Review 回归测试。 + +验证: + +- Deep Review report tests。 +- 标准 Code Review 测试,证明 Deep Review metadata 不会出现在 Deep Review 外。 + +允许行为变化:无。 + +#### Refactor Round 4:Event 与 Tool Pipeline 收敛 + +动作: + +- 保持当前 Deep Review queue event contract 稳定。 +- 把 payload conversion helpers 移入 Deep Review modules。 +- 用小 hook/helper 替代 `tool_pipeline.rs` 中内联 Deep Review context propagation。 +- 保持 duplicate read/diff measurement 受 Deep Review gate 保护。 + +验证: + +- Queue event serialization tests。 +- Tool pipeline 非 DeepReview 回归测试。 + +允许行为变化:无。 + +延期行为变化: + +- 把 `DeepReviewQueueStateChanged` 替换为通用 `SubagentQueueStateChanged` event。 + +#### Refactor Round 5:前端 Review Team 拆分 + +动作: + +- 创建 `src/web-ui/src/shared/services/review-team/`。 +- 先移动 types。 +- 再移动 pure helpers:strategy、risk、work packets、token budget、cache plan、pre-review summary。 +- 分别移动 config persistence 和 backend definition loading。 +- `reviewTeamService.ts` 保持 facade。 + +验证: + +- `pnpm --dir src/web-ui run test:run -- src/shared/services/reviewTeamService.test.ts` +- `pnpm run type-check:web` + +允许行为变化:无。 + +#### Refactor Round 6:Flow Chat Deep Review 拆分 + +动作: + +- 拆分 command parsing、target resolution、manifest runtime signals、launch cleanup、child-session launch。 +- 拆分 queue notice、interruption recovery、remediation controls、review action layout。 +- 拆分 reliability notices、manifest markdown、report normalization。 + +验证: + +- `pnpm --dir src/web-ui run test:run -- src/flow_chat/services/DeepReviewService.test.ts src/flow_chat/components/btw/DeepReviewActionBar.test.tsx src/flow_chat/utils/codeReviewReport.test.ts` +- `pnpm run lint:web` +- `pnpm run type-check:web` + +允许行为变化:无。 + +#### Refactor Round 7:文档、注释与 Ownership 清理 + +动作: + +- 为边界不明显的新 Rust `deep_review` 模块添加 module-level docs。 +- 只为 facade 和边界模块添加简洁 TypeScript file header。 +- 抽取后移除重复 constants 和状态措辞。 +- 仅当文件 ownership 变化时更新文档。 + +验证: + +- `rg -n "TODO|TBD|temporary|copy of|duplicate" src/crates/core/src/agentic/deep_review src/web-ui/src/shared/services/review-team` +- 聚焦前端和 Rust Deep Review tests。 + +允许行为变化:无。 + +### 重构行为变化 Checkpoints + +出现以下情况必须停止并询问: + +1. 把 Deep Review queue 行为应用到所有 subagents。 +2. 让普通 subagents 的 provider transient errors 自动 queue。 +3. 用通用 subagent queue event 替换 Deep Review-specific queue event。 +4. 把 retry 从结构化 model/user-issued retry 改为后端自动 redispatch。 +5. 让后端风险评分覆盖 user/team strategy。 +6. 把 review cache 持久化到 session metadata 之外。 +7. Hard-clipping prompt bytes 或从 coverage metadata 隐藏文件。 +8. 超出 cost-aware plan 改变 quick/normal/deep 语义。 + +## 非 DeepReview 影响要求 + +后续轮次必须保持以下规则: + +1. 通用 subagent runtime modules 不得 import Deep Review modules。 +2. Deep Review adapters 可以 import 通用 runtime modules。 +3. Shared tools 只有在显式 context gate 后才能调用 Deep Review adapters。 +4. 标准 Code Review 必须在没有 Deep Review manifest 时继续工作。 +5. Deep Review queue time 不能变成全局 subagent timeout rule。 +6. Provider capacity queueing 在产品批准前保持 Deep Review-scoped。 +7. Diagnostics 必须 aggregate-only 且不含内容。 + +必须保留或新增的回归测试: + +- 没有 `deep_review_run_manifest` 的普通 Task 不应用 Deep Review queue controls。 +- 普通 Task retry 不要求 Deep Review `retry_coverage`。 +- 标准 Code Review submission 不发出 Deep Review packet/cache/queue metadata。 +- Deep Review queue events 按当前稳定形态序列化。 +- Tool pipeline duplicate-read measurement 忽略非 DeepReview `Read` 和 `GetFileDiff` 调用。 +- 标准 Code Review action bar 不渲染 Deep Review queue controls。 +- Deep Review queue controls 只在 Deep Review 状态下渲染。 +- 标准 Code Review markdown export 不包含 Deep Review manifest/cache sections。 +- Review settings 文案区分 Review Team max reviewers 与 global subagent concurrency。 + +## 跨领域 UX 与国际化要求 + +- 每个新用户可见字符串必须覆盖 `en-US`、`zh-CN`、`zh-TW`。 +- 优先使用既有 action bar 和 Review settings,不新增不必要 modal。 +- Queue 和 retry notice 保持紧凑。 +- 默认不展示密集 Token/cost 内部细节。 +- Reliability 与 coverage 解释使用默认折叠细节。 +- 保持主题兼容和紧凑布局稳定。 +- 除非能帮助用户做决策,不增加解释实现内部逻辑的可见文本。 + +## 性能与隐私要求 + +- Diagnostics 必须低频。 +- Runtime logs 必须为英文且无 emoji。 +- Diagnostics 不得记录或存储源码、完整 diff、reviewer output、provider raw body 或完整文件内容。 +- Shared evidence 必须紧凑且 metadata-first。 +- Queue 和 retry 自动化必须受 settings 与 budgets 约束。 +- 大变更成本削减必须通过 coverage metadata 保持透明。 + +## 最终 Release Gate + +完成任一待实现功能批次或重构批次后运行: + +```powershell +cargo test -p bitfun-core deep_review -- --nocapture +cargo check --workspace --exclude bitfun-cli +pnpm run lint:web +pnpm run type-check:web +pnpm --dir src/web-ui run test:run +git diff --check +``` + +如果触及 backend、desktop API 或 Tauri adapters,还要按 `AGENTS.md` 运行相邻 desktop/backend 验证。 + +## 停止条件 + +出现以下情况必须停止并重新评审设计: + +- 修复需要把全局 `ai.subagent_max_concurrency` 作为 Deep Review 常规恢复路径; +- diagnostics 需要存储源码、diff、reviewer output、provider raw body 或完整文件内容; +- provider queue 需要对同一 reviewer packet 自动 reattempt 超过一次; +- auto retry 需要不小于原 packet 的 scope; +- quick/default 成本削减会隐藏 changed files,而不是标记 reduced-depth coverage; +- duplicate-call diagnostics 证明必要前,就需要共享完整 `Read` output cache; +- UI 变化需要新增页面或 modal,而 action bar/Review settings 足以承载; +- 重构影响 Deep Review 之外行为,但未经过确认 checkpoint。 + +## 预期最终状态 + +全部待完成功能和重构结束后: + +- Provider transient queue 短、可见、有界、可控制。 +- Retry 默认由用户显式触发,opt-in 后的自动 retry 仍有界。 +- Quick/default 通过风险聚焦降低时间和 Token 成本,`deep` 保持 full-depth。 +- Reviewers 从紧凑 shared evidence pack 开始。 +- Project-level cache、完整 tool-result reuse、hard prompt clipping、DAG scheduling 仍保持延期,除非单独批准。 +- Deep Review 后端逻辑位于独立模块树。 +- Shared TaskTool、CodeReviewTool、tool pipeline、event code 只包含薄 Deep Review hooks。 +- 前端 review-team 和 Flow Chat Deep Review 代码按职责拆分,并保留稳定 facade。 +- 非 DeepReview 行为有聚焦回归测试覆盖。 +- 文档和代码状态保持一致。