From 2bba6569ffd5fc6d94945effd5b57fe3015d3ef3 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Wed, 8 Apr 2026 12:25:33 -0400 Subject: [PATCH 01/18] feat(conductor): GitButler virtual branch mode for workspace management --- .husky/post-checkout | 72 +++++++++ .husky/pre-commit | 52 ++++-- .husky/pre-commit-user | 13 ++ scripts/dspy/optimized_state.json | 4 +- scripts/gepa/generations/gen-000/baseline.md | 2 +- scripts/gepa/generations/gen-001/baseline.md | 2 +- src/cli/commands/orchestrate.ts | 6 + src/cli/commands/orchestrator.ts | 158 ++++++++++++++++--- 8 files changed, 276 insertions(+), 33 deletions(-) create mode 100755 .husky/post-checkout create mode 100755 .husky/pre-commit-user diff --git a/.husky/post-checkout b/.husky/post-checkout new file mode 100755 index 00000000..fd875bc5 --- /dev/null +++ b/.husky/post-checkout @@ -0,0 +1,72 @@ +#!/bin/sh +# GITBUTLER_MANAGED_HOOK_V1 +# This hook auto-cleans GitButler hooks when you checkout away from gitbutler/workspace. + +PREV_HEAD=$1 +NEW_HEAD=$2 +BRANCH_CHECKOUT=$3 + +# Only act on branch checkouts (not file checkouts) +if [ "$BRANCH_CHECKOUT" != "1" ]; then + # Run user's hook if it exists + if [ -x "$(dirname "$0")/post-checkout-user" ]; then + exec "$(dirname "$0")/post-checkout-user" "$@" + fi + exit 0 +fi + +# Get the new branch name +NEW_BRANCH=$(git symbolic-ref --short HEAD 2>/dev/null) + +# If we just left gitbutler/workspace (and aren't coming back to it) +PREV_BRANCH=$(git name-rev --name-only "$PREV_HEAD" 2>/dev/null | sed 's|^remotes/||') +if echo "$PREV_BRANCH" | grep -q "gitbutler/workspace"; then + if [ "$NEW_BRANCH" != "gitbutler/workspace" ]; then + echo "" + echo "NOTE: You have left GitButler's managed workspace branch." + echo "Cleaning up GitButler hooks..." + + HOOKS_DIR=$(dirname "$0") + + # Restore pre-commit - but only if it's GitButler-managed + if [ -f "$HOOKS_DIR/pre-commit-user" ]; then + mv "$HOOKS_DIR/pre-commit-user" "$HOOKS_DIR/pre-commit" + echo " Restored: pre-commit" + elif [ -f "$HOOKS_DIR/pre-commit" ]; then + # Only remove if it's GitButler-managed (has our signature) + if grep -q "GITBUTLER_MANAGED_HOOK_V1" "$HOOKS_DIR/pre-commit"; then + rm "$HOOKS_DIR/pre-commit" + echo " Removed: pre-commit (GitButler managed)" + else + echo " Warning: pre-commit hook is not GitButler-managed, leaving it untouched" + fi + fi + + # Run user's post-checkout if it exists, then clean up + if [ -x "$HOOKS_DIR/post-checkout-user" ]; then + "$HOOKS_DIR/post-checkout-user" "$@" + mv "$HOOKS_DIR/post-checkout-user" "$HOOKS_DIR/post-checkout" + echo " Restored: post-checkout" + else + # Only remove self if we're GitButler-managed (we should be, but check anyway) + if grep -q "GITBUTLER_MANAGED_HOOK_V1" "$HOOKS_DIR/post-checkout"; then + rm "$HOOKS_DIR/post-checkout" + echo " Removed: post-checkout (GitButler managed)" + else + echo " Warning: post-checkout hook is not GitButler-managed, leaving it untouched" + fi + fi + + echo "" + echo "To return to GitButler mode, run: but setup" + echo "" + exit 0 + fi +fi + +# Run user's hook if it exists +if [ -x "$(dirname "$0")/post-checkout-user" ]; then + exec "$(dirname "$0")/post-checkout-user" "$@" +fi + +exit 0 diff --git a/.husky/pre-commit b/.husky/pre-commit index d8089be9..869a3294 100755 --- a/.husky/pre-commit +++ b/.husky/pre-commit @@ -1,13 +1,43 @@ -# Use Node version from .nvmrc -export NVM_DIR="$HOME/.nvm" -if [ -s "$NVM_DIR/nvm.sh" ]; then - . "$NVM_DIR/nvm.sh" - nvm use 2>/dev/null -elif [ -d "$HOME/.nvm/versions/node" ]; then - NODE_VER=$(cat "$(git rev-parse --show-toplevel)/.nvmrc" 2>/dev/null || echo "20") - NODE_PATH=$(ls -d "$HOME/.nvm/versions/node/v${NODE_VER}"* 2>/dev/null | head -1) - [ -n "$NODE_PATH" ] && export PATH="$NODE_PATH/bin:$PATH" +#!/bin/sh +# GITBUTLER_MANAGED_HOOK_V1 +# This hook is managed by GitButler to prevent accidental commits on the workspace branch. +# Your original pre-commit hook has been preserved as 'pre-commit-user'. + +HOOKS_DIR=$(dirname "$0") + +# Run user's hook first if it exists - if it fails, stop here +if [ -x "$HOOKS_DIR/pre-commit-user" ]; then + "$HOOKS_DIR/pre-commit-user" "$@" || exit $? +fi + +# Get the current branch name +BRANCH=$(git symbolic-ref --short HEAD 2>/dev/null) + +if [ "$BRANCH" = "gitbutler/workspace" ]; then + echo "" + echo "GITBUTLER_ERROR: Cannot commit directly to gitbutler/workspace branch." + echo "" + echo "GitButler manages commits on this branch. Please use GitButler to commit your changes:" + echo " - Use the GitButler app to create commits" + echo " - Or run 'but commit' from the command line" + echo "" + echo "If you want to exit GitButler mode and use normal git:" + echo " - Run 'but teardown' to switch to a regular branch" + echo " - Or directly checkout another branch: git checkout " + echo "" + echo "If you no longer have the GitButler CLI installed, you can simply remove this hook and checkout another branch:" + printf ' rm "%s/pre-commit"\n' "$HOOKS_DIR" + echo "" + exit 1 +fi + +# Not on workspace branch - run user's original hook if it exists +if [ -x "$HOOKS_DIR/pre-commit-user" ]; then + echo "" + echo "WARNING: GitButler's pre-commit hook is still installed but you're not on gitbutler/workspace." + echo "If you're no longer using GitButler, you can restore your original hook:" + printf ' mv "%s/pre-commit-user" "%s/pre-commit"\n' "$HOOKS_DIR" "$HOOKS_DIR" + echo "" fi -npx lint-staged -npm run build +exit 0 diff --git a/.husky/pre-commit-user b/.husky/pre-commit-user new file mode 100755 index 00000000..d8089be9 --- /dev/null +++ b/.husky/pre-commit-user @@ -0,0 +1,13 @@ +# Use Node version from .nvmrc +export NVM_DIR="$HOME/.nvm" +if [ -s "$NVM_DIR/nvm.sh" ]; then + . "$NVM_DIR/nvm.sh" + nvm use 2>/dev/null +elif [ -d "$HOME/.nvm/versions/node" ]; then + NODE_VER=$(cat "$(git rev-parse --show-toplevel)/.nvmrc" 2>/dev/null || echo "20") + NODE_PATH=$(ls -d "$HOME/.nvm/versions/node/v${NODE_VER}"* 2>/dev/null | head -1) + [ -n "$NODE_PATH" ] && export PATH="$NODE_PATH/bin:$PATH" +fi + +npx lint-staged +npm run build diff --git a/scripts/dspy/optimized_state.json b/scripts/dspy/optimized_state.json index 40481fa0..4f13fb2a 100644 --- a/scripts/dspy/optimized_state.json +++ b/scripts/dspy/optimized_state.json @@ -10,7 +10,7 @@ "token_budget": 4096, "session_summary": "Frames: 50, recent activity in project", "available_frames": "- c05e94b2-3400-438b-bda1-d040c41a4d06: \"StackMemory v0.3.0 Development\" (task, score: 0.50, events: 0)\n- 722d8b90-29d2-462d-8077-a1ce0920db58: \"Test tool call storage\" (task, score: 0.50, events: 0)\n- 5fc00ed8-96a9-430f-a05e-ab83e2a411ac: \"working-on-cli\" (task, score: 0.50, events: 0)\n- 67cb9f1b-a458-4d57-82c0-c29308e00e87: \"cli-session\" (task, score: 0.50, events: 0)\n- 278f2eab-6bd7-4693-b9dc-7d583acadb8c: \"test-frame\" (task, score: 0.50, events: 0)\n- 86001f96-262a-4dd0-9320-f1545bf07b37: \"frame-1\" (task, score: 0.50, events: 0)\n- 88129690-84d0-48f5-825d-29e1f1deae86: \"frame-2\" (task, score: 0.50, events: 0)\n- 4ec15099-8217-453c-97bb-78f9806295e0: \"frame-3\" (task, score: 0.50, events: 0)\n- 67268788-f362-45f9-a68b-31801196e56b: \"test-stack-trace-capture\" (task, score: 0.50, events: 0)\n- 0f48dc4c-3a4b-4a69-ba23-95ac3be8d6e4: \"test-stack-trace-capture\" (task, score: 0.50, events: 0)\n- bd55adfc-6dc6-4dfb-a726-fda040796486: \"cli-session\" (task, score: 0.50, events: 0)\n- 7179084c-e4cc-4fda-82e1-47165c3629fb: \"cli-session\" (task, score: 0.50, events: 0)\n- 2e6692f7-3fc0-4777-8578-b1674658bbdc: \"team_share\" (tool_scope, score: 0.50, events: 0)\n- b41fbc67-c9f1-4fa6-b4f0-f2d523b93b2d: \"team_share\" (tool_scope, score: 0.50, events: 0)\n- d1eabbf4-fdab-4460-985c-5413f7307004: \"team_share\" (tool_scope, score: 0.50, events: 0)", - "key_decisions": "- Exit code is 0. The remaining 4 warnings are in `skill-storage.ts`, which is not...\n- I now have a complete picture. Here is the implementation plan.\n\n---\n\n## Impleme...\n- Perfect! I have all the key files. Let me create a comprehensive report of the s...\n- Perfect. Now I have all the information you need. Let me compile a comprehensive...\n- Now I have all the information I need. Let me provide a comprehensive analysis:\n...", + "key_decisions": "- The lint script only runs on `.ts` files, not `.js` files. The next.config.js er...\n- Task completed by agent: Add web clipper ingest pipeline. Watch a raw/ directory...\n- Task completed by agent: Build Obsidian vault adapter for frame serialization. W...\n- Task completed by agent: Wire Obsidian adapter into config + CLI. Add obsidianVa...\n- Task completed by agent: Test board end-to-end. Launch board, create session, se...", "reasoning": "Frame 'StackMemory v0.3.0 Development' directly matches the query topic.", "frames_to_retrieve": "[{\"frameId\": \"c05e94b2-3400-438b-bda1-d040c41a4d06\", \"priority\": 9, \"reason\": \"Direct match\", \"includeEvents\": true, \"includeAnchors\": true}]", "confidence_score": 0.9 @@ -20,7 +20,7 @@ "token_budget": 4096, "session_summary": "Frames: 50, recent activity in project", "available_frames": "- c05e94b2-3400-438b-bda1-d040c41a4d06: \"StackMemory v0.3.0 Development\" (task, score: 0.50, events: 0)\n- 722d8b90-29d2-462d-8077-a1ce0920db58: \"Test tool call storage\" (task, score: 0.50, events: 0)\n- 5fc00ed8-96a9-430f-a05e-ab83e2a411ac: \"working-on-cli\" (task, score: 0.50, events: 0)\n- 67cb9f1b-a458-4d57-82c0-c29308e00e87: \"cli-session\" (task, score: 0.50, events: 0)\n- 278f2eab-6bd7-4693-b9dc-7d583acadb8c: \"test-frame\" (task, score: 0.50, events: 0)\n- 86001f96-262a-4dd0-9320-f1545bf07b37: \"frame-1\" (task, score: 0.50, events: 0)\n- 88129690-84d0-48f5-825d-29e1f1deae86: \"frame-2\" (task, score: 0.50, events: 0)\n- 4ec15099-8217-453c-97bb-78f9806295e0: \"frame-3\" (task, score: 0.50, events: 0)\n- 67268788-f362-45f9-a68b-31801196e56b: \"test-stack-trace-capture\" (task, score: 0.50, events: 0)\n- 0f48dc4c-3a4b-4a69-ba23-95ac3be8d6e4: \"test-stack-trace-capture\" (task, score: 0.50, events: 0)\n- bd55adfc-6dc6-4dfb-a726-fda040796486: \"cli-session\" (task, score: 0.50, events: 0)\n- 7179084c-e4cc-4fda-82e1-47165c3629fb: \"cli-session\" (task, score: 0.50, events: 0)\n- 2e6692f7-3fc0-4777-8578-b1674658bbdc: \"team_share\" (tool_scope, score: 0.50, events: 0)\n- b41fbc67-c9f1-4fa6-b4f0-f2d523b93b2d: \"team_share\" (tool_scope, score: 0.50, events: 0)\n- d1eabbf4-fdab-4460-985c-5413f7307004: \"team_share\" (tool_scope, score: 0.50, events: 0)", - "key_decisions": "- Exit code is 0. The remaining 4 warnings are in `skill-storage.ts`, which is not...\n- I now have a complete picture. Here is the implementation plan.\n\n---\n\n## Impleme...\n- Perfect! I have all the key files. Let me create a comprehensive report of the s...\n- Perfect. Now I have all the information you need. Let me compile a comprehensive...\n- Now I have all the information I need. Let me provide a comprehensive analysis:\n...", + "key_decisions": "- The lint script only runs on `.ts` files, not `.js` files. The next.config.js er...\n- Task completed by agent: Add web clipper ingest pipeline. Watch a raw/ directory...\n- Task completed by agent: Build Obsidian vault adapter for frame serialization. W...\n- Task completed by agent: Wire Obsidian adapter into config + CLI. Add obsidianVa...\n- Task completed by agent: Test board end-to-end. Launch board, create session, se...", "reasoning": "Frame 'cli-session' directly matches the query topic.", "frames_to_retrieve": "[{\"frameId\": \"67cb9f1b-a458-4d57-82c0-c29308e00e87\", \"priority\": 9, \"reason\": \"Direct match\", \"includeEvents\": true, \"includeAnchors\": true}]", "confidence_score": 0.9 diff --git a/scripts/gepa/generations/gen-000/baseline.md b/scripts/gepa/generations/gen-000/baseline.md index 4dc0ebb0..5fd37e77 100644 --- a/scripts/gepa/generations/gen-000/baseline.md +++ b/scripts/gepa/generations/gen-000/baseline.md @@ -112,7 +112,7 @@ When adding or renaming GitHub Actions workflows that should be triggerable via | Workflow | Script path | Category | |---|---|---| | `weekly-start.yml` | `voyager/scripts/content-brief.mjs` + `voyager/scripts/content-audit.mjs` + `ops/fathom-social-content.mjs` + `ops/fathom-testimonial-scan.mjs` + `ops/perplexity-citation-audit.mjs` + `commit/profound-aeo-pulse.mjs` + `voyager/scripts/generate-blog-scaffold.mjs` + `ops/ahrefs-firehose-digest.mjs` + `ops/export-dripify.mjs` + `commit/prospect-discovery.mjs` + `ops/repush-clay-leads.mjs` + `ops/snitcher-outreach.mjs` | GHA cron (Mon) | -| `weekly-end.yml` | `diag/fathom-demo-scorecard.mjs` + `commit/feedback/collect-*.mjs` | GHA cron (Fri) | +| `weekly-end.yml` | `diag/fathom-demo-scorecard.mjs` + `commit/feedback/collect-*.mjs` + `commit/feedback/collect-ops-feedback.mjs` + `diag/weekly-retro.mjs` | GHA cron (Fri) | | `anneal-keywords.yml` | `commit/anneal-keywords.mjs` | GHA cron (Sun) | | `g2-review-monitor.yml` | `ops/g2-to-senja.mjs` | GHA cron (Daily) | | `testimonial-pipeline.yml` | `commit/testimonial-pipeline.mjs` | Manual | diff --git a/scripts/gepa/generations/gen-001/baseline.md b/scripts/gepa/generations/gen-001/baseline.md index 4dc0ebb0..5fd37e77 100644 --- a/scripts/gepa/generations/gen-001/baseline.md +++ b/scripts/gepa/generations/gen-001/baseline.md @@ -112,7 +112,7 @@ When adding or renaming GitHub Actions workflows that should be triggerable via | Workflow | Script path | Category | |---|---|---| | `weekly-start.yml` | `voyager/scripts/content-brief.mjs` + `voyager/scripts/content-audit.mjs` + `ops/fathom-social-content.mjs` + `ops/fathom-testimonial-scan.mjs` + `ops/perplexity-citation-audit.mjs` + `commit/profound-aeo-pulse.mjs` + `voyager/scripts/generate-blog-scaffold.mjs` + `ops/ahrefs-firehose-digest.mjs` + `ops/export-dripify.mjs` + `commit/prospect-discovery.mjs` + `ops/repush-clay-leads.mjs` + `ops/snitcher-outreach.mjs` | GHA cron (Mon) | -| `weekly-end.yml` | `diag/fathom-demo-scorecard.mjs` + `commit/feedback/collect-*.mjs` | GHA cron (Fri) | +| `weekly-end.yml` | `diag/fathom-demo-scorecard.mjs` + `commit/feedback/collect-*.mjs` + `commit/feedback/collect-ops-feedback.mjs` + `diag/weekly-retro.mjs` | GHA cron (Fri) | | `anneal-keywords.yml` | `commit/anneal-keywords.mjs` | GHA cron (Sun) | | `g2-review-monitor.yml` | `ops/g2-to-senja.mjs` | GHA cron (Daily) | | `testimonial-pipeline.yml` | `commit/testimonial-pipeline.mjs` | Manual | diff --git a/src/cli/commands/orchestrate.ts b/src/cli/commands/orchestrate.ts index 50e75101..405c41d4 100644 --- a/src/cli/commands/orchestrate.ts +++ b/src/cli/commands/orchestrate.ts @@ -2045,6 +2045,11 @@ export function createConductorCommands(): Command { '--no-pr', 'Disable automatic GitHub PR creation after agent success' ) + .option( + '--workspace-mode ', + 'Workspace mode: "auto" (detect GitButler), "gitbutler", or "worktree"', + 'auto' + ) .action(async (options) => { // Ensure default prompt template exists on first start ensureDefaultPromptTemplate(); @@ -2065,6 +2070,7 @@ export function createConductorCommands(): Command { agentMode: options.mode === 'adapter' ? 'adapter' : 'cli', model: options.model, autoPR: options.pr, + workspaceMode: options.workspaceMode, }); await conductor.start(); diff --git a/src/cli/commands/orchestrator.ts b/src/cli/commands/orchestrator.ts index 05be1dc3..749edcba 100644 --- a/src/cli/commands/orchestrator.ts +++ b/src/cli/commands/orchestrator.ts @@ -86,6 +86,8 @@ export interface ConductorConfig { model?: string; /** Auto-create GitHub PRs after successful agent runs (default: true) */ autoPR?: boolean; + /** Workspace mode: 'auto' (detect GitButler), 'gitbutler', or 'worktree' (default: 'auto') */ + workspaceMode?: 'auto' | 'gitbutler' | 'worktree'; } export interface RunningIssue { @@ -205,7 +207,7 @@ function logAgentOutcome(entry: AgentOutcomeEntry): void { appendFileSync(getOutcomesLogPath(), JSON.stringify(entry) + '\n'); } -/** Best-effort PR creation via GitHub CLI after successful agent run */ +/** Best-effort PR creation via GitHub CLI (or GitButler) after successful agent run */ function createPullRequest(opts: { branch: string; baseBranch: string; @@ -214,15 +216,9 @@ function createPullRequest(opts: { filesModified: number; toolCalls: number; workspacePath: string; + useGitButler?: boolean; }): string | null { try { - // Push the branch first - execSync(`git push -u origin "${opts.branch}"`, { - cwd: opts.workspacePath, - stdio: 'pipe', - timeout: 60000, - }); - const prTitle = `feat(conductor): ${opts.issueId} β€” ${opts.title}`; const prBody = [ '## Summary', @@ -235,6 +231,38 @@ function createPullRequest(opts: { '_This PR was auto-created by StackMemory Conductor._', ].join('\n'); + if (opts.useGitButler) { + // GitButler: push branch then create PR via but cli + execSync(`but push --branch "${opts.branch}"`, { + cwd: opts.workspacePath, + stdio: 'pipe', + timeout: 60000, + }); + + const result = execSync( + `but pr create --branch "${opts.branch}" --title "${prTitle.replace(/"/g, '\\"')}" --body "${prBody.replace(/"/g, '\\"')}"`, + { + cwd: opts.workspacePath, + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + timeout: 30000, + } + ); + const prUrl = result.trim(); + logger.info('Created PR via GitButler', { + issueId: opts.issueId, + prUrl, + }); + return prUrl; + } + + // Standard git + gh CLI + execSync(`git push -u origin "${opts.branch}"`, { + cwd: opts.workspacePath, + stdio: 'pipe', + timeout: 60000, + }); + const result = execSync( `gh pr create --base "${opts.baseBranch}" --head "${opts.branch}" --title "${prTitle.replace(/"/g, '\\"')}" --body "${prBody.replace(/"/g, '\\"')}"`, { @@ -664,6 +692,8 @@ export class Conductor { private stateCache: Map = new Map(); private activeStatesLower: string[]; private terminalStatesLower: string[]; + /** Whether to use GitButler virtual branches instead of git worktrees */ + private useGitButler = false; /** Global rate limit backoff state */ private rateLimit: RateLimitState = { @@ -728,8 +758,37 @@ export class Conductor { } } - // Ensure workspace root exists - if (!existsSync(this.config.workspaceRoot)) { + // Detect workspace mode: GitButler virtual branches or git worktrees + const wsMode = this.config.workspaceMode || 'auto'; + if (wsMode === 'gitbutler' || wsMode === 'auto') { + try { + const butVersion = execSync('but --version', { + cwd: this.config.repoRoot, + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + timeout: 5000, + }).trim(); + // Check if repo is in GitButler mode (gitbutler/workspace branch exists) + const gbDir = join(this.config.repoRoot, '.git', 'gitbutler'); + if (wsMode === 'gitbutler' || existsSync(gbDir)) { + this.useGitButler = true; + logger.info('Using GitButler virtual branches', { + version: butVersion, + }); + console.log(`[conductor] GitButler mode (${butVersion})`); + } + } catch { + if (wsMode === 'gitbutler') { + throw new Error( + 'GitButler CLI (but) not found. Install: brew install --cask gitbutler' + ); + } + // auto mode: fall through to worktrees + } + } + + // Ensure workspace root exists (only needed for worktree mode) + if (!this.useGitButler && !existsSync(this.config.workspaceRoot)) { mkdirSync(this.config.workspaceRoot, { recursive: true }); } @@ -1309,6 +1368,7 @@ export class Conductor { filesModified: run.filesModified, toolCalls: run.toolCalls, workspacePath: run.workspacePath, + useGitButler: this.useGitButler, }); if (url) { prUrl = url; @@ -1680,6 +1740,52 @@ export class Conductor { private async createWorkspace(issue: LinearIssue): Promise { const wsKey = this.sanitizeIdentifier(issue.identifier); + + if (this.useGitButler) { + return this.createGitButlerBranch(issue, wsKey); + } + return this.createWorktree(issue, wsKey); + } + + private createGitButlerBranch(issue: LinearIssue, wsKey: string): string { + const branchName = `conductor/${wsKey}`; + + try { + // Pull latest changes + execSync('but pull', { + cwd: this.config.repoRoot, + stdio: 'pipe', + timeout: 30000, + }); + } catch { + // Non-fatal β€” may be offline + } + + try { + // Create virtual branch + execSync(`but branch new "${branchName}"`, { + cwd: this.config.repoRoot, + stdio: 'pipe', + timeout: 10000, + }); + + logger.info('Created GitButler virtual branch', { + identifier: issue.identifier, + branch: branchName, + }); + } catch { + // Branch may already exist β€” that's fine + logger.info('GitButler branch may already exist, reusing', { + identifier: issue.identifier, + branch: branchName, + }); + } + + // GitButler: agents work in repo root, not a separate dir + return this.config.repoRoot; + } + + private createWorktree(issue: LinearIssue, wsKey: string): string { const wsPath = join(this.config.workspaceRoot, wsKey); if (existsSync(wsPath)) { @@ -1690,18 +1796,15 @@ export class Conductor { return wsPath; } - // Create git worktree const branchName = `conductor/${wsKey}`; try { - // Fetch latest execSync('git fetch origin', { cwd: this.config.repoRoot, stdio: 'pipe', timeout: 30000, }); - // Create worktree with new branch from base execSync( `git worktree add "${wsPath}" -b "${branchName}" "origin/${this.config.baseBranch}"`, { @@ -1717,7 +1820,6 @@ export class Conductor { branch: branchName, }); } catch (err) { - // Branch may already exist β€” try checking it out try { execSync(`git worktree add "${wsPath}" "${branchName}"`, { cwd: this.config.repoRoot, @@ -1736,14 +1838,34 @@ export class Conductor { private async removeWorkspace(issue: LinearIssue): Promise { const wsKey = this.sanitizeIdentifier(issue.identifier); - const wsPath = join(this.config.workspaceRoot, wsKey); + const branchName = `conductor/${wsKey}`; + if (this.useGitButler) { + // Unapply virtual branch (keeps it in history, just removes from workspace) + await this.runHook('before-remove', this.config.repoRoot, issue).catch( + () => {} + ); + try { + execSync(`but unapply "${branchName}"`, { + cwd: this.config.repoRoot, + stdio: 'pipe', + timeout: 10000, + }); + } catch { + // May already be unapplied + logger.debug('GitButler branch already unapplied', { + identifier: issue.identifier, + }); + } + return; + } + + // Worktree mode + const wsPath = join(this.config.workspaceRoot, wsKey); if (!existsSync(wsPath)) return; - // Run before_remove hook await this.runHook('before-remove', wsPath, issue).catch(() => {}); - // Remove git worktree try { execSync(`git worktree remove "${wsPath}" --force`, { cwd: this.config.repoRoot, @@ -1751,7 +1873,6 @@ export class Conductor { timeout: 30000, }); } catch { - // Fallback: manual cleanup try { rmSync(wsPath, { recursive: true, force: true }); execSync('git worktree prune', { @@ -2679,6 +2800,7 @@ export class Conductor { filesModified: run.filesModified, toolCalls: run.toolCalls, workspacePath: wsPath, + useGitButler: this.useGitButler, }); if (url) { prUrl = url; From 6c756c714c84d4821f5112524c85bf4f9885f62d Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Wed, 8 Apr 2026 13:17:21 -0400 Subject: [PATCH 02/18] fix(conductor): state filter + labels flatten for issue dispatch --- src/cli/commands/orchestrator.ts | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/cli/commands/orchestrator.ts b/src/cli/commands/orchestrator.ts index 749edcba..396b2711 100644 --- a/src/cli/commands/orchestrator.ts +++ b/src/cli/commands/orchestrator.ts @@ -1185,15 +1185,14 @@ export class Conductor { const allCandidates: LinearIssue[] = []; - // Fetch issues for each active state - // Linear API filters by state type, but we need state name matching - // Use 'unstarted' type which covers Todo-like states + // Fetch issues with unstarted state type (covers Todo-like states) + // Then filter by exact state name match const issues = await this.client.getIssues({ teamId: this.config.teamId, + stateType: 'unstarted', limit: 50, }); - // Filter by active state names (case-insensitive, pre-computed) for (const issue of issues) { const stateName = issue.state.name.trim().toLowerCase(); if (this.activeStatesLower.includes(stateName)) { From 48d1d68a5fe627075c78d8d8ac899f8bad1e39cb Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Wed, 8 Apr 2026 13:17:52 -0400 Subject: [PATCH 03/18] fix(linear): flatten labels in getIssues response --- src/integrations/linear/client.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/integrations/linear/client.ts b/src/integrations/linear/client.ts index 9f1967e8..a38259c7 100644 --- a/src/integrations/linear/client.ts +++ b/src/integrations/linear/client.ts @@ -719,7 +719,17 @@ export class LinearClient { first: options?.limit || 50, }); - return result.issues.nodes; + // Flatten labels from { nodes: [...] } to plain array + return result.issues.nodes.map((issue) => ({ + ...issue, + labels: Array.isArray(issue.labels) + ? issue.labels + : ( + issue.labels as unknown as { + nodes: Array<{ id: string; name: string }>; + } + )?.nodes || [], + })); } /** From 019e4997e3627ac51bc270867e24be079b12c98d Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Wed, 8 Apr 2026 20:15:58 -0400 Subject: [PATCH 04/18] feat(cross-search): multi-database frame search across projects (STA-480) - Add CrossProjectSearch engine with FTS5/BM25 ranking across N databases - Project registry (~/.stackmemory/projects.json) with CRUD + auto-discovery - Read-only SQLite connections for safety, LIKE fallback for non-FTS databases - 4 MCP tools: sm_cross_search, sm_cross_discover, sm_cross_register, sm_cross_list - CLI: `stackmemory search --all-projects "query"` for cross-project search - 17 tests: registry CRUD, multi-db FTS5 search, ranking, LIKE fallback, graceful skip --- src/cli/commands/search.ts | 39 ++ .../__tests__/cross-project-search.test.ts | 333 +++++++++++ src/core/cross-search/cross-project-search.ts | 400 +++++++++++++ src/core/cross-search/index.ts | 7 + .../mcp/__tests__/tool-alias-registry.test.ts | 321 ++++++++++ .../mcp/handlers/cross-search-handlers.ts | 225 +++++++ src/integrations/mcp/handlers/index.ts | 47 +- src/integrations/mcp/server.ts | 122 +++- src/integrations/mcp/tool-alias-registry.ts | 556 ++++++++++++++++++ src/integrations/mcp/tool-definitions.ts | 121 +++- 10 files changed, 2151 insertions(+), 20 deletions(-) create mode 100644 src/core/cross-search/__tests__/cross-project-search.test.ts create mode 100644 src/core/cross-search/cross-project-search.ts create mode 100644 src/core/cross-search/index.ts create mode 100644 src/integrations/mcp/__tests__/tool-alias-registry.test.ts create mode 100644 src/integrations/mcp/handlers/cross-search-handlers.ts create mode 100644 src/integrations/mcp/tool-alias-registry.ts diff --git a/src/cli/commands/search.ts b/src/cli/commands/search.ts index 7e90f266..1a09ce4a 100644 --- a/src/cli/commands/search.ts +++ b/src/cli/commands/search.ts @@ -8,6 +8,7 @@ import Database from 'better-sqlite3'; import { join } from 'path'; import { existsSync } from 'fs'; import { z } from 'zod'; +import { CrossProjectSearch } from '../../core/cross-search/cross-project-search.js'; /** Raw task row from task_cache table */ interface TaskRow { @@ -58,6 +59,10 @@ export function createSearchCommand(): Command { .argument('', 'Search query') .option('-t, --tasks', 'Search only tasks') .option('-c, --context', 'Search only context') + .option( + '-a, --all-projects', + 'Search across all registered project databases' + ) .option('-l, --limit ', 'Limit results', '20') .action(async (rawQuery, options) => { const projectRoot = process.cwd(); @@ -86,6 +91,40 @@ export function createSearchCommand(): Command { return; } + // Cross-project search mode + if (options.allProjects) { + console.log( + `\nπŸ” Searching across all projects for "${rawQuery}"...\n` + ); + const crossSearch = new CrossProjectSearch(); + const results = await crossSearch.search({ + query: rawQuery, + limit, + }); + + if (results.length === 0) { + console.log('No results found across project databases.\n'); + console.log( + 'Tip: Run "stackmemory search --all-projects" after "stackmemory projects scan" to discover databases.' + ); + return; + } + + console.log(`πŸ“ Cross-Project Results (${results.length})\n`); + for (const r of results) { + const date = new Date(r.createdAt).toLocaleDateString(); + console.log( + ` [${r.projectName}] ${r.name} (${r.type}, score: ${r.score.toFixed(3)})` + ); + if (r.digestText) { + console.log(` ${r.digestText.slice(0, 100)}`); + } + console.log(` ${date} | ${r.projectPath}`); + } + console.log(`\nFound ${results.length} results.\n`); + return; + } + const db = new Database(dbPath); const searchTasks = !options.context || options.tasks; const searchContext = !options.tasks || options.context; diff --git a/src/core/cross-search/__tests__/cross-project-search.test.ts b/src/core/cross-search/__tests__/cross-project-search.test.ts new file mode 100644 index 00000000..13d81ecb --- /dev/null +++ b/src/core/cross-search/__tests__/cross-project-search.test.ts @@ -0,0 +1,333 @@ +/** + * Tests for Cross-Project Search + * Tests project registry CRUD and cross-database FTS5 search + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { CrossProjectSearch } from '../cross-project-search.js'; +import { SQLiteAdapter } from '../../database/sqlite-adapter.js'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +describe('CrossProjectSearch', () => { + let tmpDir: string; + let crossSearch: CrossProjectSearch; + + beforeEach(() => { + tmpDir = fs.mkdtempSync( + path.join(os.tmpdir(), 'stackmemory-cross-search-') + ); + crossSearch = new CrossProjectSearch(tmpDir); + }); + + afterEach(() => { + try { + fs.rmSync(tmpDir, { recursive: true }); + } catch { + // cleanup best-effort + } + }); + + describe('Project Registry CRUD', () => { + it('should start with empty registry', () => { + const projects = crossSearch.listProjects(); + expect(projects).toEqual([]); + }); + + it('should register a project', () => { + crossSearch.registerProject({ + name: 'test-project', + path: '/tmp/test-project', + dbPath: '/tmp/test-project/.stackmemory/context.db', + lastAccessed: Date.now(), + }); + + const projects = crossSearch.listProjects(); + expect(projects).toHaveLength(1); + expect(projects[0].name).toBe('test-project'); + }); + + it('should update existing project on re-register with same path', () => { + const entry = { + name: 'test-project', + path: '/tmp/test-project', + dbPath: '/tmp/test-project/.stackmemory/context.db', + lastAccessed: 1000, + }; + + crossSearch.registerProject(entry); + crossSearch.registerProject({ ...entry, lastAccessed: 2000 }); + + const projects = crossSearch.listProjects(); + expect(projects).toHaveLength(1); + expect(projects[0].lastAccessed).toBe(2000); + }); + + it('should unregister a project by path', () => { + crossSearch.registerProject({ + name: 'a', + path: '/tmp/a', + dbPath: '/tmp/a/.stackmemory/context.db', + lastAccessed: Date.now(), + }); + crossSearch.registerProject({ + name: 'b', + path: '/tmp/b', + dbPath: '/tmp/b/.stackmemory/context.db', + lastAccessed: Date.now(), + }); + + const removed = crossSearch.unregisterProject('/tmp/a'); + expect(removed).toBe(true); + expect(crossSearch.listProjects()).toHaveLength(1); + expect(crossSearch.listProjects()[0].name).toBe('b'); + }); + + it('should unregister a project by name', () => { + crossSearch.registerProject({ + name: 'my-app', + path: '/tmp/my-app', + dbPath: '/tmp/my-app/.stackmemory/context.db', + lastAccessed: Date.now(), + }); + + const removed = crossSearch.unregisterProject('my-app'); + expect(removed).toBe(true); + expect(crossSearch.listProjects()).toHaveLength(0); + }); + + it('should return false when unregistering non-existent project', () => { + const removed = crossSearch.unregisterProject('ghost'); + expect(removed).toBe(false); + }); + + it('should persist registry to disk', () => { + crossSearch.registerProject({ + name: 'persisted', + path: '/tmp/persisted', + dbPath: '/tmp/persisted/.stackmemory/context.db', + lastAccessed: Date.now(), + }); + + // Load from disk in a new instance + const crossSearch2 = new CrossProjectSearch(tmpDir); + const projects = crossSearch2.listProjects(); + expect(projects).toHaveLength(1); + expect(projects[0].name).toBe('persisted'); + }); + }); + + describe('Cross-Database Search', () => { + let projectADir: string; + let projectBDir: string; + let adapterA: SQLiteAdapter; + let adapterB: SQLiteAdapter; + + beforeEach(async () => { + // Create two project databases with frames + projectADir = path.join(tmpDir, 'project-a', '.stackmemory'); + projectBDir = path.join(tmpDir, 'project-b', '.stackmemory'); + fs.mkdirSync(projectADir, { recursive: true }); + fs.mkdirSync(projectBDir, { recursive: true }); + + const dbPathA = path.join(projectADir, 'context.db'); + const dbPathB = path.join(projectBDir, 'context.db'); + + adapterA = new SQLiteAdapter('project-a', { dbPath: dbPathA }); + adapterB = new SQLiteAdapter('project-b', { dbPath: dbPathB }); + + await adapterA.connect(); + await adapterA.initializeSchema(); + await adapterB.connect(); + await adapterB.initializeSchema(); + + // Populate project A + await adapterA.createFrame({ + run_id: 'run-a1', + project_id: 'project-a', + type: 'task', + name: 'authentication login flow', + digest_text: 'implements JWT-based auth with refresh tokens', + }); + await adapterA.createFrame({ + run_id: 'run-a1', + project_id: 'project-a', + type: 'debug', + name: 'fix database migration', + digest_text: 'resolved foreign key constraint on users table', + }); + + // Populate project B + await adapterB.createFrame({ + run_id: 'run-b1', + project_id: 'project-b', + type: 'task', + name: 'authentication OAuth integration', + digest_text: 'added Google and GitHub OAuth providers', + }); + await adapterB.createFrame({ + run_id: 'run-b1', + project_id: 'project-b', + type: 'task', + name: 'API rate limiting', + digest_text: 'token bucket algorithm for API endpoints', + }); + + await adapterA.disconnect(); + await adapterB.disconnect(); + + // Register both projects + crossSearch.registerProject({ + name: 'project-a', + path: path.join(tmpDir, 'project-a'), + dbPath: dbPathA, + lastAccessed: Date.now(), + }); + crossSearch.registerProject({ + name: 'project-b', + path: path.join(tmpDir, 'project-b'), + dbPath: dbPathB, + lastAccessed: Date.now(), + }); + }); + + it('should search across multiple databases with FTS5', async () => { + const results = await crossSearch.search({ query: 'authentication' }); + + expect(results.length).toBe(2); + // Both projects should have auth-related results + const projectNames = results.map((r) => r.projectName); + expect(projectNames).toContain('project-a'); + expect(projectNames).toContain('project-b'); + }); + + it('should rank results by BM25 score', async () => { + const results = await crossSearch.search({ query: 'authentication' }); + + // Results should be sorted by score descending + for (let i = 1; i < results.length; i++) { + expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score); + } + }); + + it('should search with term matching in digest_text', async () => { + const results = await crossSearch.search({ query: 'OAuth' }); + + expect(results.length).toBeGreaterThanOrEqual(1); + expect(results[0].projectName).toBe('project-b'); + }); + + it('should respect limit parameter', async () => { + const results = await crossSearch.search({ + query: 'authentication', + limit: 1, + }); + + expect(results.length).toBe(1); + }); + + it('should exclude a project when specified', async () => { + const results = await crossSearch.search({ + query: 'authentication', + excludeProject: 'project-a', + }); + + expect(results.length).toBe(1); + expect(results[0].projectName).toBe('project-b'); + }); + + it('should return empty array when no matches', async () => { + const results = await crossSearch.search({ + query: 'xyznonexistent123', + }); + + expect(results).toEqual([]); + }); + + it('should skip missing databases gracefully', async () => { + crossSearch.registerProject({ + name: 'ghost', + path: '/tmp/nonexistent', + dbPath: '/tmp/nonexistent/.stackmemory/context.db', + lastAccessed: Date.now(), + }); + + // Should not throw, just skip the missing db + const results = await crossSearch.search({ query: 'authentication' }); + expect(results.length).toBe(2); + }); + + it('should return empty array with no registered projects', async () => { + const emptySearch = new CrossProjectSearch( + fs.mkdtempSync(path.join(os.tmpdir(), 'empty-')) + ); + const results = await emptySearch.search({ query: 'test' }); + expect(results).toEqual([]); + }); + + it('should include project metadata in results', async () => { + const results = await crossSearch.search({ query: 'migration' }); + + expect(results.length).toBeGreaterThanOrEqual(1); + const result = results[0]; + expect(result.projectName).toBeDefined(); + expect(result.projectPath).toBeDefined(); + expect(result.frameId).toBeDefined(); + expect(result.name).toBeDefined(); + expect(result.type).toBeDefined(); + expect(typeof result.score).toBe('number'); + expect(typeof result.createdAt).toBe('number'); + }); + }); + + describe('LIKE fallback', () => { + let projectDir: string; + + beforeEach(async () => { + // Create a database without FTS5 table + projectDir = path.join(tmpDir, 'no-fts', '.stackmemory'); + fs.mkdirSync(projectDir, { recursive: true }); + const dbPath = path.join(projectDir, 'context.db'); + + // Manually create a minimal frames table without FTS + const Database = (await import('better-sqlite3')).default; + const db = new Database(dbPath); + db.exec(` + CREATE TABLE frames ( + rowid INTEGER PRIMARY KEY AUTOINCREMENT, + frame_id TEXT UNIQUE, + run_id TEXT, + project_id TEXT, + type TEXT DEFAULT 'task', + name TEXT DEFAULT '', + state TEXT DEFAULT 'active', + depth INTEGER DEFAULT 0, + inputs TEXT DEFAULT '{}', + outputs TEXT DEFAULT '{}', + digest_text TEXT DEFAULT '', + digest_json TEXT DEFAULT '{}', + created_at INTEGER DEFAULT 0 + ); + INSERT INTO frames (frame_id, name, type, state, digest_text, inputs, created_at) + VALUES ('f1', 'fallback test frame', 'task', 'active', 'should be found via LIKE', '{}', 1000); + `); + db.close(); + + crossSearch.registerProject({ + name: 'no-fts-project', + path: path.join(tmpDir, 'no-fts'), + dbPath, + lastAccessed: Date.now(), + }); + }); + + it('should fall back to LIKE search when FTS5 table is absent', async () => { + const results = await crossSearch.search({ query: 'fallback' }); + + expect(results.length).toBe(1); + expect(results[0].name).toBe('fallback test frame'); + expect(results[0].projectName).toBe('no-fts-project'); + }); + }); +}); diff --git a/src/core/cross-search/cross-project-search.ts b/src/core/cross-search/cross-project-search.ts new file mode 100644 index 00000000..10393e22 --- /dev/null +++ b/src/core/cross-search/cross-project-search.ts @@ -0,0 +1,400 @@ +/** + * Cross-Project Search Engine + * Queries frames across multiple project databases using FTS5/BM25 + * Opens read-only SQLite connections to each database for safety + */ + +import Database from 'better-sqlite3'; +import { + existsSync, + readFileSync, + writeFileSync, + mkdirSync, + readdirSync, +} from 'fs'; +import { join } from 'path'; +import { homedir } from 'os'; +import { logger } from '../monitoring/logger.js'; + +export interface ProjectEntry { + name: string; + path: string; + dbPath: string; + lastAccessed: number; +} + +export interface ProjectRegistry { + projects: ProjectEntry[]; +} + +export interface CrossSearchResult { + projectName: string; + projectPath: string; + frameId: string; + name: string; + type: string; + state: string; + digestText: string | null; + score: number; + createdAt: number; +} + +export interface CrossSearchOptions { + query: string; + limit?: number; + excludeProject?: string; +} + +/** + * Sanitize user input for FTS5 MATCH queries. + * Mirrors the logic in SQLiteAdapter.sanitizeFtsQuery(). + */ +function sanitizeFtsQuery(query: string): string { + const wantsPrefix = query.trimEnd().endsWith('*'); + + const cleaned = query + .replace(/['"(){}[\]^~*\\,]/g, ' ') + .replace(/\b(AND|OR|NOT|NEAR)\b/gi, '') + .trim(); + + const terms = cleaned.split(/\s+/).filter((t) => t.length > 0); + if (terms.length === 0) return '""'; + + const quoted = terms.map((t) => `"${t}"`); + + if (wantsPrefix) { + quoted[quoted.length - 1] = quoted[quoted.length - 1] + '*'; + } + + return quoted.join(' '); +} + +export class CrossProjectSearch { + private registryPath: string; + + constructor(registryDir?: string) { + const dir = registryDir || join(homedir(), '.stackmemory'); + if (!existsSync(dir)) { + mkdirSync(dir, { recursive: true }); + } + this.registryPath = join(dir, 'projects.json'); + } + + // --- Project Registry CRUD --- + + loadRegistry(): ProjectRegistry { + if (!existsSync(this.registryPath)) { + return { projects: [] }; + } + try { + const raw = readFileSync(this.registryPath, 'utf-8'); + return JSON.parse(raw) as ProjectRegistry; + } catch { + logger.warn('Failed to parse projects.json, returning empty registry'); + return { projects: [] }; + } + } + + saveRegistry(registry: ProjectRegistry): void { + writeFileSync(this.registryPath, JSON.stringify(registry, null, 2)); + } + + registerProject(entry: ProjectEntry): void { + const registry = this.loadRegistry(); + const idx = registry.projects.findIndex( + (p) => p.path === entry.path || p.dbPath === entry.dbPath + ); + if (idx >= 0) { + registry.projects[idx] = entry; + } else { + registry.projects.push(entry); + } + this.saveRegistry(registry); + } + + unregisterProject(pathOrName: string): boolean { + const registry = this.loadRegistry(); + const before = registry.projects.length; + registry.projects = registry.projects.filter( + (p) => p.path !== pathOrName && p.name !== pathOrName + ); + if (registry.projects.length < before) { + this.saveRegistry(registry); + return true; + } + return false; + } + + listProjects(): ProjectEntry[] { + return this.loadRegistry().projects; + } + + /** + * Auto-discover projects by scanning common directories for .stackmemory/context.db + */ + discoverProjects(basePaths?: string[]): ProjectEntry[] { + const paths = basePaths || [ + join(homedir(), 'Dev'), + join(homedir(), 'dev'), + join(homedir(), 'Projects'), + join(homedir(), 'projects'), + join(homedir(), 'Work'), + join(homedir(), 'work'), + join(homedir(), 'code'), + join(homedir(), 'Code'), + ]; + + // Also check ~/.stackmemory/context.db (global/home project) + const homeDb = join(homedir(), '.stackmemory', 'context.db'); + const discovered: ProjectEntry[] = []; + + if (existsSync(homeDb)) { + discovered.push({ + name: 'global', + path: homedir(), + dbPath: homeDb, + lastAccessed: Date.now(), + }); + } + + for (const basePath of paths) { + if (!existsSync(basePath)) continue; + + try { + // Scan 3 levels deep for .stackmemory/context.db + this.scanForDatabases(basePath, 0, 3, discovered); + } catch { + // Skip inaccessible directories + } + } + + // Merge with existing registry + const registry = this.loadRegistry(); + for (const entry of discovered) { + const existing = registry.projects.find((p) => p.dbPath === entry.dbPath); + if (!existing) { + registry.projects.push(entry); + } + } + this.saveRegistry(registry); + + return discovered; + } + + private scanForDatabases( + dir: string, + depth: number, + maxDepth: number, + results: ProjectEntry[] + ): void { + if (depth > maxDepth) return; + + const dbPath = join(dir, '.stackmemory', 'context.db'); + if (existsSync(dbPath)) { + const name = dir.split('/').pop() || dir; + results.push({ + name, + path: dir, + dbPath, + lastAccessed: Date.now(), + }); + return; // Don't scan subdirectories of a project + } + + // Scan subdirectories + try { + const entries = readdirSync(dir, { withFileTypes: true }); + for (const entry of entries) { + if ( + entry.isDirectory() && + !entry.name.startsWith('.') && + entry.name !== 'node_modules' && + entry.name !== 'dist' && + entry.name !== 'build' + ) { + this.scanForDatabases( + join(dir, entry.name), + depth + 1, + maxDepth, + results + ); + } + } + } catch { + // Permission denied or other errors + } + } + + // --- Cross-Project Search --- + + /** + * Search across all registered project databases using FTS5/BM25. + * Opens read-only connections. Skips missing/locked databases gracefully. + */ + async search(options: CrossSearchOptions): Promise { + const { query, limit = 20, excludeProject } = options; + const registry = this.loadRegistry(); + + if (registry.projects.length === 0) { + return []; + } + + const allResults: CrossSearchResult[] = []; + const perDbLimit = Math.max(limit, 10); // Fetch more per-db, merge later + + for (const project of registry.projects) { + if (excludeProject && project.name === excludeProject) continue; + if (!existsSync(project.dbPath)) { + logger.debug(`Skipping missing database: ${project.dbPath}`); + continue; + } + + try { + const results = this.searchSingleDb(project, query, perDbLimit); + allResults.push(...results); + } catch (error) { + logger.debug( + `Skipping database ${project.dbPath}: ${error instanceof Error ? error.message : String(error)}` + ); + } + } + + // Sort all results by BM25 score descending, then limit + allResults.sort((a, b) => b.score - a.score); + return allResults.slice(0, limit); + } + + /** + * Search a single project database (read-only connection). + */ + private searchSingleDb( + project: ProjectEntry, + query: string, + limit: number + ): CrossSearchResult[] { + let db: Database.Database | null = null; + + try { + db = new Database(project.dbPath, { + readonly: true, + fileMustExist: true, + }); + + // Check if FTS5 table exists + const hasFts = db + .prepare( + "SELECT name FROM sqlite_master WHERE type='table' AND name='frames_fts'" + ) + .get(); + + if (hasFts) { + return this.searchFts(db, project, query, limit); + } else { + return this.searchLike(db, project, query, limit); + } + } finally { + if (db) { + try { + db.close(); + } catch { + // best-effort close + } + } + } + } + + private searchFts( + db: Database.Database, + project: ProjectEntry, + query: string, + limit: number + ): CrossSearchResult[] { + const sanitized = sanitizeFtsQuery(query); + + const sql = ` + SELECT f.frame_id, f.name, f.type, f.state, f.digest_text, f.created_at, + -bm25(frames_fts, 10.0, 5.0, 2.0, 1.0) as score + FROM frames_fts fts + JOIN frames f ON f.rowid = fts.rowid + WHERE frames_fts MATCH ? + ORDER BY score DESC + LIMIT ? + `; + + const rows = db.prepare(sql).all(sanitized, limit) as Array<{ + frame_id: string; + name: string; + type: string; + state: string; + digest_text: string | null; + score: number; + created_at: number; + }>; + + return rows.map((row) => ({ + projectName: project.name, + projectPath: project.path, + frameId: row.frame_id, + name: row.name, + type: row.type, + state: row.state, + digestText: row.digest_text, + score: row.score, + createdAt: row.created_at, + })); + } + + private searchLike( + db: Database.Database, + project: ProjectEntry, + query: string, + limit: number + ): CrossSearchResult[] { + const likeParam = `%${query}%`; + const sql = ` + SELECT frame_id, name, type, state, digest_text, created_at, + CASE + WHEN name LIKE ? THEN 1.0 + WHEN digest_text LIKE ? THEN 0.8 + WHEN inputs LIKE ? THEN 0.6 + ELSE 0.5 + END as score + FROM frames + WHERE (name LIKE ? OR digest_text LIKE ? OR inputs LIKE ?) + ORDER BY score DESC + LIMIT ? + `; + + const rows = db + .prepare(sql) + .all( + likeParam, + likeParam, + likeParam, + likeParam, + likeParam, + likeParam, + limit + ) as Array<{ + frame_id: string; + name: string; + type: string; + state: string; + digest_text: string | null; + score: number; + created_at: number; + }>; + + return rows.map((row) => ({ + projectName: project.name, + projectPath: project.path, + frameId: row.frame_id, + name: row.name, + type: row.type, + state: row.state, + digestText: row.digest_text, + score: row.score, + createdAt: row.created_at, + })); + } +} diff --git a/src/core/cross-search/index.ts b/src/core/cross-search/index.ts new file mode 100644 index 00000000..99d8fa83 --- /dev/null +++ b/src/core/cross-search/index.ts @@ -0,0 +1,7 @@ +export { + CrossProjectSearch, + type ProjectEntry, + type ProjectRegistry, + type CrossSearchResult, + type CrossSearchOptions, +} from './cross-project-search.js'; diff --git a/src/integrations/mcp/__tests__/tool-alias-registry.test.ts b/src/integrations/mcp/__tests__/tool-alias-registry.test.ts new file mode 100644 index 00000000..03abefe9 --- /dev/null +++ b/src/integrations/mcp/__tests__/tool-alias-registry.test.ts @@ -0,0 +1,321 @@ +/** + * Tests for Tool Alias Registry + * Verifies tool name resolution, parameter aliasing, and registry integrity + */ + +import { describe, it, expect } from 'vitest'; +import { + resolveToolAlias, + resolveParamAliases, + getAliasesForTool, + getToolsWithAliases, + getAliasRegistry, + getParamAliasRegistry, +} from '../tool-alias-registry.js'; + +describe('Tool Alias Registry', () => { + describe('resolveToolAlias', () => { + it('resolves known aliases to canonical names', () => { + const cases: [string, string][] = [ + ['sm_save', 'save_context'], + ['sm_load', 'load_context'], + ['sm_context_search', 'sm_search'], + ['search', 'sm_search'], + ['context', 'get_context'], + ['discover', 'sm_discover'], + ['fuzzy_edit', 'sm_edit'], + ['desires', 'sm_desire_paths'], + ['spawn', 'cord_spawn'], + ['delegate', 'delegate_to_model'], + ['plan', 'plan_only'], + ['linear_issues', 'linear_get_tasks'], + ['decision_search', 'provenant_search'], + ['digest', 'sm_digest'], + ['remember', 'diffmem_store_learning'], + ]; + + for (const [alias, expected] of cases) { + const result = resolveToolAlias(alias); + expect(result.canonicalName).toBe(expected); + expect(result.wasAlias).toBe(true); + expect(result.originalName).toBe(alias); + } + }); + + it('returns canonical names unchanged', () => { + const canonicals = [ + 'get_context', + 'add_decision', + 'start_frame', + 'close_frame', + 'sm_search', + 'sm_discover', + 'sm_edit', + 'sm_desire_paths', + 'create_task', + 'linear_get_tasks', + 'delegate_to_model', + 'provenant_search', + 'cord_spawn', + 'sm_digest', + ]; + + for (const name of canonicals) { + const result = resolveToolAlias(name); + expect(result.canonicalName).toBe(name); + expect(result.wasAlias).toBe(false); + expect(result.originalName).toBe(name); + } + }); + + it('returns unknown tool names unchanged', () => { + const result = resolveToolAlias('completely_made_up_tool'); + expect(result.canonicalName).toBe('completely_made_up_tool'); + expect(result.wasAlias).toBe(false); + }); + }); + + describe('resolveParamAliases', () => { + it('resolves known parameter aliases', () => { + const result = resolveParamAliases('sm_search', { + search_term: 'hello', + max: 5, + }); + expect(result.resolvedParams).toEqual({ query: 'hello', limit: 5 }); + expect(result.renames).toEqual({ + search_term: 'query', + max: 'limit', + }); + }); + + it('preserves canonical params over aliases', () => { + const result = resolveParamAliases('sm_search', { + query: 'canonical value', + search_term: 'alias value', + limit: 10, + max: 20, + }); + // canonical 'query' should win over alias 'search_term' + expect(result.resolvedParams.query).toBe('canonical value'); + expect(result.resolvedParams.limit).toBe(10); + // alias values should NOT be in the result + expect(result.resolvedParams.search_term).toBeUndefined(); + expect(result.resolvedParams.max).toBeUndefined(); + }); + + it('passes through params without aliases for unknown tools', () => { + const params = { foo: 'bar', baz: 42 }; + const result = resolveParamAliases('unknown_tool', params); + expect(result.resolvedParams).toEqual(params); + expect(result.renames).toEqual({}); + }); + + it('passes through unrecognized params', () => { + const result = resolveParamAliases('sm_search', { + query: 'test', + unknown_param: 'value', + }); + expect(result.resolvedParams).toEqual({ + query: 'test', + unknown_param: 'value', + }); + expect(result.renames).toEqual({}); + }); + + it('handles empty params', () => { + const result = resolveParamAliases('sm_search', {}); + expect(result.resolvedParams).toEqual({}); + expect(result.renames).toEqual({}); + }); + + it('resolves smart_context token budget aliases', () => { + const result = resolveParamAliases('smart_context', { + search: 'test query', + token_budget: 8000, + force: true, + }); + expect(result.resolvedParams).toEqual({ + query: 'test query', + tokenBudget: 8000, + forceRefresh: true, + }); + }); + + it('resolves provenant_search aliases', () => { + const result = resolveParamAliases('provenant_search', { + text: 'architecture', + by: 'jwu', + after: '2026-01-01', + }); + expect(result.resolvedParams).toEqual({ + query: 'architecture', + actor: 'jwu', + since: '2026-01-01', + }); + }); + + it('resolves create_task name->title alias', () => { + const result = resolveParamAliases('create_task', { + name: 'My Task', + desc: 'Details here', + }); + expect(result.resolvedParams).toEqual({ + title: 'My Task', + description: 'Details here', + }); + }); + + it('resolves cord_spawn aliases', () => { + const result = resolveParamAliases('cord_spawn', { + task: 'Build feature', + instructions: 'Implement the new API', + depends_on: ['task-1'], + parent: 'root-task', + }); + expect(result.resolvedParams).toEqual({ + goal: 'Build feature', + prompt: 'Implement the new API', + blocked_by: ['task-1'], + parent_id: 'root-task', + }); + }); + + it('resolves sm_edit file path aliases', () => { + const result = resolveParamAliases('sm_edit', { + file: '/path/to/file.ts', + find: 'old code', + replace: 'new code', + }); + expect(result.resolvedParams).toEqual({ + file_path: '/path/to/file.ts', + old_string: 'old code', + new_string: 'new code', + }); + }); + }); + + describe('getAliasesForTool', () => { + it('returns all aliases for a canonical tool', () => { + const aliases = getAliasesForTool('get_context'); + expect(aliases).toContain('context'); + expect(aliases).toContain('get_ctx'); + expect(aliases).toContain('sm_context'); + expect(aliases).toContain('sm_get_context'); + expect(aliases).toContain('fetch_context'); + expect(aliases).toContain('read_context'); + }); + + it('returns empty array for tool with no aliases', () => { + const aliases = getAliasesForTool('nonexistent_tool'); + expect(aliases).toEqual([]); + }); + }); + + describe('getToolsWithAliases', () => { + it('returns unique canonical names', () => { + const tools = getToolsWithAliases(); + expect(tools.length).toBeGreaterThan(0); + // Should be deduplicated + expect(new Set(tools).size).toBe(tools.length); + // Should include major tools + expect(tools).toContain('get_context'); + expect(tools).toContain('sm_search'); + expect(tools).toContain('create_task'); + }); + }); + + describe('Registry integrity', () => { + it('no alias points to another alias (no chaining)', () => { + const registry = getAliasRegistry(); + for (const [alias, target] of Object.entries(registry)) { + expect(registry[target]).toBeUndefined(); + } + }); + + it('no alias shadows a canonical tool name', () => { + // Ensure aliases don't accidentally override canonical tool names + // that are used in the switch statement + const canonicalTools = [ + 'get_context', + 'add_decision', + 'start_frame', + 'close_frame', + 'add_anchor', + 'get_hot_stack', + 'create_task', + 'update_task_status', + 'get_active_tasks', + 'get_task_metrics', + 'sm_search', + 'sm_discover', + 'sm_related_files', + 'sm_session_summary', + 'sm_edit', + 'sm_digest', + 'sm_desire_paths', + 'smart_context', + 'get_summary', + 'linear_sync', + 'linear_update_task', + 'linear_get_tasks', + 'linear_status', + 'get_traces', + 'plan_only', + 'call_codex', + 'call_claude', + 'plan_gate', + 'approve_plan', + 'pending_list', + 'pending_clear', + 'pending_show', + 'delegate_to_model', + 'batch_submit', + 'batch_check', + 'cord_spawn', + 'cord_fork', + 'cord_complete', + 'cord_ask', + 'cord_tree', + 'team_context_get', + 'team_context_share', + 'team_search', + 'provenant_search', + 'provenant_log', + 'provenant_status', + 'provenant_contradictions', + 'provenant_resolve', + 'diffmem_get_user_context', + 'diffmem_store_learning', + 'diffmem_search', + 'diffmem_status', + ]; + + const registry = getAliasRegistry(); + for (const canonical of canonicalTools) { + // No alias key should equal a canonical name (it would shadow it) + if (registry[canonical]) { + // This is a problem: an alias key matches a canonical tool name + throw new Error( + `Alias "${canonical}" shadows canonical tool "${canonical}" -> "${registry[canonical]}"` + ); + } + } + }); + + it('param alias targets exist in tool schemas', () => { + const paramRegistry = getParamAliasRegistry(); + // Just verify structure - each tool has a non-empty mapping + for (const [tool, aliases] of Object.entries(paramRegistry)) { + expect(typeof tool).toBe('string'); + expect(Object.keys(aliases).length).toBeGreaterThan(0); + // Each alias should map to a string canonical param name + for (const [alias, canonical] of Object.entries(aliases)) { + expect(typeof alias).toBe('string'); + expect(typeof canonical).toBe('string'); + // Alias and canonical should differ + expect(alias).not.toBe(canonical); + } + } + }); + }); +}); diff --git a/src/integrations/mcp/handlers/cross-search-handlers.ts b/src/integrations/mcp/handlers/cross-search-handlers.ts new file mode 100644 index 00000000..dc558585 --- /dev/null +++ b/src/integrations/mcp/handlers/cross-search-handlers.ts @@ -0,0 +1,225 @@ +/** + * Cross-Project Search MCP Tool Handlers + * Enables querying frames across multiple project databases + */ + +import { + CrossProjectSearch, + type CrossSearchResult, +} from '../../../core/cross-search/cross-project-search.js'; +import { logger } from '../../../core/monitoring/logger.js'; + +export interface CrossSearchHandlerDependencies { + crossSearch?: CrossProjectSearch; +} + +export class CrossSearchHandlers { + private crossSearch: CrossProjectSearch; + + constructor(deps: CrossSearchHandlerDependencies) { + this.crossSearch = deps.crossSearch || new CrossProjectSearch(); + } + + /** + * sm_cross_search: Search across all registered project databases. + */ + async handleCrossSearch(args: any): Promise { + try { + const { query, limit = 20, exclude_current = false } = args; + + if (!query) { + throw new Error('query is required'); + } + + const projects = this.crossSearch.listProjects(); + if (projects.length === 0) { + return { + content: [ + { + type: 'text', + text: 'No projects registered. Use sm_cross_discover to scan for project databases, or sm_cross_register to add one manually.', + }, + ], + }; + } + + const excludeProject = exclude_current + ? this.getCurrentProjectName() + : undefined; + + const start = Date.now(); + const results = await this.crossSearch.search({ + query, + limit, + excludeProject, + }); + const elapsed = Date.now() - start; + + if (results.length === 0) { + return { + content: [ + { + type: 'text', + text: `No results found for "${query}" across ${projects.length} project databases (${elapsed}ms).`, + }, + ], + }; + } + + const text = this.formatResults(results, query, projects.length, elapsed); + + return { + content: [{ type: 'text', text }], + metadata: { + results: results.map((r) => ({ + project: r.projectName, + frameId: r.frameId, + name: r.name, + score: r.score, + })), + total: results.length, + projectsSearched: projects.length, + elapsedMs: elapsed, + }, + }; + } catch (error: unknown) { + logger.error( + 'Cross-project search failed', + error instanceof Error ? error : new Error(String(error)) + ); + throw error; + } + } + + /** + * sm_cross_discover: Auto-discover project databases. + */ + async handleCrossDiscover(args: any): Promise { + try { + const paths = args.paths as string[] | undefined; + const discovered = this.crossSearch.discoverProjects(paths); + const all = this.crossSearch.listProjects(); + + return { + content: [ + { + type: 'text', + text: + `Discovered ${discovered.length} project database(s).\n` + + `Total registered: ${all.length}\n\n` + + all + .map((p) => ` ${p.name}: ${p.path}\n db: ${p.dbPath}`) + .join('\n'), + }, + ], + metadata: { discovered: discovered.length, total: all.length }, + }; + } catch (error: unknown) { + logger.error( + 'Cross-project discover failed', + error instanceof Error ? error : new Error(String(error)) + ); + throw error; + } + } + + /** + * sm_cross_register: Register a project database manually. + */ + async handleCrossRegister(args: any): Promise { + try { + const { name, path, db_path } = args; + + if (!name || !path || !db_path) { + throw new Error('name, path, and db_path are required'); + } + + this.crossSearch.registerProject({ + name, + path, + dbPath: db_path, + lastAccessed: Date.now(), + }); + + return { + content: [ + { + type: 'text', + text: `Registered project "${name}" at ${path} (db: ${db_path})`, + }, + ], + }; + } catch (error: unknown) { + logger.error( + 'Cross-project register failed', + error instanceof Error ? error : new Error(String(error)) + ); + throw error; + } + } + + /** + * sm_cross_list: List all registered project databases. + */ + async handleCrossList(): Promise { + try { + const projects = this.crossSearch.listProjects(); + + if (projects.length === 0) { + return { + content: [ + { + type: 'text', + text: 'No projects registered. Use sm_cross_discover to scan for project databases.', + }, + ], + }; + } + + const text = + `Registered projects (${projects.length}):\n\n` + + projects + .map( + (p) => + ` ${p.name}\n path: ${p.path}\n db: ${p.dbPath}\n last: ${new Date(p.lastAccessed).toLocaleDateString()}` + ) + .join('\n'); + + return { + content: [{ type: 'text', text }], + metadata: { projects }, + }; + } catch (error: unknown) { + logger.error( + 'Cross-project list failed', + error instanceof Error ? error : new Error(String(error)) + ); + throw error; + } + } + + private formatResults( + results: CrossSearchResult[], + query: string, + projectCount: number, + elapsed: number + ): string { + const header = `Cross-project search: ${results.length} results for "${query}" across ${projectCount} databases (${elapsed}ms):\n\n`; + + const body = results + .map( + (r) => + `[${r.projectName}] ${r.name} (${r.type}, score: ${r.score.toFixed(3)})` + + (r.digestText ? `\n ${r.digestText.slice(0, 120)}` : '') + ) + .join('\n'); + + return header + body; + } + + private getCurrentProjectName(): string | undefined { + // Best-effort: derive from cwd + const cwd = process.cwd(); + return cwd.split('/').pop(); + } +} diff --git a/src/integrations/mcp/handlers/index.ts b/src/integrations/mcp/handlers/index.ts index a19ce7a1..b6548ab1 100644 --- a/src/integrations/mcp/handlers/index.ts +++ b/src/integrations/mcp/handlers/index.ts @@ -30,6 +30,10 @@ export { ProvenantHandlers, type ProvenantHandlerDependencies, } from './provenant-handlers.js'; +export { + CrossSearchHandlers, + type CrossSearchHandlerDependencies, +} from './cross-search-handlers.js'; import { ContextHandlers, @@ -45,6 +49,11 @@ import { ProviderHandlers } from './provider-handlers.js'; import { TeamHandlers, TeamHandlerDependencies } from './team-handlers.js'; import { CordHandlers } from './cord-handlers.js'; import { ProvenantHandlers } from './provenant-handlers.js'; +import { CrossSearchHandlers } from './cross-search-handlers.js'; +import { + resolveToolAlias, + resolveParamAliases, +} from '../tool-alias-registry.js'; // Combined dependencies interface export interface MCPHandlerDependencies @@ -69,6 +78,7 @@ export class MCPHandlerFactory { private teamHandlers?: TeamHandlers; private cordHandlers?: CordHandlers; private provenantHandlers?: ProvenantHandlers; + private crossSearchHandlers: CrossSearchHandlers; constructor(deps: MCPHandlerDependencies) { this.contextHandlers = new ContextHandlers({ @@ -109,13 +119,17 @@ export class MCPHandlerFactory { projectDir: deps.projectDir, }); } + + this.crossSearchHandlers = new CrossSearchHandlers({}); } /** - * Get handler for a specific tool + * Get handler for a specific tool. + * Resolves tool name aliases before lookup. */ getHandler(toolName: string): (args: any) => Promise { - switch (toolName) { + const { canonicalName } = resolveToolAlias(toolName); + switch (canonicalName) { // Context handlers case 'get_context': return this.contextHandlers.handleGetContext.bind(this.contextHandlers); @@ -241,6 +255,24 @@ export class MCPHandlerFactory { this.provenantHandlers ); + // Cross-project search handlers + case 'sm_cross_search': + return this.crossSearchHandlers.handleCrossSearch.bind( + this.crossSearchHandlers + ); + case 'sm_cross_discover': + return this.crossSearchHandlers.handleCrossDiscover.bind( + this.crossSearchHandlers + ); + case 'sm_cross_register': + return this.crossSearchHandlers.handleCrossRegister.bind( + this.crossSearchHandlers + ); + case 'sm_cross_list': + return this.crossSearchHandlers.handleCrossList.bind( + this.crossSearchHandlers + ); + default: throw new Error(`Unknown tool: ${toolName}`); } @@ -301,13 +333,20 @@ export class MCPHandlerFactory { 'provenant_status', 'provenant_contradictions', 'provenant_resolve', + + // Cross-project search tools + 'sm_cross_search', + 'sm_cross_discover', + 'sm_cross_register', + 'sm_cross_list', ]; } /** - * Check if a tool exists + * Check if a tool exists (resolves aliases) */ hasHandler(toolName: string): boolean { - return this.getAvailableTools().includes(toolName); + const { canonicalName } = resolveToolAlias(toolName); + return this.getAvailableTools().includes(canonicalName); } } diff --git a/src/integrations/mcp/server.ts b/src/integrations/mcp/server.ts index 5b0c0bf8..8b2cfccc 100644 --- a/src/integrations/mcp/server.ts +++ b/src/integrations/mcp/server.ts @@ -52,6 +52,7 @@ import { DiffMemHandlers } from './handlers/diffmem-handlers.js'; import { GreptileHandlers } from './handlers/greptile-handlers.js'; import { CordHandlers } from './handlers/cord-handlers.js'; import { TeamHandlers } from './handlers/team-handlers.js'; +import { CrossSearchHandlers } from './handlers/cross-search-handlers.js'; import { SQLiteAdapter } from '../../core/database/sqlite-adapter.js'; import { generateChronologicalDigest, @@ -59,6 +60,10 @@ import { } from '../../core/digest/chronological-digest.js'; import { fuzzyEdit } from '../../utils/fuzzy-edit.js'; import { v4 as uuidv4 } from 'uuid'; +import { + resolveToolAlias, + resolveParamAliases, +} from './tool-alias-registry.js'; import { DEFAULT_PLANNER_MODEL, DEFAULT_IMPLEMENTER, @@ -103,6 +108,7 @@ class LocalStackMemoryMCP { | null = null; private cordHandlers: CordHandlers | null = null; private teamHandlers: TeamHandlers | null = null; + private crossSearchHandlers: CrossSearchHandlers; private pendingPlans: Map = new Map(); constructor() { @@ -203,6 +209,9 @@ class LocalStackMemoryMCP { // Initialize Greptile Handlers this.greptileHandlers = new GreptileHandlers(); + // Initialize Cross-Project Search Handlers + this.crossSearchHandlers = new CrossSearchHandlers({}); + // Initialize Cord and Team Handlers (async - best effort) this.initCordTeamHandlers(); @@ -1427,6 +1436,79 @@ class LocalStackMemoryMCP { required: ['period'], }, }, + // Cross-project search tools + { + name: 'sm_cross_search', + description: + 'Search frames across all registered project databases using FTS5/BM25. Returns results ranked by relevance with source project attribution.', + inputSchema: { + type: 'object', + properties: { + query: { + type: 'string', + description: 'Search query (natural language or keywords)', + }, + limit: { + type: 'number', + default: 20, + description: 'Maximum results to return', + }, + exclude_current: { + type: 'boolean', + default: false, + description: 'Exclude the current project from results', + }, + }, + required: ['query'], + }, + }, + { + name: 'sm_cross_discover', + description: + 'Auto-discover project databases by scanning common directories for .stackmemory/context.db files.', + inputSchema: { + type: 'object', + properties: { + paths: { + type: 'array', + items: { type: 'string' }, + description: 'Custom directory paths to scan', + }, + }, + }, + }, + { + name: 'sm_cross_register', + description: + 'Manually register a project database for cross-project search.', + inputSchema: { + type: 'object', + properties: { + name: { + type: 'string', + description: 'Project display name', + }, + path: { + type: 'string', + description: 'Project root directory path', + }, + db_path: { + type: 'string', + description: 'Path to the SQLite context.db file', + }, + }, + required: ['name', 'path', 'db_path'], + }, + }, + { + name: 'sm_cross_list', + description: + 'List all project databases registered for cross-project search.', + inputSchema: { + type: 'object', + properties: {}, + }, + }, ], }; } @@ -1442,7 +1524,28 @@ class LocalStackMemoryMCP { }), }), async (request) => { - const { name, arguments: args } = request.params; + const { name: rawName, arguments: rawArgs } = request.params; + + // Resolve tool name aliases (e.g., "sm_save" -> "save_context") + const aliasResolution = resolveToolAlias(rawName); + const name = aliasResolution.canonicalName; + + // Resolve parameter aliases for the canonical tool + const paramResolution = resolveParamAliases(name, rawArgs); + const args = paramResolution.resolvedParams; + + // Log alias resolution for observability + if ( + aliasResolution.wasAlias || + Object.keys(paramResolution.renames).length > 0 + ) { + logger.debug('Tool alias resolved', { + originalTool: rawName, + canonicalTool: name, + paramRenames: paramResolution.renames, + }); + } + const callId = uuidv4(); const startTime = Date.now(); @@ -1453,6 +1556,7 @@ class LocalStackMemoryMCP { tool_name: name, arguments: args, timestamp: startTime, + ...(aliasResolution.wasAlias ? { alias_from: rawName } : {}), }); } @@ -1830,6 +1934,22 @@ class LocalStackMemoryMCP { result = this.handleDesirePaths(args); break; + case 'sm_cross_search': + result = await this.crossSearchHandlers.handleCrossSearch(args); + break; + + case 'sm_cross_discover': + result = await this.crossSearchHandlers.handleCrossDiscover(args); + break; + + case 'sm_cross_register': + result = await this.crossSearchHandlers.handleCrossRegister(args); + break; + + case 'sm_cross_list': + result = await this.crossSearchHandlers.handleCrossList(); + break; + default: throw new Error(`Unknown tool: ${name}`); } diff --git a/src/integrations/mcp/tool-alias-registry.ts b/src/integrations/mcp/tool-alias-registry.ts new file mode 100644 index 00000000..48297c27 --- /dev/null +++ b/src/integrations/mcp/tool-alias-registry.ts @@ -0,0 +1,556 @@ +/** + * Tool Alias Registry + * + * Maps common misspellings, abbreviations, and variant names to canonical + * MCP tool names. Built from desire paths analysis β€” what agents try to call + * vs what actually exists. + * + * Also handles parameter name aliases so agents can use common variants + * (e.g., `query` vs `search_term`) and have them resolved transparently. + */ + +export interface AliasResolution { + /** The canonical tool name */ + canonicalName: string; + /** Whether the name was an alias (false = already canonical) */ + wasAlias: boolean; + /** The original name that was looked up */ + originalName: string; +} + +export interface ParamResolution { + /** Resolved parameters with canonical names */ + resolvedParams: Record; + /** Map of param renames that were applied: original -> canonical */ + renames: Record; +} + +/** + * Static registry of tool name aliases. + * + * Key: alias name (what agents try to call) + * Value: canonical tool name (what actually exists) + * + * Organized by category for readability. + */ +const TOOL_ALIASES: Record = { + // --- Context tools --- + context: 'get_context', + get_ctx: 'get_context', + sm_context: 'get_context', + sm_get_context: 'get_context', + fetch_context: 'get_context', + read_context: 'get_context', + + record_decision: 'add_decision', + log_decision: 'add_decision', + save_decision: 'add_decision', + sm_decision: 'add_decision', + + push_frame: 'start_frame', + open_frame: 'start_frame', + begin_frame: 'start_frame', + new_frame: 'start_frame', + + pop_frame: 'close_frame', + end_frame: 'close_frame', + finish_frame: 'close_frame', + + anchor: 'add_anchor', + sm_anchor: 'add_anchor', + save_anchor: 'add_anchor', + + hot_stack: 'get_hot_stack', + stack: 'get_hot_stack', + sm_stack: 'get_hot_stack', + + // --- Task tools --- + new_task: 'create_task', + add_task: 'create_task', + sm_task: 'create_task', + sm_create_task: 'create_task', + + update_task: 'update_task_status', + set_task_status: 'update_task_status', + task_update: 'update_task_status', + + list_tasks: 'get_active_tasks', + tasks: 'get_active_tasks', + sm_tasks: 'get_active_tasks', + active_tasks: 'get_active_tasks', + + task_metrics: 'get_task_metrics', + metrics: 'get_task_metrics', + + // --- Search & Discovery --- + sm_context_search: 'sm_search', + search: 'sm_search', + context_search: 'sm_search', + sm_find: 'sm_search', + find: 'sm_search', + + discover: 'sm_discover', + sm_explore: 'sm_discover', + explore: 'sm_discover', + + related: 'sm_related_files', + find_related: 'sm_related_files', + + session_summary: 'sm_session_summary', + summary: 'sm_session_summary', + + // --- Save/Load context (old MCP server) --- + sm_save: 'save_context', + sm_context_save: 'save_context', + store_context: 'save_context', + + sm_load: 'load_context', + sm_context_load: 'load_context', + retrieve_context: 'load_context', + + // --- Linear tools --- + linear_issues: 'linear_get_tasks', + linear_list: 'linear_get_tasks', + get_linear_tasks: 'linear_get_tasks', + + linear_update: 'linear_update_task', + update_linear: 'linear_update_task', + + linear_comment: 'linear_create_comment', + comment_on_issue: 'linear_create_comment', + + linear_comments: 'linear_list_comments', + + // --- Trace tools --- + traces: 'get_traces', + sm_traces: 'get_traces', + list_traces: 'get_traces', + + trace_stats: 'get_trace_statistics', + trace_statistics: 'get_trace_statistics', + + // --- Smart context --- + smart: 'smart_context', + sm_smart: 'smart_context', + intelligent_context: 'smart_context', + + sm_summary: 'get_summary', + project_summary: 'get_summary', + + // --- Planning tools --- + plan: 'plan_only', + generate_plan: 'plan_only', + sm_plan: 'plan_only', + + codex: 'call_codex', + run_codex: 'call_codex', + + claude: 'call_claude', + ask_claude: 'call_claude', + + gate: 'plan_gate', + plan_and_gate: 'plan_gate', + + approve: 'approve_plan', + execute_plan: 'approve_plan', + + // --- Pending tools --- + pending: 'pending_list', + list_pending: 'pending_list', + + clear_pending: 'pending_clear', + + show_pending: 'pending_show', + + // --- Edit tools --- + fuzzy_edit: 'sm_edit', + edit: 'sm_edit', + sm_fuzzy_edit: 'sm_edit', + + // --- DiffMem tools --- + user_context: 'diffmem_get_user_context', + get_user_context: 'diffmem_get_user_context', + user_memory: 'diffmem_get_user_context', + + store_learning: 'diffmem_store_learning', + learn: 'diffmem_store_learning', + remember: 'diffmem_store_learning', + + memory_search: 'diffmem_search', + search_memory: 'diffmem_search', + + diffmem: 'diffmem_status', + memory_status: 'diffmem_status', + + // --- Digest tools --- + digest: 'sm_digest', + activity_digest: 'sm_digest', + daily_digest: 'sm_digest', + + // --- Desire paths --- + desire_paths: 'sm_desire_paths', + desires: 'sm_desire_paths', + failed_tools: 'sm_desire_paths', + + // --- Provider tools --- + delegate: 'delegate_to_model', + route: 'delegate_to_model', + send_to_model: 'delegate_to_model', + + batch: 'batch_submit', + submit_batch: 'batch_submit', + + check_batch: 'batch_check', + batch_status: 'batch_check', + + // --- Team tools --- + team_get: 'team_context_get', + team_context: 'team_context_get', + get_team_context: 'team_context_get', + + team_share: 'team_context_share', + share_context: 'team_context_share', + share: 'team_context_share', + + // --- Cord tools --- + spawn: 'cord_spawn', + subtask: 'cord_spawn', + + fork: 'cord_fork', + fork_task: 'cord_fork', + + complete: 'cord_complete', + done: 'cord_complete', + finish: 'cord_complete', + + ask: 'cord_ask', + question: 'cord_ask', + + tree: 'cord_tree', + task_tree: 'cord_tree', + + // --- Greptile tools --- + pr_comments: 'greptile_pr_comments', + review_comments: 'greptile_pr_comments', + + pr_details: 'greptile_pr_details', + pr_info: 'greptile_pr_details', + + list_prs: 'greptile_list_prs', + prs: 'greptile_list_prs', + + trigger_review: 'greptile_trigger_review', + review_pr: 'greptile_trigger_review', + + search_patterns: 'greptile_search_patterns', + patterns: 'greptile_search_patterns', + + create_pattern: 'greptile_create_pattern', + add_pattern: 'greptile_create_pattern', + + greptile: 'greptile_status', + + // --- Provenant tools --- + decision_search: 'provenant_search', + search_decisions: 'provenant_search', + + log_decision_graph: 'provenant_log', + decision_log: 'provenant_log', + + decision_status: 'provenant_status', + graph_status: 'provenant_status', + + contradictions: 'provenant_contradictions', + conflicts: 'provenant_contradictions', + + resolve: 'provenant_resolve', + resolve_contradiction: 'provenant_resolve', +}; + +/** + * Parameter alias mappings per tool. + * + * Key: canonical tool name + * Value: Record mapping alias param name -> canonical param name + * + * Only tools where agents commonly send wrong param names are listed. + */ +const PARAM_ALIASES: Record> = { + // Agents often send `query` for search-like tools + sm_search: { + search_term: 'query', + search: 'query', + text: 'query', + q: 'query', + max: 'limit', + max_results: 'limit', + count: 'limit', + }, + sm_discover: { + search: 'query', + q: 'query', + search_query: 'query', + max: 'maxFiles', + max_files: 'maxFiles', + limit: 'maxFiles', + include: 'includePatterns', + exclude: 'excludePatterns', + }, + get_context: { + search: 'query', + q: 'query', + text: 'query', + max: 'limit', + max_results: 'limit', + count: 'limit', + }, + smart_context: { + search: 'query', + q: 'query', + tokens: 'tokenBudget', + token_budget: 'tokenBudget', + max_tokens: 'tokenBudget', + budget: 'tokenBudget', + refresh: 'forceRefresh', + force: 'forceRefresh', + }, + get_active_tasks: { + state: 'status', + max: 'limit', + max_results: 'limit', + count: 'limit', + query: 'search', + q: 'search', + }, + linear_get_tasks: { + status: 'state', + max: 'limit', + max_results: 'limit', + count: 'limit', + q: 'search', + query: 'search', + team: 'team_id', + assignee: 'assignee_id', + }, + add_decision: { + text: 'content', + decision: 'content', + value: 'content', + kind: 'type', + category: 'type', + }, + add_anchor: { + content: 'text', + value: 'text', + anchor: 'text', + kind: 'type', + category: 'type', + importance: 'priority', + weight: 'priority', + }, + start_frame: { + title: 'name', + goal: 'name', + label: 'name', + kind: 'type', + frame_type: 'type', + }, + create_task: { + name: 'title', + goal: 'title', + label: 'title', + desc: 'description', + detail: 'description', + details: 'description', + }, + sm_desire_paths: { + type: 'category', + kind: 'category', + max: 'limit', + max_results: 'limit', + lookback: 'days', + period: 'days', + }, + delegate_to_model: { + text: 'prompt', + message: 'prompt', + input: 'prompt', + tokens: 'maxTokens', + max_tokens: 'maxTokens', + temp: 'temperature', + task: 'taskType', + task_type: 'taskType', + }, + provenant_search: { + text: 'query', + search: 'query', + q: 'query', + max: 'limit', + max_results: 'limit', + from: 'since', + after: 'since', + by: 'actor', + who: 'actor', + }, + provenant_log: { + decision: 'content', + text: 'content', + value: 'content', + by: 'actor', + who: 'actor', + why: 'reasoning', + reason: 'reasoning', + rationale: 'reasoning', + }, + diffmem_store_learning: { + insight: 'content', + text: 'content', + value: 'content', + type: 'category', + kind: 'category', + }, + diffmem_search: { + text: 'query', + search: 'query', + q: 'query', + max: 'limit', + max_results: 'limit', + time: 'timeRange', + range: 'timeRange', + time_range: 'timeRange', + min_confidence: 'minConfidence', + threshold: 'minConfidence', + }, + sm_edit: { + path: 'file_path', + file: 'file_path', + find: 'old_string', + search: 'old_string', + replace: 'new_string', + replacement: 'new_string', + }, + cord_spawn: { + task: 'goal', + title: 'goal', + name: 'goal', + instructions: 'prompt', + description: 'prompt', + depends_on: 'blocked_by', + blockers: 'blocked_by', + parent: 'parent_id', + }, + cord_fork: { + task: 'goal', + title: 'goal', + name: 'goal', + instructions: 'prompt', + description: 'prompt', + depends_on: 'blocked_by', + blockers: 'blocked_by', + parent: 'parent_id', + }, + cord_complete: { + id: 'task_id', + output: 'result', + response: 'result', + answer: 'result', + }, + team_search: { + text: 'query', + search: 'query', + q: 'query', + max: 'limit', + max_results: 'limit', + }, + sm_digest: { + time: 'period', + range: 'period', + timeframe: 'period', + }, + get_summary: { + refresh: 'forceRefresh', + force: 'forceRefresh', + }, +}; + +/** + * Resolve a tool name to its canonical form. + * Returns the canonical name and whether an alias was used. + */ +export function resolveToolAlias(name: string): AliasResolution { + const alias = TOOL_ALIASES[name]; + if (alias) { + return { canonicalName: alias, wasAlias: true, originalName: name }; + } + return { canonicalName: name, wasAlias: false, originalName: name }; +} + +/** + * Resolve parameter aliases for a given tool. + * Remaps aliased param names to canonical names. + * Original params take precedence over aliases (don't overwrite). + */ +export function resolveParamAliases( + toolName: string, + params: Record +): ParamResolution { + const aliases = PARAM_ALIASES[toolName]; + if (!aliases) { + return { resolvedParams: { ...params }, renames: {} }; + } + + const resolved: Record = {}; + const renames: Record = {}; + + // First pass: copy all canonical params + for (const [key, value] of Object.entries(params)) { + if (!aliases[key]) { + // Not an alias, keep as-is + resolved[key] = value; + } + } + + // Second pass: apply aliases (only if canonical name not already set) + for (const [key, value] of Object.entries(params)) { + const canonicalKey = aliases[key]; + if (canonicalKey && !(canonicalKey in resolved)) { + resolved[canonicalKey] = value; + renames[key] = canonicalKey; + } + } + + return { resolvedParams: resolved, renames }; +} + +/** + * Get all registered aliases for a canonical tool name. + * Useful for enriching tool descriptions. + */ +export function getAliasesForTool(canonicalName: string): string[] { + return Object.entries(TOOL_ALIASES) + .filter(([, target]) => target === canonicalName) + .map(([alias]) => alias); +} + +/** + * Get all canonical tool names that have aliases. + */ +export function getToolsWithAliases(): string[] { + return Array.from(new Set(Object.values(TOOL_ALIASES))); +} + +/** + * Get the full alias registry (for debugging/analysis). + */ +export function getAliasRegistry(): Readonly> { + return TOOL_ALIASES; +} + +/** + * Get the full param alias registry (for debugging/analysis). + */ +export function getParamAliasRegistry(): Readonly< + Record> +> { + return PARAM_ALIASES; +} diff --git a/src/integrations/mcp/tool-definitions.ts b/src/integrations/mcp/tool-definitions.ts index d1096730..9001763c 100644 --- a/src/integrations/mcp/tool-definitions.ts +++ b/src/integrations/mcp/tool-definitions.ts @@ -37,6 +37,7 @@ export class MCPToolDefinitions { ...this.getDigestTools(), ...this.getDesirePathTools(), ...this.getProvenantTools(), + ...this.getCrossSearchTools(), ]; } @@ -47,7 +48,8 @@ export class MCPToolDefinitions { return [ { name: 'get_context', - description: 'Get current project context and active frame information', + description: + 'Get current project context and active frame information. Aliases: context, get_ctx, sm_context, fetch_context', inputSchema: { type: 'object', properties: { @@ -65,7 +67,8 @@ export class MCPToolDefinitions { }, { name: 'add_decision', - description: 'Record a decision, constraint, or important information', + description: + 'Record a decision, constraint, or important information. Aliases: record_decision, log_decision, save_decision', inputSchema: { type: 'object', properties: { @@ -84,7 +87,8 @@ export class MCPToolDefinitions { }, { name: 'start_frame', - description: 'Start a new frame (task/subtask) on the call stack', + description: + 'Start a new frame (task/subtask) on the call stack. Aliases: push_frame, open_frame, begin_frame', inputSchema: { type: 'object', properties: { @@ -193,7 +197,7 @@ export class MCPToolDefinitions { return [ { name: 'create_task', - description: 'Create a new task', + description: 'Create a new task. Aliases: new_task, add_task, sm_task', inputSchema: { type: 'object', properties: { @@ -257,7 +261,8 @@ export class MCPToolDefinitions { }, { name: 'get_active_tasks', - description: 'Get active tasks with optional filtering', + description: + 'Get active tasks with optional filtering. Aliases: list_tasks, tasks, sm_tasks, active_tasks', inputSchema: { type: 'object', properties: { @@ -730,7 +735,7 @@ export class MCPToolDefinitions { { name: 'smart_context', description: - 'LLM-driven context retrieval - intelligently selects relevant frames based on query', + 'LLM-driven context retrieval - intelligently selects relevant frames based on query. Aliases: smart, sm_smart, intelligent_context', inputSchema: { type: 'object', properties: { @@ -775,7 +780,7 @@ export class MCPToolDefinitions { { name: 'sm_discover', description: - 'Discover relevant files based on current context. Extracts keywords from active frames and searches codebase for related files.', + 'Discover relevant files based on current context. Extracts keywords from active frames and searches codebase for related files. Aliases: discover, sm_explore, explore', inputSchema: { type: 'object', properties: { @@ -855,7 +860,7 @@ export class MCPToolDefinitions { { name: 'sm_search', description: - 'Search across StackMemory context - frames, events, decisions, and tasks.', + 'Search across StackMemory context - frames, events, decisions, and tasks. Aliases: search, context_search, sm_find, sm_context_search', inputSchema: { type: 'object', properties: { @@ -889,7 +894,7 @@ export class MCPToolDefinitions { { name: 'sm_edit', description: - "Fuzzy file edit β€” fallback when Claude Code's Edit tool fails on whitespace or indentation mismatches. Uses four-tier matching: exact, whitespace-normalized, indentation-insensitive, and line-level fuzzy (Levenshtein).", + "Fuzzy file edit β€” fallback when Claude Code's Edit tool fails on whitespace or indentation mismatches. Uses four-tier matching: exact, whitespace-normalized, indentation-insensitive, and line-level fuzzy (Levenshtein). Aliases: fuzzy_edit, edit, sm_fuzzy_edit", inputSchema: { type: 'object', properties: { @@ -1185,7 +1190,7 @@ export class MCPToolDefinitions { { name: 'delegate_to_model', description: - 'Route a prompt to a specific provider/model. Uses smart cost-based routing by default.', + 'Route a prompt to a specific provider/model. Uses smart cost-based routing by default. Aliases: delegate, route, send_to_model', inputSchema: { type: 'object', properties: { @@ -1271,7 +1276,7 @@ export class MCPToolDefinitions { { name: 'sm_digest', description: - 'Generate a chronological activity digest for a time period', + 'Generate a chronological activity digest for a time period. Aliases: digest, activity_digest, daily_digest', inputSchema: { type: 'object', properties: { @@ -1295,7 +1300,7 @@ export class MCPToolDefinitions { { name: 'sm_desire_paths', description: - 'Analyze failed tool calls (desire paths) β€” what agents want but cannot get. Use mode "summary" for aggregated counts or "list" for recent failures.', + 'Analyze failed tool calls (desire paths) β€” what agents want but cannot get. Use mode "summary" for aggregated counts or "list" for recent failures. Aliases: desire_paths, desires, failed_tools', inputSchema: { type: 'object', properties: { @@ -1427,7 +1432,7 @@ export class MCPToolDefinitions { { name: 'cord_spawn', description: - 'Create a subtask with clean context (spawn). Child sees only its prompt and completed blocker results.', + 'Create a subtask with clean context (spawn). Child sees only its prompt and completed blocker results. Aliases: spawn, subtask', inputSchema: { type: 'object', properties: { @@ -1554,7 +1559,7 @@ export class MCPToolDefinitions { { name: 'provenant_search', description: - 'Search the decision graph for past decisions, patterns, and context by meaning', + 'Search the decision graph for past decisions, patterns, and context by meaning. Aliases: decision_search, search_decisions', inputSchema: { type: 'object', properties: { @@ -1582,7 +1587,7 @@ export class MCPToolDefinitions { { name: 'provenant_log', description: - 'Log a decision to the graph. Use when a product or technical decision is made during a session.', + 'Log a decision to the graph. Use when a product or technical decision is made during a session. Aliases: decision_log, log_decision_graph', inputSchema: { type: 'object', properties: { @@ -1650,6 +1655,89 @@ export class MCPToolDefinitions { ]; } + /** + * Cross-project search tools + */ + getCrossSearchTools(): MCPToolDefinition[] { + return [ + { + name: 'sm_cross_search', + description: + 'Search frames across all registered project databases using FTS5/BM25. Returns results ranked by relevance with source project attribution.', + inputSchema: { + type: 'object', + properties: { + query: { + type: 'string', + description: 'Search query (natural language or keywords)', + }, + limit: { + type: 'number', + default: 20, + description: 'Maximum results to return', + }, + exclude_current: { + type: 'boolean', + default: false, + description: + 'Exclude the current project from results (useful when searching for external context)', + }, + }, + required: ['query'], + }, + }, + { + name: 'sm_cross_discover', + description: + 'Auto-discover project databases by scanning common directories for .stackmemory/context.db files.', + inputSchema: { + type: 'object', + properties: { + paths: { + type: 'array', + items: { type: 'string' }, + description: + 'Custom directory paths to scan (defaults to ~/Dev, ~/Projects, etc.)', + }, + }, + }, + }, + { + name: 'sm_cross_register', + description: + 'Manually register a project database for cross-project search.', + inputSchema: { + type: 'object', + properties: { + name: { + type: 'string', + description: 'Project display name', + }, + path: { + type: 'string', + description: 'Project root directory path', + }, + db_path: { + type: 'string', + description: + 'Path to the SQLite context.db file (e.g. /path/to/project/.stackmemory/context.db)', + }, + }, + required: ['name', 'path', 'db_path'], + }, + }, + { + name: 'sm_cross_list', + description: + 'List all project databases registered for cross-project search.', + inputSchema: { + type: 'object', + properties: {}, + }, + }, + ]; + } + /** * Get tool definition by name */ @@ -1679,6 +1767,7 @@ export class MCPToolDefinitions { | 'cord' | 'digest' | 'provenant' + | 'cross_search' ): MCPToolDefinition[] { switch (category) { case 'context': @@ -1717,6 +1806,8 @@ export class MCPToolDefinitions { return this.getDesirePathTools(); case 'provenant': return this.getProvenantTools(); + case 'cross_search': + return this.getProvenantTools(); default: return []; } From 39c1b39751aeebfe45b79333f30c2935bb8be21f Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Mon, 13 Apr 2026 16:13:52 -0400 Subject: [PATCH 05/18] feat(shared-state): add canonical instance coordination --- package-lock.json | 4 +- package.json | 2 +- src/cli/claude-sm.ts | 323 ++++++-- src/cli/codex-sm.ts | 170 ++++- src/cli/commands/daemon.ts | 47 ++ src/cli/commands/state.ts | 380 ++++++++++ src/cli/index.ts | 63 ++ src/core/session/session-manager.ts | 28 + src/core/shared-state/canonical-store.ts | 905 +++++++++++++++++++++++ src/daemon/daemon-config.ts | 14 + src/daemon/services/github-service.ts | 158 ++++ src/daemon/unified-daemon.ts | 32 + src/features/sweep/pty-wrapper.ts | 18 +- src/integrations/github/pr-state.ts | 209 ++++++ 14 files changed, 2250 insertions(+), 103 deletions(-) create mode 100644 src/cli/commands/state.ts create mode 100644 src/core/shared-state/canonical-store.ts create mode 100644 src/daemon/services/github-service.ts create mode 100644 src/integrations/github/pr-state.ts diff --git a/package-lock.json b/package-lock.json index 2d9e9c39..b0dfc90c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@stackmemoryai/stackmemory", - "version": "1.10.1", + "version": "1.10.6", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@stackmemoryai/stackmemory", - "version": "1.10.1", + "version": "1.10.6", "hasInstallScript": true, "license": "BUSL-1.1", "dependencies": { diff --git a/package.json b/package.json index 40b183dc..52284729 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@stackmemoryai/stackmemory", - "version": "1.10.5", + "version": "1.10.6", "description": "Lossless, project-scoped memory for AI coding tools. Durable context across sessions with 56 MCP tools, FTS5 search, conductor orchestrator, loop/watch monitoring, snapshot capture, pre-flight overlap checks, Claude/Codex/OpenCode wrappers, Linear sync, and automatic hooks.", "engines": { "node": ">=20.0.0", diff --git a/src/cli/claude-sm.ts b/src/cli/claude-sm.ts index 1e1cacdc..0bc7339b 100644 --- a/src/cli/claude-sm.ts +++ b/src/cli/claude-sm.ts @@ -12,10 +12,15 @@ import { spawn, execSync, execFileSync } from 'child_process'; import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; +import { fileURLToPath } from 'url'; import { program } from 'commander'; import { v4 as uuidv4 } from 'uuid'; import chalk from 'chalk'; import { initializeTracing, trace } from '../core/trace/index.js'; +import { + canonicalStateStore, + projectIdFromIdentifier, +} from '../core/shared-state/canonical-store.js'; import { getModelRouter, loadModelRouterConfig, @@ -50,7 +55,7 @@ import { getSettingsPath, } from '../utils/hook-installer.js'; -// __filename and __dirname are provided by esbuild banner for ESM compatibility +const runtimeDirname = path.dirname(fileURLToPath(import.meta.url)); interface ClaudeSMConfig { defaultWorktree: boolean; @@ -128,6 +133,9 @@ class ClaudeSM { private worktreeScriptPath: string; private claudeConfigDir: string; private smConfig: ClaudeSMConfig; + private sessionId: string; + private ownsSession: boolean; + private sessionEnded: boolean; constructor() { // Load persistent defaults @@ -151,10 +159,13 @@ class ClaudeSM { this.stackmemoryPath = this.findStackMemory(); this.worktreeScriptPath = path.join( - __dirname, + runtimeDirname, '../../scripts/claude-worktree-manager.sh' ); this.claudeConfigDir = path.join(os.homedir(), '.claude'); + this.sessionId = process.env['STACKMEMORY_SESSION'] || uuidv4(); + this.ownsSession = !process.env['STACKMEMORY_SESSION']; + this.sessionEnded = false; // Ensure config directory exists if (!fs.existsSync(this.claudeConfigDir)) { @@ -236,6 +247,24 @@ class ClaudeSM { } } + private getProjectId(): string | undefined { + const root = this.getRepoRoot() || process.cwd(); + + try { + const remote = execSync('git config --get remote.origin.url', { + cwd: root, + encoding: 'utf8', + }).trim(); + if (remote) { + return projectIdFromIdentifier(remote); + } + } catch { + // Fall back to the current path below. + } + + return projectIdFromIdentifier(root); + } + private hasUncommittedChanges(): boolean { try { const status = execSync('git status --porcelain', { encoding: 'utf8' }); @@ -281,9 +310,9 @@ class ClaudeSM { // Find GEPA scripts directory (check multiple locations) const gepaPaths = [ // From dist/src/cli -> scripts/gepa (3 levels up) - path.join(__dirname, '../../../scripts/gepa/hooks/auto-optimize.js'), + path.join(runtimeDirname, '../../../scripts/gepa/hooks/auto-optimize.js'), // From src/cli -> scripts/gepa (2 levels up, for dev mode) - path.join(__dirname, '../../scripts/gepa/hooks/auto-optimize.js'), + path.join(runtimeDirname, '../../scripts/gepa/hooks/auto-optimize.js'), // Global install location path.join( os.homedir(), @@ -295,7 +324,7 @@ class ClaudeSM { ), // npm global install path.join( - __dirname, + runtimeDirname, '..', '..', 'scripts', @@ -495,10 +524,10 @@ class ClaudeSM { // 2. Find templates dir (dev β†’ dist β†’ global npm) const candidateDirs = [ - path.join(__dirname, '../../templates/claude-hooks'), - path.join(__dirname, '../../../templates/claude-hooks'), + path.join(runtimeDirname, '../../templates/claude-hooks'), + path.join(runtimeDirname, '../../../templates/claude-hooks'), path.join( - __dirname, + runtimeDirname, '..', '..', '..', @@ -613,6 +642,176 @@ class ClaudeSM { console.log(chalk.gray(`\nSession ended (exit ${exitCode ?? 0})`)); } + private async publishSessionStart(): Promise { + const projectPath = process.cwd(); + const projectId = this.getProjectId(); + const branch = this.isGitRepo() ? this.getCurrentBranch() : undefined; + + await canonicalStateStore.upsertSession({ + sessionId: this.sessionId, + tool: 'claude', + projectId, + projectPath, + branch, + instanceId: this.config.instanceId, + metadata: { + task: this.config.task, + sandbox: this.config.useSandbox, + chrome: this.config.useChrome, + }, + }); + + await canonicalStateStore.upsertInstance({ + instanceId: this.config.instanceId, + tool: 'claude', + sessionId: this.sessionId, + projectId, + projectPath, + branch, + worktreePath: this.config.worktreePath, + pid: process.pid, + status: 'active', + metadata: { + task: this.config.task, + sandbox: this.config.useSandbox, + chrome: this.config.useChrome, + }, + }); + + await canonicalStateStore.appendEvent({ + type: 'session_start', + tool: 'claude', + sessionId: this.sessionId, + instanceId: this.config.instanceId, + projectId, + projectPath, + branch, + payload: { + task: this.config.task, + worktreePath: this.config.worktreePath, + sandbox: this.config.useSandbox, + chrome: this.config.useChrome, + }, + }); + + const claimResult = await canonicalStateStore.claimPaths({ + tool: 'claude', + sessionId: this.sessionId, + instanceId: this.config.instanceId, + projectId, + projectPath, + branch, + paths: [], + metadata: { + task: this.config.task, + scope: 'branch', + }, + }); + + if (claimResult.conflicts.length > 0) { + console.log(chalk.yellow('⚠️ Shared state conflict detected')); + for (const conflict of claimResult.conflicts.slice(0, 3)) { + console.log( + chalk.gray( + ` Claim ${conflict.claimId.slice(0, 8)} already owns ${conflict.branch || 'overlapping work'}` + ) + ); + } + } + } + + private async publishSessionEnd( + eventType: 'session_end' | 'session_interrupt' | 'session_terminate', + payload: Record = {} + ): Promise { + if (this.sessionEnded) { + return; + } + this.sessionEnded = true; + + const projectPath = process.cwd(); + const projectId = this.getProjectId(); + const branch = this.isGitRepo() ? this.getCurrentBranch() : undefined; + + await canonicalStateStore.appendEvent({ + type: eventType, + tool: 'claude', + sessionId: this.sessionId, + instanceId: this.config.instanceId, + projectId, + projectPath, + branch, + payload, + }); + await canonicalStateStore.releaseClaims({ + instanceId: this.config.instanceId, + reason: eventType, + }); + await canonicalStateStore.endInstance(this.config.instanceId); + if (this.ownsSession) { + await canonicalStateStore.endSession(this.sessionId); + } + } + + private async finalizeSession( + eventType: 'session_end' | 'session_interrupt' | 'session_terminate', + exitCode: number | null, + payload: Record = {} + ): Promise { + this.stopGEPAWatcher(); + + this.saveContext( + eventType === 'session_end' + ? 'Claude session ended' + : eventType === 'session_interrupt' + ? 'Claude session interrupted' + : 'Claude session terminated', + { + action: eventType, + exitCode, + ...payload, + } + ); + + await this.publishSessionEnd(eventType, { + exitCode, + ...payload, + }); + + if (eventType === 'session_end' && process.env['LINEAR_API_KEY']) { + try { + execSync('stackmemory linear sync', { + stdio: 'ignore', + timeout: 10000, + }); + } catch { + // Non-fatal: don't block exit + } + } + + if (this.config.tracingEnabled) { + const summary = trace.getExecutionSummary(); + console.log(); + console.log(chalk.gray('─'.repeat(42))); + console.log(chalk.blue('Debug Trace Summary:')); + console.log(chalk.gray(summary)); + } + + if (eventType === 'session_end' && this.config.notifyOnDone) { + this.notifyDone(exitCode); + } + + if (this.config.worktreePath) { + console.log(); + console.log(chalk.gray('─'.repeat(42))); + console.log(chalk.blue('Session ended in worktree:')); + console.log(chalk.gray(` ${this.config.worktreePath}`)); + console.log(); + console.log(chalk.gray('To remove worktree: gd_claude')); + console.log(chalk.gray('To merge to main: cwm')); + } + } + public async run(args: string[]): Promise { // Parse arguments const claudeArgs: string[] = []; @@ -826,10 +1025,28 @@ class ClaudeSM { // Setup environment process.env['CLAUDE_INSTANCE_ID'] = this.config.instanceId; + process.env['STACKMEMORY_SESSION'] = this.sessionId; if (this.config.worktreePath) { process.env['CLAUDE_WORKTREE_PATH'] = this.config.worktreePath; } + const claudeBin = this.resolveClaudeBin(); + if (!claudeBin) { + console.error(chalk.red('❌ Claude CLI not found.')); + console.log( + chalk.gray( + ' Install Claude CLI or set an override:\n' + + ' export CLAUDE_BIN=/path/to/claude\n' + + ' claude-sm --help\n\n' + + ' Ensure PATH includes npm global bin (npm bin -g).' + ) + ); + process.exit(1); + return; + } + + await this.publishSessionStart(); console.log(chalk.gray(`πŸ€– Instance ID: ${this.config.instanceId}`)); + console.log(chalk.gray(`🧠 Session ID: ${this.sessionId.slice(0, 8)}`)); console.log(chalk.gray(`πŸ“ Working in: ${process.cwd()}`)); if (this.config.useSandbox) { @@ -932,12 +1149,6 @@ class ClaudeSM { // ── Launch ──────────────────────────────────────────────────── // Sweep PTY wrapper: next-edit predictions (falls back to direct launch) if (this.config.useSweep) { - const claudeBin = this.resolveClaudeBin(); - if (!claudeBin) { - console.error(chalk.red('Claude CLI not found.')); - process.exit(1); - return; - } console.log( chalk.cyan('[Sweep] Launching Claude with prediction bar...') ); @@ -947,6 +1158,16 @@ class ClaudeSM { claudeBin, claudeArgs, initialInput: initialInput || undefined, + onExit: async (exitCode) => { + await this.finalizeSession('session_end', exitCode); + }, + onSignal: async (signal) => { + await this.finalizeSession( + signal === 'SIGINT' ? 'session_interrupt' : 'session_terminate', + null, + { signal } + ); + }, }); // PTY wrapper is now running β€” it calls process.exit() on child exit. // Return to prevent falling through to the fallback-monitor path, @@ -974,21 +1195,6 @@ class ClaudeSM { console.log(chalk.gray('Starting Claude...')); console.log(chalk.gray('─'.repeat(42))); - const claudeBin = this.resolveClaudeBin(); - if (!claudeBin) { - console.error(chalk.red('❌ Claude CLI not found.')); - console.log( - chalk.gray( - ' Install Claude CLI or set an override:\n' + - ' export CLAUDE_BIN=/path/to/claude\n' + - ' claude-sm --help\n\n' + - ' Ensure PATH includes npm global bin (npm bin -g).' - ) - ); - process.exit(1); - return; - } - // Setup fallback monitor for automatic Qwen switching on Claude failures const fallbackMonitor = new FallbackMonitor({ enabled: true, @@ -1037,9 +1243,6 @@ class ClaudeSM { // Handle exit claude.on('exit', async (code) => { - // Stop GEPA watcher if running - this.stopGEPAWatcher(); - // Check if we were in fallback mode const status = fallbackMonitor.getStatus(); if (status.inFallback) { @@ -1049,63 +1252,21 @@ class ClaudeSM { ) ); } - // Save final context - this.saveContext('Claude session ended', { - action: 'session_end', - exitCode: code, - }); - - // Sync Linear on exit if configured - if (process.env['LINEAR_API_KEY']) { - try { - execSync('stackmemory linear sync', { - stdio: 'ignore', - timeout: 10000, - }); - } catch { - // Non-fatal: don't block exit - } - } - - // End tracing and show summary if enabled - if (this.config.tracingEnabled) { - const summary = trace.getExecutionSummary(); - console.log(); - console.log(chalk.gray('─'.repeat(42))); - console.log(chalk.blue('Debug Trace Summary:')); - console.log(chalk.gray(summary)); - } - - // Bell notification when done - if (this.config.notifyOnDone) { - this.notifyDone(code); - } - - // Offer to clean up worktree - if (this.config.worktreePath) { - console.log(); - console.log(chalk.gray('─'.repeat(42))); - console.log(chalk.blue('Session ended in worktree:')); - console.log(chalk.gray(` ${this.config.worktreePath}`)); - console.log(); - console.log(chalk.gray('To remove worktree: gd_claude')); - console.log(chalk.gray('To merge to main: cwm')); - } - + await this.finalizeSession('session_end', code); process.exit(code || 0); }); // Handle signals - process.on('SIGINT', () => { - this.saveContext('Claude session interrupted', { - action: 'session_interrupt', + process.on('SIGINT', async () => { + await this.finalizeSession('session_interrupt', null, { + signal: 'SIGINT', }); claude.kill('SIGINT'); }); - process.on('SIGTERM', () => { - this.saveContext('Claude session terminated', { - action: 'session_terminate', + process.on('SIGTERM', async () => { + await this.finalizeSession('session_terminate', null, { + signal: 'SIGTERM', }); claude.kill('SIGTERM'); }); diff --git a/src/cli/codex-sm.ts b/src/cli/codex-sm.ts index 1045043d..2a3a8d84 100644 --- a/src/cli/codex-sm.ts +++ b/src/cli/codex-sm.ts @@ -13,6 +13,10 @@ import { program } from 'commander'; import { v4 as uuidv4 } from 'uuid'; import chalk from 'chalk'; import { initializeTracing, trace } from '../core/trace/index.js'; +import { + canonicalStateStore, + projectIdFromIdentifier, +} from '../core/shared-state/canonical-store.js'; interface CodexConfig { instanceId: string; @@ -29,6 +33,9 @@ interface CodexConfig { class CodexSM { private config: CodexConfig; private stackmemoryPath: string; + private sessionId: string; + private ownsSession: boolean; + private sessionEnded: boolean; constructor() { this.config = { @@ -40,6 +47,9 @@ class CodexSM { }; this.stackmemoryPath = this.findStackMemory(); + this.sessionId = process.env['STACKMEMORY_SESSION'] || uuidv4(); + this.ownsSession = !process.env['STACKMEMORY_SESSION']; + this.sessionEnded = false; } private getRepoRoot(): string | null { @@ -113,6 +123,24 @@ class CodexSM { } } + private getProjectId(): string | undefined { + const root = this.getRepoRoot() || process.cwd(); + + try { + const remote = execSync('git config --get remote.origin.url', { + cwd: root, + encoding: 'utf8', + }).trim(); + if (remote) { + return projectIdFromIdentifier(remote); + } + } catch { + // Fall back to current path below. + } + + return projectIdFromIdentifier(root); + } + private hasUncommittedChanges(): boolean { try { const status = execSync('git status --porcelain', { encoding: 'utf8' }); @@ -220,18 +248,17 @@ class CodexSM { if (!this.config.contextEnabled) return; try { console.log(chalk.blue('πŸ“š Loading previous context...')); - const cmd = `${this.stackmemoryPath} context list --limit 5 --format json`; - const output = execSync(cmd, { encoding: 'utf8' }); - const contexts = JSON.parse(output); - if (Array.isArray(contexts) && contexts.length > 0) { - console.log(chalk.gray('Recent context loaded:')); - contexts.forEach( - (ctx: { message: string; metadata?: { timestamp?: string } }) => { - console.log( - chalk.gray(` - ${ctx.message} (${ctx.metadata?.timestamp})`) - ); - } - ); + const cmd = `${this.stackmemoryPath} context show`; + const output = execSync(cmd, { + encoding: 'utf8', + stdio: ['pipe', 'pipe', 'pipe'], + }); + const lines = output + .trim() + .split('\n') + .filter((line) => line.trim()); + if (lines.length > 3) { + console.log(chalk.gray('Context stack loaded')); } } catch { // ignore @@ -247,6 +274,111 @@ class CodexSM { } } + private async publishSessionStart(): Promise { + const projectPath = process.cwd(); + const projectId = this.getProjectId(); + const branch = this.isGitRepo() ? this.getCurrentBranch() : undefined; + + await canonicalStateStore.upsertSession({ + sessionId: this.sessionId, + tool: 'codex', + projectId, + projectPath, + branch, + instanceId: this.config.instanceId, + metadata: { + task: this.config.task, + }, + }); + + await canonicalStateStore.upsertInstance({ + instanceId: this.config.instanceId, + tool: 'codex', + sessionId: this.sessionId, + projectId, + projectPath, + branch, + worktreePath: this.config.worktreePath, + pid: process.pid, + status: 'active', + metadata: { + task: this.config.task, + }, + }); + + await canonicalStateStore.appendEvent({ + type: 'session_start', + tool: 'codex', + sessionId: this.sessionId, + instanceId: this.config.instanceId, + projectId, + projectPath, + branch, + payload: { + task: this.config.task, + worktreePath: this.config.worktreePath, + }, + }); + + const claimResult = await canonicalStateStore.claimPaths({ + tool: 'codex', + sessionId: this.sessionId, + instanceId: this.config.instanceId, + projectId, + projectPath, + branch, + paths: [], + metadata: { + task: this.config.task, + scope: 'branch', + }, + }); + + if (claimResult.conflicts.length > 0) { + console.log(chalk.yellow('⚠️ Shared state conflict detected')); + for (const conflict of claimResult.conflicts.slice(0, 3)) { + console.log( + chalk.gray( + ` Claim ${conflict.claimId.slice(0, 8)} already owns ${conflict.branch || 'overlapping work'}` + ) + ); + } + } + } + + private async publishSessionEnd( + eventType: 'session_end' | 'session_interrupt' | 'session_terminate', + payload: Record = {} + ): Promise { + if (this.sessionEnded) { + return; + } + this.sessionEnded = true; + + const projectPath = process.cwd(); + const projectId = this.getProjectId(); + const branch = this.isGitRepo() ? this.getCurrentBranch() : undefined; + + await canonicalStateStore.appendEvent({ + type: eventType, + tool: 'codex', + sessionId: this.sessionId, + instanceId: this.config.instanceId, + projectId, + projectPath, + branch, + payload, + }); + await canonicalStateStore.releaseClaims({ + instanceId: this.config.instanceId, + reason: eventType, + }); + await canonicalStateStore.endInstance(this.config.instanceId); + if (this.ownsSession) { + await canonicalStateStore.endSession(this.sessionId); + } + } + public async run(args: string[]): Promise { const codexArgs: string[] = []; let i = 0; @@ -350,10 +482,13 @@ class CodexSM { this.loadContext(); process.env['CODEX_INSTANCE_ID'] = this.config.instanceId; + process.env['STACKMEMORY_SESSION'] = this.sessionId; if (this.config.worktreePath) process.env['CODEX_WORKTREE_PATH'] = this.config.worktreePath; + await this.publishSessionStart(); console.log(chalk.gray(`πŸ€– Instance ID: ${this.config.instanceId}`)); + console.log(chalk.gray(`🧠 Session ID: ${this.sessionId.slice(0, 8)}`)); console.log(chalk.gray(`πŸ“ Working in: ${process.cwd()}`)); console.log(); @@ -401,11 +536,14 @@ class CodexSM { process.exit(1); }); - child.on('exit', (code) => { + child.on('exit', async (code) => { this.saveContext('Codex session ended', { action: 'session_end', exitCode: code, }); + await this.publishSessionEnd('session_end', { + exitCode: code, + }); // Sync Linear on exit β€” let sync command handle auth detection // (supports API key env var, .env files, and OAuth tokens) @@ -434,17 +572,19 @@ class CodexSM { process.exit(code || 0); }); - process.on('SIGINT', () => { + process.on('SIGINT', async () => { this.saveContext('Codex session interrupted', { action: 'session_interrupt', }); + await this.publishSessionEnd('session_interrupt'); child.kill('SIGINT'); }); - process.on('SIGTERM', () => { + process.on('SIGTERM', async () => { this.saveContext('Codex session terminated', { action: 'session_terminate', }); + await this.publishSessionEnd('session_terminate'); child.kill('SIGTERM'); }); } diff --git a/src/cli/commands/daemon.ts b/src/cli/commands/daemon.ts index 71f43f03..ab0a209c 100644 --- a/src/cli/commands/daemon.ts +++ b/src/cli/commands/daemon.ts @@ -140,6 +140,7 @@ The daemon provides: const services = []; if (newStatus.services.context.enabled) services.push('context'); if (newStatus.services.linear.enabled) services.push('linear'); + if (newStatus.services.github?.enabled) services.push('github'); if (newStatus.services.maintenance?.enabled) services.push('maintenance'); if (newStatus.services.memory?.enabled) services.push('memory'); @@ -303,6 +304,26 @@ The daemon provides: } } + const gh = status.services.github; + if (gh) { + console.log( + ` GitHub: ${gh.enabled ? chalk.green('Enabled') : chalk.gray('Disabled')}` + ); + if (gh.enabled) { + console.log( + chalk.gray(` Interval: ${config.github.interval} min`) + ); + if (gh.syncCount) { + console.log(chalk.gray(` Refreshes: ${gh.syncCount}`)); + } + if (gh.lastProjectionState) { + console.log( + chalk.gray(` Last PR state: ${gh.lastProjectionState}`) + ); + } + } + } + // Maintenance service const maint = status.services.maintenance; if (maint) { @@ -844,6 +865,27 @@ function getServiceHealthChecks( }); } + const gh = status.services.github; + if (gh?.enabled) { + const intervalMs = config.github.interval * 60_000; + const overdue = gh.lastRun + ? Date.now() - gh.lastRun > intervalMs * 2 + : false; + checks.push({ + name: 'GitHub Service', + status: overdue ? 'warn' : 'ok', + detail: gh.lastRun + ? `Last refresh: ${formatTimeAgo(gh.lastRun)} | Refreshes: ${gh.syncCount ?? 0}${gh.lastProjectionState ? ` | Last PR state: ${gh.lastProjectionState}` : ''}` + : `Enabled (interval: ${config.github.interval}m) | No refreshes yet`, + }); + } else { + checks.push({ + name: 'GitHub Service', + status: 'ok', + detail: 'Disabled', + }); + } + // Maintenance service const maint = status.services.maintenance; if (maint?.enabled) { @@ -980,6 +1022,11 @@ function buildHealthReport( enabled: status.services.linear.enabled, lastRun: status.services.linear.lastRun, }, + { + key: 'github', + enabled: status.services.github?.enabled ?? false, + lastRun: status.services.github?.lastRun, + }, { key: 'maintenance', enabled: status.services.maintenance?.enabled ?? false, diff --git a/src/cli/commands/state.ts b/src/cli/commands/state.ts new file mode 100644 index 00000000..d015a0dd --- /dev/null +++ b/src/cli/commands/state.ts @@ -0,0 +1,380 @@ +import { Command } from 'commander'; +import chalk from 'chalk'; +import { + canonicalStateStore, + type SharedToolName, +} from '../../core/shared-state/canonical-store.js'; +import { + getCurrentRepoGitHubInfo, + refreshCurrentRepoPullRequestState, +} from '../../integrations/github/pr-state.js'; + +function parseJsonObject(input: string | undefined): Record { + if (!input) { + return {}; + } + + const parsed = JSON.parse(input); + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + throw new Error('Expected a JSON object'); + } + + return parsed as Record; +} + +export function createStateCommand(): Command { + const cmd = new Command('state').description( + 'Manage canonical user-scoped shared state across instances and sessions' + ); + + const instance = cmd.command('instance').description('Manage instance state'); + instance + .command('upsert') + .requiredOption('--id ', 'Instance identifier') + .requiredOption( + '--tool ', + 'Tool name (claude|codex|opencode|stackmemory)' + ) + .option('--session ', 'Session identifier') + .option('--project ', 'Project identifier') + .option('--project-path ', 'Project path') + .option('--branch ', 'Git branch') + .option('--worktree-path ', 'Worktree path') + .option('--pid ', 'Process id') + .option('--status ', 'Status', 'active') + .option('--metadata ', 'Metadata JSON object') + .action(async (options) => { + const record = await canonicalStateStore.upsertInstance({ + instanceId: options.id, + tool: options.tool as SharedToolName, + sessionId: options.session, + projectId: options.project, + projectPath: options.projectPath, + branch: options.branch, + worktreePath: options.worktreePath, + pid: options.pid ? Number(options.pid) : undefined, + status: options.status, + metadata: parseJsonObject(options.metadata), + }); + + console.log(JSON.stringify(record, null, 2)); + }); + + instance + .command('end') + .requiredOption('--id ', 'Instance identifier') + .action(async (options) => { + await canonicalStateStore.endInstance(options.id); + console.log(chalk.green(`Ended instance ${options.id}`)); + }); + + const session = cmd.command('session').description('Manage session state'); + session + .command('upsert') + .requiredOption('--id ', 'Session identifier') + .requiredOption( + '--tool ', + 'Tool name (claude|codex|opencode|stackmemory)' + ) + .option('--project ', 'Project identifier') + .option('--project-path ', 'Project path') + .option('--branch ', 'Git branch') + .option('--instance ', 'Associated instance identifier') + .option('--status ', 'Status', 'active') + .option('--metadata ', 'Metadata JSON object') + .action(async (options) => { + const record = await canonicalStateStore.upsertSession({ + sessionId: options.id, + tool: options.tool as SharedToolName, + projectId: options.project, + projectPath: options.projectPath, + branch: options.branch, + instanceId: options.instance, + status: options.status, + metadata: parseJsonObject(options.metadata), + }); + + console.log(JSON.stringify(record, null, 2)); + }); + + session + .command('end') + .requiredOption('--id ', 'Session identifier') + .option('--status ', 'Status', 'closed') + .action(async (options) => { + await canonicalStateStore.endSession(options.id, options.status); + console.log(chalk.green(`Ended session ${options.id}`)); + }); + + cmd + .command('event') + .description('Append a shared-state event') + .requiredOption('--type ', 'Event type') + .option('--tool ', 'Tool name (claude|codex|opencode|stackmemory)') + .option('--instance ', 'Instance identifier') + .option('--session ', 'Session identifier') + .option('--project ', 'Project identifier') + .option('--project-path ', 'Project path') + .option('--branch ', 'Git branch') + .option('--payload ', 'Payload JSON object') + .action(async (options) => { + const event = await canonicalStateStore.appendEvent({ + type: options.type, + tool: options.tool as SharedToolName | undefined, + instanceId: options.instance, + sessionId: options.session, + projectId: options.project, + projectPath: options.projectPath, + branch: options.branch, + payload: parseJsonObject(options.payload), + }); + + console.log(JSON.stringify(event, null, 2)); + }); + + cmd + .command('show') + .description('Show canonical shared state for a project') + .option('--project ', 'Project identifier') + .option('--project-path ', 'Project path') + .option('--limit ', 'Recent event limit', '10') + .option('--json', 'Emit JSON output') + .action(async (options) => { + const summary = await canonicalStateStore.getProjectSummary({ + projectId: options.project, + projectPath: options.projectPath, + eventLimit: Number(options.limit), + }); + + if (options.json) { + console.log(JSON.stringify(summary, null, 2)); + return; + } + + console.log(chalk.bold('Canonical Shared State')); + console.log( + ` Active sessions: ${summary.activeSessions.length} | Active instances: ${summary.activeInstances.length} | Active claims: ${summary.activeClaims.length}` + ); + if (summary.projectId) { + console.log(` Project: ${summary.projectId}`); + } + if (summary.activeSessions.length > 0) { + console.log(chalk.bold('\nSessions')); + for (const record of summary.activeSessions) { + console.log( + ` ${record.sessionId.slice(0, 8)} ${record.tool} ${record.branch || ''}`.trim() + ); + } + } + if (summary.activeInstances.length > 0) { + console.log(chalk.bold('\nInstances')); + for (const record of summary.activeInstances) { + console.log( + ` ${record.instanceId} ${record.tool} ${record.branch || ''}`.trim() + ); + } + } + if (summary.activeClaims.length > 0) { + console.log(chalk.bold('\nClaims')); + for (const claim of summary.activeClaims) { + const scopes = [ + claim.branch ? `branch:${claim.branch}` : '', + ...claim.paths.map((item) => `path:${item}`), + ] + .filter(Boolean) + .join(', '); + console.log( + ` ${claim.claimId.slice(0, 8)} ${claim.tool} ${scopes || '(no scope)'}`.trim() + ); + } + } + if (summary.recentEvents.length > 0) { + console.log(chalk.bold('\nRecent events')); + for (const event of summary.recentEvents) { + console.log( + ` ${event.type} ${new Date(event.timestamp).toISOString()}` + ); + } + } + }); + + const claims = cmd + .command('claims') + .description('Manage shared ownership claims'); + + claims + .command('claim') + .requiredOption( + '--tool ', + 'Tool name (claude|codex|opencode|stackmemory)' + ) + .option('--session ', 'Session identifier') + .option('--instance ', 'Instance identifier') + .option('--project ', 'Project identifier') + .option('--project-path ', 'Project path') + .option('--branch ', 'Git branch') + .option('--path ', 'Claimed file or directory path(s)') + .option('--ttl-ms ', 'Time to live in milliseconds', '86400000') + .option('--metadata ', 'Metadata JSON object') + .option('--json', 'Emit JSON output') + .action(async (options) => { + const result = await canonicalStateStore.claimPaths({ + tool: options.tool as SharedToolName, + sessionId: options.session, + instanceId: options.instance, + projectId: options.project, + projectPath: options.projectPath, + branch: options.branch, + paths: options.path || [], + ttlMs: Number(options.ttlMs), + metadata: parseJsonObject(options.metadata), + }); + + if (options.json) { + console.log(JSON.stringify(result, null, 2)); + return; + } + + console.log(chalk.green(`Claimed ${result.record.claimId.slice(0, 8)}`)); + if (result.conflicts.length > 0) { + console.log(chalk.yellow(`Conflicts: ${result.conflicts.length}`)); + for (const conflict of result.conflicts) { + console.log( + ` ${conflict.claimId.slice(0, 8)} ${conflict.branch || ''} ${conflict.paths.join(', ')}`.trim() + ); + } + } + }); + + claims + .command('release') + .option('--claim ', 'Claim identifier') + .option('--session ', 'Session identifier') + .option('--instance ', 'Instance identifier') + .option('--project ', 'Project identifier') + .option('--project-path ', 'Project path') + .option('--branch ', 'Git branch') + .option('--reason ', 'Release reason') + .action(async (options) => { + const released = await canonicalStateStore.releaseClaims({ + claimId: options.claim, + sessionId: options.session, + instanceId: options.instance, + projectId: options.project, + projectPath: options.projectPath, + branch: options.branch, + reason: options.reason, + }); + console.log(chalk.green(`Released ${released} claim(s)`)); + }); + + claims + .command('show') + .option('--project ', 'Project identifier') + .option('--project-path ', 'Project path') + .option('--all', 'Show released and expired claims too') + .option('--json', 'Emit JSON output') + .action(async (options) => { + const records = await canonicalStateStore.listPathClaims({ + projectId: options.project, + projectPath: options.projectPath, + activeOnly: !options.all, + }); + + if (options.json) { + console.log(JSON.stringify(records, null, 2)); + return; + } + + if (records.length === 0) { + console.log(chalk.yellow('No claims found')); + return; + } + + for (const claim of records) { + const scopes = [ + claim.branch ? `branch:${claim.branch}` : '', + ...claim.paths.map((item) => `path:${item}`), + ] + .filter(Boolean) + .join(', '); + console.log( + `${claim.claimId.slice(0, 8)} ${claim.status} ${claim.tool} ${scopes || '(no scope)'}` + ); + } + }); + + const github = cmd.command('github').description('GitHub projection state'); + + github + .command('refresh') + .description('Refresh current repo branch PR state from GitHub CLI') + .option('--json', 'Emit JSON output') + .action(async (options) => { + const projection = await refreshCurrentRepoPullRequestState(); + if (!projection) { + console.log( + chalk.yellow( + 'No GitHub PR projection available for current repo/branch' + ) + ); + return; + } + + if (options.json) { + console.log(JSON.stringify(projection, null, 2)); + return; + } + + console.log(chalk.green(`Refreshed PR #${projection.prNumber}`)); + console.log(`${projection.state} ${projection.title}`); + console.log(projection.url); + }); + + github + .command('show') + .description('Show cached current repo branch PR projection') + .option('--json', 'Emit JSON output') + .action(async (options) => { + const info = getCurrentRepoGitHubInfo(); + if (!info) { + console.log(chalk.yellow('Not in a GitHub repository')); + return; + } + + const projection = await canonicalStateStore.getGitHubPullRequest({ + repo: info.repo, + branch: info.branch, + }); + if (!projection) { + console.log( + chalk.yellow('No cached GitHub PR projection for current branch') + ); + return; + } + + if (options.json) { + console.log(JSON.stringify(projection, null, 2)); + return; + } + + console.log(chalk.bold(`PR #${projection.prNumber}`)); + console.log(`${projection.state} ${projection.title}`); + console.log(`Repo: ${projection.repo}`); + console.log( + `Branch: ${projection.headRefName} -> ${projection.baseRefName}` + ); + if (projection.reviewDecision) { + console.log(`Review: ${projection.reviewDecision}`); + } + if (projection.statusCheckRollup) { + console.log(`Checks: ${projection.statusCheckRollup}`); + } + console.log(`Synced: ${new Date(projection.lastSyncedAt).toISOString()}`); + console.log(projection.url); + }); + + return cmd; +} + +export default createStateCommand; diff --git a/src/cli/index.ts b/src/cli/index.ts index deb6bf51..7ce66d2c 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -61,6 +61,7 @@ import { createPingCommand } from './commands/ping.js'; import { createAuditCommand } from './commands/audit.js'; import { createStatsCommand } from './commands/stats.js'; import { createBenchCommand } from './commands/bench.js'; +import { createStateCommand } from './commands/state.js'; import { createDigestCommands } from './commands/digest.js'; import { createTeamCommands } from './commands/team.js'; import { createDesiresCommands } from './commands/desires.js'; @@ -75,7 +76,12 @@ import chalk from 'chalk'; import * as fs from 'fs'; import * as path from 'path'; import { filterPending } from '../integrations/mcp/pending-utils.js'; +import { + getCurrentRepoGitHubInfo, + refreshCurrentRepoPullRequestState, +} from '../integrations/github/pr-state.js'; import { ProjectManager } from '../core/projects/project-manager.js'; +import { canonicalStateStore } from '../core/shared-state/canonical-store.js'; import { join } from 'path'; import { existsSync, mkdirSync } from 'fs'; import type { @@ -272,6 +278,28 @@ program projectPath: projectRoot, sessionId: options.session, }); + const sharedProjectState = await canonicalStateStore.getProjectSummary({ + projectId: session.projectId, + projectPath: projectRoot, + eventLimit: 5, + }); + const githubInfo = getCurrentRepoGitHubInfo(projectRoot); + let githubProjection = + githubInfo && + (await canonicalStateStore.getGitHubPullRequest({ + repo: githubInfo.repo, + branch: githubInfo.branch, + })); + + if ( + githubInfo && + (!githubProjection || + Date.now() - githubProjection.lastSyncedAt > 2 * 60 * 1000) + ) { + githubProjection = + (await refreshCurrentRepoPullRequestState(projectRoot)) || + githubProjection; + } // Auto-discover shared context on startup const contextDiscovery = await sharedContextLayer.autoDiscoverContext(); @@ -372,6 +400,24 @@ program console.log( ` Cached contexts: ${contextCount.count || 0} (global)` ); + console.log( + ` Shared sessions: ${sharedProjectState.activeSessions.length}` + ); + console.log( + ` Shared instances: ${sharedProjectState.activeInstances.length}` + ); + console.log( + ` Shared claims: ${sharedProjectState.activeClaims.length}` + ); + + const branchClaim = sharedProjectState.activeClaims.find( + (claim) => claim.branch && claim.branch === session.branch + ); + if (branchClaim) { + console.log( + ` Branch owner: ${branchClaim.tool} ${branchClaim.instanceId || branchClaim.sessionId || branchClaim.claimId.slice(0, 8)}` + ); + } // Show recent activity const recentFrames = db @@ -401,6 +447,22 @@ program }); } + if (githubProjection) { + console.log(`\n GitHub PR:`); + console.log( + ` #${githubProjection.prNumber} ${githubProjection.state} ${githubProjection.title}` + ); + console.log( + ` ${githubProjection.headRefName} -> ${githubProjection.baseRefName}` + ); + if (githubProjection.reviewDecision) { + console.log(` Review: ${githubProjection.reviewDecision}`); + } + if (githubProjection.statusCheckRollup) { + console.log(` Checks: ${githubProjection.statusCheckRollup}`); + } + } + console.log(`\n Current Session:`); console.log(` Stack depth: ${stackDepth}`); console.log(` Active frames: ${activeFrames.length}`); @@ -770,6 +832,7 @@ program.addCommand(createModelCommand()); program.addCommand(createAuditCommand()); program.addCommand(createStatsCommand()); program.addCommand(createBenchCommand()); +program.addCommand(createStateCommand()); program.addCommand(createDigestCommands()); program.addCommand(createTeamCommands()); program.addCommand(createDesiresCommands()); diff --git a/src/core/session/session-manager.ts b/src/core/session/session-manager.ts index 8b61ee6e..5880219b 100644 --- a/src/core/session/session-manager.ts +++ b/src/core/session/session-manager.ts @@ -9,6 +9,7 @@ import * as path from 'path'; import * as _crypto from 'crypto'; import { logger } from '../monitoring/logger.js'; import { SystemError, ErrorCode } from '../errors/index.js'; +import { canonicalStateStore } from '../shared-state/canonical-store.js'; // Type-safe environment variable access function _getEnv(key: string, defaultValue?: string): string { const value = process.env[key]; @@ -169,6 +170,16 @@ export class SessionManager { await this.saveSession(session); await this.setProjectActiveSession(params.projectId, session.sessionId); + await canonicalStateStore.appendEvent({ + type: 'session_created', + tool: 'stackmemory', + sessionId: session.sessionId, + projectId: session.projectId, + branch: session.branch, + payload: { + state: session.state, + }, + }); // Set as current session this.currentSession = session; @@ -209,6 +220,14 @@ export class SessionManager { `${session.sessionId}.json` ); await fs.writeFile(sessionPath, JSON.stringify(session, null, 2)); + await canonicalStateStore.upsertSession({ + sessionId: session.sessionId, + tool: 'stackmemory', + projectId: session.projectId, + branch: session.branch, + status: session.state, + metadata: session.metadata, + }); } async suspendSession(sessionId?: string): Promise { @@ -260,6 +279,15 @@ export class SessionManager { ); await fs.rename(sessionPath, historyPath); + await canonicalStateStore.endSession(session.sessionId, 'closed'); + await canonicalStateStore.appendEvent({ + type: 'session_closed', + tool: 'stackmemory', + sessionId: session.sessionId, + projectId: session.projectId, + branch: session.branch, + payload: {}, + }); } } diff --git a/src/core/shared-state/canonical-store.ts b/src/core/shared-state/canonical-store.ts new file mode 100644 index 00000000..77675aa6 --- /dev/null +++ b/src/core/shared-state/canonical-store.ts @@ -0,0 +1,905 @@ +import * as fs from 'fs/promises'; +import * as path from 'path'; +import * as os from 'os'; +import { createHash, randomUUID } from 'crypto'; + +export type SharedToolName = 'stackmemory' | 'claude' | 'codex' | 'opencode'; + +export interface SharedInstanceRecord { + instanceId: string; + tool: SharedToolName; + sessionId?: string; + projectId?: string; + projectPath?: string; + branch?: string; + worktreePath?: string; + pid?: number; + startedAt: number; + lastSeenAt: number; + status: 'active' | 'ended'; + metadata?: Record; +} + +export interface SharedSessionRecord { + sessionId: string; + tool: SharedToolName; + projectId?: string; + projectPath?: string; + branch?: string; + startedAt: number; + lastSeenAt: number; + status: 'active' | 'suspended' | 'closed'; + instanceIds: string[]; + metadata?: Record; +} + +export interface SharedStateEvent { + id: string; + type: string; + timestamp: number; + tool?: SharedToolName; + instanceId?: string; + sessionId?: string; + projectId?: string; + projectPath?: string; + branch?: string; + payload: Record; +} + +export interface SharedProjectSummary { + projectId?: string; + projectPath?: string; + activeSessions: SharedSessionRecord[]; + activeInstances: SharedInstanceRecord[]; + activeClaims: SharedPathClaimRecord[]; + recentEvents: SharedStateEvent[]; +} + +export interface GitHubPullRequestProjection { + repo: string; + branch: string; + projectId?: string; + projectPath?: string; + prNumber: number; + title: string; + state: 'OPEN' | 'CLOSED' | 'MERGED'; + isDraft: boolean; + url: string; + baseRefName: string; + headRefName: string; + headRefOid?: string; + mergedAt?: string; + updatedAt: string; + reviewDecision?: string; + statusCheckRollup?: string; + lastSyncedAt: number; +} + +export interface SharedPathClaimRecord { + claimId: string; + tool: SharedToolName; + sessionId?: string; + instanceId?: string; + projectId?: string; + projectPath?: string; + branch?: string; + paths: string[]; + status: 'active' | 'released' | 'expired'; + claimedAt: number; + lastSeenAt: number; + expiresAt: number; + releasedAt?: number; + releaseReason?: string; + metadata?: Record; +} + +export interface SharedPathClaimConflict { + claimId: string; + branch?: string; + paths: string[]; + sessionId?: string; + instanceId?: string; +} + +export interface SharedPathClaimResult { + record: SharedPathClaimRecord; + conflicts: SharedPathClaimConflict[]; +} + +function getBaseStateDir(): string { + const xdgState = process.env['XDG_STATE_HOME']?.trim(); + if (xdgState) { + return path.join(xdgState, 'stackmemory'); + } + + const homeDir = + process.env['HOME'] || process.env['USERPROFILE'] || os.homedir(); + return path.join(homeDir, '.stackmemory'); +} + +function projectIdFromIdentifier(identifier: string): string { + return identifier + .replace(/\.git$/, '') + .replace(/[^a-zA-Z0-9-]/g, '-') + .toLowerCase() + .slice(-50); +} + +function normalizeProjectId( + projectId?: string, + projectPath?: string +): string | undefined { + if (projectId && projectId.trim()) { + return projectIdFromIdentifier(projectId.trim()); + } + if (!projectPath || !projectPath.trim()) { + return undefined; + } + + return createHash('sha1') + .update(projectPath.trim().toLowerCase()) + .digest('hex') + .slice(0, 16); +} + +async function pathExists(targetPath: string): Promise { + try { + await fs.access(targetPath); + return true; + } catch { + return false; + } +} + +export class CanonicalStateStore { + private rootDir: string; + + constructor(rootDir: string = path.join(getBaseStateDir(), 'shared-state')) { + this.rootDir = rootDir; + } + + getRootDir(): string { + return this.rootDir; + } + + async initialize(): Promise { + await fs.mkdir(this.getInstancesDir(), { recursive: true }); + await fs.mkdir(this.getSessionsDir(), { recursive: true }); + await fs.mkdir(this.getEventsDir(), { recursive: true }); + await fs.mkdir(this.getGithubDir(), { recursive: true }); + await fs.mkdir(this.getClaimsDir(), { recursive: true }); + } + + async upsertInstance( + input: Omit & { + startedAt?: number; + lastSeenAt?: number; + } + ): Promise { + await this.initialize(); + + const filePath = this.getInstanceFile(input.instanceId); + const existing = await this.readJsonFile(filePath); + const now = Date.now(); + + const record: SharedInstanceRecord = { + instanceId: input.instanceId, + tool: input.tool, + sessionId: input.sessionId ?? existing?.sessionId, + projectId: + normalizeProjectId(input.projectId, input.projectPath) ?? + existing?.projectId, + projectPath: input.projectPath ?? existing?.projectPath, + branch: input.branch ?? existing?.branch, + worktreePath: input.worktreePath ?? existing?.worktreePath, + pid: input.pid ?? existing?.pid, + startedAt: existing?.startedAt ?? input.startedAt ?? now, + lastSeenAt: input.lastSeenAt ?? now, + status: input.status ?? existing?.status ?? 'active', + metadata: { + ...(existing?.metadata || {}), + ...(input.metadata || {}), + }, + }; + + await this.writeJsonFile(filePath, record); + + if (record.sessionId) { + await this.upsertSession({ + sessionId: record.sessionId, + tool: record.tool, + projectId: record.projectId, + projectPath: record.projectPath, + branch: record.branch, + instanceId: record.instanceId, + metadata: record.metadata, + }); + } + + return record; + } + + async endInstance(instanceId: string): Promise { + await this.initialize(); + + const filePath = this.getInstanceFile(instanceId); + const existing = await this.readJsonFile(filePath); + if (!existing) { + return; + } + + const updated: SharedInstanceRecord = { + ...existing, + status: 'ended', + lastSeenAt: Date.now(), + }; + await this.writeJsonFile(filePath, updated); + } + + async upsertSession( + input: Omit< + SharedSessionRecord, + 'startedAt' | 'lastSeenAt' | 'instanceIds' | 'status' + > & { + startedAt?: number; + lastSeenAt?: number; + instanceId?: string; + instanceIds?: string[]; + status?: SharedSessionRecord['status']; + } + ): Promise { + await this.initialize(); + + const filePath = this.getSessionFile(input.sessionId); + const existing = await this.readJsonFile(filePath); + const now = Date.now(); + const nextInstanceIds = new Set(existing?.instanceIds || []); + + if (input.instanceId) { + nextInstanceIds.add(input.instanceId); + } + for (const instanceId of input.instanceIds || []) { + nextInstanceIds.add(instanceId); + } + + const record: SharedSessionRecord = { + sessionId: input.sessionId, + tool: input.tool, + projectId: + normalizeProjectId(input.projectId, input.projectPath) ?? + existing?.projectId, + projectPath: input.projectPath ?? existing?.projectPath, + branch: input.branch ?? existing?.branch, + startedAt: existing?.startedAt ?? input.startedAt ?? now, + lastSeenAt: input.lastSeenAt ?? now, + status: input.status ?? existing?.status ?? 'active', + instanceIds: Array.from(nextInstanceIds), + metadata: { + ...(existing?.metadata || {}), + ...(input.metadata || {}), + }, + }; + + await this.writeJsonFile(filePath, record); + return record; + } + + async endSession( + sessionId: string, + status: SharedSessionRecord['status'] = 'closed' + ): Promise { + await this.initialize(); + + const filePath = this.getSessionFile(sessionId); + const existing = await this.readJsonFile(filePath); + if (!existing) { + return; + } + + const updated: SharedSessionRecord = { + ...existing, + status, + lastSeenAt: Date.now(), + }; + await this.writeJsonFile(filePath, updated); + } + + async saveGitHubPullRequest( + projection: GitHubPullRequestProjection + ): Promise { + await this.initialize(); + + const normalizedProjectId = + normalizeProjectId(projection.projectId, projection.projectPath) ?? + projection.projectId; + const filePath = this.getGitHubPullRequestFile( + projection.repo, + projection.branch + ); + const record: GitHubPullRequestProjection = { + ...projection, + projectId: normalizedProjectId, + lastSyncedAt: projection.lastSyncedAt || Date.now(), + }; + await this.writeJsonFile(filePath, record); + return record; + } + + async claimPaths( + input: Omit< + SharedPathClaimRecord, + 'claimId' | 'claimedAt' | 'lastSeenAt' | 'expiresAt' | 'status' + > & { + claimId?: string; + ttlMs?: number; + lastSeenAt?: number; + expiresAt?: number; + status?: SharedPathClaimRecord['status']; + } + ): Promise { + await this.initialize(); + await this.cleanupExpiredClaims(); + + const now = input.lastSeenAt ?? Date.now(); + const existing = input.claimId + ? await this.readJsonFile( + this.getClaimFile(input.claimId) + ) + : null; + const record: SharedPathClaimRecord = { + claimId: input.claimId || randomUUID(), + tool: input.tool, + sessionId: input.sessionId ?? existing?.sessionId, + instanceId: input.instanceId ?? existing?.instanceId, + projectId: + normalizeProjectId(input.projectId, input.projectPath) ?? + existing?.projectId, + projectPath: input.projectPath ?? existing?.projectPath, + branch: input.branch ?? existing?.branch, + paths: Array.from( + new Set( + (input.paths ?? existing?.paths ?? []) + .map((item) => item.trim()) + .filter(Boolean) + ) + ), + status: input.status ?? 'active', + claimedAt: existing?.claimedAt ?? now, + lastSeenAt: now, + expiresAt: + input.expiresAt ?? + now + Math.max(1, input.ttlMs ?? 24 * 60 * 60 * 1000), + metadata: { + ...(existing?.metadata || {}), + ...(input.metadata || {}), + }, + }; + + const conflicts = ( + await this.listPathClaims({ + projectId: record.projectId, + projectPath: record.projectPath, + activeOnly: true, + }) + ) + .filter((claim) => claim.claimId !== record.claimId) + .filter((claim) => this.claimsOverlap(record, claim)) + .map((claim) => ({ + claimId: claim.claimId, + branch: claim.branch, + paths: claim.paths, + sessionId: claim.sessionId, + instanceId: claim.instanceId, + })); + + await this.writeJsonFile(this.getClaimFile(record.claimId), record); + return { record, conflicts }; + } + + async releaseClaims(options: { + claimId?: string; + instanceId?: string; + sessionId?: string; + projectId?: string; + projectPath?: string; + branch?: string; + reason?: string; + }): Promise { + await this.initialize(); + + const now = Date.now(); + let released = 0; + const claims = await this.listPathClaims({ + projectId: options.projectId, + projectPath: options.projectPath, + activeOnly: false, + }); + + for (const claim of claims) { + if (claim.status !== 'active') { + continue; + } + if (options.claimId && claim.claimId !== options.claimId) { + continue; + } + if (options.instanceId && claim.instanceId !== options.instanceId) { + continue; + } + if (options.sessionId && claim.sessionId !== options.sessionId) { + continue; + } + if (options.branch && claim.branch !== options.branch) { + continue; + } + + await this.writeJsonFile(this.getClaimFile(claim.claimId), { + ...claim, + status: 'released', + lastSeenAt: now, + releasedAt: now, + releaseReason: options.reason || claim.releaseReason, + }); + released++; + } + + return released; + } + + async listPathClaims(options?: { + projectId?: string; + projectPath?: string; + activeOnly?: boolean; + }): Promise { + await this.initialize(); + await this.cleanupExpiredClaims(); + + const projectId = normalizeProjectId( + options?.projectId, + options?.projectPath + ); + const dir = this.getClaimsDir(); + const entries = await fs.readdir(dir); + const claims = await Promise.all( + entries + .filter((entry) => entry.endsWith('.json')) + .map((entry) => + this.readJsonFile(path.join(dir, entry)) + ) + ); + + return (claims.filter(Boolean) as SharedPathClaimRecord[]) + .filter((claim) => !options?.activeOnly || claim.status === 'active') + .filter( + (claim) => + !projectId || + this.matchesProject( + claim.projectId, + claim.projectPath, + projectId, + options?.projectPath + ) + ) + .sort((a, b) => b.lastSeenAt - a.lastSeenAt); + } + + async listActiveProjectPaths(): Promise { + await this.initialize(); + + const projectPaths = new Set(); + const sessions = await this.listSessions(); + for (const session of sessions) { + if (session.status === 'active' && session.projectPath) { + projectPaths.add(session.projectPath); + } + } + + const instances = await this.listInstances(); + for (const instance of instances) { + if (instance.status === 'active' && instance.projectPath) { + projectPaths.add(instance.projectPath); + } + } + + const pullRequests = await this.listGitHubPullRequests(); + for (const projection of pullRequests) { + if (projection.projectPath) { + projectPaths.add(projection.projectPath); + } + } + + return Array.from(projectPaths).sort(); + } + + async getGitHubPullRequest(options: { + repo: string; + branch: string; + }): Promise { + await this.initialize(); + return this.readJsonFile( + this.getGitHubPullRequestFile(options.repo, options.branch) + ); + } + + async listGitHubPullRequests(options?: { + projectId?: string; + projectPath?: string; + }): Promise { + await this.initialize(); + + const projectId = normalizeProjectId( + options?.projectId, + options?.projectPath + ); + const dir = this.getGithubDir(); + const entries = await fs.readdir(dir); + const records = await Promise.all( + entries + .filter((entry) => entry.endsWith('.json')) + .map((entry) => + this.readJsonFile(path.join(dir, entry)) + ) + ); + + return (records.filter(Boolean) as GitHubPullRequestProjection[]).filter( + (record) => + !projectId || + this.matchesProject( + record.projectId, + record.projectPath, + projectId, + options?.projectPath + ) + ); + } + + async appendEvent( + input: Omit & { + id?: string; + timestamp?: number; + } + ): Promise { + await this.initialize(); + + const event: SharedStateEvent = { + id: input.id || randomUUID(), + timestamp: input.timestamp || Date.now(), + type: input.type, + tool: input.tool, + instanceId: input.instanceId, + sessionId: input.sessionId, + projectId: normalizeProjectId(input.projectId, input.projectPath), + projectPath: input.projectPath, + branch: input.branch, + payload: input.payload || {}, + }; + + const date = new Date(event.timestamp).toISOString().slice(0, 10); + const eventFile = path.join(this.getEventsDir(), `${date}.jsonl`); + await fs.appendFile(eventFile, `${JSON.stringify(event)}\n`, 'utf8'); + + if (event.instanceId) { + const instance = await this.readJsonFile( + this.getInstanceFile(event.instanceId) + ); + if (instance) { + await this.upsertInstance({ + ...instance, + lastSeenAt: event.timestamp, + }); + } + } + + if (event.sessionId) { + const session = await this.readJsonFile( + this.getSessionFile(event.sessionId) + ); + if (session) { + await this.upsertSession({ + ...session, + lastSeenAt: event.timestamp, + instanceIds: session.instanceIds, + }); + } + } + + return event; + } + + async listSessions(): Promise { + await this.initialize(); + + const dir = this.getSessionsDir(); + const entries = await fs.readdir(dir); + const sessions = await Promise.all( + entries + .filter((entry) => entry.endsWith('.json')) + .map((entry) => + this.readJsonFile(path.join(dir, entry)) + ) + ); + + return sessions.filter(Boolean) as SharedSessionRecord[]; + } + + async listInstances(): Promise { + await this.initialize(); + + const dir = this.getInstancesDir(); + const entries = await fs.readdir(dir); + const instances = await Promise.all( + entries + .filter((entry) => entry.endsWith('.json')) + .map((entry) => + this.readJsonFile(path.join(dir, entry)) + ) + ); + + return instances.filter(Boolean) as SharedInstanceRecord[]; + } + + async getProjectSummary(options: { + projectId?: string; + projectPath?: string; + eventLimit?: number; + }): Promise { + await this.initialize(); + + const projectId = normalizeProjectId( + options.projectId, + options.projectPath + ); + const sessions = (await this.listSessions()).filter( + (session) => + session.status === 'active' && + this.matchesProject( + session.projectId, + session.projectPath, + projectId, + options.projectPath + ) + ); + const instances = (await this.listInstances()).filter( + (instance) => + instance.status === 'active' && + this.matchesProject( + instance.projectId, + instance.projectPath, + projectId, + options.projectPath + ) + ); + const activeClaims = await this.listPathClaims({ + projectId, + projectPath: options.projectPath, + activeOnly: true, + }); + + const recentEvents = await this.listRecentEvents({ + projectId, + projectPath: options.projectPath, + limit: options.eventLimit || 10, + }); + + return { + projectId, + projectPath: options.projectPath, + activeSessions: sessions.sort((a, b) => b.lastSeenAt - a.lastSeenAt), + activeInstances: instances.sort((a, b) => b.lastSeenAt - a.lastSeenAt), + activeClaims, + recentEvents, + }; + } + + private async cleanupExpiredClaims(): Promise { + const dir = this.getClaimsDir(); + const entries = await fs.readdir(dir).catch(() => [] as string[]); + const now = Date.now(); + + for (const entry of entries) { + if (!entry.endsWith('.json')) { + continue; + } + const filePath = path.join(dir, entry); + const claim = await this.readJsonFile(filePath); + if (!claim || claim.status !== 'active' || claim.expiresAt > now) { + continue; + } + await this.writeJsonFile(filePath, { + ...claim, + status: 'expired', + releasedAt: now, + releaseReason: claim.releaseReason || 'expired', + lastSeenAt: now, + }); + } + } + + private claimsOverlap( + left: SharedPathClaimRecord, + right: SharedPathClaimRecord + ): boolean { + if ( + left.branch && + right.branch && + left.branch.trim() && + right.branch.trim() && + left.branch === right.branch + ) { + return true; + } + + for (const leftPath of left.paths) { + for (const rightPath of right.paths) { + if (this.pathsOverlap(leftPath, rightPath)) { + return true; + } + } + } + + return false; + } + + private pathsOverlap(left: string, right: string): boolean { + const normalizedLeft = this.normalizeClaimPath(left); + const normalizedRight = this.normalizeClaimPath(right); + + if (!normalizedLeft || !normalizedRight) { + return false; + } + if ( + normalizedLeft === '*' || + normalizedRight === '*' || + normalizedLeft === '.' || + normalizedRight === '.' + ) { + return true; + } + if (normalizedLeft === normalizedRight) { + return true; + } + + return ( + normalizedLeft.startsWith(`${normalizedRight}/`) || + normalizedRight.startsWith(`${normalizedLeft}/`) + ); + } + + private normalizeClaimPath(value: string): string { + return value + .trim() + .replace(/\\/g, '/') + .replace(/\/\*\*$/, '') + .replace(/\/$/, ''); + } + + async listRecentEvents(options: { + projectId?: string; + projectPath?: string; + limit?: number; + }): Promise { + await this.initialize(); + + const projectId = normalizeProjectId( + options.projectId, + options.projectPath + ); + const eventDir = this.getEventsDir(); + const eventFiles = (await fs.readdir(eventDir)) + .filter((entry) => entry.endsWith('.jsonl')) + .sort() + .reverse() + .slice(0, 7); + + const events: SharedStateEvent[] = []; + for (const entry of eventFiles) { + const filePath = path.join(eventDir, entry); + const content = await fs.readFile(filePath, 'utf8'); + const lines = content + .split('\n') + .map((line) => line.trim()) + .filter(Boolean); + + for (let i = lines.length - 1; i >= 0; i--) { + try { + const parsed = JSON.parse(lines[i]) as SharedStateEvent; + if ( + this.matchesProject( + parsed.projectId, + parsed.projectPath, + projectId, + options.projectPath + ) + ) { + events.push(parsed); + } + } catch { + // Skip malformed event lines. + } + if (events.length >= (options.limit || 20)) { + return events; + } + } + } + + return events; + } + + private matchesProject( + candidateProjectId: string | undefined, + candidateProjectPath: string | undefined, + projectId: string | undefined, + projectPath: string | undefined + ): boolean { + if (projectId && candidateProjectId) { + return candidateProjectId === projectId; + } + if (projectPath && candidateProjectPath) { + return candidateProjectPath === projectPath; + } + return !projectId && !projectPath; + } + + private getInstancesDir(): string { + return path.join(this.rootDir, 'instances'); + } + + private getSessionsDir(): string { + return path.join(this.rootDir, 'sessions'); + } + + private getEventsDir(): string { + return path.join(this.rootDir, 'events'); + } + + private getGithubDir(): string { + return path.join(this.rootDir, 'github', 'pull-requests'); + } + + private getClaimsDir(): string { + return path.join(this.rootDir, 'claims'); + } + + private getInstanceFile(instanceId: string): string { + return path.join(this.getInstancesDir(), `${instanceId}.json`); + } + + private getSessionFile(sessionId: string): string { + return path.join(this.getSessionsDir(), `${sessionId}.json`); + } + + private getGitHubPullRequestFile(repo: string, branch: string): string { + const slug = `${repo}__${branch}` + .replace(/[\\/]/g, '__') + .replace(/[^a-zA-Z0-9_.-]/g, '-'); + return path.join(this.getGithubDir(), `${slug}.json`); + } + + private getClaimFile(claimId: string): string { + return path.join(this.getClaimsDir(), `${claimId}.json`); + } + + private async readJsonFile(filePath: string): Promise { + if (!(await pathExists(filePath))) { + return null; + } + + const content = await fs.readFile(filePath, 'utf8'); + return JSON.parse(content) as T; + } + + private async writeJsonFile(filePath: string, value: unknown): Promise { + const dir = path.dirname(filePath); + await fs.mkdir(dir, { recursive: true }); + + const tempPath = `${filePath}.${process.pid}.tmp`; + await fs.writeFile(tempPath, JSON.stringify(value, null, 2)); + await fs.rename(tempPath, filePath); + } +} + +export const canonicalStateStore = new CanonicalStateStore(); +export { + getBaseStateDir as getCanonicalStateBaseDir, + normalizeProjectId, + projectIdFromIdentifier, +}; diff --git a/src/daemon/daemon-config.ts b/src/daemon/daemon-config.ts index 32e2d2f6..1e846a6a 100644 --- a/src/daemon/daemon-config.ts +++ b/src/daemon/daemon-config.ts @@ -65,6 +65,7 @@ export interface DaemonConfig { version: string; context: ContextServiceConfig; linear: LinearServiceConfig; + github: DaemonServiceConfig; maintenance: MaintenanceServiceConfig; memory: MemoryServiceConfig; fileWatch: FileWatchConfig; @@ -87,6 +88,10 @@ export const DEFAULT_DAEMON_CONFIG: DaemonConfig = { retryAttempts: 3, retryDelay: 30000, }, + github: { + enabled: false, + interval: 5, + }, maintenance: { enabled: true, interval: 360, // 6 hours @@ -124,6 +129,12 @@ export interface DaemonStatus { services: { context: { enabled: boolean; lastRun?: number; saveCount?: number }; linear: { enabled: boolean; lastRun?: number; syncCount?: number }; + github: { + enabled: boolean; + lastRun?: number; + syncCount?: number; + lastProjectionState?: string; + }; maintenance: { enabled: boolean; lastRun?: number; @@ -200,6 +211,7 @@ export function loadDaemonConfig(): DaemonConfig { ...config, context: { ...DEFAULT_DAEMON_CONFIG.context, ...config.context }, linear: { ...DEFAULT_DAEMON_CONFIG.linear, ...config.linear }, + github: { ...DEFAULT_DAEMON_CONFIG.github, ...config.github }, maintenance: { ...DEFAULT_DAEMON_CONFIG.maintenance, ...config.maintenance, @@ -223,6 +235,7 @@ export function saveDaemonConfig(config: Partial): void { ...config, context: { ...currentConfig.context, ...config.context }, linear: { ...currentConfig.linear, ...config.linear }, + github: { ...currentConfig.github, ...config.github }, maintenance: { ...currentConfig.maintenance, ...config.maintenance }, memory: { ...currentConfig.memory, ...config.memory }, fileWatch: { ...currentConfig.fileWatch, ...config.fileWatch }, @@ -241,6 +254,7 @@ export function readDaemonStatus(): DaemonStatus { services: { context: { enabled: false }, linear: { enabled: false }, + github: { enabled: false }, maintenance: { enabled: false }, memory: { enabled: false }, fileWatch: { enabled: false }, diff --git a/src/daemon/services/github-service.ts b/src/daemon/services/github-service.ts new file mode 100644 index 00000000..f1892281 --- /dev/null +++ b/src/daemon/services/github-service.ts @@ -0,0 +1,158 @@ +import { existsSync } from 'fs'; +import { join } from 'path'; +import { homedir } from 'os'; +import type { DaemonServiceConfig } from '../daemon-config.js'; +import { refreshCurrentRepoPullRequestState } from '../../integrations/github/pr-state.js'; +import { canonicalStateStore } from '../../core/shared-state/canonical-store.js'; + +export interface GitHubServiceState { + lastSyncTime: number; + syncCount: number; + errors: string[]; + nextSyncTime?: number; + lastProjectionState?: string; + lastProjectsScanned?: number; +} + +export class DaemonGitHubService { + private config: DaemonServiceConfig; + private state: GitHubServiceState; + private intervalId?: NodeJS.Timeout; + private isRunning = false; + private onLog: (level: string, message: string, data?: unknown) => void; + + constructor( + config: DaemonServiceConfig, + onLog: (level: string, message: string, data?: unknown) => void + ) { + this.config = config; + this.onLog = onLog; + this.state = { + lastSyncTime: 0, + syncCount: 0, + errors: [], + }; + } + + async start(): Promise { + if (this.isRunning || !this.config.enabled) { + return; + } + + if (!this.isGitHubConfigured()) { + this.onLog('WARN', 'GitHub CLI not configured, skipping github service'); + return; + } + + this.isRunning = true; + const intervalMs = this.config.interval * 60 * 1000; + + this.onLog('INFO', 'GitHub service started', { + interval: this.config.interval, + }); + + await this.performSync(); + + this.intervalId = setInterval(async () => { + await this.performSync(); + }, intervalMs); + } + + stop(): void { + if (this.intervalId) { + clearInterval(this.intervalId); + this.intervalId = undefined; + } + this.isRunning = false; + this.onLog('INFO', 'GitHub service stopped'); + } + + getState(): GitHubServiceState { + return { + ...this.state, + nextSyncTime: this.isRunning + ? this.state.lastSyncTime + this.config.interval * 60 * 1000 + : undefined, + }; + } + + async forceSync(): Promise { + await this.performSync(); + } + + private async performSync(): Promise { + if (!this.isRunning) return; + + try { + const projectRoots = await this.getProjectRoots(); + this.state.lastProjectsScanned = projectRoots.length; + if (projectRoots.length === 0) { + this.onLog('DEBUG', 'No active project roots found for GitHub sync'); + return; + } + + let synced = false; + for (const projectRoot of projectRoots) { + const projection = + await refreshCurrentRepoPullRequestState(projectRoot); + if (!projection) { + this.onLog('DEBUG', 'No GitHub PR projection available', { + projectRoot, + }); + continue; + } + + synced = true; + this.state.syncCount++; + this.state.lastSyncTime = Date.now(); + this.state.lastProjectionState = projection.state; + + this.onLog('INFO', 'GitHub PR projection refreshed', { + projectRoot, + repo: projection.repo, + branch: projection.branch, + prNumber: projection.prNumber, + state: projection.state, + }); + } + if (!synced) { + this.state.lastSyncTime = Date.now(); + } + } catch (err) { + const errorMsg = err instanceof Error ? err.message : String(err); + this.state.errors.push(errorMsg); + this.onLog('ERROR', 'GitHub sync failed', { error: errorMsg }); + + if (this.state.errors.length > 10) { + this.state.errors = this.state.errors.slice(-10); + } + } + } + + private isGitHubConfigured(): boolean { + try { + return existsSync(join(homedir(), '.config', 'gh', 'hosts.yml')); + } catch { + return false; + } + } + + private async getProjectRoots(): Promise { + const roots = new Set(); + const activeProjectPaths = + await canonicalStateStore.listActiveProjectPaths(); + + for (const projectPath of activeProjectPaths) { + if (existsSync(join(projectPath, '.git'))) { + roots.add(projectPath); + } + } + + const cwd = process.cwd(); + if (existsSync(join(cwd, '.git'))) { + roots.add(cwd); + } + + return Array.from(roots).sort(); + } +} diff --git a/src/daemon/unified-daemon.ts b/src/daemon/unified-daemon.ts index 08935dff..0bf7ffd4 100644 --- a/src/daemon/unified-daemon.ts +++ b/src/daemon/unified-daemon.ts @@ -26,6 +26,7 @@ import { } from './daemon-config.js'; import { DaemonContextService } from './services/context-service.js'; import { DaemonLinearService } from './services/linear-service.js'; +import { DaemonGitHubService } from './services/github-service.js'; import { DaemonMaintenanceService } from './services/maintenance-service.js'; import { DaemonMemoryService } from './services/memory-service.js'; @@ -42,6 +43,7 @@ export class UnifiedDaemon { private paths: ReturnType; private contextService: DaemonContextService; private linearService: DaemonLinearService; + private githubService: DaemonGitHubService; private maintenanceService: DaemonMaintenanceService; private memoryService: DaemonMemoryService; private heartbeatInterval?: NodeJS.Timeout; @@ -63,6 +65,11 @@ export class UnifiedDaemon { (level, msg, data) => this.log(level, 'linear', msg, data) ); + this.githubService = new DaemonGitHubService( + this.config.github, + (level, msg, data) => this.log(level, 'github', msg, data) + ); + this.maintenanceService = new DaemonMaintenanceService( this.config.maintenance, (level, msg, data) => this.log(level, 'maintenance', msg, data) @@ -142,6 +149,7 @@ export class UnifiedDaemon { private updateStatus(): void { const maintenanceState = this.maintenanceService.getState(); const memoryState = this.memoryService.getState(); + const githubState = this.githubService.getState(); const status: DaemonStatus = { running: true, pid: process.pid, @@ -158,6 +166,12 @@ export class UnifiedDaemon { lastRun: this.linearService.getState().lastSyncTime || undefined, syncCount: this.linearService.getState().syncCount, }, + github: { + enabled: this.config.github.enabled, + lastRun: githubState.lastSyncTime || undefined, + syncCount: githubState.syncCount, + lastProjectionState: githubState.lastProjectionState, + }, maintenance: { enabled: this.config.maintenance.enabled, lastRun: maintenanceState.lastRunTime || undefined, @@ -180,6 +194,7 @@ export class UnifiedDaemon { errors: [ ...this.contextService.getState().errors.slice(-5), ...this.linearService.getState().errors.slice(-5), + ...githubState.errors.slice(-5), ...maintenanceState.errors.slice(-5), ...memoryState.errors.slice(-5), ], @@ -240,6 +255,12 @@ export class UnifiedDaemon { enabled: false, syncCount: this.linearService.getState().syncCount, }, + github: { + enabled: false, + syncCount: this.githubService.getState().syncCount, + lastProjectionState: + this.githubService.getState().lastProjectionState, + }, maintenance: { enabled: false, staleFramesCleaned: @@ -266,6 +287,7 @@ export class UnifiedDaemon { uptime: Date.now() - this.startTime, contextSaves: this.contextService.getState().saveCount, linearSyncs: this.linearService.getState().syncCount, + githubSyncs: this.githubService.getState().syncCount, maintenanceRuns: this.maintenanceService.getState().ftsRebuilds, memoryTriggers: this.memoryService.getState().triggerCount, }); @@ -279,6 +301,7 @@ export class UnifiedDaemon { // Stop services this.contextService.stop(); this.linearService.stop(); + this.githubService.stop(); this.maintenanceService.stop(); this.memoryService.stop(); @@ -312,6 +335,7 @@ export class UnifiedDaemon { config: { context: this.config.context.enabled, linear: this.config.linear.enabled, + github: this.config.github.enabled, maintenance: this.config.maintenance.enabled, memory: this.config.memory.enabled, fileWatch: this.config.fileWatch.enabled, @@ -321,6 +345,7 @@ export class UnifiedDaemon { // Start services this.contextService.start(); await this.linearService.start(); + await this.githubService.start(); this.maintenanceService.start(); this.memoryService.start(); @@ -336,6 +361,7 @@ export class UnifiedDaemon { getStatus(): DaemonStatus { const maintenanceState = this.maintenanceService.getState(); const memoryState = this.memoryService.getState(); + const githubState = this.githubService.getState(); return { running: !this.isShuttingDown, pid: process.pid, @@ -352,6 +378,12 @@ export class UnifiedDaemon { lastRun: this.linearService.getState().lastSyncTime || undefined, syncCount: this.linearService.getState().syncCount, }, + github: { + enabled: this.config.github.enabled, + lastRun: githubState.lastSyncTime || undefined, + syncCount: githubState.syncCount, + lastProjectionState: githubState.lastProjectionState, + }, maintenance: { enabled: this.config.maintenance.enabled, lastRun: maintenanceState.lastRunTime || undefined, diff --git a/src/features/sweep/pty-wrapper.ts b/src/features/sweep/pty-wrapper.ts index 7b6cf5c8..57ecf744 100644 --- a/src/features/sweep/pty-wrapper.ts +++ b/src/features/sweep/pty-wrapper.ts @@ -33,6 +33,8 @@ export interface PtyWrapperConfig { claudeArgs?: string[]; stateFile?: string; initialInput?: string; + onExit?: (exitCode: number) => Promise | void; + onSignal?: (signal: 'SIGINT' | 'SIGTERM') => Promise | void; } // Minimal interface for node-pty process to avoid compile-time dep @@ -59,6 +61,8 @@ export class PtyWrapper { claudeArgs: config.claudeArgs || [], stateFile: config.stateFile || getSweepPath('sweep-state.json'), initialInput: config.initialInput || '', + onExit: config.onExit || (() => undefined), + onSignal: config.onSignal || (() => undefined), }; this.stateWatcher = new SweepStateWatcher(this.config.stateFile); @@ -174,8 +178,9 @@ export class PtyWrapper { }); // Handle PTY exit - this.ptyProcess.onExit(({ exitCode }) => { + this.ptyProcess.onExit(async ({ exitCode }) => { this.cleanup(); + await this.config.onExit(exitCode); // Sync Linear on exit if configured if (process.env['LINEAR_API_KEY']) { try { @@ -191,12 +196,17 @@ export class PtyWrapper { }); // Handle signals - const onSignal = () => { + const onSignal = async (signal: 'SIGINT' | 'SIGTERM') => { this.cleanup(); + await this.config.onSignal(signal); process.exit(0); }; - process.on('SIGINT', onSignal); - process.on('SIGTERM', onSignal); + process.on('SIGINT', () => { + void onSignal('SIGINT'); + }); + process.on('SIGTERM', () => { + void onSignal('SIGTERM'); + }); } private acceptPrediction(): void { diff --git a/src/integrations/github/pr-state.ts b/src/integrations/github/pr-state.ts new file mode 100644 index 00000000..cbde49b0 --- /dev/null +++ b/src/integrations/github/pr-state.ts @@ -0,0 +1,209 @@ +import { execFileSync } from 'child_process'; +import { + canonicalStateStore, + type GitHubPullRequestProjection, +} from '../../core/shared-state/canonical-store.js'; +import { projectIdFromIdentifier } from '../../core/shared-state/canonical-store.js'; + +export interface CurrentRepoGitHubInfo { + repo: string; + branch: string; + projectPath: string; + projectId: string; +} + +interface GhPrViewResult { + number: number; + title: string; + state: 'OPEN' | 'CLOSED' | 'MERGED'; + isDraft: boolean; + url: string; + baseRefName: string; + headRefName: string; + headRefOid?: string; + mergedAt?: string | null; + updatedAt: string; + reviewDecision?: string | null; + statusCheckRollup?: Array<{ + __typename?: string; + conclusion?: string | null; + status?: string | null; + state?: string | null; + }> | null; +} + +function runGit(args: string[], cwd: string): string { + return execFileSync('git', args, { + cwd, + encoding: 'utf8', + stdio: ['ignore', 'pipe', 'pipe'], + }).trim(); +} + +function runGh(args: string[], cwd: string): string { + return execFileSync('gh', args, { + cwd, + encoding: 'utf8', + stdio: ['ignore', 'pipe', 'pipe'], + }).trim(); +} + +function normalizeRemoteToRepo(remote: string): string { + const cleaned = remote.replace(/\.git$/, '').trim(); + + if (cleaned.startsWith('git@github.com:')) { + return cleaned.replace('git@github.com:', ''); + } + + if (cleaned.startsWith('https://github.com/')) { + return cleaned.replace('https://github.com/', ''); + } + + if (cleaned.startsWith('http://github.com/')) { + return cleaned.replace('http://github.com/', ''); + } + + throw new Error(`Unsupported GitHub remote: ${remote}`); +} + +function summarizeStatusCheckRollup( + rollup: GhPrViewResult['statusCheckRollup'] +): string | undefined { + if (!rollup || rollup.length === 0) { + return undefined; + } + + const states = rollup + .map((item) => item.conclusion || item.status || item.state) + .filter(Boolean) as string[]; + + if (states.length === 0) { + return undefined; + } + + if (states.every((state) => state === 'SUCCESS')) { + return 'SUCCESS'; + } + if (states.some((state) => state === 'FAILURE' || state === 'ERROR')) { + return 'FAILURE'; + } + if ( + states.some( + (state) => + state === 'PENDING' || state === 'IN_PROGRESS' || state === 'EXPECTED' + ) + ) { + return 'PENDING'; + } + + return states[0]; +} + +export function getCurrentRepoGitHubInfo( + cwd: string = process.cwd() +): CurrentRepoGitHubInfo | null { + try { + const projectPath = runGit(['rev-parse', '--show-toplevel'], cwd); + const branch = runGit(['rev-parse', '--abbrev-ref', 'HEAD'], projectPath); + const remote = runGit( + ['config', '--get', 'remote.origin.url'], + projectPath + ); + const repo = normalizeRemoteToRepo(remote); + + return { + repo, + branch, + projectPath, + projectId: projectIdFromIdentifier(remote), + }; + } catch { + return null; + } +} + +export async function refreshCurrentRepoPullRequestState( + cwd: string = process.cwd() +): Promise { + const info = getCurrentRepoGitHubInfo(cwd); + if (!info) { + return null; + } + + try { + const output = runGh( + [ + 'pr', + 'view', + '--repo', + info.repo, + '--json', + [ + 'number', + 'title', + 'state', + 'isDraft', + 'url', + 'baseRefName', + 'headRefName', + 'headRefOid', + 'mergedAt', + 'updatedAt', + 'reviewDecision', + 'statusCheckRollup', + ].join(','), + ], + info.projectPath + ); + + const parsed = JSON.parse(output) as GhPrViewResult; + const projection: GitHubPullRequestProjection = { + repo: info.repo, + branch: info.branch, + projectId: info.projectId, + projectPath: info.projectPath, + prNumber: parsed.number, + title: parsed.title, + state: + parsed.mergedAt && parsed.state === 'MERGED' ? 'MERGED' : parsed.state, + isDraft: parsed.isDraft, + url: parsed.url, + baseRefName: parsed.baseRefName, + headRefName: parsed.headRefName, + headRefOid: parsed.headRefOid, + mergedAt: parsed.mergedAt || undefined, + updatedAt: parsed.updatedAt, + reviewDecision: parsed.reviewDecision || undefined, + statusCheckRollup: summarizeStatusCheckRollup(parsed.statusCheckRollup), + lastSyncedAt: Date.now(), + }; + + await canonicalStateStore.saveGitHubPullRequest(projection); + if (projection.state === 'MERGED' || projection.state === 'CLOSED') { + await canonicalStateStore.releaseClaims({ + projectId: info.projectId, + projectPath: info.projectPath, + branch: info.branch, + reason: `github_pr_${projection.state.toLowerCase()}`, + }); + } + await canonicalStateStore.appendEvent({ + type: 'github_pr_refreshed', + tool: 'stackmemory', + projectId: info.projectId, + projectPath: info.projectPath, + branch: info.branch, + payload: { + repo: info.repo, + prNumber: projection.prNumber, + state: projection.state, + reviewDecision: projection.reviewDecision, + statusCheckRollup: projection.statusCheckRollup, + }, + }); + + return projection; + } catch { + return null; + } +} From 10db0938be17c1fd2f90ea7a7381d69b3b196754 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Tue, 14 Apr 2026 17:43:37 -0400 Subject: [PATCH 06/18] feat: add deterministic harness smoke tooling --- .husky/pre-commit | 53 +-- package.json | 5 + scripts/determinism-pre-commit.sh | 67 ++++ src/cli/claude-sm.ts | 92 ++++-- src/cli/codex-sm.ts | 75 +++-- src/cli/commands/bench.ts | 261 ++++++++++++++- src/cli/commands/handoff.ts | 52 ++- src/cli/gemini-sm.ts | 53 ++- src/cli/index.ts | 17 + src/cli/opencode-sm.ts | 72 ++-- src/cli/utils/determinism-watcher.ts | 90 +++++ src/cli/utils/real-cli-bin.ts | 66 ++++ src/core/monitoring/logger.ts | 3 +- .../session/__tests__/project-handoff.test.ts | 64 ++++ src/core/session/project-handoff.ts | 85 +++++ .../multimodal/__tests__/determinism.test.ts | 103 ++++++ src/orchestrators/multimodal/determinism.ts | 309 ++++++++++++++++++ src/orchestrators/multimodal/harness.ts | 250 ++++++++------ src/orchestrators/multimodal/types.ts | 2 + 19 files changed, 1467 insertions(+), 252 deletions(-) create mode 100755 scripts/determinism-pre-commit.sh create mode 100644 src/cli/utils/determinism-watcher.ts create mode 100644 src/cli/utils/real-cli-bin.ts create mode 100644 src/core/session/__tests__/project-handoff.test.ts create mode 100644 src/core/session/project-handoff.ts create mode 100644 src/orchestrators/multimodal/__tests__/determinism.test.ts create mode 100644 src/orchestrators/multimodal/determinism.ts diff --git a/.husky/pre-commit b/.husky/pre-commit index 869a3294..b0e2844d 100755 --- a/.husky/pre-commit +++ b/.husky/pre-commit @@ -1,43 +1,14 @@ -#!/bin/sh -# GITBUTLER_MANAGED_HOOK_V1 -# This hook is managed by GitButler to prevent accidental commits on the workspace branch. -# Your original pre-commit hook has been preserved as 'pre-commit-user'. - -HOOKS_DIR=$(dirname "$0") - -# Run user's hook first if it exists - if it fails, stop here -if [ -x "$HOOKS_DIR/pre-commit-user" ]; then - "$HOOKS_DIR/pre-commit-user" "$@" || exit $? -fi - -# Get the current branch name -BRANCH=$(git symbolic-ref --short HEAD 2>/dev/null) - -if [ "$BRANCH" = "gitbutler/workspace" ]; then - echo "" - echo "GITBUTLER_ERROR: Cannot commit directly to gitbutler/workspace branch." - echo "" - echo "GitButler manages commits on this branch. Please use GitButler to commit your changes:" - echo " - Use the GitButler app to create commits" - echo " - Or run 'but commit' from the command line" - echo "" - echo "If you want to exit GitButler mode and use normal git:" - echo " - Run 'but teardown' to switch to a regular branch" - echo " - Or directly checkout another branch: git checkout " - echo "" - echo "If you no longer have the GitButler CLI installed, you can simply remove this hook and checkout another branch:" - printf ' rm "%s/pre-commit"\n' "$HOOKS_DIR" - echo "" - exit 1 -fi - -# Not on workspace branch - run user's original hook if it exists -if [ -x "$HOOKS_DIR/pre-commit-user" ]; then - echo "" - echo "WARNING: GitButler's pre-commit hook is still installed but you're not on gitbutler/workspace." - echo "If you're no longer using GitButler, you can restore your original hook:" - printf ' mv "%s/pre-commit-user" "%s/pre-commit"\n' "$HOOKS_DIR" "$HOOKS_DIR" - echo "" +# Use Node version from .nvmrc +export NVM_DIR="$HOME/.nvm" +if [ -s "$NVM_DIR/nvm.sh" ]; then + . "$NVM_DIR/nvm.sh" + nvm use 2>/dev/null +elif [ -d "$HOME/.nvm/versions/node" ]; then + NODE_VER=$(cat "$(git rev-parse --show-toplevel)/.nvmrc" 2>/dev/null || echo "20") + NODE_PATH=$(ls -d "$HOME/.nvm/versions/node/v${NODE_VER}"* 2>/dev/null | head -1) + [ -n "$NODE_PATH" ] && export PATH="$NODE_PATH/bin:$PATH" fi -exit 0 +npx lint-staged +bash scripts/determinism-pre-commit.sh +npm run build diff --git a/package.json b/package.json index 52284729..75a89d9b 100644 --- a/package.json +++ b/package.json @@ -114,6 +114,11 @@ "test:run": "vitest run", "test:pre-publish": "./scripts/test-pre-publish-quick.sh", "test:pre-commit": "vitest related --run --reporter=dot --silent --bail=1", + "determinism:smoke": "node --import tsx src/cli/index.ts bench determinism --task \"Determinism probe\" --runs 5", + "determinism:watch": "node --import tsx src/cli/index.ts bench determinism --task \"Determinism probe\" --runs 3 --watch", + "determinism:latest": "node --import tsx src/cli/index.ts bench determinism --latest --json", + "determinism:test": "npx vitest run src/orchestrators/multimodal/__tests__/determinism.test.ts --reporter=dot", + "determinism:pre-commit": "bash scripts/determinism-pre-commit.sh", "prepublishOnly": "npm run build && npm run verify:dist && npm run test:pre-publish", "quality": "npm run lint && npm run test:run && npm run build", "dev": "tsx watch src/integrations/mcp/server.ts", diff --git a/scripts/determinism-pre-commit.sh b/scripts/determinism-pre-commit.sh new file mode 100755 index 00000000..6ff8fb72 --- /dev/null +++ b/scripts/determinism-pre-commit.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +set -euo pipefail + +PROJECT_ROOT="$(git rev-parse --show-toplevel 2>/dev/null || pwd)" +cd "$PROJECT_ROOT" + +BLUE='\033[0;34m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log_info() { + echo -e "${BLUE}[determinism]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[determinism]${NC} $1" +} + +log_skip() { + echo -e "${YELLOW}[determinism]${NC} $1" +} + +if [ "${STACKMEMORY_DETERMINISM_SKIP:-0}" = "1" ]; then + log_skip "Skipping because STACKMEMORY_DETERMINISM_SKIP=1" + exit 0 +fi + +CHANGED_FILES="${STACKMEMORY_DETERMINISM_FILES:-}" +if [ -z "$CHANGED_FILES" ]; then + CHANGED_FILES="$(git diff --cached --name-only --diff-filter=ACMR)" +fi + +if [ -z "$CHANGED_FILES" ]; then + log_skip "No staged files detected" + exit 0 +fi + +RELEVANT_PATTERN='^(src/orchestrators/multimodal/|src/cli/commands/bench\.ts$|src/cli/index\.ts$|src/core/monitoring/logger\.ts$)' +RELEVANT_FILES="$(printf '%s\n' "$CHANGED_FILES" | rg "$RELEVANT_PATTERN" || true)" + +if [ -z "$RELEVANT_FILES" ]; then + log_skip "No harness determinism files staged" + exit 0 +fi + +RUNS="${STACKMEMORY_DETERMINISM_RUNS:-3}" +TASK="${STACKMEMORY_DETERMINISM_TASK:-Determinism pre-commit}" + +log_info "Running deterministic smoke check for staged harness files" +printf '%s\n' "$RELEVANT_FILES" | sed 's/^/ - /' + +REPORT_JSON="$(node --import tsx src/cli/index.ts bench determinism --task "$TASK" --runs "$RUNS" --json)" + +SCORE="$(printf '%s' "$REPORT_JSON" | node -e "let data='';process.stdin.on('data',d=>data+=d);process.stdin.on('end',()=>{const report=JSON.parse(data);process.stdout.write(String(report.score));});")" + +if [ "$SCORE" != "100" ] && [ "$SCORE" != "100.00" ]; then + log_skip "Determinism smoke failed with score $SCORE/100" + printf '%s\n' "$REPORT_JSON" + exit 1 +fi + +log_info "Running deterministic harness tests" +npx vitest run src/orchestrators/multimodal/__tests__/determinism.test.ts --reporter=dot + +log_success "Determinism guard passed ($SCORE/100)" diff --git a/src/cli/claude-sm.ts b/src/cli/claude-sm.ts index 0bc7339b..8bb7459c 100644 --- a/src/cli/claude-sm.ts +++ b/src/cli/claude-sm.ts @@ -17,10 +17,17 @@ import { program } from 'commander'; import { v4 as uuidv4 } from 'uuid'; import chalk from 'chalk'; import { initializeTracing, trace } from '../core/trace/index.js'; +import { resolveRealCliBin } from './utils/real-cli-bin.js'; +import { + type DeterminismWatcherHandle, + startDeterminismWatcher, + stopDeterminismWatcher, +} from './utils/determinism-watcher.js'; import { canonicalStateStore, projectIdFromIdentifier, } from '../core/shared-state/canonical-store.js'; +import { loadProjectHandoff } from '../core/session/project-handoff.js'; import { getModelRouter, loadModelRouterConfig, @@ -136,6 +143,8 @@ class ClaudeSM { private sessionId: string; private ownsSession: boolean; private sessionEnded: boolean; + private determinismWatcher: DeterminismWatcherHandle | null; + private skippedHandoffReason: string | null; constructor() { // Load persistent defaults @@ -166,6 +175,8 @@ class ClaudeSM { this.sessionId = process.env['STACKMEMORY_SESSION'] || uuidv4(); this.ownsSession = !process.env['STACKMEMORY_SESSION']; this.sessionEnded = false; + this.determinismWatcher = null; + this.skippedHandoffReason = null; // Ensure config directory exists if (!fs.existsSync(this.claudeConfigDir)) { @@ -275,19 +286,16 @@ class ClaudeSM { } private resolveClaudeBin(): string | null { - // 1) CLI-specified - if (this.config.claudeBin && this.config.claudeBin.trim()) { - return this.config.claudeBin.trim(); - } - // 2) Env override - const envBin = process.env['CLAUDE_BIN']; - if (envBin && envBin.trim()) return envBin.trim(); - // 3) PATH detection - try { - execSync('which claude', { stdio: 'ignore' }); - return 'claude'; - } catch {} - return null; + return resolveRealCliBin({ + explicitBin: this.config.claudeBin, + envBin: process.env['CLAUDE_BIN'], + preferredPaths: [ + path.join(os.homedir(), '.local', 'bin', 'claude'), + '/usr/local/bin/claude', + '/opt/homebrew/bin/claude', + ], + pathCommands: ['claude'], + }); } private gepaProcesses: ReturnType[] = []; @@ -373,6 +381,30 @@ class ClaudeSM { this.gepaProcesses = []; } + private startDeterminismWatcher(): void { + this.determinismWatcher = startDeterminismWatcher({ + stackmemoryBin: this.stackmemoryPath, + cwd: process.cwd(), + task: this.config.task, + instanceId: this.config.instanceId, + sessionId: this.sessionId, + tool: 'claude', + }); + + if (this.determinismWatcher) { + const modeLabel = + this.determinismWatcher.mode === 'targeted' + ? 'targeted' + : 'repo-root fallback'; + console.log(chalk.gray(` Determinism: ${modeLabel}`)); + } + } + + private stopDeterminismWatcher(): void { + stopDeterminismWatcher(this.determinismWatcher); + this.determinismWatcher = null; + } + private setupWorktree(): string | null { if (!this.config.useWorktree || !this.isGitRepo()) { return null; @@ -489,19 +521,25 @@ class ClaudeSM { if (!this.config.contextEnabled) return null; try { - const handoffPath = path.join( + const handoff = loadProjectHandoff( process.cwd(), - '.stackmemory', - 'last-handoff.md' + this.isGitRepo() ? this.getCurrentBranch() : undefined ); - if (fs.existsSync(handoffPath)) { - const content = fs.readFileSync(handoffPath, 'utf8').trim(); - if (content.length > 0) { - // Cap at 8000 chars to avoid excessively long system prompts - return content.length > 8000 - ? content.substring(0, 8000) + '\n\n[...truncated]' - : content; - } + if (!handoff) { + this.skippedHandoffReason = null; + return null; + } + if (!handoff.compatible) { + this.skippedHandoffReason = handoff.mismatchReason || 'stale handoff'; + return null; + } + this.skippedHandoffReason = null; + const content = handoff.content.trim(); + if (content.length > 0) { + // Cap at 8000 chars to avoid excessively long system prompts + return content.length > 8000 + ? content.substring(0, 8000) + '\n\n[...truncated]' + : content; } } catch { // Silently continue - handoff loading is optional @@ -759,6 +797,7 @@ class ClaudeSM { payload: Record = {} ): Promise { this.stopGEPAWatcher(); + this.stopDeterminismWatcher(); this.saveContext( eventType === 'session_end' @@ -1045,6 +1084,7 @@ class ClaudeSM { } await this.publishSessionStart(); + this.startDeterminismWatcher(); console.log(chalk.gray(`πŸ€– Instance ID: ${this.config.instanceId}`)); console.log(chalk.gray(`🧠 Session ID: ${this.sessionId.slice(0, 8)}`)); console.log(chalk.gray(`πŸ“ Working in: ${process.cwd()}`)); @@ -1133,6 +1173,10 @@ class ClaudeSM { if (handoffContent) { initialInput = handoffContent; console.log(chalk.gray(' Handoff context ready')); + } else if (this.skippedHandoffReason) { + console.log( + chalk.gray(` Handoff skipped: ${this.skippedHandoffReason}`) + ); } const theoryContent = this.getTheoryContent(); diff --git a/src/cli/codex-sm.ts b/src/cli/codex-sm.ts index 2a3a8d84..2718a992 100644 --- a/src/cli/codex-sm.ts +++ b/src/cli/codex-sm.ts @@ -13,6 +13,12 @@ import { program } from 'commander'; import { v4 as uuidv4 } from 'uuid'; import chalk from 'chalk'; import { initializeTracing, trace } from '../core/trace/index.js'; +import { resolveRealCliBin } from './utils/real-cli-bin.js'; +import { + type DeterminismWatcherHandle, + startDeterminismWatcher, + stopDeterminismWatcher, +} from './utils/determinism-watcher.js'; import { canonicalStateStore, projectIdFromIdentifier, @@ -36,6 +42,7 @@ class CodexSM { private sessionId: string; private ownsSession: boolean; private sessionEnded: boolean; + private determinismWatcher: DeterminismWatcherHandle | null; constructor() { this.config = { @@ -50,6 +57,7 @@ class CodexSM { this.sessionId = process.env['STACKMEMORY_SESSION'] || uuidv4(); this.ownsSession = !process.env['STACKMEMORY_SESSION']; this.sessionEnded = false; + this.determinismWatcher = null; } private getRepoRoot(): string | null { @@ -151,25 +159,26 @@ class CodexSM { } private resolveCodexBin(): string | null { - // 1) CLI option - if (this.config.codexBin && this.config.codexBin.trim()) { - return this.config.codexBin.trim(); - } - // 2) Environment override - const envBin = process.env['CODEX_BIN']; - if (envBin && envBin.trim()) { - return envBin.trim(); - } - // 3) Detect on PATH - try { - execSync('which codex', { stdio: 'ignore' }); - return 'codex'; - } catch {} - try { - execSync('which codex-cli', { stdio: 'ignore' }); - return 'codex-cli'; - } catch {} - return null; + return resolveRealCliBin({ + explicitBin: this.config.codexBin, + envBin: process.env['CODEX_BIN'], + preferredPaths: [ + path.join( + os.homedir(), + '.nvm', + 'versions', + 'node', + 'v22.22.0', + 'bin', + 'codex' + ), + '/usr/local/bin/codex', + '/opt/homebrew/bin/codex', + '/usr/local/bin/codex-cli', + '/opt/homebrew/bin/codex-cli', + ], + pathCommands: ['codex', 'codex-cli'], + }); } private setupWorktree(): string | null { @@ -379,6 +388,30 @@ class CodexSM { } } + private startDeterminismWatcher(): void { + this.determinismWatcher = startDeterminismWatcher({ + stackmemoryBin: this.stackmemoryPath, + cwd: process.cwd(), + task: this.config.task, + instanceId: this.config.instanceId, + sessionId: this.sessionId, + tool: 'codex', + }); + + if (this.determinismWatcher) { + const modeLabel = + this.determinismWatcher.mode === 'targeted' + ? 'targeted' + : 'repo-root fallback'; + console.log(chalk.gray(`πŸ§ͺ Determinism: ${modeLabel}`)); + } + } + + private stopDeterminismWatcher(): void { + stopDeterminismWatcher(this.determinismWatcher); + this.determinismWatcher = null; + } + public async run(args: string[]): Promise { const codexArgs: string[] = []; let i = 0; @@ -486,6 +519,7 @@ class CodexSM { if (this.config.worktreePath) process.env['CODEX_WORKTREE_PATH'] = this.config.worktreePath; await this.publishSessionStart(); + this.startDeterminismWatcher(); console.log(chalk.gray(`πŸ€– Instance ID: ${this.config.instanceId}`)); console.log(chalk.gray(`🧠 Session ID: ${this.sessionId.slice(0, 8)}`)); @@ -537,6 +571,7 @@ class CodexSM { }); child.on('exit', async (code) => { + this.stopDeterminismWatcher(); this.saveContext('Codex session ended', { action: 'session_end', exitCode: code, @@ -573,6 +608,7 @@ class CodexSM { }); process.on('SIGINT', async () => { + this.stopDeterminismWatcher(); this.saveContext('Codex session interrupted', { action: 'session_interrupt', }); @@ -581,6 +617,7 @@ class CodexSM { }); process.on('SIGTERM', async () => { + this.stopDeterminismWatcher(); this.saveContext('Codex session terminated', { action: 'session_terminate', }); diff --git a/src/cli/commands/bench.ts b/src/cli/commands/bench.ts index 4e6538bc..8991121d 100644 --- a/src/cli/commands/bench.ts +++ b/src/cli/commands/bench.ts @@ -6,6 +6,7 @@ */ import { Command } from 'commander'; +import chalk from 'chalk'; import { existsSync, readFileSync, readdirSync } from 'fs'; import { join } from 'path'; import { @@ -18,6 +19,16 @@ import { feedbackLoops, _DEFAULT_CONFIG, } from '../../core/monitoring/feedback-loops.js'; +import { + DETERMINISM_WATCH_IGNORE, + DETERMINISM_WATCH_PATTERNS, + getDeterminismWatchTargets, + persistDeterminismReport, + readLatestDeterminismReport, + runDeterminismSmoke, + type DeterminismReport, + type StoredDeterminismReport, +} from '../../orchestrators/multimodal/determinism.js'; function loadRunMetrics(projectRoot: string): HarnessRunMetrics[] { const metricsFile = join( @@ -66,6 +77,66 @@ function loadSpikeAudits( .filter(Boolean) as Array<{ file: string; data: any }>; } +function printDeterminismReport( + task: string, + requestedRuns: number, + report: DeterminismReport +): void { + console.log('\nHarness Determinism Smoke'); + console.log('═'.repeat(60)); + console.log(`Task: ${task}`); + console.log(`Runs: ${report.runs}`); + console.log(`Determinism score: ${report.score.toFixed(2)}/100`); + + console.log('\nDimension Scores:'); + for (const dimension of report.dimensions) { + console.log( + ` ${dimension.name.padEnd(14)} ${dimension.score.toFixed(2).padStart(6)}/100 ${dimension.details}` + ); + } + + if (report.recommendations.length > 0) { + console.log('\nRecommended Tightening:'); + for (const recommendation of report.recommendations) { + console.log(` - ${recommendation}`); + } + } else { + console.log('\nNo drift detected in deterministic fixture mode.'); + } + + const sample = report.snapshots[0]; + if (sample) { + console.log('\nReference Snapshot:'); + console.log(` resultHash: ${sample.resultHash.slice(0, 16)}`); + console.log(` planHash: ${sample.planHash.slice(0, 16)}`); + console.log(` critiqueHash: ${sample.critiqueHash.slice(0, 16)}`); + console.log(` commandsHash: ${sample.commandsHash.slice(0, 16)}`); + console.log(` iterations: ${sample.iterations}`); + console.log(` contextTokens: ${sample.contextTokens}`); + } + + if (report.runs !== requestedRuns) { + console.log( + `\nNote: requested ${requestedRuns} runs, completed ${report.runs}.` + ); + } + + console.log(''); +} + +function printStoredDeterminismReport(stored: StoredDeterminismReport): void { + console.log('\nCached Determinism Result'); + console.log('═'.repeat(60)); + console.log(`Task: ${stored.task}`); + console.log(`Trigger: ${stored.trigger}`); + console.log(`Timestamp: ${stored.timestamp}`); + console.log(`Determinism score: ${stored.report.score.toFixed(2)}/100`); + if (stored.changedPaths.length > 0) { + console.log(`Changed paths: ${stored.changedPaths.join(', ')}`); + } + console.log(''); +} + export function createBenchCommand(): Command { const bench = new Command('bench') .description( @@ -243,17 +314,203 @@ export function createBenchCommand(): Command { console.log(''); }); + // Sub-command: bench loops + bench + .command('determinism') + .description( + 'Run deterministic fixture smoke checks for the multimodal harness' + ) + .option( + '-t, --task ', + 'Task description to run through the harness', + 'Add a small auth guard' + ) + .option('--runs ', 'Number of repeated runs', '5') + .option( + '--planner-model ', + 'Planner model label to include in the run config', + 'claude-sonnet-4-20250514' + ) + .option( + '--reviewer-model ', + 'Reviewer model label to include in the run config', + 'claude-sonnet-4-20250514' + ) + .option('--implementer ', 'codex|claude', 'codex') + .option('--max-iters ', 'Retry loop iterations', '2') + .option( + '--watch', + 'Watch harness-critical files and rerun on changes', + false + ) + .option( + '--debounce-ms ', + 'Debounce window for write completion in watch mode', + '3000' + ) + .option('--latest', 'Show the latest cached determinism result', false) + .option('--json', 'Output as JSON', false) + .action(async function () { + const command = this as Command; + const options = command.opts(); + const json = Boolean(options.json || command.parent?.opts().json); + const projectRoot = process.cwd(); + const runs = Math.max(1, parseInt(options.runs, 10) || 5); + const debounceMs = Math.max( + 250, + parseInt(options.debounceMs, 10) || 3000 + ); + + if (options.latest) { + const stored = readLatestDeterminismReport(projectRoot); + if (!stored) { + console.error('No cached determinism result found.'); + process.exitCode = 1; + return; + } + + if (json) { + console.log(JSON.stringify(stored, null, 2)); + return; + } + + printStoredDeterminismReport(stored); + return; + } + + const runCheck = async ( + trigger: string, + changedPaths: string[] = [] + ): Promise => { + const report = await runDeterminismSmoke( + { + task: options.task, + repoPath: projectRoot, + }, + { + runs, + plannerModel: options.plannerModel, + reviewerModel: options.reviewerModel, + implementer: options.implementer, + maxIters: parseInt(options.maxIters, 10) || 2, + } + ); + + const stored = persistDeterminismReport(projectRoot, report, { + task: options.task, + trigger, + changedPaths, + }); + + if (json) { + console.log(JSON.stringify(stored, null, 2)); + } else { + printDeterminismReport(options.task, runs, report); + } + + return stored; + }; + + if (options.watch) { + const chokidar = await import('chokidar'); + const watchTargets = getDeterminismWatchTargets(projectRoot); + const watchPatterns = watchTargets.map((pattern) => + join(projectRoot, pattern) + ); + const watcher = chokidar.watch(watchPatterns, { + ignoreInitial: true, + ignored: DETERMINISM_WATCH_IGNORE.map((pattern) => + join(projectRoot, pattern) + ), + awaitWriteFinish: { + stabilityThreshold: debounceMs, + pollInterval: 100, + }, + }); + + let running = false; + let rerunRequested = false; + const pendingPaths = new Set(); + + const maybeRun = async (trigger: string) => { + if (running) { + rerunRequested = true; + return; + } + + running = true; + const changedPaths = Array.from(pendingPaths).sort(); + pendingPaths.clear(); + + try { + await runCheck(trigger, changedPaths); + } finally { + running = false; + if (rerunRequested) { + rerunRequested = false; + await maybeRun('watch:debounced-rerun'); + } + } + }; + + const onFileEvent = async (trigger: string, filePath: string) => { + const relativePath = filePath.startsWith(projectRoot) + ? filePath.slice(projectRoot.length + 1) + : filePath; + pendingPaths.add(relativePath); + if (!json) { + console.log( + chalk.gray(`determinism watcher: ${trigger} ${relativePath}`) + ); + } + await maybeRun(`watch:${trigger}`); + }; + + watcher.on('all', async (eventName: string, filePath: string) => { + if (eventName !== 'add' && eventName !== 'change') { + return; + } + await onFileEvent(eventName, filePath); + }); + + if (!json) { + console.log('\nHarness Determinism Watch'); + console.log('═'.repeat(60)); + console.log(`Task: ${options.task}`); + console.log(`Watching: ${watchTargets.join(', ')}`); + console.log(`Debounce: ${debounceMs}ms`); + console.log(chalk.gray('Press Ctrl+C to stop.\n')); + } + + await runCheck('watch:initial'); + await new Promise((resolve) => { + const stop = () => { + void watcher.close(); + resolve(); + }; + process.once('SIGINT', stop); + process.once('SIGTERM', stop); + }); + return; + } + + await runCheck('manual'); + }); + // Sub-command: bench loops bench .command('loops') .description('Show feedback loop configuration, status, and recent events') .option('--json', 'Output as JSON', false) - .action((options) => { + .action(function () { + const command = this as Command; + const options = command.opts(); + const json = Boolean(options.json || command.parent?.opts().json); const config = feedbackLoops.getConfig(); const stats = feedbackLoops.getStats(); const history = feedbackLoops.getHistory(undefined, 20); - if (options.json) { + if (json) { console.log(JSON.stringify({ config, stats, history }, null, 2)); return; } diff --git a/src/cli/commands/handoff.ts b/src/cli/commands/handoff.ts index 3dc1eb85..959e8b95 100644 --- a/src/cli/commands/handoff.ts +++ b/src/cli/commands/handoff.ts @@ -20,6 +20,10 @@ import { FrameManager } from '../../core/context/index.js'; import { LinearTaskManager } from '../../features/tasks/linear-task-manager.js'; import { logger } from '../../core/monitoring/logger.js'; import { EnhancedHandoffGenerator } from '../../core/session/handoff.js'; +import { + getProjectHandoffPaths, + loadProjectHandoff, +} from '../../core/session/project-handoff.js'; // Simple token estimation (avg 3.5 chars per token for English) const countTokens = (text: string): number => Math.ceil(text.length / 3.5); @@ -360,7 +364,8 @@ Generated by stackmemory capture at ${timestamp} if (!existsSync(stackmemoryDir)) { mkdirSync(stackmemoryDir, { recursive: true }); } - const handoffPath = join(stackmemoryDir, 'last-handoff.md'); + const { handoffPath, metadataPath } = + getProjectHandoffPaths(projectRoot); writeFileSync(handoffPath, handoffPrompt); // Save versioned copy @@ -379,6 +384,18 @@ Generated by stackmemory capture at ${timestamp} branch, handoffPrompt ); + writeFileSync( + metadataPath, + JSON.stringify( + { + branch, + capturedAt: new Date().toISOString(), + projectRoot, + }, + null, + 2 + ) + ); console.log( `Versioned: ${versionedPath.split('/').slice(-2).join('/')}` ); @@ -433,14 +450,10 @@ export function createRestoreCommand(): Command { cmd .description('Restore context from last handoff') .option('--no-copy', 'Do not copy prompt to clipboard') + .option('--force', 'Restore even if the handoff branch does not match') .action(async (options) => { try { const projectRoot = process.cwd(); - const handoffPath = join( - projectRoot, - '.stackmemory', - 'last-handoff.md' - ); const metaPath = join( process.env['HOME'] || '~', '.stackmemory', @@ -448,14 +461,35 @@ export function createRestoreCommand(): Command { 'last-handoff-meta.json' ); - if (!existsSync(handoffPath)) { + const currentBranch = (() => { + try { + return execSync('git rev-parse --abbrev-ref HEAD', { + encoding: 'utf-8', + cwd: projectRoot, + stdio: ['pipe', 'pipe', 'pipe'], + }).trim(); + } catch { + return undefined; + } + })(); + + const handoff = loadProjectHandoff(projectRoot, currentBranch); + if (!handoff) { console.log('❌ No handoff found in this project'); console.log('πŸ’‘ Run "stackmemory capture" to create one'); return; } - // Read handoff prompt - const handoffPrompt = readFileSync(handoffPath, 'utf-8'); + if (!handoff.compatible && !options.force) { + console.log('⚠️ Skipping stale handoff'); + console.log(` ${handoff.mismatchReason}`); + console.log( + ' Run "stackmemory restore --force" to inspect it anyway' + ); + return; + } + + const handoffPrompt = handoff.content; // Display the prompt console.log('\n' + '='.repeat(60)); diff --git a/src/cli/gemini-sm.ts b/src/cli/gemini-sm.ts index b0bbf9c6..cea9d63b 100644 --- a/src/cli/gemini-sm.ts +++ b/src/cli/gemini-sm.ts @@ -16,6 +16,7 @@ import { program } from 'commander'; import { v4 as uuidv4 } from 'uuid'; import chalk from 'chalk'; import { initializeTracing, trace } from '../core/trace/index.js'; +import { resolveRealCliBin } from './utils/real-cli-bin.js'; interface GeminiSMConfig { defaultWorktree: boolean; @@ -167,40 +168,24 @@ class GeminiSM { } private resolveGeminiBin(): string | null { - if (this.config.geminiBin && this.config.geminiBin.trim()) { - return this.config.geminiBin.trim(); - } - const envBin = process.env['GEMINI_BIN']; - if (envBin && envBin.trim()) return envBin.trim(); - - const possiblePaths = [ - path.join( - os.homedir(), - '.nvm', - 'versions', - 'node', - 'v22.22.0', - 'bin', - 'gemini' - ), - '/usr/local/bin/gemini', - '/opt/homebrew/bin/gemini', - ]; - - for (const binPath of possiblePaths) { - if (fs.existsSync(binPath)) { - return binPath; - } - } - - // Try PATH - try { - execSync('which gemini', { stdio: 'ignore' }); - return 'gemini'; - } catch { - // Not found - } - return null; + return resolveRealCliBin({ + explicitBin: this.config.geminiBin, + envBin: process.env['GEMINI_BIN'], + preferredPaths: [ + path.join( + os.homedir(), + '.nvm', + 'versions', + 'node', + 'v22.22.0', + 'bin', + 'gemini' + ), + '/usr/local/bin/gemini', + '/opt/homebrew/bin/gemini', + ], + pathCommands: ['gemini'], + }); } private setupWorktree(): string | null { diff --git a/src/cli/index.ts b/src/cli/index.ts index 7ce66d2c..f6153228 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -7,6 +7,12 @@ // Set environment flag for CLI usage to skip async context bridge process.env['STACKMEMORY_CLI'] = 'true'; +// Machine-readable CLI output should not be prefixed by INFO banners unless +// the caller explicitly opted into a log level. +if (!process.env['STACKMEMORY_LOG_LEVEL'] && process.argv.includes('--json')) { + process.env['STACKMEMORY_LOG_LEVEL'] = 'ERROR'; +} + // Load environment variables (quiet mode to suppress logging) import { config as loadDotenv } from 'dotenv'; loadDotenv({ quiet: true }); @@ -878,6 +884,11 @@ program .option('--audit-dir ', 'Persist spike results to directory') .option('--record-frame', 'Record as real frame with anchors', false) .option('--record', 'Record plan & critique into StackMemory context', false) + .option( + '--deterministic-fixture', + 'Use deterministic fixture planner/critic for replayable smoke runs', + false + ) .option('--json', 'Emit single JSON result (UI-friendly)', false) .option('--quiet', 'Minimal output (default)', true) .option('--verbose', 'Verbose sectioned output', false) @@ -904,6 +915,7 @@ program auditDir: opts.auditDir, recordFrame: Boolean(opts.recordFrame), record: Boolean(opts.record), + deterministicFixture: Boolean(opts.deterministicFixture), } ); @@ -978,6 +990,10 @@ program .option('--audit-dir ', 'Persist spike results to directory') .option('--record-frame', 'Record as real frame with anchors') .option('--record', 'Record plan & critique into StackMemory context') + .option( + '--deterministic-fixture', + 'Use deterministic fixture planner/critic for replayable smoke runs' + ) .option('--json', 'Emit single JSON result (UI-friendly)') .option('--quiet', 'Minimal output') .option('--verbose', 'Verbose sectioned output') @@ -1032,6 +1048,7 @@ program auditDir: opts.auditDir, recordFrame: Boolean(opts.recordFrame), record: Boolean(opts.record), + deterministicFixture: Boolean(opts.deterministicFixture), } ); diff --git a/src/cli/opencode-sm.ts b/src/cli/opencode-sm.ts index 6b5f5f61..adf15beb 100644 --- a/src/cli/opencode-sm.ts +++ b/src/cli/opencode-sm.ts @@ -16,6 +16,12 @@ import { program } from 'commander'; import { v4 as uuidv4 } from 'uuid'; import chalk from 'chalk'; import { initializeTracing, trace } from '../core/trace/index.js'; +import { resolveRealCliBin } from './utils/real-cli-bin.js'; +import { + type DeterminismWatcherHandle, + startDeterminismWatcher, + stopDeterminismWatcher, +} from './utils/determinism-watcher.js'; interface OpencodeSMConfig { defaultWorktree: boolean; @@ -70,6 +76,7 @@ class OpencodeSM { private config: OpencodeConfig; private stackmemoryPath: string; private smConfig: OpencodeSMConfig; + private determinismWatcher: DeterminismWatcherHandle | null; constructor() { this.smConfig = loadSMConfig(); @@ -84,6 +91,7 @@ class OpencodeSM { }; this.stackmemoryPath = this.findStackMemory(); + this.determinismWatcher = null; } private getRepoRoot(): string | null { @@ -169,33 +177,16 @@ class OpencodeSM { } private resolveOpencodeBin(): string | null { - if (this.config.opencodeBin && this.config.opencodeBin.trim()) { - return this.config.opencodeBin.trim(); - } - const envBin = process.env['OPENCODE_BIN']; - if (envBin && envBin.trim()) return envBin.trim(); - - // Check common OpenCode locations - const possiblePaths = [ - path.join(os.homedir(), '.opencode', 'bin', 'opencode'), - '/usr/local/bin/opencode', - '/opt/homebrew/bin/opencode', - ]; - - for (const binPath of possiblePaths) { - if (fs.existsSync(binPath)) { - return binPath; - } - } - - // Try PATH - try { - execSync('which opencode', { stdio: 'ignore' }); - return 'opencode'; - } catch { - // Not found - } - return null; + return resolveRealCliBin({ + explicitBin: this.config.opencodeBin, + envBin: process.env['OPENCODE_BIN'], + preferredPaths: [ + path.join(os.homedir(), '.opencode', 'bin', 'opencode'), + '/usr/local/bin/opencode', + '/opt/homebrew/bin/opencode', + ], + pathCommands: ['opencode'], + }); } private setupWorktree(): string | null { @@ -333,6 +324,29 @@ class OpencodeSM { } } + private startDeterminismWatcher(): void { + this.determinismWatcher = startDeterminismWatcher({ + stackmemoryBin: this.stackmemoryPath, + cwd: process.cwd(), + task: this.config.task, + instanceId: this.config.instanceId, + tool: 'opencode', + }); + + if (this.determinismWatcher) { + const modeLabel = + this.determinismWatcher.mode === 'targeted' + ? 'targeted' + : 'repo-root fallback'; + console.log(chalk.gray(`Determinism: ${modeLabel}`)); + } + } + + private stopDeterminismWatcher(): void { + stopDeterminismWatcher(this.determinismWatcher); + this.determinismWatcher = null; + } + public async run(args: string[]): Promise { const opencodeArgs: string[] = []; let i = 0; @@ -455,6 +469,7 @@ class OpencodeSM { if (this.config.worktreePath) { process.env['OPENCODE_WORKTREE_PATH'] = this.config.worktreePath; } + this.startDeterminismWatcher(); console.log(chalk.gray(`Instance: ${this.config.instanceId}`)); console.log(chalk.gray(`Working in: ${process.cwd()}`)); @@ -501,6 +516,7 @@ class OpencodeSM { }); opencode.on('exit', async (code) => { + this.stopDeterminismWatcher(); this.saveContext('OpenCode session ended', { action: 'session_end', exitCode: code, @@ -525,6 +541,7 @@ class OpencodeSM { }); process.on('SIGINT', () => { + this.stopDeterminismWatcher(); this.saveContext('OpenCode session interrupted', { action: 'session_interrupt', }); @@ -532,6 +549,7 @@ class OpencodeSM { }); process.on('SIGTERM', () => { + this.stopDeterminismWatcher(); this.saveContext('OpenCode session terminated', { action: 'session_terminate', }); diff --git a/src/cli/utils/determinism-watcher.ts b/src/cli/utils/determinism-watcher.ts new file mode 100644 index 00000000..8453484f --- /dev/null +++ b/src/cli/utils/determinism-watcher.ts @@ -0,0 +1,90 @@ +import { spawn } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; + +import { getDeterminismWatchTargets } from '../../orchestrators/multimodal/determinism.js'; + +export interface DeterminismWatcherOptions { + stackmemoryBin: string; + cwd: string; + task?: string; + instanceId?: string; + sessionId?: string; + tool: 'claude' | 'codex' | 'opencode'; +} + +export interface DeterminismWatcherHandle { + child: ReturnType; + mode: 'targeted' | 'repo-root'; + targets: string[]; +} + +export function shouldAutoStartDeterminismWatcher(cwd: string): boolean { + if (process.env['STACKMEMORY_DETERMINISM_AUTO'] === '0') { + return false; + } + + return fs.existsSync(path.join(cwd, '.git')); +} + +export function startDeterminismWatcher( + options: DeterminismWatcherOptions +): DeterminismWatcherHandle | null { + if (!shouldAutoStartDeterminismWatcher(options.cwd)) { + return null; + } + + const runs = process.env['STACKMEMORY_DETERMINISM_RUNS'] || '3'; + const task = + options.task || + process.env['STACKMEMORY_DETERMINISM_TASK'] || + 'Determinism probe'; + const targets = getDeterminismWatchTargets(options.cwd); + const mode: 'targeted' | 'repo-root' = + targets.length === 1 && targets[0] === '.' ? 'repo-root' : 'targeted'; + + const child = spawn( + options.stackmemoryBin, + [ + 'bench', + 'determinism', + '--task', + task, + '--runs', + runs, + '--watch', + '--json', + ], + { + cwd: options.cwd, + stdio: 'ignore', + env: { + ...process.env, + STACKMEMORY_DETERMINISM_PARENT_TOOL: options.tool, + STACKMEMORY_DETERMINISM_PARENT_INSTANCE: options.instanceId || '', + STACKMEMORY_DETERMINISM_PARENT_SESSION: options.sessionId || '', + }, + } + ); + + return { + child, + mode, + targets, + }; +} + +export function stopDeterminismWatcher( + handle: DeterminismWatcherHandle | null +): void { + const child = handle?.child ?? null; + if (!child || child.killed) { + return; + } + + try { + child.kill('SIGTERM'); + } catch { + // Best-effort only. + } +} diff --git a/src/cli/utils/real-cli-bin.ts b/src/cli/utils/real-cli-bin.ts new file mode 100644 index 00000000..c1a0eff3 --- /dev/null +++ b/src/cli/utils/real-cli-bin.ts @@ -0,0 +1,66 @@ +import { execSync } from 'child_process'; +import * as fs from 'fs'; + +const DEFAULT_WRAPPER_PATH_SNIPPETS = [ + '/Applications/cmux.app/Contents/Resources/bin/', +]; + +function isWrapperPath( + candidate: string, + wrapperPathSnippets: string[] +): boolean { + const normalized = candidate.trim(); + return wrapperPathSnippets.some((snippet) => normalized.includes(snippet)); +} + +export interface ResolveRealCliBinOptions { + explicitBin?: string; + envBin?: string; + preferredPaths?: string[]; + pathCommands: string[]; + wrapperPathSnippets?: string[]; +} + +export function resolveRealCliBin( + options: ResolveRealCliBinOptions +): string | null { + if (options.explicitBin?.trim()) { + return options.explicitBin.trim(); + } + if (options.envBin?.trim()) { + return options.envBin.trim(); + } + + const wrapperPathSnippets = + options.wrapperPathSnippets || DEFAULT_WRAPPER_PATH_SNIPPETS; + + for (const candidate of options.preferredPaths || []) { + if ( + fs.existsSync(candidate) && + !isWrapperPath(candidate, wrapperPathSnippets) + ) { + return candidate; + } + } + + for (const command of options.pathCommands) { + try { + const output = execSync(`which -a ${command}`, { + encoding: 'utf8', + stdio: ['ignore', 'pipe', 'ignore'], + }); + const resolved = output + .split('\n') + .map((line) => line.trim()) + .filter(Boolean) + .find((candidate) => !isWrapperPath(candidate, wrapperPathSnippets)); + if (resolved) { + return resolved; + } + } catch { + // Continue searching. + } + } + + return null; +} diff --git a/src/core/monitoring/logger.ts b/src/core/monitoring/logger.ts index 030490dd..be75dc00 100644 --- a/src/core/monitoring/logger.ts +++ b/src/core/monitoring/logger.ts @@ -105,6 +105,7 @@ export class Logger { private constructor() { // Set log level from environment const envLevel = process.env['STACKMEMORY_LOG_LEVEL']?.toUpperCase(); + const jsonCliMode = !envLevel && process.argv.includes('--json'); switch (envLevel) { case 'ERROR': this.logLevel = LogLevel.ERROR; @@ -116,7 +117,7 @@ export class Logger { this.logLevel = LogLevel.DEBUG; break; default: - this.logLevel = LogLevel.INFO; + this.logLevel = jsonCliMode ? LogLevel.ERROR : LogLevel.INFO; } // Set up log file if in debug mode or if specified diff --git a/src/core/session/__tests__/project-handoff.test.ts b/src/core/session/__tests__/project-handoff.test.ts new file mode 100644 index 00000000..5e393c68 --- /dev/null +++ b/src/core/session/__tests__/project-handoff.test.ts @@ -0,0 +1,64 @@ +import { afterEach, describe, expect, it } from 'vitest'; +import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from 'fs'; +import { join } from 'path'; +import { tmpdir } from 'os'; +import { + loadProjectHandoff, + parseBranchFromHandoffContent, +} from '../project-handoff.js'; + +describe('project handoff compatibility', () => { + const tempDirs: string[] = []; + + afterEach(() => { + for (const dir of tempDirs.splice(0)) { + rmSync(dir, { recursive: true, force: true }); + } + }); + + function makeProject(): string { + const dir = mkdtempSync(join(tmpdir(), 'stackmemory-handoff-')); + mkdirSync(join(dir, '.stackmemory'), { recursive: true }); + tempDirs.push(dir); + return dir; + } + + it('parses branch from compact handoff content', () => { + expect( + parseBranchFromHandoffContent( + '# Handoff: stackmemory@feature/test-branch\n## Work: test' + ) + ).toBe('feature/test-branch'); + }); + + it('loads a compatible handoff for the current branch', () => { + const project = makeProject(); + writeFileSync( + join(project, '.stackmemory', 'last-handoff.md'), + '# Handoff: stackmemory@feature/test-branch\n## Work: test' + ); + writeFileSync( + join(project, '.stackmemory', 'last-handoff-meta.json'), + JSON.stringify({ branch: 'feature/test-branch' }, null, 2) + ); + + const handoff = loadProjectHandoff(project, 'feature/test-branch'); + expect(handoff).not.toBeNull(); + expect(handoff?.compatible).toBe(true); + expect(handoff?.branch).toBe('feature/test-branch'); + }); + + it('rejects a stale handoff from another branch', () => { + const project = makeProject(); + writeFileSync( + join(project, '.stackmemory', 'last-handoff.md'), + '# Handoff: stackmemory@feature/old-branch\n## Work: test' + ); + + const handoff = loadProjectHandoff(project, 'release/current'); + expect(handoff).not.toBeNull(); + expect(handoff?.compatible).toBe(false); + expect(handoff?.mismatchReason).toContain('feature/old-branch'); + expect(handoff?.mismatchReason).toContain('release/current'); + }); +}); diff --git a/src/core/session/project-handoff.ts b/src/core/session/project-handoff.ts new file mode 100644 index 00000000..38a58f66 --- /dev/null +++ b/src/core/session/project-handoff.ts @@ -0,0 +1,85 @@ +import { existsSync, readFileSync } from 'fs'; +import { join } from 'path'; + +export interface ProjectHandoffMetadata { + branch?: string; + capturedAt?: string; + gitHead?: string; + projectRoot?: string; +} + +export interface LoadedProjectHandoff { + content: string; + branch: string | null; + compatible: boolean; + mismatchReason?: string; +} + +export function getProjectHandoffPaths(projectRoot: string): { + handoffPath: string; + metadataPath: string; +} { + return { + handoffPath: join(projectRoot, '.stackmemory', 'last-handoff.md'), + metadataPath: join(projectRoot, '.stackmemory', 'last-handoff-meta.json'), + }; +} + +export function parseBranchFromHandoffContent(content: string): string | null { + const compactMatch = content.match(/^# Handoff:\s+.+?@([^\n]+)$/m); + if (compactMatch?.[1]) { + return compactMatch[1].trim(); + } + + const verboseMatch = content.match(/^\*\*Branch\*\*:\s+([^\n]+)$/m); + if (verboseMatch?.[1]) { + return verboseMatch[1].trim(); + } + + const ultraMatch = content.match(/^\[H\].+?@([^|\n]+)\|/m); + if (ultraMatch?.[1]) { + return ultraMatch[1].trim(); + } + + return null; +} + +export function loadProjectHandoff( + projectRoot: string, + currentBranch?: string +): LoadedProjectHandoff | null { + const { handoffPath, metadataPath } = getProjectHandoffPaths(projectRoot); + if (!existsSync(handoffPath)) { + return null; + } + + const content = readFileSync(handoffPath, 'utf8').trim(); + if (!content) { + return null; + } + + let metadata: ProjectHandoffMetadata | null = null; + if (existsSync(metadataPath)) { + try { + metadata = JSON.parse(readFileSync(metadataPath, 'utf8')); + } catch { + metadata = null; + } + } + + const branch = metadata?.branch || parseBranchFromHandoffContent(content); + if (currentBranch && branch && branch !== currentBranch) { + return { + content, + branch, + compatible: false, + mismatchReason: `handoff is for branch ${branch}, current branch is ${currentBranch}`, + }; + } + + return { + content, + branch: branch || null, + compatible: true, + }; +} diff --git a/src/orchestrators/multimodal/__tests__/determinism.test.ts b/src/orchestrators/multimodal/__tests__/determinism.test.ts new file mode 100644 index 00000000..17d2f1ec --- /dev/null +++ b/src/orchestrators/multimodal/__tests__/determinism.test.ts @@ -0,0 +1,103 @@ +import { afterEach, describe, expect, it } from 'vitest'; +import { existsSync, mkdtempSync, rmSync } from 'fs'; +import { join } from 'path'; +import { tmpdir } from 'os'; + +import { + persistDeterminismReport, + readLatestDeterminismReport, + runDeterminismSmoke, +} from '../determinism.js'; +import { runSpike } from '../harness.js'; + +const tempDirs: string[] = []; + +function makeTempRepo(): string { + const dir = mkdtempSync(join(tmpdir(), 'stackmemory-determinism-')); + tempDirs.push(dir); + return dir; +} + +afterEach(() => { + while (tempDirs.length > 0) { + const dir = tempDirs.pop(); + if (dir) { + rmSync(dir, { recursive: true, force: true }); + } + } +}); + +describe('multimodal determinism', () => { + it('scores deterministic fixture runs at 100 out of 100', async () => { + const repoPath = makeTempRepo(); + + const report = await runDeterminismSmoke( + { + task: 'Add a small auth guard', + repoPath, + }, + { + runs: 5, + implementer: 'codex', + maxIters: 2, + } + ); + + expect(report.runs).toBe(5); + expect(report.score).toBe(100); + expect( + report.dimensions.every((dimension) => dimension.score === 100) + ).toBe(true); + expect(report.recommendations).toEqual([]); + expect( + new Set(report.snapshots.map((snapshot) => snapshot.resultHash)).size + ).toBe(1); + }); + + it('can skip audit persistence for deterministic or replay runs', async () => { + const repoPath = makeTempRepo(); + + await runSpike( + { + task: 'Add a small auth guard', + repoPath, + }, + { + dryRun: true, + deterministicFixture: true, + persistAudit: false, + } + ); + + expect(existsSync(join(repoPath, '.stackmemory', 'build'))).toBe(false); + }); + + it('persists and reloads the latest cached determinism report', async () => { + const repoPath = makeTempRepo(); + const report = await runDeterminismSmoke( + { + task: 'Add a small auth guard', + repoPath, + }, + { + runs: 3, + } + ); + + const stored = persistDeterminismReport(repoPath, report, { + task: 'Add a small auth guard', + trigger: 'test', + changedPaths: ['src/orchestrators/multimodal/harness.ts'], + }); + const reloaded = readLatestDeterminismReport(repoPath); + + expect(reloaded).not.toBeNull(); + expect(reloaded?.task).toBe('Add a small auth guard'); + expect(reloaded?.trigger).toBe('test'); + expect(reloaded?.changedPaths).toEqual([ + 'src/orchestrators/multimodal/harness.ts', + ]); + expect(reloaded?.report.score).toBe(100); + expect(stored.report.score).toBe(100); + }); +}); diff --git a/src/orchestrators/multimodal/determinism.ts b/src/orchestrators/multimodal/determinism.ts new file mode 100644 index 00000000..3d4163f7 --- /dev/null +++ b/src/orchestrators/multimodal/determinism.ts @@ -0,0 +1,309 @@ +import { createHash } from 'crypto'; +import { + appendFileSync, + existsSync, + mkdirSync, + readFileSync, + writeFileSync, +} from 'fs'; +import { join } from 'path'; +import type { HarnessOptions, HarnessResult, PlanningInput } from './types.js'; +import { compactPlan } from './utils.js'; +import { runSpike } from './harness.js'; + +export const DETERMINISM_WATCH_PATTERNS = [ + 'src/orchestrators/multimodal', + 'src/cli/commands/bench.ts', + 'src/cli/index.ts', + 'src/core/monitoring/logger.ts', +]; + +export const DETERMINISM_WATCH_IGNORE = [ + '.git/**', + 'node_modules/**', + 'dist/**', + 'build/**', + '.next/**', + '.turbo/**', + 'coverage/**', + '.stackmemory/**', +]; + +export interface DeterminismSnapshot { + index: number; + approved: boolean; + iterations: number; + planHash: string; + critiqueHash: string; + commandsHash: string; + resultHash: string; + contextTokens: number; +} + +export interface DeterminismDimensionScore { + name: string; + score: number; + weight: number; + details: string; +} + +export interface DeterminismReport { + runs: number; + score: number; + snapshots: DeterminismSnapshot[]; + dimensions: DeterminismDimensionScore[]; + recommendations: string[]; +} + +export interface StoredDeterminismReport { + timestamp: string; + task: string; + trigger: string; + changedPaths: string[]; + report: DeterminismReport; +} + +function stableStringify(value: unknown): string { + return JSON.stringify(canonicalize(value)); +} + +function canonicalize(value: unknown): unknown { + if (Array.isArray(value)) { + return value.map((item) => canonicalize(item)); + } + if (value && typeof value === 'object') { + const entries = Object.entries(value as Record).sort( + ([a], [b]) => a.localeCompare(b) + ); + return Object.fromEntries( + entries.map(([key, entryValue]) => [key, canonicalize(entryValue)]) + ); + } + return value; +} + +function hashValue(value: unknown): string { + return createHash('sha256').update(stableStringify(value)).digest('hex'); +} + +function modeAgreement(values: T[]): number { + if (values.length === 0) return 1; + const counts = new Map(); + for (const value of values) { + counts.set(value, (counts.get(value) || 0) + 1); + } + const maxCount = Math.max(...counts.values()); + return maxCount / values.length; +} + +function normalizeResult(result: HarnessResult) { + return { + plan: compactPlan(result.plan), + critique: canonicalize(result.critique), + implementation: { + success: result.implementation.success, + summary: result.implementation.summary, + commands: [...(result.implementation.commands || [])], + }, + iterations: (result.iterations || []).map((iteration) => ({ + command: iteration.command, + ok: iteration.ok, + critique: canonicalize(iteration.critique), + outputPreviewHash: hashValue(iteration.outputPreview), + })), + }; +} + +function estimateContextTokens(result: HarnessResult): number { + const normalized = normalizeResult(result); + return Math.ceil(stableStringify(normalized).length / 4); +} + +function toSnapshot(result: HarnessResult, index: number): DeterminismSnapshot { + const normalized = normalizeResult(result); + return { + index, + approved: result.critique.approved, + iterations: (result.iterations || []).length, + planHash: hashValue(compactPlan(result.plan)), + critiqueHash: hashValue(canonicalize(result.critique)), + commandsHash: hashValue(result.implementation.commands || []), + resultHash: hashValue(normalized), + contextTokens: estimateContextTokens(result), + }; +} + +function computeNumericStability(values: number[]): number { + if (values.length <= 1) return 1; + const min = Math.min(...values); + const max = Math.max(...values); + if (max === min) return 1; + return Math.max(0, 1 - (max - min) / Math.max(max, 1)); +} + +function scoreReport(snapshots: DeterminismSnapshot[]): DeterminismReport { + const dimensions: DeterminismDimensionScore[] = [ + { + name: 'result', + weight: 40, + score: modeAgreement(snapshots.map((item) => item.resultHash)) * 100, + details: 'Full normalized result hash agreement', + }, + { + name: 'plan', + weight: 20, + score: modeAgreement(snapshots.map((item) => item.planHash)) * 100, + details: 'Plan structure hash agreement', + }, + { + name: 'critique', + weight: 15, + score: modeAgreement(snapshots.map((item) => item.critiqueHash)) * 100, + details: 'Critique hash agreement', + }, + { + name: 'commands', + weight: 10, + score: modeAgreement(snapshots.map((item) => item.commandsHash)) * 100, + details: 'Implementer command sequence agreement', + }, + { + name: 'iterations', + weight: 10, + score: modeAgreement(snapshots.map((item) => item.iterations)) * 100, + details: 'Retry-count agreement', + }, + { + name: 'context_tokens', + weight: 5, + score: + computeNumericStability(snapshots.map((item) => item.contextTokens)) * + 100, + details: 'Token-footprint stability', + }, + ]; + + const weightedScore = dimensions.reduce((sum, dimension) => { + return sum + dimension.score * dimension.weight; + }, 0); + const totalWeight = dimensions.reduce( + (sum, dimension) => sum + dimension.weight, + 0 + ); + const score = totalWeight > 0 ? weightedScore / totalWeight : 0; + + const recommendations: string[] = []; + if (dimensions[0].score < 100) { + recommendations.push( + 'Pin planner/critic outputs behind deterministic fixtures or replay traces.' + ); + } + if (dimensions[1].score < 100) { + recommendations.push( + 'Canonicalize plan generation further and remove any model-dependent fields from smoke checks.' + ); + } + if (dimensions[4].score < 100) { + recommendations.push( + 'Tighten retry rules so the same failure mode produces the same iteration count.' + ); + } + if (dimensions[5].score < 100) { + recommendations.push( + 'Reduce context assembly drift by sorting symbols and fixing token accounting.' + ); + } + + return { + runs: snapshots.length, + score: Math.round(score * 100) / 100, + snapshots, + dimensions, + recommendations, + }; +} + +export async function runDeterminismSmoke( + input: PlanningInput, + options: HarnessOptions & { runs?: number } = {} +): Promise { + const runs = Math.max(1, options.runs ?? 5); + const snapshots: DeterminismSnapshot[] = []; + + for (let index = 0; index < runs; index++) { + const result = await runSpike(input, { + ...options, + dryRun: options.dryRun ?? true, + deterministicFixture: options.deterministicFixture ?? true, + persistAudit: false, + record: false, + recordFrame: false, + }); + snapshots.push(toSnapshot(result, index + 1)); + } + + return scoreReport(snapshots); +} + +export function getDeterminismWatchTargets(repoPath: string): string[] { + const existingTargets = DETERMINISM_WATCH_PATTERNS.filter((target) => + existsSync(join(repoPath, target)) + ); + + if (existingTargets.length > 0) { + return existingTargets; + } + + // Fallback: watch the current repo root, but rely on ignore globs so the + // watcher remains contained to the repo without scanning generated/vendor dirs. + return ['.']; +} + +function getDeterminismDir(repoPath: string): string { + return join(repoPath, '.stackmemory', 'determinism'); +} + +export function persistDeterminismReport( + repoPath: string, + report: DeterminismReport, + meta: { + task: string; + trigger: string; + changedPaths?: string[]; + } +): StoredDeterminismReport { + const dir = getDeterminismDir(repoPath); + mkdirSync(dir, { recursive: true }); + + const stored: StoredDeterminismReport = { + timestamp: new Date().toISOString(), + task: meta.task, + trigger: meta.trigger, + changedPaths: meta.changedPaths || [], + report, + }; + + writeFileSync( + join(dir, 'latest.json'), + JSON.stringify(stored, null, 2) + '\n' + ); + appendFileSync(join(dir, 'history.jsonl'), JSON.stringify(stored) + '\n'); + return stored; +} + +export function readLatestDeterminismReport( + repoPath: string +): StoredDeterminismReport | null { + const latestPath = join(getDeterminismDir(repoPath), 'latest.json'); + if (!existsSync(latestPath)) { + return null; + } + + try { + return JSON.parse( + readFileSync(latestPath, 'utf8') + ) as StoredDeterminismReport; + } catch { + return null; + } +} diff --git a/src/orchestrators/multimodal/harness.ts b/src/orchestrators/multimodal/harness.ts index 3259d170..c071f4e2 100644 --- a/src/orchestrators/multimodal/harness.ts +++ b/src/orchestrators/multimodal/harness.ts @@ -58,6 +58,48 @@ function heuristicPlan(input: PlanningInput): ImplementationPlan { }; } +function deterministicCritique(args: { + plan: ImplementationPlan; + ok: boolean; + diff: string; + checks: ReturnType | null; +}): CritiqueResult { + const issues: string[] = []; + const suggestions: string[] = []; + + if (!args.ok) { + issues.push('Implementer command failed'); + suggestions.push('Fix the command invocation before retrying'); + } + + if (args.diff.includes('<<<<<<<') || args.diff.includes('>>>>>>>')) { + issues.push('Merge conflict markers detected in diff'); + suggestions.push('Resolve conflict markers before approval'); + } + + if (args.checks && !args.checks.lintOk) { + issues.push('Lint checks failed'); + suggestions.push('Address lint failures before approval'); + } + + if (args.checks && !args.checks.testsOk) { + issues.push('Tests failed'); + suggestions.push('Fix failing tests before approval'); + } + + if (!args.diff || args.diff.startsWith('(no changes detected)')) { + suggestions.push( + 'No code changes detected; verify the task can be satisfied without edits' + ); + } + + return { + approved: issues.length === 0, + issues, + suggestions, + }; +} + export async function runSpike( input: PlanningInput, options: HarnessOptions = {} @@ -71,24 +113,28 @@ export async function runSpike( const t0 = Date.now(); let plan: ImplementationPlan; - try { - const raw = await callClaude(plannerPrompt, { - model: options.plannerModel, - system: plannerSystem, - }); + if (options.deterministicFixture) { + plan = heuristicPlan(input); + } else { try { - // Strip markdown code fences if present - const cleaned = raw - .replace(/^```(?:json)?\s*\n?/i, '') - .replace(/\n?```\s*$/i, '') - .trim(); - plan = JSON.parse(cleaned); + const raw = await callClaude(plannerPrompt, { + model: options.plannerModel, + system: plannerSystem, + }); + try { + // Strip markdown code fences if present + const cleaned = raw + .replace(/^```(?:json)?\s*\n?/i, '') + .replace(/\n?```\s*$/i, '') + .trim(); + plan = JSON.parse(cleaned); + } catch { + // Fall back to heuristic if model returned text + plan = heuristicPlan(input); + } } catch { - // Fall back to heuristic if model returned text plan = heuristicPlan(input); } - } catch { - plan = heuristicPlan(input); } const planLatencyMs = Date.now() - t0; @@ -155,23 +201,32 @@ export async function runSpike( // Critic reviews the diff, not the CLI log const criticSystem = `You are a strict code reviewer. Review the git diff against the plan. Check for: correctness, missing steps, unrelated changes, bugs, security issues. Also review lint and test results if provided. Return raw JSON only (no markdown fences): { "approved": boolean, "issues": ["string"], "suggestions": ["string"] }`; const criticPrompt = `Plan: ${plan.summary}\nAcceptance criteria:\n${plan.steps.map((s) => s.acceptanceCriteria?.join(', ') || s.title).join('\n')}\n\nAttempt ${i + 1}/${maxIters}\nImplementer exit: ${ok ? 'success' : 'failed'}\n\nGit diff:\n${diff}${checksSection}`; - try { - const raw = await callClaude(criticPrompt, { - model: options.reviewerModel, - system: criticSystem, + if (options.deterministicFixture) { + lastCritique = deterministicCritique({ + plan, + ok, + diff, + checks, }); - // Strip markdown code fences if present - const cleaned = raw - .replace(/^```(?:json)?\s*\n?/i, '') - .replace(/\n?```\s*$/i, '') - .trim(); - lastCritique = JSON.parse(cleaned); - } catch { - lastCritique = { - approved: ok, - issues: ok ? [] : ['Critique failed'], - suggestions: [], - }; + } else { + try { + const raw = await callClaude(criticPrompt, { + model: options.reviewerModel, + system: criticSystem, + }); + // Strip markdown code fences if present + const cleaned = raw + .replace(/^```(?:json)?\s*\n?/i, '') + .replace(/\n?```\s*$/i, '') + .trim(); + lastCritique = JSON.parse(cleaned); + } catch { + lastCritique = { + approved: ok, + issues: ok ? [] : ['Critique failed'], + suggestions: [], + }; + } } iterations.push({ @@ -213,75 +268,77 @@ export async function runSpike( contextTokens: Math.ceil(finalDiff.length / 4), }; - // Persist audit + metrics - try { - const dir = - options.auditDir || path.join(input.repoPath, '.stackmemory', 'build'); - fs.mkdirSync(dir, { recursive: true }); - const stamp = new Date().toISOString().replace(/[:.]/g, '-'); - const file = path.join(dir, `spike-${stamp}.json`); - fs.writeFileSync( - file, - JSON.stringify( - { - input, - options: { ...options, auditDir: undefined }, - plan, - iterations, - metrics: runMetrics, - }, - null, - 2 - ) - ); - - // Append to metrics JSONL for time-series analysis - const metricsFile = path.join(dir, 'harness-metrics.jsonl'); - fs.appendFileSync(metricsFile, JSON.stringify(runMetrics) + '\n'); - - // LOOP 5: Harness Regression β€” check rolling window against targets + // Persist audit + metrics unless explicitly disabled for replay/smoke runs. + if (options.persistAudit !== false) { try { - const lines = fs - .readFileSync(metricsFile, 'utf-8') - .split('\n') - .filter((l) => l.trim()); - const recent = lines - .slice(-10) - .map((l) => JSON.parse(l) as HarnessRunMetrics); - if (recent.length >= 3) { - const summary = summarizeRuns(recent); - if (summary.approvalRate < HARNESS_TARGETS.firstPassApprovalRate) { - feedbackLoops.fire( - 'harnessRegression', - 'metrics_append', - { - metric: 'approvalRate', - current: summary.approvalRate, - target: HARNESS_TARGETS.firstPassApprovalRate, - window: recent.length, - }, - 'regression_alert' - ); - } - if (summary.p95TotalLatencyMs > HARNESS_TARGETS.totalLatencyP95Ms) { - feedbackLoops.fire( - 'harnessRegression', - 'metrics_append', - { - metric: 'totalLatencyP95', - current: summary.p95TotalLatencyMs, - target: HARNESS_TARGETS.totalLatencyP95Ms, - window: recent.length, - }, - 'regression_alert' - ); + const dir = + options.auditDir || path.join(input.repoPath, '.stackmemory', 'build'); + fs.mkdirSync(dir, { recursive: true }); + const stamp = new Date().toISOString().replace(/[:.]/g, '-'); + const file = path.join(dir, `spike-${stamp}.json`); + fs.writeFileSync( + file, + JSON.stringify( + { + input, + options: { ...options, auditDir: undefined }, + plan, + iterations, + metrics: runMetrics, + }, + null, + 2 + ) + ); + + // Append to metrics JSONL for time-series analysis + const metricsFile = path.join(dir, 'harness-metrics.jsonl'); + fs.appendFileSync(metricsFile, JSON.stringify(runMetrics) + '\n'); + + // LOOP 5: Harness Regression β€” check rolling window against targets + try { + const lines = fs + .readFileSync(metricsFile, 'utf-8') + .split('\n') + .filter((l) => l.trim()); + const recent = lines + .slice(-10) + .map((l) => JSON.parse(l) as HarnessRunMetrics); + if (recent.length >= 3) { + const summary = summarizeRuns(recent); + if (summary.approvalRate < HARNESS_TARGETS.firstPassApprovalRate) { + feedbackLoops.fire( + 'harnessRegression', + 'metrics_append', + { + metric: 'approvalRate', + current: summary.approvalRate, + target: HARNESS_TARGETS.firstPassApprovalRate, + window: recent.length, + }, + 'regression_alert' + ); + } + if (summary.p95TotalLatencyMs > HARNESS_TARGETS.totalLatencyP95Ms) { + feedbackLoops.fire( + 'harnessRegression', + 'metrics_append', + { + metric: 'totalLatencyP95', + current: summary.p95TotalLatencyMs, + target: HARNESS_TARGETS.totalLatencyP95Ms, + window: recent.length, + }, + 'regression_alert' + ); + } } + } catch { + // best-effort } } catch { - // best-effort + // best-effort only } - } catch { - // best-effort only } // Optionally record to local context DB @@ -418,12 +475,15 @@ async function recordAsFrame( // Lightweight planner: returns only the plan without implementation/critique export async function runPlanOnly( input: PlanningInput, - options: { plannerModel?: string } = {} + options: { plannerModel?: string; deterministicFixture?: boolean } = {} ): Promise { const plannerSystem = `You write concise, actionable implementation plans. Output raw JSON only (no markdown code fences). Schema: { "summary": "string", "steps": [{ "id": "step-1", "title": "string", "rationale": "string", "acceptanceCriteria": ["string"] }], "risks": ["string"] }`; const contextSummary = getLocalContextSummary(input.repoPath); const plannerPrompt = `Task: ${input.task}\nRepo: ${input.repoPath}\nNotes: ${input.contextNotes || '(none)'}\n${contextSummary}\nConstraints: Keep the plan minimal and implementable in a single PR.`; + if (options.deterministicFixture) { + return heuristicPlan(input); + } try { const raw = await callClaude(plannerPrompt, { model: options.plannerModel, diff --git a/src/orchestrators/multimodal/types.ts b/src/orchestrators/multimodal/types.ts index e6535d74..1eb6ad1d 100644 --- a/src/orchestrators/multimodal/types.ts +++ b/src/orchestrators/multimodal/types.ts @@ -30,8 +30,10 @@ export interface HarnessOptions { implementer?: 'codex' | 'claude'; maxIters?: number; // retry loop for critique β†’ fix cycles auditDir?: string; // where to persist spike results + persistAudit?: boolean; // if false, skip writing audit artifacts/metrics record?: boolean; // store plan/critique in local context DB recordFrame?: boolean; // create a real frame and anchors + deterministicFixture?: boolean; // force deterministic fixture mode for smoke/replay checks } export interface ImplementationResult { From 6bf62e96f2524cfc0f5fb39317f833adcedbca12 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Tue, 14 Apr 2026 17:49:36 -0400 Subject: [PATCH 07/18] docs: add design principles architecture note --- docs/architecture/DESIGN_PRINCIPLES.md | 91 +++++++++++++++++++++ docs/architecture/TECHNICAL_ARCHITECTURE.md | 2 + 2 files changed, 93 insertions(+) create mode 100644 docs/architecture/DESIGN_PRINCIPLES.md diff --git a/docs/architecture/DESIGN_PRINCIPLES.md b/docs/architecture/DESIGN_PRINCIPLES.md new file mode 100644 index 00000000..f2fb12be --- /dev/null +++ b/docs/architecture/DESIGN_PRINCIPLES.md @@ -0,0 +1,91 @@ +# Design Principles + +## The Three-Layer Architecture + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ FAT SKILLS (intelligence) β”‚ +β”‚ Markdown procedures that encode β”‚ +β”‚ judgment, process, domain knowledge. β”‚ +β”‚ This is where 90% of the value lives. β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ THIN HARNESS (routing) β”‚ +β”‚ ~200 lines of code. JSON in, text out. β”‚ +β”‚ Read-only by default. State machine. β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ DETERMINISTIC FOUNDATION (execution) β”‚ +β”‚ QueryDB, ReadDoc, Search, Timeline β”‚ +β”‚ β€” the tools that never fail ambiguouslyβ”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### The Principle + +**Push intelligence UP into skills. Push execution DOWN into deterministic tooling. Keep the harness THIN.** + +When you do this: +- Every model improvement automatically improves every skill +- The deterministic layer stays perfectly reliable +- The harness never accumulates complexity + +### How This Maps to StackMemory + +| Layer | StackMemory Component | Examples | +|-------|----------------------|----------| +| **Fat Skills** | `.claude/skills/`, CLAUDE.md, wiki articles | Context engineering, code conventions, deploy recipes | +| **Thin Harness** | MCP server, CLI, hooks, handoff script | `stackmemory restore`, `stackmemory snap`, frame lifecycle | +| **Deterministic Foundation** | SQLite, file system, git, embeddings | `contexts` table, `.stackmemory/` directory, decision log files | + +### Anti-Patterns + +- **Fat harness**: Logic in the MCP server that should be a skill. If you're writing `if/else` chains in the harness, move it to a skill. +- **Thin skills**: Skills that just call tools. If a skill has no judgment, it's a tool wrapper β€” push it down. +- **Smart foundation**: Database queries that encode business logic. Keep the foundation dumb β€” SELECT/INSERT/UPDATE only. + +## Cross-Agent Memory Strategies + +When multiple agents need shared state, choose the mechanism that matches the bottleneck: + +| Need | Strategy | StackMemory Component | +|------|----------|----------------------| +| Survive session restart | **Persistent context** | `stackmemory restore` / handoff script | +| Share decisions across agents | **Decision log** | `.stackmemory/decisions/` files | +| Transfer orchestrator state to worker | **Text handoff** (current) | `-smd` wrapper, structured notes | +| Transfer latent state without text | **KV cache compaction** (research) | Not yet β€” requires runtime KV access | +| Find relevant prior context | **Semantic search** | Embeddings + vector index | +| Replicate exact prior state | **Snapshot** | `stackmemory snap save/restore` | + +### Current Default: Text Handoff + +The `-smd` wrapper (`stackmemory-auto-handoff.sh`) does text-level handoff: +1. Saves current session state before exit +2. Restores prior context on next session start +3. Injects structured notes (decisions, corrections, task state) + +This is the **"structured notes" strategy** β€” human-readable, auditable, portable across model families. It works with any API (Claude, Codex, local models). + +### Future: Latent Briefing (Research) + +For systems that control the inference runtime (self-hosted models, custom Cloudflare workers), **Latent Briefing** offers a more efficient path: + +- Compact orchestrator KV cache using Attention Matching +- Task-guided scoring retains only positions relevant to the current worker +- Eliminates text serialization overhead + +**Status**: Research reference. Blocked by API access β€” Claude API doesn't expose KV state. Viable for self-hosted models or custom inference runtimes. + +**When to revisit**: When StackMemory supports self-hosted model backends, or when Substrate Cloud ships a custom inference runtime. + +**Reference**: See skill doc `latent-briefing.skill.md` for the full technical treatment, decision framework, and gotchas. + +## Compaction Hierarchy + +When context is too large, apply these strategies in order: + +1. **Observation masking** β€” Hide tool outputs that aren't relevant to the current task (cheapest) +2. **Prefix caching** β€” Reuse identical prompt prefixes across calls (free with API support) +3. **Structured notes** β€” Summarize prior sessions into decision/correction format (current default) +4. **Semantic retrieval** β€” Pull only relevant chunks from prior context (needs embeddings) +5. **KV cache compaction** β€” Transfer latent state directly (requires runtime access) + +Each level is more powerful but harder to implement. Start from the top. Only move down when the level above is insufficient. diff --git a/docs/architecture/TECHNICAL_ARCHITECTURE.md b/docs/architecture/TECHNICAL_ARCHITECTURE.md index 85a0efa6..eaba5874 100644 --- a/docs/architecture/TECHNICAL_ARCHITECTURE.md +++ b/docs/architecture/TECHNICAL_ARCHITECTURE.md @@ -47,6 +47,8 @@ The outer system that: > **Harness = runtime. Frames = call stack. Tools = syscalls. Digests = return values.** +**Design principle**: Push intelligence UP into skills. Push execution DOWN into deterministic tooling. Keep the harness THIN. See `DESIGN_PRINCIPLES.md` for the full three-layer architecture and cross-agent memory strategy hierarchy. + --- ## Database Design From b6c3afbe6147abd56fc198268fd6eb2e5dde9843 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Tue, 14 Apr 2026 18:06:02 -0400 Subject: [PATCH 08/18] chore: update gepa baselines and clean GitButler hooks --- .husky/post-checkout | 72 ------ .husky/pre-commit-user | 13 -- scripts/gepa/.before-optimize.md | 228 ++++-------------- scripts/gepa/generations/gen-000/baseline.md | 230 +++++-------------- scripts/gepa/generations/gen-001/baseline.md | 230 +++++-------------- 5 files changed, 157 insertions(+), 616 deletions(-) delete mode 100755 .husky/post-checkout delete mode 100755 .husky/pre-commit-user diff --git a/.husky/post-checkout b/.husky/post-checkout deleted file mode 100755 index fd875bc5..00000000 --- a/.husky/post-checkout +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/sh -# GITBUTLER_MANAGED_HOOK_V1 -# This hook auto-cleans GitButler hooks when you checkout away from gitbutler/workspace. - -PREV_HEAD=$1 -NEW_HEAD=$2 -BRANCH_CHECKOUT=$3 - -# Only act on branch checkouts (not file checkouts) -if [ "$BRANCH_CHECKOUT" != "1" ]; then - # Run user's hook if it exists - if [ -x "$(dirname "$0")/post-checkout-user" ]; then - exec "$(dirname "$0")/post-checkout-user" "$@" - fi - exit 0 -fi - -# Get the new branch name -NEW_BRANCH=$(git symbolic-ref --short HEAD 2>/dev/null) - -# If we just left gitbutler/workspace (and aren't coming back to it) -PREV_BRANCH=$(git name-rev --name-only "$PREV_HEAD" 2>/dev/null | sed 's|^remotes/||') -if echo "$PREV_BRANCH" | grep -q "gitbutler/workspace"; then - if [ "$NEW_BRANCH" != "gitbutler/workspace" ]; then - echo "" - echo "NOTE: You have left GitButler's managed workspace branch." - echo "Cleaning up GitButler hooks..." - - HOOKS_DIR=$(dirname "$0") - - # Restore pre-commit - but only if it's GitButler-managed - if [ -f "$HOOKS_DIR/pre-commit-user" ]; then - mv "$HOOKS_DIR/pre-commit-user" "$HOOKS_DIR/pre-commit" - echo " Restored: pre-commit" - elif [ -f "$HOOKS_DIR/pre-commit" ]; then - # Only remove if it's GitButler-managed (has our signature) - if grep -q "GITBUTLER_MANAGED_HOOK_V1" "$HOOKS_DIR/pre-commit"; then - rm "$HOOKS_DIR/pre-commit" - echo " Removed: pre-commit (GitButler managed)" - else - echo " Warning: pre-commit hook is not GitButler-managed, leaving it untouched" - fi - fi - - # Run user's post-checkout if it exists, then clean up - if [ -x "$HOOKS_DIR/post-checkout-user" ]; then - "$HOOKS_DIR/post-checkout-user" "$@" - mv "$HOOKS_DIR/post-checkout-user" "$HOOKS_DIR/post-checkout" - echo " Restored: post-checkout" - else - # Only remove self if we're GitButler-managed (we should be, but check anyway) - if grep -q "GITBUTLER_MANAGED_HOOK_V1" "$HOOKS_DIR/post-checkout"; then - rm "$HOOKS_DIR/post-checkout" - echo " Removed: post-checkout (GitButler managed)" - else - echo " Warning: post-checkout hook is not GitButler-managed, leaving it untouched" - fi - fi - - echo "" - echo "To return to GitButler mode, run: but setup" - echo "" - exit 0 - fi -fi - -# Run user's hook if it exists -if [ -x "$(dirname "$0")/post-checkout-user" ]; then - exec "$(dirname "$0")/post-checkout-user" "$@" -fi - -exit 0 diff --git a/.husky/pre-commit-user b/.husky/pre-commit-user deleted file mode 100755 index d8089be9..00000000 --- a/.husky/pre-commit-user +++ /dev/null @@ -1,13 +0,0 @@ -# Use Node version from .nvmrc -export NVM_DIR="$HOME/.nvm" -if [ -s "$NVM_DIR/nvm.sh" ]; then - . "$NVM_DIR/nvm.sh" - nvm use 2>/dev/null -elif [ -d "$HOME/.nvm/versions/node" ]; then - NODE_VER=$(cat "$(git rev-parse --show-toplevel)/.nvmrc" 2>/dev/null || echo "20") - NODE_PATH=$(ls -d "$HOME/.nvm/versions/node/v${NODE_VER}"* 2>/dev/null | head -1) - [ -n "$NODE_PATH" ] && export PATH="$NODE_PATH/bin:$PATH" -fi - -npx lint-staged -npm run build diff --git a/scripts/gepa/.before-optimize.md b/scripts/gepa/.before-optimize.md index 4dc0ebb0..2388f26a 100644 --- a/scripts/gepa/.before-optimize.md +++ b/scripts/gepa/.before-optimize.md @@ -1,198 +1,72 @@ -# CLAUDE.md +# croissant.ai β€” Agent Guide -You are a senior full-stack engineer working on **Sol**, the monorepo for Rize β€” an automatic time tracking application. Read the relevant code before making changes. Quote the specific code you're modifying when explaining changes. +Tool-agnostic reference for AI coding agents working in this repository. -## Project Overview +## Stack -- **api/** β€” Rails 7.1 GraphQL backend (Ruby 3.3.5) -- **web/** β€” Next.js 14 React web app (Node 22) -- **electron/** β€” Electron desktop app (Node 22) -- **services/** β€” Bun-based TypeScript event consumers/workers -- **voyager/** β€” Marketing website and landing pages (Next.js) -- **scripts/** β€” Automation scripts (categorized by side-effect type) -- **puppet/** β€” Puppeteer server for images/PDFs -- **chrome/** β€” Chrome browser extension -- **docs/** β€” Docusaurus documentation site -- **zapier/** β€” Zapier integration +Node.js / Express / PostgreSQL / Redis +Railway deployment | Stripe / Salesforce / QuickBooks integrations -## Development Commands +## Project Structure -```bash -# Start all services (requires iTerm2 on macOS) -./scripts/run-dev.sh - -# Or individually: -cd api && hivemind Procfile.dev # Rails + AnyCable + Sidekiq + Clockwork -cd web && npm run dev # Next.js dev server -cd electron && npm run dev # Electron with hot reload -cd services && hivemind Procfile.dev # Bun services -cd voyager && npm run dev # Marketing site (port 3003) ``` - -### Docker (start first) -```bash -cd api && docker-compose up -d -# TimescaleDB :15432 | Redis :16379 | Kafka :9092 | MySQL :13306 +src/ + api/ # Route handlers + core/ # monitoring-service, cache-service, queue-service, master-agent, api-validation + features/ # Feature modules + shared/ # Shared utilities + integrations/ # Third-party connectors +docs/ # Documentation +scripts/ # Automation scripts +docker/ # Container configs +prompts/ # Externalized LLM prompt templates ``` -### Testing -```bash -cd api && bundle exec rspec # Full API suite -cd api && bundle exec rspec spec/path/to/file_spec.rb # Single file -cd api && bundle exec rspec spec/path/to/file_spec.rb:42 # Single line -cd electron && npm test # Electron (Jest) -# Web β€” no active tests -``` +## Commands -### Building ```bash -cd api && bundle install && rake db:migrate -cd web && npm run build # gql-gen + tailwind + next build -cd electron && npm run build # Electron Forge make -cd services && bun install +npm run dev # Start dev server +npm run test # Run test suites (3 parallel Jest workers, maxWorkers=4) +npm run lint # Lint check +npm run migrate # Run DB migrations +docker-compose up -d # Start local DBs ``` -## Architecture - -### GraphQL API -Two endpoints: `api/v1` (public β€” OAuth, Zapier) and `private/v1` (web, electron). Located at `api/app/graphql/{api,private}/v1/`. - -### Background Processing -- **Sidekiq** for async jobs (`api/config/sidekiq.yml`) β€” use `perform_async`, not `perform_later` (ApplicationJob uses Sidekiq::Worker, not ActiveJob) -- **Clockwork** for scheduled jobs (`api/config/clock.rb`) -- **Kafka** for event streaming (`services/consumers/`) - -### Databases -PostgreSQL (primary) + TimescaleDB (time-series, separate connection) + MySQL (legacy) + Redis (cache, ActionCable, Sidekiq) - -### Real-time -AnyCable WebSocket server for subscriptions. Channels in `api/app/channels/`. - -## Code Patterns - -### Ruby/Rails -- Controllers validate + enqueue async jobs. Jobs handle business logic. Models handle delivery. -- Webhook controllers: `skip_before_action :authenticate_user!` + shared secret verification -- `CanonicalEmail.find_by_canonical(email:)` β€” uses `email_address` gem canonicalization; stub in tests -- `Identity#first_name` is a computed method (from `name` via `Nameable::Latin`), not a column -- `generate_hash_authentication_settings_url` calls `update!` internally β€” stub in tests via `allow_any_instance_of(Identity)` -- Test env uses `cache_store: :null_store` β€” swap to `MemoryStore` in `around` block for cache tests -- Postmark emails: all go through `PostmarkClient.deliver_in_batches_with_templates` with required keys: `email_enabled`, `email_bounced`, `message_stream` -- Prefer `be_between(before, after)` for time assertions (no `freeze_time` or `travel_to`) - -### JavaScript/TypeScript -- Use `test()` instead of `it()` in Jest tests -- Use `toBeCalled()` instead of `toHaveBeenCalledWith()` in assertions -- ESM: add `.js` extension to relative imports - -### Error Handling -- Prefer returning undefined over throwing exceptions -- Log and continue rather than crashing β€” filter nulls at boundaries -- Validate inputs at system boundaries (user input, external APIs, webhooks) - -## Scripts (`scripts/`) - -Standalone Node.js `.mjs` automation β€” outreach, content, analytics, CRM sync. Organized by side-effect type: - -- **`scripts/commit/`** β€” Scripts that produce repo artifacts (PRs, committed files). Includes `feedback/` for feedback collection and `profound-briefs/` for AEO pulse output. -- **`scripts/ops/`** β€” Marketing motions with external side effects (CRM sync, outreach, social content). -- **`scripts/diag/`** β€” Read-only diagnostics (pipeline health checks, demo scorecards). -- **`scripts/data/`** β€” Committed data artifacts (ICP keywords, pipeline config, profound learnings/snapshots). -- **`scripts/lib/`** β€” Shared utilities (Attio, Claude, Fathom, Slack, dates, prompts). - -Scheduled via GitHub Actions cron. All scheduled workflows support `workflow_dispatch` for manual runs. - -**GitHub Actions limit:** `workflow_dispatch` allows max 25 `inputs`. `weekly-start.yml` has 22/25 inputs. Feedback is consolidated into a single JSON `feedback` input: `{"social":"...","aeo":"...","blog":"...","snitcher":"..."}`. - -### Slack `/run` command -When adding or renaming GitHub Actions workflows that should be triggerable via Slack, update the `WORKFLOWS` hash in `api/app/jobs/trigger_github_workflow_job.rb`. When deleting a workflow, remove it from the hash. The Slack `/run` command reads this mapping to dispatch workflows. - -### Workflow β†’ Script mapping - -| Workflow | Script path | Category | -|---|---|---| -| `weekly-start.yml` | `voyager/scripts/content-brief.mjs` + `voyager/scripts/content-audit.mjs` + `ops/fathom-social-content.mjs` + `ops/fathom-testimonial-scan.mjs` + `ops/perplexity-citation-audit.mjs` + `commit/profound-aeo-pulse.mjs` + `voyager/scripts/generate-blog-scaffold.mjs` + `ops/ahrefs-firehose-digest.mjs` + `ops/export-dripify.mjs` + `commit/prospect-discovery.mjs` + `ops/repush-clay-leads.mjs` + `ops/snitcher-outreach.mjs` | GHA cron (Mon) | -| `weekly-end.yml` | `diag/fathom-demo-scorecard.mjs` + `commit/feedback/collect-*.mjs` | GHA cron (Fri) | -| `anneal-keywords.yml` | `commit/anneal-keywords.mjs` | GHA cron (Sun) | -| `g2-review-monitor.yml` | `ops/g2-to-senja.mjs` | GHA cron (Daily) | -| `testimonial-pipeline.yml` | `commit/testimonial-pipeline.mjs` | Manual | -| `video-pipeline.yml` | `ops/video-clips.mjs` | Manual | -| `pagespeed-audit.yml` | `diag/pagespeed-audit.mjs` + `commit/pagespeed-improvements.mjs` | GHA cron (1st of month) | -| `daily-ops.yml` | `ops/slack-digest.mjs` + `ops/fathom-meeting-digest.mjs` + `ops/ops-daily-briefing.mjs` | GHA cron (weekdays) | -| `indexnow-submit.yml` | (inline curl) | Push to master (voyager) / Manual | - -## GitHub Actions (`.github/workflows/`) - -### CI/CD (PR-triggered) -- `test-api.yml` β€” RSpec on PR to `api/` -- `review-voyager-seo.yml` β€” SEO/AEO/GEO review on PR to `voyager/` -- `main.yml` β€” Deploy API/Web/Services/Docs/Voyager to staging on merge to master -- `deploy-production.yml` β€” Manual sequential prod deploy (API β†’ Services β†’ Web) - -### GitHub Actions gotcha -In `actions/github-script@v7`, `github.rest.issues.createComment` posts plain issue comments on PRs (PRs are issues in GitHub's API). For inline code suggestions on specific files/lines, use `github.rest.pulls.createReview` or `github.rest.pulls.createReviewComment` instead. - -### Scheduled (cron) -- `weekly-start.yml` β€” Mon 9am ET (content review, social content, testimonial scan, Perplexity audit, AEO pulse β†’ blog scaffold, Ahrefs digest, Dripify export, prospect discovery β†’ snitcher outreach) -- `weekly-end.yml` β€” Fri 9am ET (demo scorecard + pipeline health) -- `anneal-keywords.yml` β€” Sun 11am ET (keyword annealing + kill pattern updates) -- `g2-review-monitor.yml` β€” Daily 10am ET -- `pagespeed-audit.yml` β€” 1st of month 9am ET (PSI audit β†’ Claude recommendations β†’ PR) -- `daily-ops.yml` β€” Weekdays 10am ET (signal monitor, G2 reviews, review intercept, Slack digest β†’ meeting digest β†’ daily briefing) -- `indexnow-submit.yml` β€” On push to master (voyager pages) + manual (`/run indexnow urls=...`) +## Git Conventions -## Deployments +- Branch prefixes: `feature/`, `fix/`, `chore/` +- Commit format: `type(scope): message` +- Do NOT add `Co-Authored-By` lines to commits +- Pre-commit hook runs: `npm run lint` + `npm run test` + E2E browser screenshots -### Staging (auto on merge to master) -- **API, Web, Services** β€” GCP Cloud Run via Docker (Artifact Registry) -- **Voyager** β€” GCP Cloud Run -- **Docs** β€” Heroku +## Testing Rules -### Production (manual `workflow_dispatch` only) -- Sequential: API β†’ 5min wait β†’ Services β†’ 5min wait β†’ Web -- `gh workflow run deploy-production.yml --ref master` +- **Framework**: Jest + SWC +- **DB mocking**: Use dependency injection (DI), not global mocks +- **Supertest**: Pass `app` (NOT `server`) to supertest +- **Global jest**: src/ tests use global `jest` β€” do NOT import from `@jest/globals` (causes redeclaration errors) +- **Mock reset**: `jest.clearAllMocks()` resets `mockReturnValue` β€” always re-set mocks in `beforeEach` +- **Test runner**: `npm test` is long-running; run in a background process or sub-agent, not inline -## Voyager Content +## ESLint Rules -Blog posts in `voyager/src/content/blog/*.mdx`. See `voyager/CLAUDE.md` for tone of voice, banned words, and content rules. - -Key patterns: -- Blog JSON-LD (BlogPosting) in `voyager/src/modules/blogJsonLd.js` -- FAQ structured data via `faqs` frontmatter array in blog MDX files -- Sitemap auto-includes all posts via `voyager/src/app/sitemap.js` -- Blog scaffold: `voyager/scripts/generate-blog-scaffold.mjs` (or `npm run content:scaffold`) -- Analytics events: `voyager/src/modules/analytics.js` -- Route paths: `voyager/src/utils/locations.js` - -## Style - -### Commits -- Plain imperative sentences, no conventional commit prefixes -- Short and direct β€” describe what, not why - -### Code -- Read before writing. Edit over rewrite. No docs unless asked. -- KISS / YAGNI / SOLID. Under 20 lines per function. -- Comments only for complex logic. No emojis in code. -- When blocked, try an alternative approach before asking. Explain what you tried and why it failed. -- Review your changes against the task requirements before reporting completion. - -## Knowledge Skills (.claude/skills/knowledge/) +- Use `catch {}` not `catch (_err) {}` β€” underscore prefix not in the allowed pattern +- CJS format for JS files in `src/` -Project-specific knowledge skills load automatically when prompts match `activates_on` keywords. They provide current API patterns, SDK versions, and gotchas that prevent hallucination. - -**When to suggest a new skill:** If you encounter a repeatable workflow where you got something wrong (wrong API shape, deprecated pattern, incorrect filter field), suggest creating a knowledge skill for it. Format: "This would be a good candidate for a `.claude/skills/knowledge/.skill.md` β€” want me to create one?" +## Key Patterns -Current skills: `postmark-email`, `nextjs-app-router`, `profound-mcp`, `greptile-review`, `tailwind-v4-design`, `rails-graphql-mutations`, `rails-sidekiq-clockwork`, `rails-billing-identity`, `electron-store-ipc`, `chrome-extension`, `blog-hero-images` +- Provenance tracking: every data point includes source, timestamp, lineage +- Multi-tenant container isolation +- DI route factories for testability +- Error handling: return undefined over throwing; log and continue over crashing +- Add `.js` extension to relative ESM imports -## Key Files +## StackMemory Context Rule -- `api/config/database.yml` β€” DB connections (primary + timescale) -- `api/config/sidekiq.yml` β€” Job queues and concurrency -- `api/config/clock.rb` β€” Scheduled jobs (Clockwork) -- `api/Procfile.dev` β€” Dev processes -- `api/app/services/postmark_client.rb` β€” Email delivery (all Postmark goes through here) -- `api/app/services/drip_campaign_config.rb` β€” Drip email templates + required keys -- `voyager/CLAUDE.md` β€” Blog tone, banned words, content rules -- `sol.code-workspace` β€” VS Code workspace -- Each project requires its own `.env` file (not in repo) +- When an agent fetches conversation context for active work, it must pass the exact current assignment or question as `task_query`. +- Prefer the MCP shape: + - `org_id` + - `conversation_id` + - `task_query` + - `recover_on_low_signal: true` +- Do not fetch raw `get_conversation` context for worker execution unless full transcript behavior is explicitly required. diff --git a/scripts/gepa/generations/gen-000/baseline.md b/scripts/gepa/generations/gen-000/baseline.md index 5fd37e77..0c86cace 100644 --- a/scripts/gepa/generations/gen-000/baseline.md +++ b/scripts/gepa/generations/gen-000/baseline.md @@ -1,198 +1,74 @@ -# CLAUDE.md +# croissant.ai β€” Agent Guide -You are a senior full-stack engineer working on **Sol**, the monorepo for Rize β€” an automatic time tracking application. Read the relevant code before making changes. Quote the specific code you're modifying when explaining changes. +Tool-agnostic reference for AI coding agents working in this repository. -## Project Overview +## Stack -- **api/** β€” Rails 7.1 GraphQL backend (Ruby 3.3.5) -- **web/** β€” Next.js 14 React web app (Node 22) -- **electron/** β€” Electron desktop app (Node 22) -- **services/** β€” Bun-based TypeScript event consumers/workers -- **voyager/** β€” Marketing website and landing pages (Next.js) -- **scripts/** β€” Automation scripts (categorized by side-effect type) -- **puppet/** β€” Puppeteer server for images/PDFs -- **chrome/** β€” Chrome browser extension -- **docs/** β€” Docusaurus documentation site -- **zapier/** β€” Zapier integration +Node.js / Express / PostgreSQL / Redis +Railway deployment | Stripe / Salesforce / QuickBooks integrations -## Development Commands +## Project Structure -```bash -# Start all services (requires iTerm2 on macOS) -./scripts/run-dev.sh - -# Or individually: -cd api && hivemind Procfile.dev # Rails + AnyCable + Sidekiq + Clockwork -cd web && npm run dev # Next.js dev server -cd electron && npm run dev # Electron with hot reload -cd services && hivemind Procfile.dev # Bun services -cd voyager && npm run dev # Marketing site (port 3003) ``` - -### Docker (start first) -```bash -cd api && docker-compose up -d -# TimescaleDB :15432 | Redis :16379 | Kafka :9092 | MySQL :13306 +src/ + api/ # Route handlers + core/ # monitoring-service, cache-service, queue-service, master-agent, api-validation + features/ # Feature modules + shared/ # Shared utilities + integrations/ # Third-party connectors +docs/ # Documentation +scripts/ # Automation scripts +docker/ # Container configs +prompts/ # Externalized LLM prompt templates ``` -### Testing -```bash -cd api && bundle exec rspec # Full API suite -cd api && bundle exec rspec spec/path/to/file_spec.rb # Single file -cd api && bundle exec rspec spec/path/to/file_spec.rb:42 # Single line -cd electron && npm test # Electron (Jest) -# Web β€” no active tests -``` +## Commands -### Building ```bash -cd api && bundle install && rake db:migrate -cd web && npm run build # gql-gen + tailwind + next build -cd electron && npm run build # Electron Forge make -cd services && bun install +npm run dev # Start dev server +npm run test # Run test suites (3 parallel Jest workers, maxWorkers=4) +npm run lint # Lint check +npm run migrate # Run DB migrations +docker-compose up -d # Start local DBs ``` -## Architecture - -### GraphQL API -Two endpoints: `api/v1` (public β€” OAuth, Zapier) and `private/v1` (web, electron). Located at `api/app/graphql/{api,private}/v1/`. - -### Background Processing -- **Sidekiq** for async jobs (`api/config/sidekiq.yml`) β€” use `perform_async`, not `perform_later` (ApplicationJob uses Sidekiq::Worker, not ActiveJob) -- **Clockwork** for scheduled jobs (`api/config/clock.rb`) -- **Kafka** for event streaming (`services/consumers/`) - -### Databases -PostgreSQL (primary) + TimescaleDB (time-series, separate connection) + MySQL (legacy) + Redis (cache, ActionCable, Sidekiq) - -### Real-time -AnyCable WebSocket server for subscriptions. Channels in `api/app/channels/`. - -## Code Patterns - -### Ruby/Rails -- Controllers validate + enqueue async jobs. Jobs handle business logic. Models handle delivery. -- Webhook controllers: `skip_before_action :authenticate_user!` + shared secret verification -- `CanonicalEmail.find_by_canonical(email:)` β€” uses `email_address` gem canonicalization; stub in tests -- `Identity#first_name` is a computed method (from `name` via `Nameable::Latin`), not a column -- `generate_hash_authentication_settings_url` calls `update!` internally β€” stub in tests via `allow_any_instance_of(Identity)` -- Test env uses `cache_store: :null_store` β€” swap to `MemoryStore` in `around` block for cache tests -- Postmark emails: all go through `PostmarkClient.deliver_in_batches_with_templates` with required keys: `email_enabled`, `email_bounced`, `message_stream` -- Prefer `be_between(before, after)` for time assertions (no `freeze_time` or `travel_to`) - -### JavaScript/TypeScript -- Use `test()` instead of `it()` in Jest tests -- Use `toBeCalled()` instead of `toHaveBeenCalledWith()` in assertions -- ESM: add `.js` extension to relative imports - -### Error Handling -- Prefer returning undefined over throwing exceptions -- Log and continue rather than crashing β€” filter nulls at boundaries -- Validate inputs at system boundaries (user input, external APIs, webhooks) - -## Scripts (`scripts/`) - -Standalone Node.js `.mjs` automation β€” outreach, content, analytics, CRM sync. Organized by side-effect type: - -- **`scripts/commit/`** β€” Scripts that produce repo artifacts (PRs, committed files). Includes `feedback/` for feedback collection and `profound-briefs/` for AEO pulse output. -- **`scripts/ops/`** β€” Marketing motions with external side effects (CRM sync, outreach, social content). -- **`scripts/diag/`** β€” Read-only diagnostics (pipeline health checks, demo scorecards). -- **`scripts/data/`** β€” Committed data artifacts (ICP keywords, pipeline config, profound learnings/snapshots). -- **`scripts/lib/`** β€” Shared utilities (Attio, Claude, Fathom, Slack, dates, prompts). - -Scheduled via GitHub Actions cron. All scheduled workflows support `workflow_dispatch` for manual runs. - -**GitHub Actions limit:** `workflow_dispatch` allows max 25 `inputs`. `weekly-start.yml` has 22/25 inputs. Feedback is consolidated into a single JSON `feedback` input: `{"social":"...","aeo":"...","blog":"...","snitcher":"..."}`. - -### Slack `/run` command -When adding or renaming GitHub Actions workflows that should be triggerable via Slack, update the `WORKFLOWS` hash in `api/app/jobs/trigger_github_workflow_job.rb`. When deleting a workflow, remove it from the hash. The Slack `/run` command reads this mapping to dispatch workflows. - -### Workflow β†’ Script mapping - -| Workflow | Script path | Category | -|---|---|---| -| `weekly-start.yml` | `voyager/scripts/content-brief.mjs` + `voyager/scripts/content-audit.mjs` + `ops/fathom-social-content.mjs` + `ops/fathom-testimonial-scan.mjs` + `ops/perplexity-citation-audit.mjs` + `commit/profound-aeo-pulse.mjs` + `voyager/scripts/generate-blog-scaffold.mjs` + `ops/ahrefs-firehose-digest.mjs` + `ops/export-dripify.mjs` + `commit/prospect-discovery.mjs` + `ops/repush-clay-leads.mjs` + `ops/snitcher-outreach.mjs` | GHA cron (Mon) | -| `weekly-end.yml` | `diag/fathom-demo-scorecard.mjs` + `commit/feedback/collect-*.mjs` + `commit/feedback/collect-ops-feedback.mjs` + `diag/weekly-retro.mjs` | GHA cron (Fri) | -| `anneal-keywords.yml` | `commit/anneal-keywords.mjs` | GHA cron (Sun) | -| `g2-review-monitor.yml` | `ops/g2-to-senja.mjs` | GHA cron (Daily) | -| `testimonial-pipeline.yml` | `commit/testimonial-pipeline.mjs` | Manual | -| `video-pipeline.yml` | `ops/video-clips.mjs` | Manual | -| `pagespeed-audit.yml` | `diag/pagespeed-audit.mjs` + `commit/pagespeed-improvements.mjs` | GHA cron (1st of month) | -| `daily-ops.yml` | `ops/slack-digest.mjs` + `ops/fathom-meeting-digest.mjs` + `ops/ops-daily-briefing.mjs` | GHA cron (weekdays) | -| `indexnow-submit.yml` | (inline curl) | Push to master (voyager) / Manual | - -## GitHub Actions (`.github/workflows/`) - -### CI/CD (PR-triggered) -- `test-api.yml` β€” RSpec on PR to `api/` -- `review-voyager-seo.yml` β€” SEO/AEO/GEO review on PR to `voyager/` -- `main.yml` β€” Deploy API/Web/Services/Docs/Voyager to staging on merge to master -- `deploy-production.yml` β€” Manual sequential prod deploy (API β†’ Services β†’ Web) - -### GitHub Actions gotcha -In `actions/github-script@v7`, `github.rest.issues.createComment` posts plain issue comments on PRs (PRs are issues in GitHub's API). For inline code suggestions on specific files/lines, use `github.rest.pulls.createReview` or `github.rest.pulls.createReviewComment` instead. - -### Scheduled (cron) -- `weekly-start.yml` β€” Mon 9am ET (content review, social content, testimonial scan, Perplexity audit, AEO pulse β†’ blog scaffold, Ahrefs digest, Dripify export, prospect discovery β†’ snitcher outreach) -- `weekly-end.yml` β€” Fri 9am ET (demo scorecard + pipeline health) -- `anneal-keywords.yml` β€” Sun 11am ET (keyword annealing + kill pattern updates) -- `g2-review-monitor.yml` β€” Daily 10am ET -- `pagespeed-audit.yml` β€” 1st of month 9am ET (PSI audit β†’ Claude recommendations β†’ PR) -- `daily-ops.yml` β€” Weekdays 10am ET (signal monitor, G2 reviews, review intercept, Slack digest β†’ meeting digest β†’ daily briefing) -- `indexnow-submit.yml` β€” On push to master (voyager pages) + manual (`/run indexnow urls=...`) +## Git Conventions -## Deployments +- Branch prefixes: `feature/`, `fix/`, `chore/` +- Commit format: `type(scope): message` +- Do NOT add `Co-Authored-By` lines to commits +- Pre-commit hook runs: `npm run lint` + `npm run test` + E2E browser screenshots -### Staging (auto on merge to master) -- **API, Web, Services** β€” GCP Cloud Run via Docker (Artifact Registry) -- **Voyager** β€” GCP Cloud Run -- **Docs** β€” Heroku +## Testing Rules -### Production (manual `workflow_dispatch` only) -- Sequential: API β†’ 5min wait β†’ Services β†’ 5min wait β†’ Web -- `gh workflow run deploy-production.yml --ref master` +- **Framework**: Jest + SWC +- **DB mocking**: Use dependency injection (DI), not global mocks +- **Supertest**: Pass `app` (NOT `server`) to supertest +- **Global jest**: src/ tests use global `jest` β€” do NOT import from `@jest/globals` (causes redeclaration errors) +- **Mock reset**: `jest.clearAllMocks()` resets `mockReturnValue` β€” always re-set mocks in `beforeEach` +- **Test runner**: `npm test` is long-running; run in a background process or sub-agent, not inline -## Voyager Content +## ESLint Rules -Blog posts in `voyager/src/content/blog/*.mdx`. See `voyager/CLAUDE.md` for tone of voice, banned words, and content rules. - -Key patterns: -- Blog JSON-LD (BlogPosting) in `voyager/src/modules/blogJsonLd.js` -- FAQ structured data via `faqs` frontmatter array in blog MDX files -- Sitemap auto-includes all posts via `voyager/src/app/sitemap.js` -- Blog scaffold: `voyager/scripts/generate-blog-scaffold.mjs` (or `npm run content:scaffold`) -- Analytics events: `voyager/src/modules/analytics.js` -- Route paths: `voyager/src/utils/locations.js` - -## Style - -### Commits -- Plain imperative sentences, no conventional commit prefixes -- Short and direct β€” describe what, not why - -### Code -- Read before writing. Edit over rewrite. No docs unless asked. -- KISS / YAGNI / SOLID. Under 20 lines per function. -- Comments only for complex logic. No emojis in code. -- When blocked, try an alternative approach before asking. Explain what you tried and why it failed. -- Review your changes against the task requirements before reporting completion. - -## Knowledge Skills (.claude/skills/knowledge/) +- Use `catch {}` not `catch (_err) {}` β€” underscore prefix not in the allowed pattern +- CJS format for JS files in `src/` -Project-specific knowledge skills load automatically when prompts match `activates_on` keywords. They provide current API patterns, SDK versions, and gotchas that prevent hallucination. - -**When to suggest a new skill:** If you encounter a repeatable workflow where you got something wrong (wrong API shape, deprecated pattern, incorrect filter field), suggest creating a knowledge skill for it. Format: "This would be a good candidate for a `.claude/skills/knowledge/.skill.md` β€” want me to create one?" +## Key Patterns -Current skills: `postmark-email`, `nextjs-app-router`, `profound-mcp`, `greptile-review`, `tailwind-v4-design`, `rails-graphql-mutations`, `rails-sidekiq-clockwork`, `rails-billing-identity`, `electron-store-ipc`, `chrome-extension`, `blog-hero-images` +- Provenance tracking: every data point includes source, timestamp, lineage +- Multi-tenant container isolation +- DI route factories for testability +- Error handling: return undefined over throwing; log and continue over crashing +- Add `.js` extension to relative ESM imports -## Key Files +## StackMemory Context Rule -- `api/config/database.yml` β€” DB connections (primary + timescale) -- `api/config/sidekiq.yml` β€” Job queues and concurrency -- `api/config/clock.rb` β€” Scheduled jobs (Clockwork) -- `api/Procfile.dev` β€” Dev processes -- `api/app/services/postmark_client.rb` β€” Email delivery (all Postmark goes through here) -- `api/app/services/drip_campaign_config.rb` β€” Drip email templates + required keys -- `voyager/CLAUDE.md` β€” Blog tone, banned words, content rules -- `sol.code-workspace` β€” VS Code workspace -- Each project requires its own `.env` file (not in repo) +- When an agent fetches conversation context for active work, it must pass the exact current assignment or question as `task_query`. +- Prefer the MCP shape: + - `org_id` + - `conversation_id` + - `worker_mode: true` + - `task_query` + - `recover_on_low_signal: true` +- Do not fetch raw `get_conversation` context for worker execution unless full transcript behavior is explicitly required. +- The current assignment is persisted under `.stackmemory/worker-context/current-assignment.json` so wrappers and hooks can auto-fill or enforce `task_query`. diff --git a/scripts/gepa/generations/gen-001/baseline.md b/scripts/gepa/generations/gen-001/baseline.md index 5fd37e77..0c86cace 100644 --- a/scripts/gepa/generations/gen-001/baseline.md +++ b/scripts/gepa/generations/gen-001/baseline.md @@ -1,198 +1,74 @@ -# CLAUDE.md +# croissant.ai β€” Agent Guide -You are a senior full-stack engineer working on **Sol**, the monorepo for Rize β€” an automatic time tracking application. Read the relevant code before making changes. Quote the specific code you're modifying when explaining changes. +Tool-agnostic reference for AI coding agents working in this repository. -## Project Overview +## Stack -- **api/** β€” Rails 7.1 GraphQL backend (Ruby 3.3.5) -- **web/** β€” Next.js 14 React web app (Node 22) -- **electron/** β€” Electron desktop app (Node 22) -- **services/** β€” Bun-based TypeScript event consumers/workers -- **voyager/** β€” Marketing website and landing pages (Next.js) -- **scripts/** β€” Automation scripts (categorized by side-effect type) -- **puppet/** β€” Puppeteer server for images/PDFs -- **chrome/** β€” Chrome browser extension -- **docs/** β€” Docusaurus documentation site -- **zapier/** β€” Zapier integration +Node.js / Express / PostgreSQL / Redis +Railway deployment | Stripe / Salesforce / QuickBooks integrations -## Development Commands +## Project Structure -```bash -# Start all services (requires iTerm2 on macOS) -./scripts/run-dev.sh - -# Or individually: -cd api && hivemind Procfile.dev # Rails + AnyCable + Sidekiq + Clockwork -cd web && npm run dev # Next.js dev server -cd electron && npm run dev # Electron with hot reload -cd services && hivemind Procfile.dev # Bun services -cd voyager && npm run dev # Marketing site (port 3003) ``` - -### Docker (start first) -```bash -cd api && docker-compose up -d -# TimescaleDB :15432 | Redis :16379 | Kafka :9092 | MySQL :13306 +src/ + api/ # Route handlers + core/ # monitoring-service, cache-service, queue-service, master-agent, api-validation + features/ # Feature modules + shared/ # Shared utilities + integrations/ # Third-party connectors +docs/ # Documentation +scripts/ # Automation scripts +docker/ # Container configs +prompts/ # Externalized LLM prompt templates ``` -### Testing -```bash -cd api && bundle exec rspec # Full API suite -cd api && bundle exec rspec spec/path/to/file_spec.rb # Single file -cd api && bundle exec rspec spec/path/to/file_spec.rb:42 # Single line -cd electron && npm test # Electron (Jest) -# Web β€” no active tests -``` +## Commands -### Building ```bash -cd api && bundle install && rake db:migrate -cd web && npm run build # gql-gen + tailwind + next build -cd electron && npm run build # Electron Forge make -cd services && bun install +npm run dev # Start dev server +npm run test # Run test suites (3 parallel Jest workers, maxWorkers=4) +npm run lint # Lint check +npm run migrate # Run DB migrations +docker-compose up -d # Start local DBs ``` -## Architecture - -### GraphQL API -Two endpoints: `api/v1` (public β€” OAuth, Zapier) and `private/v1` (web, electron). Located at `api/app/graphql/{api,private}/v1/`. - -### Background Processing -- **Sidekiq** for async jobs (`api/config/sidekiq.yml`) β€” use `perform_async`, not `perform_later` (ApplicationJob uses Sidekiq::Worker, not ActiveJob) -- **Clockwork** for scheduled jobs (`api/config/clock.rb`) -- **Kafka** for event streaming (`services/consumers/`) - -### Databases -PostgreSQL (primary) + TimescaleDB (time-series, separate connection) + MySQL (legacy) + Redis (cache, ActionCable, Sidekiq) - -### Real-time -AnyCable WebSocket server for subscriptions. Channels in `api/app/channels/`. - -## Code Patterns - -### Ruby/Rails -- Controllers validate + enqueue async jobs. Jobs handle business logic. Models handle delivery. -- Webhook controllers: `skip_before_action :authenticate_user!` + shared secret verification -- `CanonicalEmail.find_by_canonical(email:)` β€” uses `email_address` gem canonicalization; stub in tests -- `Identity#first_name` is a computed method (from `name` via `Nameable::Latin`), not a column -- `generate_hash_authentication_settings_url` calls `update!` internally β€” stub in tests via `allow_any_instance_of(Identity)` -- Test env uses `cache_store: :null_store` β€” swap to `MemoryStore` in `around` block for cache tests -- Postmark emails: all go through `PostmarkClient.deliver_in_batches_with_templates` with required keys: `email_enabled`, `email_bounced`, `message_stream` -- Prefer `be_between(before, after)` for time assertions (no `freeze_time` or `travel_to`) - -### JavaScript/TypeScript -- Use `test()` instead of `it()` in Jest tests -- Use `toBeCalled()` instead of `toHaveBeenCalledWith()` in assertions -- ESM: add `.js` extension to relative imports - -### Error Handling -- Prefer returning undefined over throwing exceptions -- Log and continue rather than crashing β€” filter nulls at boundaries -- Validate inputs at system boundaries (user input, external APIs, webhooks) - -## Scripts (`scripts/`) - -Standalone Node.js `.mjs` automation β€” outreach, content, analytics, CRM sync. Organized by side-effect type: - -- **`scripts/commit/`** β€” Scripts that produce repo artifacts (PRs, committed files). Includes `feedback/` for feedback collection and `profound-briefs/` for AEO pulse output. -- **`scripts/ops/`** β€” Marketing motions with external side effects (CRM sync, outreach, social content). -- **`scripts/diag/`** β€” Read-only diagnostics (pipeline health checks, demo scorecards). -- **`scripts/data/`** β€” Committed data artifacts (ICP keywords, pipeline config, profound learnings/snapshots). -- **`scripts/lib/`** β€” Shared utilities (Attio, Claude, Fathom, Slack, dates, prompts). - -Scheduled via GitHub Actions cron. All scheduled workflows support `workflow_dispatch` for manual runs. - -**GitHub Actions limit:** `workflow_dispatch` allows max 25 `inputs`. `weekly-start.yml` has 22/25 inputs. Feedback is consolidated into a single JSON `feedback` input: `{"social":"...","aeo":"...","blog":"...","snitcher":"..."}`. - -### Slack `/run` command -When adding or renaming GitHub Actions workflows that should be triggerable via Slack, update the `WORKFLOWS` hash in `api/app/jobs/trigger_github_workflow_job.rb`. When deleting a workflow, remove it from the hash. The Slack `/run` command reads this mapping to dispatch workflows. - -### Workflow β†’ Script mapping - -| Workflow | Script path | Category | -|---|---|---| -| `weekly-start.yml` | `voyager/scripts/content-brief.mjs` + `voyager/scripts/content-audit.mjs` + `ops/fathom-social-content.mjs` + `ops/fathom-testimonial-scan.mjs` + `ops/perplexity-citation-audit.mjs` + `commit/profound-aeo-pulse.mjs` + `voyager/scripts/generate-blog-scaffold.mjs` + `ops/ahrefs-firehose-digest.mjs` + `ops/export-dripify.mjs` + `commit/prospect-discovery.mjs` + `ops/repush-clay-leads.mjs` + `ops/snitcher-outreach.mjs` | GHA cron (Mon) | -| `weekly-end.yml` | `diag/fathom-demo-scorecard.mjs` + `commit/feedback/collect-*.mjs` + `commit/feedback/collect-ops-feedback.mjs` + `diag/weekly-retro.mjs` | GHA cron (Fri) | -| `anneal-keywords.yml` | `commit/anneal-keywords.mjs` | GHA cron (Sun) | -| `g2-review-monitor.yml` | `ops/g2-to-senja.mjs` | GHA cron (Daily) | -| `testimonial-pipeline.yml` | `commit/testimonial-pipeline.mjs` | Manual | -| `video-pipeline.yml` | `ops/video-clips.mjs` | Manual | -| `pagespeed-audit.yml` | `diag/pagespeed-audit.mjs` + `commit/pagespeed-improvements.mjs` | GHA cron (1st of month) | -| `daily-ops.yml` | `ops/slack-digest.mjs` + `ops/fathom-meeting-digest.mjs` + `ops/ops-daily-briefing.mjs` | GHA cron (weekdays) | -| `indexnow-submit.yml` | (inline curl) | Push to master (voyager) / Manual | - -## GitHub Actions (`.github/workflows/`) - -### CI/CD (PR-triggered) -- `test-api.yml` β€” RSpec on PR to `api/` -- `review-voyager-seo.yml` β€” SEO/AEO/GEO review on PR to `voyager/` -- `main.yml` β€” Deploy API/Web/Services/Docs/Voyager to staging on merge to master -- `deploy-production.yml` β€” Manual sequential prod deploy (API β†’ Services β†’ Web) - -### GitHub Actions gotcha -In `actions/github-script@v7`, `github.rest.issues.createComment` posts plain issue comments on PRs (PRs are issues in GitHub's API). For inline code suggestions on specific files/lines, use `github.rest.pulls.createReview` or `github.rest.pulls.createReviewComment` instead. - -### Scheduled (cron) -- `weekly-start.yml` β€” Mon 9am ET (content review, social content, testimonial scan, Perplexity audit, AEO pulse β†’ blog scaffold, Ahrefs digest, Dripify export, prospect discovery β†’ snitcher outreach) -- `weekly-end.yml` β€” Fri 9am ET (demo scorecard + pipeline health) -- `anneal-keywords.yml` β€” Sun 11am ET (keyword annealing + kill pattern updates) -- `g2-review-monitor.yml` β€” Daily 10am ET -- `pagespeed-audit.yml` β€” 1st of month 9am ET (PSI audit β†’ Claude recommendations β†’ PR) -- `daily-ops.yml` β€” Weekdays 10am ET (signal monitor, G2 reviews, review intercept, Slack digest β†’ meeting digest β†’ daily briefing) -- `indexnow-submit.yml` β€” On push to master (voyager pages) + manual (`/run indexnow urls=...`) +## Git Conventions -## Deployments +- Branch prefixes: `feature/`, `fix/`, `chore/` +- Commit format: `type(scope): message` +- Do NOT add `Co-Authored-By` lines to commits +- Pre-commit hook runs: `npm run lint` + `npm run test` + E2E browser screenshots -### Staging (auto on merge to master) -- **API, Web, Services** β€” GCP Cloud Run via Docker (Artifact Registry) -- **Voyager** β€” GCP Cloud Run -- **Docs** β€” Heroku +## Testing Rules -### Production (manual `workflow_dispatch` only) -- Sequential: API β†’ 5min wait β†’ Services β†’ 5min wait β†’ Web -- `gh workflow run deploy-production.yml --ref master` +- **Framework**: Jest + SWC +- **DB mocking**: Use dependency injection (DI), not global mocks +- **Supertest**: Pass `app` (NOT `server`) to supertest +- **Global jest**: src/ tests use global `jest` β€” do NOT import from `@jest/globals` (causes redeclaration errors) +- **Mock reset**: `jest.clearAllMocks()` resets `mockReturnValue` β€” always re-set mocks in `beforeEach` +- **Test runner**: `npm test` is long-running; run in a background process or sub-agent, not inline -## Voyager Content +## ESLint Rules -Blog posts in `voyager/src/content/blog/*.mdx`. See `voyager/CLAUDE.md` for tone of voice, banned words, and content rules. - -Key patterns: -- Blog JSON-LD (BlogPosting) in `voyager/src/modules/blogJsonLd.js` -- FAQ structured data via `faqs` frontmatter array in blog MDX files -- Sitemap auto-includes all posts via `voyager/src/app/sitemap.js` -- Blog scaffold: `voyager/scripts/generate-blog-scaffold.mjs` (or `npm run content:scaffold`) -- Analytics events: `voyager/src/modules/analytics.js` -- Route paths: `voyager/src/utils/locations.js` - -## Style - -### Commits -- Plain imperative sentences, no conventional commit prefixes -- Short and direct β€” describe what, not why - -### Code -- Read before writing. Edit over rewrite. No docs unless asked. -- KISS / YAGNI / SOLID. Under 20 lines per function. -- Comments only for complex logic. No emojis in code. -- When blocked, try an alternative approach before asking. Explain what you tried and why it failed. -- Review your changes against the task requirements before reporting completion. - -## Knowledge Skills (.claude/skills/knowledge/) +- Use `catch {}` not `catch (_err) {}` β€” underscore prefix not in the allowed pattern +- CJS format for JS files in `src/` -Project-specific knowledge skills load automatically when prompts match `activates_on` keywords. They provide current API patterns, SDK versions, and gotchas that prevent hallucination. - -**When to suggest a new skill:** If you encounter a repeatable workflow where you got something wrong (wrong API shape, deprecated pattern, incorrect filter field), suggest creating a knowledge skill for it. Format: "This would be a good candidate for a `.claude/skills/knowledge/.skill.md` β€” want me to create one?" +## Key Patterns -Current skills: `postmark-email`, `nextjs-app-router`, `profound-mcp`, `greptile-review`, `tailwind-v4-design`, `rails-graphql-mutations`, `rails-sidekiq-clockwork`, `rails-billing-identity`, `electron-store-ipc`, `chrome-extension`, `blog-hero-images` +- Provenance tracking: every data point includes source, timestamp, lineage +- Multi-tenant container isolation +- DI route factories for testability +- Error handling: return undefined over throwing; log and continue over crashing +- Add `.js` extension to relative ESM imports -## Key Files +## StackMemory Context Rule -- `api/config/database.yml` β€” DB connections (primary + timescale) -- `api/config/sidekiq.yml` β€” Job queues and concurrency -- `api/config/clock.rb` β€” Scheduled jobs (Clockwork) -- `api/Procfile.dev` β€” Dev processes -- `api/app/services/postmark_client.rb` β€” Email delivery (all Postmark goes through here) -- `api/app/services/drip_campaign_config.rb` β€” Drip email templates + required keys -- `voyager/CLAUDE.md` β€” Blog tone, banned words, content rules -- `sol.code-workspace` β€” VS Code workspace -- Each project requires its own `.env` file (not in repo) +- When an agent fetches conversation context for active work, it must pass the exact current assignment or question as `task_query`. +- Prefer the MCP shape: + - `org_id` + - `conversation_id` + - `worker_mode: true` + - `task_query` + - `recover_on_low_signal: true` +- Do not fetch raw `get_conversation` context for worker execution unless full transcript behavior is explicitly required. +- The current assignment is persisted under `.stackmemory/worker-context/current-assignment.json` so wrappers and hooks can auto-fill or enforce `task_query`. From 2f8ed5f67a4fdf250d39ca724ae847877e01914f Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Thu, 16 Apr 2026 15:15:08 -0400 Subject: [PATCH 09/18] fix(conductor): harden lane mode cleanup --- .../commands/__tests__/conductor-lane.test.ts | 198 ++++++++++++ src/cli/commands/orchestrate.ts | 301 +++++++++++++++++- src/cli/commands/orchestrator.ts | 39 ++- 3 files changed, 533 insertions(+), 5 deletions(-) create mode 100644 src/cli/commands/__tests__/conductor-lane.test.ts diff --git a/src/cli/commands/__tests__/conductor-lane.test.ts b/src/cli/commands/__tests__/conductor-lane.test.ts new file mode 100644 index 00000000..6ac7ab7f --- /dev/null +++ b/src/cli/commands/__tests__/conductor-lane.test.ts @@ -0,0 +1,198 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { Command } from 'commander'; +import { mkdtempSync, mkdirSync, rmSync, writeFileSync } from 'fs'; +import { join } from 'path'; +import { tmpdir } from 'os'; + +describe('conductor lane mode', () => { + let tempDir: string; + let sigintBefore: Function[]; + let sigtermBefore: Function[]; + + beforeEach(() => { + tempDir = mkdtempSync(join(tmpdir(), 'sm-conductor-lane-')); + sigintBefore = process.listeners('SIGINT'); + sigtermBefore = process.listeners('SIGTERM'); + vi.resetModules(); + }); + + afterEach(() => { + for (const listener of process.listeners('SIGINT')) { + if (!sigintBefore.includes(listener)) { + process.removeListener('SIGINT', listener); + } + } + for (const listener of process.listeners('SIGTERM')) { + if (!sigtermBefore.includes(listener)) { + process.removeListener('SIGTERM', listener); + } + } + vi.restoreAllMocks(); + vi.resetModules(); + rmSync(tempDir, { recursive: true, force: true }); + }); + + it('skips lane cleanup when worktree cleanliness cannot be verified', async () => { + const execSync = vi.fn((cmd: string) => { + if (cmd === 'git branch --show-current') return 'lane/main\n'; + if (cmd === `git branch --list 'worktree-agent-*'`) { + return ' worktree-agent-123\n'; + } + if ( + cmd === 'git merge-base --is-ancestor "worktree-agent-123" "lane/main"' + ) { + throw Object.assign(new Error('not ancestor'), { status: 1 }); + } + if (cmd === 'git cherry "lane/main" "worktree-agent-123"') return ''; + if (cmd === 'git worktree list --porcelain') { + return [ + 'worktree /tmp/worktree-agent-123', + 'branch refs/heads/worktree-agent-123', + '', + ].join('\n'); + } + if (cmd === 'git -C "/tmp/worktree-agent-123" status --short') { + throw new Error('status unavailable'); + } + throw new Error(`Unexpected execSync: ${cmd}`); + }); + + vi.doMock('child_process', async () => { + const actual = + await vi.importActual('child_process'); + return { ...actual, execSync }; + }); + + const consoleLog = vi.spyOn(console, 'log').mockImplementation(() => {}); + const { createConductorCommands } = await import('../orchestrate.js'); + + const program = new Command(); + program.exitOverride(); + program.addCommand(createConductorCommands()); + + await program.parseAsync([ + 'node', + 'stackmemory', + 'conductor', + 'lane', + 'cleanup', + '--repo', + tempDir, + ]); + + const output = consoleLog.mock.calls + .map((call) => String(call[0])) + .join('\n'); + expect(output).toContain('unknown'); + expect(output).toContain('could not verify clean state'); + expect( + execSync.mock.calls.some(([cmd]) => + String(cmd).includes('git worktree remove') + ) + ).toBe(false); + }); + + it('uses worktrees in auto mode when lane mode is enabled', async () => { + const repoRoot = join(tempDir, 'repo'); + const workspaceRoot = join(tempDir, 'workspaces'); + const appServerPath = join(tempDir, 'claude-app-server.cjs'); + + mkdirSync(join(repoRoot, '.git', 'gitbutler'), { recursive: true }); + writeFileSync(appServerPath, 'module.exports = {};'); + + const execSync = vi.fn((cmd: string) => { + if (cmd === 'but --version') return 'gitbutler 1.0.0\n'; + throw new Error(`Unexpected execSync: ${cmd}`); + }); + + vi.doMock('child_process', async () => { + const actual = + await vi.importActual('child_process'); + return { ...actual, execSync }; + }); + + const { Conductor } = await import('../orchestrator.js'); + const conductor = new Conductor({ + activeStates: ['Todo'], + terminalStates: ['Done', 'Cancelled'], + inProgressState: 'In Progress', + inReviewState: 'In Review', + pollIntervalMs: 1, + maxConcurrent: 1, + workspaceRoot, + repoRoot, + baseBranch: 'main', + appServerPath, + turnTimeoutMs: 1, + maxRetries: 0, + hookTimeoutMs: 1, + agentMode: 'cli', + workspaceMode: 'auto', + laneBranch: 'lane/main', + }); + + (conductor as unknown as Record).createLinearClient = vi + .fn() + .mockResolvedValue(null); + (conductor as unknown as Record).cacheWorkflowStates = vi + .fn() + .mockResolvedValue(undefined); + (conductor as unknown as Record).writeStatusFile = vi.fn(); + (conductor as unknown as Record).poll = vi + .fn() + .mockResolvedValue(undefined); + (conductor as unknown as Record).schedulePoll = vi + .fn() + .mockResolvedValue(undefined); + + await conductor.start(); + + expect( + execSync.mock.calls.some(([cmd]) => String(cmd) === 'but --version') + ).toBe(false); + expect( + (conductor as unknown as { useGitButler: boolean }).useGitButler + ).toBe(false); + }); + + it('rejects explicit gitbutler mode when lane mode is enabled', async () => { + const repoRoot = join(tempDir, 'repo'); + const workspaceRoot = join(tempDir, 'workspaces'); + const appServerPath = join(tempDir, 'claude-app-server.cjs'); + + mkdirSync(repoRoot, { recursive: true }); + writeFileSync(appServerPath, 'module.exports = {};'); + + const execSync = vi.fn(); + + vi.doMock('child_process', async () => { + const actual = + await vi.importActual('child_process'); + return { ...actual, execSync }; + }); + + const { Conductor } = await import('../orchestrator.js'); + const conductor = new Conductor({ + activeStates: ['Todo'], + terminalStates: ['Done', 'Cancelled'], + inProgressState: 'In Progress', + inReviewState: 'In Review', + pollIntervalMs: 1, + maxConcurrent: 1, + workspaceRoot, + repoRoot, + baseBranch: 'main', + appServerPath, + turnTimeoutMs: 1, + maxRetries: 0, + hookTimeoutMs: 1, + agentMode: 'cli', + workspaceMode: 'gitbutler', + laneBranch: 'lane/main', + }); + + await expect(conductor.start()).rejects.toThrow( + '--lane is only supported with git worktrees' + ); + }); +}); diff --git a/src/cli/commands/orchestrate.ts b/src/cli/commands/orchestrate.ts index 405c41d4..af8a34c1 100644 --- a/src/cli/commands/orchestrate.ts +++ b/src/cli/commands/orchestrate.ts @@ -1421,6 +1421,300 @@ export function createConductorCommands(): Command { } }); + // --- lane --- + // Inspect / clean up disposable worktree-agent-* branches that have been + // merged back into a human-curated lane branch. + + // Check whether is effectively merged into . + // Two-phase: + // 1) Fast: `git merge-base --is-ancestor` β€” catches regular merges + ff. + // 2) Fallback: `git cherry` β€” patch-id matching catches squash-merged + // branches, which BOTH `git branch --merged` and `--is-ancestor` miss. + // Conductor's auto-PR flow often ends in a squash, so this matters. + const isMerged = (repo: string, branch: string, lane: string): boolean => { + try { + execSync(`git merge-base --is-ancestor "${branch}" "${lane}"`, { + cwd: repo, + stdio: 'pipe', + timeout: 5000, + }); + return true; + } catch (err) { + const status = (err as { status?: number }).status; + if (status !== 1) throw err; // genuine error, not "false" + // not a direct ancestor β€” fall through to patch-id check + } + try { + const out = execSync(`git cherry "${lane}" "${branch}"`, { + cwd: repo, + encoding: 'utf-8', + timeout: 10000, + }).trim(); + if (!out) return true; // no unique commits (rare non-ancestor case) + const lines = out.split('\n').filter(Boolean); + // `-` = commit's patch is already in lane; `+` = not in lane. + return lines.every((l) => l.startsWith('-')); + } catch { + return false; + } + }; + + // List all worktree-agent-* branches and bucket by ancestry vs lane. + const bucketLaneBranches = ( + repo: string, + lane: string + ): { merged: string[]; unmerged: string[] } => { + let all: string[] = []; + try { + all = execSync(`git branch --list 'worktree-agent-*'`, { + cwd: repo, + encoding: 'utf-8', + timeout: 10000, + }) + .split('\n') + .map((s) => s.trim().replace(/^[*+]\s*/, '')) + .filter(Boolean); + } catch { + // non-fatal β€” no branches found + } + const merged: string[] = []; + const unmerged: string[] = []; + for (const b of all) { + (isMerged(repo, b, lane) ? merged : unmerged).push(b); + } + return { merged, unmerged }; + }; + + // Return worktree cleanliness so cleanup can refuse removal when git status + // cannot prove the tree is clean. + const getWorktreeCleanliness = ( + repo: string, + wtPath: string + ): 'clean' | 'dirty' | 'unknown' => { + try { + const out = execSync(`git -C "${wtPath}" status --short`, { + cwd: repo, + encoding: 'utf-8', + timeout: 5000, + }); + return out.trim().length > 0 ? 'dirty' : 'clean'; + } catch { + return 'unknown'; + } + }; + + const laneCmd = cmd + .command('lane') + .description( + 'Inspect or clean up worktree-agent-* branches against a lane' + ); + + laneCmd + .command('status') + .description( + 'List worktree-agent-* branches and their merge state vs the lane' + ) + .option( + '--lane ', + 'Lane branch to compare against (default: current branch)' + ) + .option('--repo ', 'Git repo root', process.cwd()) + .action((options) => { + const repo: string = options.repo; + const lane: string = + options.lane || + execSync('git branch --show-current', { + cwd: repo, + encoding: 'utf-8', + }).trim(); + + if (!lane) { + console.error( + `${c.red}Could not resolve lane branch.${c.r} Pass --lane.` + ); + process.exit(1); + } + + const { merged, unmerged } = bucketLaneBranches(repo, lane); + + console.log(`\n ${c.b}Lane:${c.r} ${c.cyan}${lane}${c.r}\n`); + if (merged.length) { + console.log(` ${c.green}merged (${merged.length}):${c.r}`); + for (const b of merged) console.log(` ${c.gray}βœ“${c.r} ${b}`); + } + if (unmerged.length) { + console.log(`\n ${c.orange}unmerged (${unmerged.length}):${c.r}`); + for (const b of unmerged) console.log(` ${c.orange}β€’${c.r} ${b}`); + } + if (!merged.length && !unmerged.length) { + console.log(` ${c.gray}No worktree-agent-* branches found.${c.r}`); + } + }); + + laneCmd + .command('cleanup') + .description( + 'Remove worktree-agent-* branches and worktrees already merged into the lane' + ) + .option( + '--lane ', + 'Lane branch to compare against (default: current branch)' + ) + .option('--repo ', 'Git repo root', process.cwd()) + .option('--dry-run', 'Show what would be removed without doing it', false) + .option( + '--force', + 'Remove worktrees even if they have uncommitted changes', + false + ) + .action((options) => { + const repo: string = options.repo; + const lane: string = + options.lane || + execSync('git branch --show-current', { + cwd: repo, + encoding: 'utf-8', + }).trim(); + + if (!lane) { + console.error( + `${c.red}Could not resolve lane branch.${c.r} Pass --lane.` + ); + process.exit(1); + } + + const { merged } = bucketLaneBranches(repo, lane); + + if (merged.length === 0) { + console.log( + `${c.green}Nothing to clean up for lane ${c.cyan}${lane}${c.r}.` + ); + return; + } + + // Build map of branch β†’ worktree path (if any) + const worktreeMap = new Map(); + try { + const wt = execSync('git worktree list --porcelain', { + cwd: repo, + encoding: 'utf-8', + timeout: 10000, + }); + let currentPath = ''; + for (const line of wt.split('\n')) { + if (line.startsWith('worktree ')) { + currentPath = line.slice('worktree '.length).trim(); + } else if (line.startsWith('branch ')) { + const ref = line.slice('branch '.length).trim(); + const b = ref.replace(/^refs\/heads\//, ''); + if (currentPath) worktreeMap.set(b, currentPath); + } + } + } catch { + // non-fatal β€” fall back to branch-only cleanup + } + + console.log( + `\n ${c.b}Lane:${c.r} ${c.cyan}${lane}${c.r} ${c.d}(${merged.length} merged branches)${c.r}\n` + ); + + let skippedDirty = 0; + let skippedUnknown = 0; + for (const branch of merged) { + const wtPath = worktreeMap.get(branch); + const cleanliness = wtPath + ? getWorktreeCleanliness(repo, wtPath) + : 'unknown'; + const dirty = cleanliness === 'dirty'; + + // Refuse to destroy dirty worktrees without --force β€” prevents + // racing with a still-working agent and losing uncommitted work. + if (dirty && !options.force) { + skippedDirty++; + console.log( + ` ${c.orange}⚠${c.r} ${c.orange}dirty${c.r} ${wtPath} ${c.d}(skipping ${branch} β€” pass --force to remove)${c.r}` + ); + continue; + } + + // If git status could not prove the worktree is clean, refuse cleanup + // unless the operator explicitly opts into forceful removal. + if (wtPath && cleanliness === 'unknown' && !options.force) { + skippedUnknown++; + console.log( + ` ${c.orange}⚠${c.r} ${c.orange}unknown${c.r} ${wtPath} ${c.d}(skipping ${branch} β€” could not verify clean state; pass --force to remove)${c.r}` + ); + continue; + } + + const dirtyTag = dirty ? ` ${c.orange}[dirty]${c.r}` : ''; + const unknownTag = + cleanliness === 'unknown' ? ` ${c.orange}[unknown]${c.r}` : ''; + const action = wtPath + ? `${c.gray}worktree remove${c.r} ${wtPath}${dirtyTag}${unknownTag} + ${c.gray}branch -D${c.r} ${branch}` + : `${c.gray}branch -D${c.r} ${branch}`; + console.log(` ${c.green}βœ“${c.r} ${action}`); + + if (options.dryRun) continue; + + if (wtPath) { + // --force on `git worktree remove` is always passed because we + // already decided above whether dirty state is tolerable. + try { + execSync(`git worktree remove "${wtPath}" --force`, { + cwd: repo, + stdio: 'pipe', + timeout: 15000, + }); + } catch (err) { + console.log( + ` ${c.red}worktree remove failed:${c.r} ${(err as Error).message}` + ); + } + } + try { + execSync(`git branch -D "${branch}"`, { + cwd: repo, + stdio: 'pipe', + timeout: 5000, + }); + } catch (err) { + console.log( + ` ${c.red}branch -D failed:${c.r} ${(err as Error).message}` + ); + } + } + + if (!options.dryRun) { + try { + execSync('git worktree prune', { + cwd: repo, + stdio: 'pipe', + timeout: 10000, + }); + } catch { + // non-fatal + } + const skipNote = skippedDirty + ? ` ${c.orange}(${skippedDirty} dirty skipped β€” pass --force)${c.r}` + : ''; + const unknownNote = skippedUnknown + ? ` ${c.orange}(${skippedUnknown} unknown skipped β€” pass --force)${c.r}` + : ''; + console.log(`\n ${c.green}Done.${c.r}${skipNote}${unknownNote}`); + } else { + const skipNote = skippedDirty + ? ` ${c.orange}(${skippedDirty} dirty would be skipped β€” pass --force to include)${c.r}` + : ''; + const unknownNote = skippedUnknown + ? ` ${c.orange}(${skippedUnknown} unknown would be skipped β€” pass --force to include)${c.r}` + : ''; + console.log( + `\n ${c.d}Dry run β€” no changes made. Remove --dry-run to execute.${c.r}${skipNote}${unknownNote}` + ); + } + }); + // --- logs --- cmd .command('logs') @@ -2050,6 +2344,10 @@ export function createConductorCommands(): Command { 'Workspace mode: "auto" (detect GitButler), "gitbutler", or "worktree"', 'auto' ) + .option( + '--lane ', + 'Optional lane branch. When set, conductor uses git worktrees rooted at the lane as worktree-agent- and suppresses PRs (lane is human-curated).' + ) .action(async (options) => { // Ensure default prompt template exists on first start ensureDefaultPromptTemplate(); @@ -2069,8 +2367,9 @@ export function createConductorCommands(): Command { turnTimeoutMs: parseInt(options.turnTimeout, 10), agentMode: options.mode === 'adapter' ? 'adapter' : 'cli', model: options.model, - autoPR: options.pr, + autoPR: options.lane ? false : options.pr, workspaceMode: options.workspaceMode, + laneBranch: options.lane, }); await conductor.start(); diff --git a/src/cli/commands/orchestrator.ts b/src/cli/commands/orchestrator.ts index 396b2711..cf1c1ab3 100644 --- a/src/cli/commands/orchestrator.ts +++ b/src/cli/commands/orchestrator.ts @@ -88,6 +88,13 @@ export interface ConductorConfig { autoPR?: boolean; /** Workspace mode: 'auto' (detect GitButler), 'gitbutler', or 'worktree' (default: 'auto') */ workspaceMode?: 'auto' | 'gitbutler' | 'worktree'; + /** + * Optional lane branch (e.g. "feature/STA-500-retrieval"). When set: + * - agent worktrees branch from laneBranch (not baseBranch) + * - branch naming uses "worktree-agent-" (not "conductor/") + * - autoPR is suppressed; merge target is the lane, curated by a human + */ + laneBranch?: string; } export interface RunningIssue { @@ -758,9 +765,18 @@ export class Conductor { } } - // Detect workspace mode: GitButler virtual branches or git worktrees const wsMode = this.config.workspaceMode || 'auto'; - if (wsMode === 'gitbutler' || wsMode === 'auto') { + const laneMode = Boolean(this.config.laneBranch); + if (laneMode && wsMode === 'gitbutler') { + throw new Error( + '--lane is only supported with git worktrees. Use --workspace-mode worktree or omit --workspace-mode.' + ); + } + + // Detect workspace mode: GitButler virtual branches or git worktrees. + // Lane mode always uses worktrees because branches are created from the + // lane itself and later inspected/cleaned via `conductor lane`. + if (!laneMode && (wsMode === 'gitbutler' || wsMode === 'auto')) { try { const butVersion = execSync('but --version', { cwd: this.config.repoRoot, @@ -785,6 +801,10 @@ export class Conductor { } // auto mode: fall through to worktrees } + } else if (laneMode) { + logger.info('Lane mode enabled; using git worktrees', { + laneBranch: this.config.laneBranch, + }); } // Ensure workspace root exists (only needed for worktree mode) @@ -1747,6 +1767,12 @@ export class Conductor { } private createGitButlerBranch(issue: LinearIssue, wsKey: string): string { + if (this.config.laneBranch) { + throw new Error( + 'Lane mode requires git worktrees; GitButler virtual branches are not supported.' + ); + } + const branchName = `conductor/${wsKey}`; try { @@ -1795,7 +1821,11 @@ export class Conductor { return wsPath; } - const branchName = `conductor/${wsKey}`; + // Lane mode: subagents get disposable worktree-agent- branches + // rooted at the lane branch. Base mode: conductor/ off baseBranch. + const lane = this.config.laneBranch; + const branchName = lane ? `worktree-agent-${wsKey}` : `conductor/${wsKey}`; + const startPoint = lane ? lane : `origin/${this.config.baseBranch}`; try { execSync('git fetch origin', { @@ -1805,7 +1835,7 @@ export class Conductor { }); execSync( - `git worktree add "${wsPath}" -b "${branchName}" "origin/${this.config.baseBranch}"`, + `git worktree add "${wsPath}" -b "${branchName}" "${startPoint}"`, { cwd: this.config.repoRoot, stdio: 'pipe', @@ -1817,6 +1847,7 @@ export class Conductor { identifier: issue.identifier, path: wsPath, branch: branchName, + lane: lane || null, }); } catch (err) { try { From 079b395fcba5280b122c9c20087f9b0fad890b0f Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Thu, 16 Apr 2026 19:40:11 -0400 Subject: [PATCH 10/18] chore: reorganize root for clarity Consolidate duplicate docs, relocate wandering files, and tighten .gitignore for agent scratch dirs. - Move SPEC.md, RELEASE_NOTES.md, tomorrow.md, vision.md to docs/ (replacing stale docs/ copies with the up-to-date root versions) - Move mcp_review_config.json to config/ - Untrack .lint-fix-log.json (ephemeral lint artifact) - Delete stale .tsbuildinfo-* and .lint-errors.log - Ignore agent scratch dirs (.ralph/, .swarm/, .bjarne/, .entire/, .opencode/, .git.backup/) and local trees (archive/, site/, voyager/, plugins/) - Update README.md Vision link to docs/vision.md --- .gitignore | 19 + .lint-fix-log.json | 22 - README.md | 2 +- RELEASE_NOTES.md | 80 -- SPEC.md | 1247 ----------------- .../mcp_review_config.json | 0 docs/RELEASE_NOTES.md | 2 +- docs/SPEC.md | 528 ++++++- tomorrow.md => docs/tomorrow.md | 0 vision.md => docs/vision.md | 0 10 files changed, 519 insertions(+), 1381 deletions(-) delete mode 100644 .lint-fix-log.json delete mode 100644 RELEASE_NOTES.md delete mode 100644 SPEC.md rename mcp_review_config.json => config/mcp_review_config.json (100%) rename tomorrow.md => docs/tomorrow.md (100%) rename vision.md => docs/vision.md (100%) diff --git a/.gitignore b/.gitignore index 85b0c6f6..9b63e497 100644 --- a/.gitignore +++ b/.gitignore @@ -135,3 +135,22 @@ scripts/gepa/results/scores.jsonl scripts/gepa/state.json scripts/gepa/results/ scripts/gepa/generations/ + +# Agent tool working dirs (untracked, per-tool scratch) +.ralph/ +.swarm/ +.bjarne/ +.entire/ +.opencode/ + +# Local backups and lint artifacts +.git.backup/ +.lint-errors.log +.lint-fix-log.json +.lint-fix-log.*.json + +# Local scratch / generated trees +archive/ +site/ +voyager/ +plugins/ diff --git a/.lint-fix-log.json b/.lint-fix-log.json deleted file mode 100644 index a227e230..00000000 --- a/.lint-fix-log.json +++ /dev/null @@ -1,22 +0,0 @@ -[ - { - "timestamp": "2026-01-05T19:42:12.135Z", - "level": "info", - "message": "πŸ”§ Starting auto-fix loop..." - }, - { - "timestamp": "2026-01-05T19:42:12.138Z", - "level": "info", - "message": "πŸ“ Auto-fix attempt 1/3" - }, - { - "timestamp": "2026-01-05T19:42:15.745Z", - "level": "info", - "message": "Running ESLint auto-fix..." - }, - { - "timestamp": "2026-01-05T19:42:21.910Z", - "level": "success", - "message": "βœ… All fixable lint errors resolved! (warnings are ok for commits)" - } -] \ No newline at end of file diff --git a/README.md b/README.md index eca2b11c..74a1676e 100644 --- a/README.md +++ b/README.md @@ -423,7 +423,7 @@ Options: `--until`, `--until-not`, `--until-empty`, `--until-non-empty`, `--unti - [Development Guide](./docs/DEVELOPMENT.md) β€” Contributing and development - [Architecture](./docs/architecture.md) β€” System design - [API Reference](./docs/API_REFERENCE.md) β€” API documentation -- [Vision](./vision.md) β€” Product vision and principles +- [Vision](./docs/vision.md) β€” Product vision and principles - [Status](./docs/status.md) β€” Current project status - [Roadmap](./docs/roadmap.md) β€” Future plans diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md deleted file mode 100644 index bba8bb97..00000000 --- a/RELEASE_NOTES.md +++ /dev/null @@ -1,80 +0,0 @@ -# Release Notes - v0.2.8 - -## LLM-Driven Context Retrieval System (STA-95) - -This release introduces intelligent context retrieval that uses LLM analysis to select the most relevant frames for any query. - -### New Features - -#### Smart Context Retrieval (`smart_context` MCP tool) - -- **Natural language queries**: Ask for context in plain English -- **LLM-driven analysis**: Intelligently selects relevant frames based on query semantics -- **Token budget management**: Stays within specified token limits -- **Auditable reasoning**: Every retrieval decision is explained -- **Heuristic fallback**: Works even without LLM provider - -#### Compressed Memory Summary (`get_summary` MCP tool) - -- **Recent session summary**: Frames, operations, files touched, errors -- **Historical patterns**: Topic counts, key decisions, recurring issues -- **Queryable indices**: By error, time, contributor, topic, file -- **Summary statistics**: Frame counts, event counts, anchor totals - -### Architecture - -``` -context_retrieval: - compressed_summary: - recent_session: frames, operations, files, errors - historical_patterns: topic counts, key decisions, recurring issues - queryable_indices: by error, timeframe, contributor - - llm_analysis: - inputs: current_query, compressed_summary, token_budget - output: reasoning (auditable), frames_to_retrieve, confidence_score -``` - -### New MCP Tools - -| Tool | Description | -| --------------- | -------------------------------------------------------- | -| `smart_context` | LLM-driven context retrieval with natural language query | -| `get_summary` | Compressed summary of project memory | - -### Other Changes - -- **Trace Detection**: Improved persistence and bundling -- **Model-Aware Compaction**: Handlers for context window management -- **Linear Sync**: Enhanced sync manager for Linear integration -- **Query Parser**: Extended natural language query parsing - -### Files Added - -- `src/core/retrieval/` - Complete retrieval system - - `types.ts` - Type definitions - - `summary-generator.ts` - Compressed summary generation - - `llm-context-retrieval.ts` - Main retrieval orchestrator - - `index.ts` - Module exports -- `src/core/context/compaction-handler.ts` - Autocompaction detection -- `src/core/context/model-aware-compaction.ts` - Model-specific handling -- `src/core/trace/trace-store.ts` - Trace persistence -- `src/integrations/linear/sync-manager.ts` - Enhanced Linear sync - -## Installation - -```bash -npm install -g @stackmemoryai/stackmemory@0.2.8 -``` - -## Usage - -```bash -# In Claude Desktop or MCP client: -smart_context "What did we work on related to authentication?" -get_summary -``` - ---- - -_Built with LLM-driven context retrieval_ diff --git a/SPEC.md b/SPEC.md deleted file mode 100644 index 41796aaf..00000000 --- a/SPEC.md +++ /dev/null @@ -1,1247 +0,0 @@ -# StackMemory Specification v1.0 - -## Executive Summary - -StackMemory is a **lossless, project-scoped memory runtime** for AI coding and writing tools that preserves full project context across sessions using a call stack metaphor instead of linear chat logs. It organizes memory as nested frames with smart retrieval, enabling AI tools to maintain context across thread resets, model switches, and long-running projects. - -## Core Architecture - -### 1. Memory Model - -#### 1.1 Frame Stack Structure -```yaml -memory_model: - structure: "call_stack" # Not linear chat log - max_depth: 10000 # Maximum frames in stack - retention: 30_days # Local retention window - storage: - local: "SQLite" # Fast local storage - remote: "TimeSeries DB + S3" # Infinite remote storage -``` - -#### 1.2 Frame Composition -```yaml -frame: - metadata: - id: "uuid" - title: "descriptive_name" - type: "task|debug|feature|architecture" - owner: "user_id" - created: "timestamp" - - contents: - events: [] # Tool calls, messages, observations - anchors: [] # Decisions, constraints, interfaces - digest: {} # 60% deterministic, 40% AI-generated summary - score: 0.0-1.0 # Importance score for retention -``` - -### 2. Storage Architecture - -#### 2.1 Two-Tier Storage System -```yaml -local_storage: - young: - age: "< 1 day" - retention: "complete" # Full events, all tool calls - memory_strategy: "hot" # RAM for instant access - compression: "none" - - mature: - age: "1-7 days" - retention: "selective" # Digests + anchors + high-score events - memory_strategy: "warm" # SQLite with memory cache - compression: "lz4" - score_threshold: 0.4 - - old: - age: "7-30 days" - retention: "critical" # Anchors + decisions only - memory_strategy: "cold" # SQLite, no cache - compression: "zstd" - score_threshold: 0.7 - - max_size: 2GB - overflow_strategy: "promote_to_remote" - -remote_storage: - retention: "infinite" - indexing: - primary: "timeseries" # ClickHouse/TimescaleDB - secondary: "inverted" # Elasticsearch - graph: "relationships" # Neo4j for frame deps - - retrieval: - cache_layer: "redis" - p50_latency: 50ms - p99_latency: 500ms - prefetch: true - - cost_model: - storage: "$0.02/GB/month" - retrieval: "$0.0004/1000_reads" - - tiers: - hot: "< 7 days" # S3 Standard - warm: "7-90 days" # S3 Standard-IA - cold: "> 90 days" # S3 Glacier - archive: "> 1 year" # Glacier Deep Archive -``` - -#### 2.2 Migration Strategy -```yaml -local_to_remote_migration: - triggers: - age_based: - schedule: "0 */6 * * *" # Every 6 hours - migrate_after: 24h - - size_pressure: - soft_limit: 75% # Start migration - hard_limit: 90% # Force migration - strategy: "lowest_score_first" - - importance_based: - score_thresholds: - "< 0.3": 2h # Low importance - "< 0.5": 12h # Medium - "< 0.7": 24h # High - ">= 0.7": 7d # Critical - - upload_strategy: - mode: "hybrid" - continuous_streaming: - for_events: ["decision", "constraint", "api_change"] - latency: "< 1 minute" - - batch_upload: - for_events: ["tool_call", "observation", "message"] - batch_size: 100 - interval: 300s - compression: true - - smart_batching: - group_by: "frame" - wait_for_frame_close: true - max_wait: 1h -``` - -### 3. Importance Scoring System - -#### 3.1 Tool Call Scoring (Deterministic) -```yaml -tool_scores: - # Discovery & Intelligence (0.8-1.0) - search: 0.95 # Finding context/code - task_creation: 0.90 # Planning work - decision_recording: 0.90 # Architectural choices - context_retrieval: 0.85 # Loading memory - - # Structural Changes (0.6-0.8) - write_new_file: 0.75 - major_refactor: 0.70 - api_change: 0.70 - - # Modifications (0.3-0.6) - edit: 0.50 - test: 0.45 - bash_execution: 0.40 - - # Simple Reads (0.1-0.3) - read: 0.25 - ls: 0.20 - grep: 0.15 # Simple pattern matching -``` - -#### 3.2 Scoring Formula -```yaml -scoring: - formula: | - score = (base_score * weights.base) + - (impact_multiplier * weights.impact) + - (persistence_bonus * weights.persistence) + - (reference_count * weights.reference) - - weights: - configurable: true # Per-project tuning - defaults: - base: 0.4 - impact: 0.3 - persistence: 0.2 - reference: 0.1 - - profiles: - security_focused: - impact: 0.5 # Changes matter more - exploration_heavy: - reference: 0.5 # Discovery paths matter - production_system: - persistence: 0.3 # Permanent changes critical -``` - -### 4. Smart Context Retrieval - -#### 4.1 LLM-Driven Retrieval -```yaml -context_retrieval: - compressed_summary: - # Provided to LLM for analysis - recent_session: - frames: 15 - dominant_operations: [] - files_touched: [] - errors_encountered: [] - - historical_patterns: - topic_frame_counts: {} - key_decisions: [] - recurring_issues: [] - - queryable_indices: - by_error_type: {} - by_timeframe: {} - by_contributor: {} - - llm_analysis: - inputs: - - current_query - - compressed_summary - - token_budget - - output: - reasoning: "visible/auditable" - frames_to_retrieve: [] - confidence_score: 0.0-1.0 - - generation: - when: "on_demand" # Not pre-computed - visibility: "settings/on_request" # Auditable -``` - -#### 4.2 Query Language - -##### 4.2.1 Natural Language Queries -```yaml -nlp_queries: - time_based: - - "provide context from the last day" - - "show me what happened yesterday" - - "get all work from December 15-20" - - "what did Alice work on last week" - - topic_based: - - "find all authentication work" - - "show database migration frames" - - "get frames about the login bug" - - "what decisions were made about caching" - - combined: - - "show Alice's auth work from last week" - - "get high-priority bug fixes from yesterday" - - "find security decisions in the last month" -``` - -##### 4.2.2 Structured Query Format -```typescript -interface StackMemoryQuery { - // Time filters - time?: { - last?: string; // "1d", "3h", "1w", "2m" - since?: Date; // ISO timestamp - until?: Date; - between?: [Date, Date]; - specific?: Date; // Exact date - }; - - // Content filters - content?: { - topic?: string[]; // ["auth", "database"] - files?: string[]; // ["src/*.ts", "tests/*"] - errors?: string[]; // ["timeout", "null pointer"] - tools?: string[]; // ["search", "edit", "test"] - }; - - // Frame filters - frame?: { - type?: FrameType[]; // ["bug", "feature", "refactor"] - status?: Status[]; // ["open", "closed", "stalled"] - score?: { - min?: number; // 0.0-1.0 - max?: number; - }; - depth?: { - min?: number; // Stack depth - max?: number; - }; - }; - - // People filters - people?: { - owner?: string[]; // ["alice", "bob"] - contributors?: string[]; - team?: string; // "backend-team" - }; - - // Output control - output?: { - limit?: number; // Max frames to return - sort?: SortBy; // "time" | "score" | "relevance" - include?: string[]; // ["digests", "events", "anchors"] - format?: Format; // "full" | "summary" | "ids" - }; -} -``` - -##### 4.2.3 Query Examples -```typescript -// Last day's context -{ - time: { last: "1d" }, - output: { format: "summary" } -} - -// High-importance auth work -{ - content: { topic: ["auth", "oauth"] }, - frame: { score: { min: 0.7 } }, - output: { sort: "score", limit: 20 } -} - -// Team's recent critical work -{ - time: { last: "3d" }, - people: { team: "backend-team" }, - frame: { score: { min: 0.8 } }, - output: { sort: "time" } -} -``` - -##### 4.2.4 Hybrid Query Syntax -```bash -# Command-line style -stackmemory query "auth work" --since="2024-12-20" --owner=alice - -# Inline modifiers -"show auth work @alice #high-priority since:yesterday depth:10" - -# Template style -"context from {time.last=1d} about {topic=authentication}" -``` - -##### 4.2.5 Query Shortcuts -```yaml -shortcuts: - # Time shortcuts - "today": { time: { last: "24h" } } - "yesterday": { time: { between: ["yesterday 00:00", "yesterday 23:59"] } } - "this week": { time: { last: "7d" } } - - # Topic shortcuts - "bugs": { frame: { type: ["bug", "error", "fix"] } } - "features": { frame: { type: ["feature", "enhancement"] } } - "critical": { frame: { score: { min: 0.8 } } } - - # Workflow shortcuts - "my work": { people: { owner: ["$current_user"] } } - "team work": { people: { team: "$current_team" } } - "recent": { time: { last: "4h" } } -``` - -##### 4.2.6 Query Response Format -```typescript -interface QueryResponse { - query: { - original: string; // User's input - interpreted: Query; // Parsed query - expanded: Query; // After expansion - }; - - results: { - frames: Frame[]; // Matching frames - count: number; // Total matches - score: number; // Query confidence - }; - - metadata: { - execution_time: number; // ms - tokens_used: number; - cache_hit: boolean; - }; - - suggestions: { - refine: string[]; // "Try adding time filter" - related: string[]; // "See also: auth decisions" - }; -} -``` - -#### 4.3 Trace Bundling -```yaml -trace_detection: - definition: "Chain of related tool calls" - - boundaries: - time_proximity: 30s # Tools within 30 seconds - same_target: true # Same file/directory - causal_relationship: true # Error β†’ fix β†’ test - - compression: - strategy: "single_trace" # Bundle as one unit - scoring: "max(all_tools)" # Use highest score - - example: - raw: "Search β†’ Read(10) β†’ Edit(3) β†’ Test β†’ Fix β†’ Test" - compressed: "Fixed auth bug via search-driven refactor [0.95]" -``` - -### 5. Garbage Collection - -#### 5.1 Incremental GC Strategy -```yaml -garbage_collection: - type: "incremental" # Avoid stop-the-world - - process: - frames_per_cycle: 100 # Process in chunks - cycle_interval: 60s # Every minute - - generational: - young: "< 1 day" - mature: "1-7 days" - old: "7-30 days" - - priorities: - protect: - - current_session - - pinned_frames - - unsynced_changes - - high_score_frames - - evict_first: - - low_score_frames - - orphaned_frames - - duplicate_traces -``` - -### 6. Digest Generation - -#### 6.1 Hybrid Approach (60/40) -```yaml -digest_generation: - deterministic: 60% # Reliable extraction - ai_generated: 40% # AI-generated summary - - deterministic_fields: - - files_modified - - tests_run - - errors_encountered - - tool_call_count - - duration - - exit_status - - ai_generated_fields: - - summary # 1-2 sentences - - key_decisions - - learned_insights - - next_steps - - processing: - when: "batch_during_idle" # Not immediate - max_tokens: 200 - fallback: "deterministic_only" -``` - -### 7. Team Collaboration - -#### 7.1 Dual Stack Architecture -```yaml -stack_types: - individual: - owner: "single_user" - visibility: "private" - can_promote: true - - shared: - team: "team_id" - visibility: "team" - participants: [] - handoff_enabled: true - - interaction: - promote: "individual β†’ shared" - fork: "shared β†’ individual" - merge: "individual β†’ shared" - handoff: "alice β†’ bob" -``` - -#### 7.2 Frame Ownership -```yaml -frame_ownership: - creator: "original_author" - contributors: [] - last_active: "current_user" - - permissions: - read: "team" - continue: "team" - close: "owner_or_admin" - delete: "owner_only" - - handoff: - explicit: "transfer_command" - implicit: "continue_working" - timeout: "idle_24h" -``` - -### 8. Configuration System - -#### 8.1 Configuration File -```yaml -# .stackmemory/config.yaml -version: 1.0 - -scoring: - weights: - base: 0.4 - impact: 0.3 - persistence: 0.2 - reference: 0.1 - - tool_scores: - # Custom overrides - custom_tool: 0.75 - -retention: - local: - young: 1d - mature: 7d - old: 30d - max_size: 2GB - - remote: - enabled: true - retention: infinite - -performance: - max_stack_depth: 10000 - retrieval_timeout_ms: 500 - -profiles: - environment: "production" -``` - -#### 8.2 Configuration Validation -```bash -$ stackmemory config validate - -validation_checks: - - syntax_validation - - semantic_validation - - performance_analysis - - compatibility_check - - environment_verification - -output: - errors: [] - warnings: [] - suggestions: [] - auto_fix_available: true -``` - -### 9. MCP Integration - -#### 9.1 Available Tools -```yaml -mcp_tools: - # Context Management - - get_context # Smart retrieval with LLM - - add_decision # Record decisions - - start_frame # Begin new frame - - close_frame # Close with digest - - # Task Management - - create_task - - update_task_status - - get_active_tasks - - get_task_metrics - - # Linear Integration - - linear_sync - - linear_update_task - - linear_get_tasks - - # Analytics - - get_metrics - - get_frame_history - - search_frames -``` - -#### 9.2 Context Bundle Format -```json -{ - "compressed_summary": { - "recent_activity": {}, - "historical_patterns": {}, - "statistics": {} - }, - "hot_frames": [], - "relevant_anchors": [], - "query_endpoints": { - "deep_search": "endpoint", - "replay_session": "endpoint", - "get_specific_frames": "endpoint" - } -} -``` - -### 10. Security & Privacy - -#### 10.1 Secret Detection -```yaml -secret_detection: - patterns: - - api_keys: "regex_patterns" - - passwords: "regex_patterns" - - tokens: "regex_patterns" - - custom: "user_defined" - - action: - detection: "real_time" - handling: "redact" # Not block - notification: "warn_user" - - storage: - hashed: true - reversible: false -``` - -#### 10.2 Privacy Controls -```yaml -privacy: - data_residency: "configurable" - encryption: - at_rest: "AES-256" - in_transit: "TLS 1.3" - - retention: - deletion_on_request: true - audit_trail: "maintained" - - sharing: - default: "private" - team_opt_in: true - org_visibility: "admin_only" -``` - -### 11. Performance Targets - -#### 11.1 SLAs -```yaml -performance_slas: - retrieval: - p50: 50ms - p95: 200ms - p99: 500ms - - storage: - write_throughput: "10K events/sec" - batch_upload: "100MB/min" - - availability: - uptime: "99.9%" - data_durability: "99.999999999%" # 11 nines - - scale: - max_frames: 10000 - max_events_per_frame: 5000 - max_storage_per_project: "unlimited" -``` - -### 12. Advanced Memory Patterns - -#### 12.1 Episodic Memory System -```yaml -episodic_memory: - definition: "Capture and reuse past agent experiences" - - episode_structure: - trigger: "significant_event" # Decision, error, breakthrough - context_snapshot: - - pre_state # State before episode - - action_sequence # Tools and decisions - - outcome # Result and impact - - learned_pattern # Extracted insight - - retrieval_strategy: - similarity_matching: - current_context: true - embedding_distance: "cosine" - threshold: 0.85 - - temporal_relevance: - recent_weight: 0.7 - historical_weight: 0.3 - - injection_mechanism: - when: "similar_context_detected" - format: "Past episode: {summary} led to {outcome}" - max_episodes: 3 -``` - -#### 12.2 Memory Synthesis from Execution Logs -```yaml -log_synthesis: - pattern_extraction: - frequency_analysis: - - common_error_sequences - - repeated_tool_patterns - - decision_reversals - - causality_detection: - error_to_fix_chains: true - search_to_discovery: true - test_to_refactor: true - - synthesis_output: - workflow_patterns: - - "Search β†’ Read β†’ Edit β†’ Test β†’ Fix" - - "Error β†’ Analyze β†’ Search β†’ Solution" - - anti_patterns: - - "Repeated failed attempts" - - "Circular dependencies" - - optimization_opportunities: - - "Batch similar operations" - - "Cache frequent queries" -``` - -### 13. Feedback Loop Architecture - -#### 13.1 Reflection Loop Pattern -```yaml -reflection_loop: - trigger_conditions: - - frame_completion - - significant_error - - milestone_reached - - context_switch - - reflection_process: - analyze: - - what_worked: "successful patterns" - - what_failed: "error patterns" - - alternative_approaches: "unexplored paths" - - synthesize: - key_insights: [] - patterns_identified: [] - improvements_suggested: [] - - persist: - to_anchors: true # Save as decisions - to_digest: true # Include in summary - score_boost: 0.2 # Important for learning -``` - -#### 13.2 Self-Critique Evaluation System -```yaml -self_critique: - evaluation_dimensions: - code_quality: - - correctness: "Does it work?" - - efficiency: "Is it optimal?" - - maintainability: "Is it clean?" - - decision_quality: - - rationale: "Was reasoning sound?" - - alternatives: "Were options considered?" - - evidence: "Was it data-driven?" - - process_quality: - - methodology: "Was approach systematic?" - - tool_usage: "Were tools used effectively?" - - time_management: "Was effort proportional?" - - critique_storage: - attach_to_frame: true - influence_scoring: true - guide_future_retrieval: true - - continuous_improvement: - track_critique_patterns: true - adjust_weights_based_on_outcomes: true - share_learnings_across_team: true -``` - -#### 13.3 Rich Feedback Integration -```yaml -feedback_sources: - automated: - - test_results - - linting_output - - performance_metrics - - security_scans - - human: - - code_review_comments - - user_satisfaction - - explicit_feedback - - environmental: - - build_success_rate - - deployment_outcomes - - production_incidents - - integration: - collection: "multi_channel" - correlation: "cross_reference" - weight_by_reliability: true - - feedback_to_memory: - positive: "boost_frame_score" - negative: "annotate_with_lessons" - neutral: "record_for_pattern" -``` - -### 14. Context Optimization Strategies - -#### 14.1 Context Minimization Pattern -```yaml -context_minimization: - strategies: - intelligent_filtering: - remove_redundant: true - compress_similar: true - prioritize_relevant: true - - hierarchical_summarization: - detail_levels: - - full: "complete events" - - medium: "key operations" - - summary: "outcomes only" - - dynamic_windowing: - expand_on: "high_relevance" - contract_on: "low_relevance" - adaptive_sizing: true - - benefits: - reduced_token_usage: "40-60%" - faster_processing: true - clearer_focus: true -``` - -#### 14.2 Dynamic Context Injection -```yaml -dynamic_injection: - triggers: - - context_switch_detected - - new_error_type - - unfamiliar_codebase_area - - performance_degradation - - injection_sources: - - relevant_documentation - - similar_past_solutions - - team_knowledge_base - - external_references - - injection_timing: - just_in_time: true # Right before needed - predictive: true # Anticipate needs - on_demand: true # User requested - - injection_format: - inline_hints: "minimal disruption" - sidebar_context: "additional detail" - full_frame: "comprehensive context" -``` - -#### 14.3 Context Window Anxiety Management -```yaml -anxiety_management: - monitoring: - track_usage: "continuous" - alert_threshold: 70% - critical_threshold: 90% - - mitigation_strategies: - progressive_compression: - - summarize_old_frames - - drop_low_score_events - - archive_to_retrieval - - selective_loading: - - load_only_relevant - - defer_deep_history - - use_pointers_not_content - - smart_truncation: - preserve: "decisions_and_outcomes" - truncate: "intermediate_steps" - compress: "repetitive_patterns" -``` - -### 15. Tool Orchestration Patterns - -#### 15.1 Progressive Tool Discovery -```yaml -tool_discovery: - learning_progression: - basic: ["read", "write", "search"] - intermediate: ["edit", "test", "analyze"] - advanced: ["refactor", "optimize", "architect"] - - discovery_mechanism: - observation: "watch_usage_patterns" - suggestion: "recommend_when_relevant" - education: "explain_tool_benefits" - - tool_introduction: - gradual: true - context_appropriate: true - with_examples: true -``` - -#### 15.2 Conditional Parallel Execution -```yaml -parallel_execution: - conditions: - can_parallelize: - - independent_files - - different_subsystems - - non_conflicting_operations - - must_serialize: - - dependent_changes - - shared_resources - - ordered_operations - - orchestration: - plan: "identify_parallelizable" - execute: "batch_similar_operations" - synchronize: "merge_results" - handle_conflicts: "retry_or_serialize" - - benefits: - speed: "3-5x improvement" - efficiency: "reduced_overhead" - atomicity: "group_related_changes" -``` - -### 16. Multi-Agent Coordination - -#### 16.1 Sub-Agent Spawning Pattern -```yaml -sub_agent_spawning: - spawn_triggers: - - complex_subtask - - specialized_domain - - parallel_workstream - - exploratory_analysis - - agent_types: - analyzer: "deep_investigation" - builder: "implementation" - reviewer: "quality_check" - documenter: "knowledge_capture" - - coordination: - handoff: "clear_context_transfer" - results: "structured_return" - state: "shared_memory_access" - - lifecycle: - spawn: "with_specific_context" - execute: "autonomous_operation" - report: "structured_findings" - terminate: "clean_resource_release" -``` - -#### 16.2 Multi-Agent Debate Pattern -```yaml -debate_pattern: - participants: - proposer: "suggests_solution" - critic: "identifies_issues" - synthesizer: "merges_perspectives" - - debate_process: - rounds: 3 - convergence_required: true - consensus_threshold: 0.8 - - decision_recording: - all_perspectives: true - final_consensus: true - dissenting_opinions: true - - benefits: - better_decisions: "multiple viewpoints" - error_reduction: "critical analysis" - learning: "exposed reasoning" -``` - -### 17. Evaluation and Scoring Evolution - -#### 17.1 Anti-Reward-Hacking Design -```yaml -anti_reward_hacking: - diverse_metrics: - - outcome_based: "actual_results" - - process_based: "methodology_quality" - - efficiency_based: "resource_usage" - - learning_based: "knowledge_gained" - - dynamic_weights: - adjust_based_on: - - gaming_detection - - metric_reliability - - context_importance - - validation: - cross_check_metrics: true - human_spot_checks: true - anomaly_detection: true -``` - -#### 17.2 Continuous Calibration -```yaml -calibration: - feedback_loop: - collect: "outcome_data" - analyze: "prediction_vs_actual" - adjust: "scoring_weights" - - calibration_frequency: - minor: "daily" - major: "weekly" - reset: "monthly" - - drift_detection: - monitor: "score_distributions" - alert: "significant_changes" - auto_adjust: "within_bounds" -``` - -### 18. Future Extensibility - -#### 18.1 Roadmap Features (Enhanced) -```yaml -planned_features: - # Original features - - cross_repository_memory - - team_memory_spaces - - background_project_compilers - - fine_grained_retention_policies - - ml_based_importance_scoring - - predictive_context_loading - - ide_frame_boundary_visualization - - # New pattern-based features - - episodic_memory_retrieval - - reflection_loop_automation - - multi_agent_orchestration - - context_anxiety_management - - progressive_tool_discovery - - debate_based_decision_making - - continuous_self_improvement -``` - -#### 18.2 Integration Points -```yaml -integrations: - current: - - claude_code - - linear - - github - - planned: - - vscode - - cursor - - jetbrains - - gitlab - - jira - - slack - - pattern_integrations: - - langchain: "memory_patterns" - - autogen: "multi_agent" - - guidance: "structured_generation" - - dspy: "optimization_loops" -``` - -## Implementation Priorities - -### Phase 1: Core Runtime (Current) -- [x] Frame stack management -- [x] Local SQLite storage -- [x] MCP server -- [x] Basic scoring -- [x] Claude Code integration - -### Phase 2: Intelligence Layer -- [ ] LLM-driven retrieval -- [ ] Hybrid digest generation -- [ ] Smart trace detection -- [ ] Configurable scoring - -### Phase 3: Collaboration -- [ ] Shared team stacks -- [ ] Frame handoff -- [ ] Merge conflict resolution -- [ ] Team analytics - -### Phase 4: Scale -- [ ] Remote infinite storage -- [ ] Incremental GC -- [ ] Performance optimization -- [ ] Enterprise features - -## Success Metrics - -```yaml -adoption: - - daily_active_projects: 10000 - - frames_created_per_day: 1M - - context_retrievals_per_day: 10M - -quality: - - retrieval_relevance: "> 90%" - - digest_accuracy: "> 85%" - - user_satisfaction: "> 4.5/5" - -performance: - - retrieval_latency: "< 100ms p50" - - zero_context_loss: true - - uptime: "> 99.9%" -``` - -## Configuration Examples - -### Example 1: Security-Focused Project -```yaml -scoring: - weights: - impact: 0.5 - persistence: 0.3 - tool_scores: - security_scan: 0.95 - -retention: - local: - old: 90d # Keep security decisions longer -``` - -### Example 2: Exploration-Heavy Project -```yaml -scoring: - weights: - reference: 0.5 - base: 0.2 - tool_scores: - search: 0.99 - -performance: - retrieval_timeout_ms: 1000 # Allow deeper searches -``` - -## Implementation Guidance - -### Pattern Implementation Priority Matrix -```yaml -high_impact_easy: - # Implement first - quick wins - - context_minimization # 40-60% token savings - - reflection_loop # Improves decision quality - - parallel_tool_execution # 3-5x speed improvement - - episodic_memory # Reuse past solutions - -high_impact_complex: - # Phase 2 - significant value - - self_critique_system # Continuous improvement - - multi_agent_debate # Better decisions - - dynamic_context_injection # Just-in-time context - - log_synthesis # Learn from patterns - -moderate_impact: - # Phase 3 - refinements - - progressive_tool_discovery # Gradual capability - - anti_reward_hacking # Robust metrics - - sub_agent_spawning # Task delegation - - context_anxiety_mgmt # Proactive optimization -``` - -### Key Design Principles from Patterns -```yaml -principles: - 1_externalize_state: - rationale: "Enable persistence across sessions" - implementation: "Filesystem + database hybrid" - - 2_minimize_context: - rationale: "Maximize efficiency and clarity" - implementation: "Hierarchical summarization" - - 3_learn_continuously: - rationale: "Improve over time" - implementation: "Reflection loops + pattern extraction" - - 4_orchestrate_intelligently: - rationale: "Use right tool for task" - implementation: "Progressive discovery + conditional execution" - - 5_critique_systematically: - rationale: "Ensure quality" - implementation: "Multi-dimensional evaluation" -``` - -### Practical Implementation Steps -```yaml -step_1_baseline: - - implement_frame_stack - - add_basic_scoring - - create_sqlite_storage - - build_mcp_interface - -step_2_memory_patterns: - - add_episodic_retrieval - - implement_log_synthesis - - create_reflection_loops - - build_pattern_detection - -step_3_optimization: - - add_context_minimization - - implement_dynamic_injection - - create_parallel_execution - - optimize_retrieval_speed - -step_4_intelligence: - - add_self_critique - - implement_debate_patterns - - create_continuous_calibration - - build_learning_system - -step_5_scale: - - add_multi_agent_coordination - - implement_distributed_memory - - create_team_collaboration - - optimize_for_production -``` - -## Conclusion - -StackMemory provides a revolutionary approach to AI tool memory management through: -- **Lossless storage** with smart retrieval -- **Frame-based organization** replacing linear chat logs -- **Two-tier storage** balancing performance and capacity -- **LLM-driven context selection** for optimal relevance -- **Team collaboration** through shared and individual stacks -- **Configurable scoring** adapting to project needs -- **Advanced patterns** from agentic AI research -- **Continuous learning** through reflection and synthesis -- **Intelligent orchestration** of tools and agents -- **Context optimization** for efficiency at scale - -The system ensures AI tools never lose context while maintaining performance at scale, incorporating state-of-the-art patterns from the agentic AI community. \ No newline at end of file diff --git a/mcp_review_config.json b/config/mcp_review_config.json similarity index 100% rename from mcp_review_config.json rename to config/mcp_review_config.json diff --git a/docs/RELEASE_NOTES.md b/docs/RELEASE_NOTES.md index ff14ab8f..bba8bb97 100644 --- a/docs/RELEASE_NOTES.md +++ b/docs/RELEASE_NOTES.md @@ -77,4 +77,4 @@ get_summary --- -_Built with LLM-driven intelligent context retrieval_ +_Built with LLM-driven context retrieval_ diff --git a/docs/SPEC.md b/docs/SPEC.md index 1941fb48..41796aaf 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -2,7 +2,7 @@ ## Executive Summary -StackMemory is a **lossless, project-scoped memory runtime** for development tools that preserves full project context across sessions using a call stack metaphor instead of linear chat logs. It organizes memory as nested frames with smart retrieval, maintaining context across thread resets, model switches, and long-running projects. +StackMemory is a **lossless, project-scoped memory runtime** for AI coding and writing tools that preserves full project context across sessions using a call stack metaphor instead of linear chat logs. It organizes memory as nested frames with smart retrieval, enabling AI tools to maintain context across thread resets, model switches, and long-running projects. ## Core Architecture @@ -120,7 +120,7 @@ local_to_remote_migration: interval: 300s compression: true - intelligent_batching: + smart_batching: group_by: "frame" wait_for_frame_close: true max_wait: 1h @@ -179,7 +179,7 @@ scoring: persistence: 0.3 # Permanent changes critical ``` -### 4. Intelligent Context Retrieval +### 4. Smart Context Retrieval #### 4.1 LLM-Driven Retrieval ```yaml @@ -429,7 +429,7 @@ garbage_collection: ```yaml digest_generation: deterministic: 60% # Reliable extraction - ai_generated: 40% # Intelligent summary + ai_generated: 40% # AI-generated summary deterministic_fields: - files_modified @@ -657,11 +657,380 @@ performance_slas: max_storage_per_project: "unlimited" ``` -### 12. Future Extensibility +### 12. Advanced Memory Patterns -#### 12.1 Roadmap Features +#### 12.1 Episodic Memory System +```yaml +episodic_memory: + definition: "Capture and reuse past agent experiences" + + episode_structure: + trigger: "significant_event" # Decision, error, breakthrough + context_snapshot: + - pre_state # State before episode + - action_sequence # Tools and decisions + - outcome # Result and impact + - learned_pattern # Extracted insight + + retrieval_strategy: + similarity_matching: + current_context: true + embedding_distance: "cosine" + threshold: 0.85 + + temporal_relevance: + recent_weight: 0.7 + historical_weight: 0.3 + + injection_mechanism: + when: "similar_context_detected" + format: "Past episode: {summary} led to {outcome}" + max_episodes: 3 +``` + +#### 12.2 Memory Synthesis from Execution Logs +```yaml +log_synthesis: + pattern_extraction: + frequency_analysis: + - common_error_sequences + - repeated_tool_patterns + - decision_reversals + + causality_detection: + error_to_fix_chains: true + search_to_discovery: true + test_to_refactor: true + + synthesis_output: + workflow_patterns: + - "Search β†’ Read β†’ Edit β†’ Test β†’ Fix" + - "Error β†’ Analyze β†’ Search β†’ Solution" + + anti_patterns: + - "Repeated failed attempts" + - "Circular dependencies" + + optimization_opportunities: + - "Batch similar operations" + - "Cache frequent queries" +``` + +### 13. Feedback Loop Architecture + +#### 13.1 Reflection Loop Pattern +```yaml +reflection_loop: + trigger_conditions: + - frame_completion + - significant_error + - milestone_reached + - context_switch + + reflection_process: + analyze: + - what_worked: "successful patterns" + - what_failed: "error patterns" + - alternative_approaches: "unexplored paths" + + synthesize: + key_insights: [] + patterns_identified: [] + improvements_suggested: [] + + persist: + to_anchors: true # Save as decisions + to_digest: true # Include in summary + score_boost: 0.2 # Important for learning +``` + +#### 13.2 Self-Critique Evaluation System +```yaml +self_critique: + evaluation_dimensions: + code_quality: + - correctness: "Does it work?" + - efficiency: "Is it optimal?" + - maintainability: "Is it clean?" + + decision_quality: + - rationale: "Was reasoning sound?" + - alternatives: "Were options considered?" + - evidence: "Was it data-driven?" + + process_quality: + - methodology: "Was approach systematic?" + - tool_usage: "Were tools used effectively?" + - time_management: "Was effort proportional?" + + critique_storage: + attach_to_frame: true + influence_scoring: true + guide_future_retrieval: true + + continuous_improvement: + track_critique_patterns: true + adjust_weights_based_on_outcomes: true + share_learnings_across_team: true +``` + +#### 13.3 Rich Feedback Integration +```yaml +feedback_sources: + automated: + - test_results + - linting_output + - performance_metrics + - security_scans + + human: + - code_review_comments + - user_satisfaction + - explicit_feedback + + environmental: + - build_success_rate + - deployment_outcomes + - production_incidents + + integration: + collection: "multi_channel" + correlation: "cross_reference" + weight_by_reliability: true + + feedback_to_memory: + positive: "boost_frame_score" + negative: "annotate_with_lessons" + neutral: "record_for_pattern" +``` + +### 14. Context Optimization Strategies + +#### 14.1 Context Minimization Pattern +```yaml +context_minimization: + strategies: + intelligent_filtering: + remove_redundant: true + compress_similar: true + prioritize_relevant: true + + hierarchical_summarization: + detail_levels: + - full: "complete events" + - medium: "key operations" + - summary: "outcomes only" + + dynamic_windowing: + expand_on: "high_relevance" + contract_on: "low_relevance" + adaptive_sizing: true + + benefits: + reduced_token_usage: "40-60%" + faster_processing: true + clearer_focus: true +``` + +#### 14.2 Dynamic Context Injection +```yaml +dynamic_injection: + triggers: + - context_switch_detected + - new_error_type + - unfamiliar_codebase_area + - performance_degradation + + injection_sources: + - relevant_documentation + - similar_past_solutions + - team_knowledge_base + - external_references + + injection_timing: + just_in_time: true # Right before needed + predictive: true # Anticipate needs + on_demand: true # User requested + + injection_format: + inline_hints: "minimal disruption" + sidebar_context: "additional detail" + full_frame: "comprehensive context" +``` + +#### 14.3 Context Window Anxiety Management +```yaml +anxiety_management: + monitoring: + track_usage: "continuous" + alert_threshold: 70% + critical_threshold: 90% + + mitigation_strategies: + progressive_compression: + - summarize_old_frames + - drop_low_score_events + - archive_to_retrieval + + selective_loading: + - load_only_relevant + - defer_deep_history + - use_pointers_not_content + + smart_truncation: + preserve: "decisions_and_outcomes" + truncate: "intermediate_steps" + compress: "repetitive_patterns" +``` + +### 15. Tool Orchestration Patterns + +#### 15.1 Progressive Tool Discovery +```yaml +tool_discovery: + learning_progression: + basic: ["read", "write", "search"] + intermediate: ["edit", "test", "analyze"] + advanced: ["refactor", "optimize", "architect"] + + discovery_mechanism: + observation: "watch_usage_patterns" + suggestion: "recommend_when_relevant" + education: "explain_tool_benefits" + + tool_introduction: + gradual: true + context_appropriate: true + with_examples: true +``` + +#### 15.2 Conditional Parallel Execution +```yaml +parallel_execution: + conditions: + can_parallelize: + - independent_files + - different_subsystems + - non_conflicting_operations + + must_serialize: + - dependent_changes + - shared_resources + - ordered_operations + + orchestration: + plan: "identify_parallelizable" + execute: "batch_similar_operations" + synchronize: "merge_results" + handle_conflicts: "retry_or_serialize" + + benefits: + speed: "3-5x improvement" + efficiency: "reduced_overhead" + atomicity: "group_related_changes" +``` + +### 16. Multi-Agent Coordination + +#### 16.1 Sub-Agent Spawning Pattern +```yaml +sub_agent_spawning: + spawn_triggers: + - complex_subtask + - specialized_domain + - parallel_workstream + - exploratory_analysis + + agent_types: + analyzer: "deep_investigation" + builder: "implementation" + reviewer: "quality_check" + documenter: "knowledge_capture" + + coordination: + handoff: "clear_context_transfer" + results: "structured_return" + state: "shared_memory_access" + + lifecycle: + spawn: "with_specific_context" + execute: "autonomous_operation" + report: "structured_findings" + terminate: "clean_resource_release" +``` + +#### 16.2 Multi-Agent Debate Pattern +```yaml +debate_pattern: + participants: + proposer: "suggests_solution" + critic: "identifies_issues" + synthesizer: "merges_perspectives" + + debate_process: + rounds: 3 + convergence_required: true + consensus_threshold: 0.8 + + decision_recording: + all_perspectives: true + final_consensus: true + dissenting_opinions: true + + benefits: + better_decisions: "multiple viewpoints" + error_reduction: "critical analysis" + learning: "exposed reasoning" +``` + +### 17. Evaluation and Scoring Evolution + +#### 17.1 Anti-Reward-Hacking Design +```yaml +anti_reward_hacking: + diverse_metrics: + - outcome_based: "actual_results" + - process_based: "methodology_quality" + - efficiency_based: "resource_usage" + - learning_based: "knowledge_gained" + + dynamic_weights: + adjust_based_on: + - gaming_detection + - metric_reliability + - context_importance + + validation: + cross_check_metrics: true + human_spot_checks: true + anomaly_detection: true +``` + +#### 17.2 Continuous Calibration +```yaml +calibration: + feedback_loop: + collect: "outcome_data" + analyze: "prediction_vs_actual" + adjust: "scoring_weights" + + calibration_frequency: + minor: "daily" + major: "weekly" + reset: "monthly" + + drift_detection: + monitor: "score_distributions" + alert: "significant_changes" + auto_adjust: "within_bounds" +``` + +### 18. Future Extensibility + +#### 18.1 Roadmap Features (Enhanced) ```yaml planned_features: + # Original features - cross_repository_memory - team_memory_spaces - background_project_compilers @@ -669,9 +1038,18 @@ planned_features: - ml_based_importance_scoring - predictive_context_loading - ide_frame_boundary_visualization + + # New pattern-based features + - episodic_memory_retrieval + - reflection_loop_automation + - multi_agent_orchestration + - context_anxiety_management + - progressive_tool_discovery + - debate_based_decision_making + - continuous_self_improvement ``` -#### 12.2 Integration Points +#### 18.2 Integration Points ```yaml integrations: current: @@ -686,37 +1064,40 @@ integrations: - gitlab - jira - slack + + pattern_integrations: + - langchain: "memory_patterns" + - autogen: "multi_agent" + - guidance: "structured_generation" + - dspy: "optimization_loops" ``` ## Implementation Priorities -### Phase 1: Core Runtime βœ… COMPLETE (v0.1.x) +### Phase 1: Core Runtime (Current) - [x] Frame stack management - [x] Local SQLite storage - [x] MCP server - [x] Basic scoring - [x] Claude Code integration -### Phase 2: Intelligence Layer βœ… COMPLETE (v0.2.x) -- [x] LLM-driven retrieval -- [x] Hybrid digest generation (60/40 deterministic/AI) -- [x] Smart trace detection and bundling -- [x] Configurable scoring with weight profiles -- [x] Railway storage optimization (3-tier) - -### Phase 3: Collaboration βœ… COMPLETE (v0.3.x) -- [x] Shared team stacks (dual stack architecture) -- [x] Frame handoff workflows (v0.3.4) -- [x] Context bridge for cross-session sync -- [x] Linear integration with bidirectional sync -- [x] Claude Skills for workflow automation - -### Phase 4: Scale (Next - v0.4.x) -- [ ] Remote infinite storage (S3/GCS) -- [ ] Incremental garbage collection -- [ ] Performance optimization (<100ms p50) -- [ ] Enterprise features (SSO, audit logs) -- [ ] Multi-repository support +### Phase 2: Intelligence Layer +- [ ] LLM-driven retrieval +- [ ] Hybrid digest generation +- [ ] Smart trace detection +- [ ] Configurable scoring + +### Phase 3: Collaboration +- [ ] Shared team stacks +- [ ] Frame handoff +- [ ] Merge conflict resolution +- [ ] Team analytics + +### Phase 4: Scale +- [ ] Remote infinite storage +- [ ] Incremental GC +- [ ] Performance optimization +- [ ] Enterprise features ## Success Metrics @@ -766,14 +1147,101 @@ performance: retrieval_timeout_ms: 1000 # Allow deeper searches ``` +## Implementation Guidance + +### Pattern Implementation Priority Matrix +```yaml +high_impact_easy: + # Implement first - quick wins + - context_minimization # 40-60% token savings + - reflection_loop # Improves decision quality + - parallel_tool_execution # 3-5x speed improvement + - episodic_memory # Reuse past solutions + +high_impact_complex: + # Phase 2 - significant value + - self_critique_system # Continuous improvement + - multi_agent_debate # Better decisions + - dynamic_context_injection # Just-in-time context + - log_synthesis # Learn from patterns + +moderate_impact: + # Phase 3 - refinements + - progressive_tool_discovery # Gradual capability + - anti_reward_hacking # Robust metrics + - sub_agent_spawning # Task delegation + - context_anxiety_mgmt # Proactive optimization +``` + +### Key Design Principles from Patterns +```yaml +principles: + 1_externalize_state: + rationale: "Enable persistence across sessions" + implementation: "Filesystem + database hybrid" + + 2_minimize_context: + rationale: "Maximize efficiency and clarity" + implementation: "Hierarchical summarization" + + 3_learn_continuously: + rationale: "Improve over time" + implementation: "Reflection loops + pattern extraction" + + 4_orchestrate_intelligently: + rationale: "Use right tool for task" + implementation: "Progressive discovery + conditional execution" + + 5_critique_systematically: + rationale: "Ensure quality" + implementation: "Multi-dimensional evaluation" +``` + +### Practical Implementation Steps +```yaml +step_1_baseline: + - implement_frame_stack + - add_basic_scoring + - create_sqlite_storage + - build_mcp_interface + +step_2_memory_patterns: + - add_episodic_retrieval + - implement_log_synthesis + - create_reflection_loops + - build_pattern_detection + +step_3_optimization: + - add_context_minimization + - implement_dynamic_injection + - create_parallel_execution + - optimize_retrieval_speed + +step_4_intelligence: + - add_self_critique + - implement_debate_patterns + - create_continuous_calibration + - build_learning_system + +step_5_scale: + - add_multi_agent_coordination + - implement_distributed_memory + - create_team_collaboration + - optimize_for_production +``` + ## Conclusion StackMemory provides a revolutionary approach to AI tool memory management through: -- **Lossless storage** with intelligent retrieval +- **Lossless storage** with smart retrieval - **Frame-based organization** replacing linear chat logs - **Two-tier storage** balancing performance and capacity - **LLM-driven context selection** for optimal relevance - **Team collaboration** through shared and individual stacks - **Configurable scoring** adapting to project needs +- **Advanced patterns** from agentic AI research +- **Continuous learning** through reflection and synthesis +- **Intelligent orchestration** of tools and agents +- **Context optimization** for efficiency at scale -The system ensures AI tools never lose context while maintaining performance at scale. \ No newline at end of file +The system ensures AI tools never lose context while maintaining performance at scale, incorporating state-of-the-art patterns from the agentic AI community. \ No newline at end of file diff --git a/tomorrow.md b/docs/tomorrow.md similarity index 100% rename from tomorrow.md rename to docs/tomorrow.md diff --git a/vision.md b/docs/vision.md similarity index 100% rename from vision.md rename to docs/vision.md From dbe9856bcd6593b2c042ca023f7c649e40158485 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Fri, 17 Apr 2026 08:47:53 -0400 Subject: [PATCH 11/18] fix(test): mock canonicalStateStore in session tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Session tests mocked fs/promises but not the canonical-store module. The canonicalStateStore singleton inherited the mocked fs, causing pathExists to return true while readFile returned undefined β€” crashing JSON.parse. Mock the entire canonical-store module with stubs for upsertSession, appendEvent, and endSession. --- src/core/session/__tests__/session-manager.test.ts | 9 +++++++++ src/core/session/__tests__/session.test.ts | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/src/core/session/__tests__/session-manager.test.ts b/src/core/session/__tests__/session-manager.test.ts index 2d24eee2..93ef884c 100644 --- a/src/core/session/__tests__/session-manager.test.ts +++ b/src/core/session/__tests__/session-manager.test.ts @@ -21,6 +21,15 @@ vi.mock('child_process', () => ({ execSync: vi.fn().mockReturnValue('main\n'), })); +vi.mock('../../shared-state/canonical-store.js', () => ({ + canonicalStateStore: { + upsertSession: vi.fn().mockResolvedValue({}), + appendEvent: vi.fn().mockResolvedValue(undefined), + endSession: vi.fn().mockResolvedValue(undefined), + initialize: vi.fn().mockResolvedValue(undefined), + }, +})); + describe('SessionManager', () => { let manager: SessionManager; diff --git a/src/core/session/__tests__/session.test.ts b/src/core/session/__tests__/session.test.ts index 0fb3d35f..7c502887 100644 --- a/src/core/session/__tests__/session.test.ts +++ b/src/core/session/__tests__/session.test.ts @@ -26,6 +26,15 @@ vi.mock('fs/promises', () => ({ access: vi.fn(), })); +vi.mock('../../shared-state/canonical-store.js', () => ({ + canonicalStateStore: { + upsertSession: vi.fn().mockResolvedValue({}), + appendEvent: vi.fn().mockResolvedValue(undefined), + endSession: vi.fn().mockResolvedValue(undefined), + initialize: vi.fn().mockResolvedValue(undefined), + }, +})); + vi.mock('child_process', () => ({ execSync: vi.fn().mockReturnValue('main\n'), })); From b1ca885d717616b62b2034bade7297cb74dd8ca0 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Fri, 17 Apr 2026 15:13:40 -0400 Subject: [PATCH 12/18] chore: handoff checkpoint on chore/root-reorg --- docs/prds/substrate-enterprise-brain.md | 632 ++++++++++++++++++++++++ 1 file changed, 632 insertions(+) create mode 100644 docs/prds/substrate-enterprise-brain.md diff --git a/docs/prds/substrate-enterprise-brain.md b/docs/prds/substrate-enterprise-brain.md new file mode 100644 index 00000000..d2424a21 --- /dev/null +++ b/docs/prds/substrate-enterprise-brain.md @@ -0,0 +1,632 @@ +# PRD: Substrate β€” Enterprise Knowledge Brain + +**Status:** Draft +**Author:** Jonathan Wu +**Date:** 2026-04-17 +**Codename:** Croissant +**Version:** v1.0 + +--- + +## 1. Problem & Evidence + +### The problem + +AI coding tools have solved context for engineers because all context lives in a git repo. For knowledge workers β€” product managers, marketers, sales teams, executives β€” context is fragmented across 5-15 SaaS tools. There is no "git repo for knowledge." + +Today's landscape: +- **Transcripts** live in Granola/Otter. **Documents** in Notion/Confluence. **Customer data** in HubSpot. **Tasks** in Linear/Jira. **Conversations** in Slack. **Code decisions** in GitHub. +- No system connects dots across these sources automatically. +- AI tools (Glean, Notion AI) sit on top of individual silos β€” they search within a tool, not across the organization's full knowledge surface. +- When an AI agent needs organizational context to complete a task, it doesn't exist in a queryable, structured form. + +### Evidence + +- Engineering teams using StackMemory's conductor report 70%+ of agent failures stem from missing organizational context (what was decided, why, by whom). +- Provenant's decision-tracking prototype (packages/provenant/) demonstrated that cross-source ingestion + confidence scoring produces actionable knowledge β€” but it's scoped to "decisions" only and lacks a user-facing product. +- The enterprise "AI readiness" conversation has shifted from "do we have data?" to "can AI access and reason over our data?" β€” this is the gap. + +### Why now + +- MCP protocol standardizes adapter interfaces β€” 7/8 target data sources have official MCP servers (per THEORY.MD: "standardize the intersection, expose the union"). +- Cloudflare Agents SDK + D1 provides zero-ops distributed SQLite β€” no Postgres migration needed (per THEORY.MD: "SQLite over Postgres for local"). +- StackMemory's conductor, scoring pipeline, and wiki compiler prove the core technical approaches work at production quality. + +--- + +## 2. Goals / Non-Goals + +### Goals + +| # | Goal | Measurable target | +|---|------|-------------------| +| G1 | Time to value under 5 minutes | Install β†’ connect 2 sources β†’ cross-source query < 5 min | +| G2 | Cross-source knowledge retrieval | β‰₯ 30% of queries cite 2+ sources within first week | +| G3 | Daily active use | Day-7 return rate β‰₯ 40% | +| G4 | Team adoption | Second user on same team within 14 days | +| G5 | Revenue | First paying Cloud Team customer within 60 days of launch | + +### Non-goals (v1) + +- OAuth connector flows (v1.5 β€” paid tier differentiator) +- Cloudflare-hosted Brain instances (v2) +- Federated team access / org-level rollup (v2) +- Autonomous agent execution using the Brain (v3) +- Stripe metering / billing infrastructure (v2) +- GDPR compliance / data residency controls +- Mobile or web-only client +- Multi-language support + +--- + +## 3. Users & Jobs-to-Be-Done + +### Primary persona: Engineering Team Lead + +**Context:** Manages 3-8 engineers. Uses Linear for tasks, GitHub for code, Slack for communication. Makes 10-20 decisions per week that are never captured in a queryable form. + +**Jobs:** +- "When I start my day, I want to know what happened overnight across all my tools without checking each one." +- "When a new engineer asks 'why did we build it this way?', I want to point them at the Brain instead of spending 30 minutes in Slack search." +- "When planning a sprint, I want to see what's blocked, what's decided, and what's still open β€” across Linear, GitHub, and Slack β€” in one view." + +### Secondary persona (v2+): Product Manager + +**Context:** Uses Notion for specs, Linear for tracking, Slack for stakeholder comms, HubSpot for customer feedback. + +**Jobs:** +- "When writing a PRD, I want the Brain to surface related past decisions, customer feedback, and technical constraints." +- "When asked 'why did we prioritize X?', I want a cited answer, not my memory." + +### Excluded (v1): C-suite, sales, marketing, non-technical operators + +--- + +## 4. Solution Overview + +### Product: Substrate + +An Electron desktop app that auto-indexes enterprise knowledge from connected sources into a queryable Brain. Users connect data sources, the Brain ingests and organizes knowledge, and a chat interface provides instant, cited answers. + +### Three components + +``` +Provenance (connectors) --> Cortex (brain) --> Substrate (app) + adapters/fetch/dedup graph/score/query Electron/UI/control +``` + +| Component | Package | License | Purpose | +|-----------|---------|---------|---------| +| **Cortex** | `@stackmemoryai/cortex` | BSL | Knowledge graph, confidence scoring, query engine, compaction | +| **Provenance** | `@stackmemoryai/provenant` | BSL | Connector adapters, MCP orchestration, delta sync, dedup | +| **Substrate** | `@stackmemoryai/substrate` | Private | Electron app, CF runtime, billing, team management | +| **Types** | `@stackmemoryai/types` | BSL | Shared interfaces between packages | + +### Why this decomposition + +Provenant was a monolith handling ingest + score + store + query + resolve. For a product: +- **Connectors are commodity** (every iPaaS does this) β€” keep them in Provenance +- **The graph + scoring + query + compaction is the moat** β€” that's Cortex +- Teams can add custom adapters without touching Brain internals +- CF architecture maps cleanly: adapters = Workers, Brain = Durable Object + +> Per THEORY.MD: "Standardize the intersection, expose the union" β€” MCP is the standardized intersection; Cortex's scoring/compaction is the exposed union. + +--- + +## 5. Architecture & Data Model + +### 5.1 Multi-repo structure + +``` +stackmemoryai/cortex OSS (BSL) Knowledge graph + query engine +stackmemoryai/provenant OSS (BSL) Connector adapters + MCP orchestration +stackmemoryai/substrate Private Electron app + CF runtime +stackmemoryai/types OSS (BSL) Shared TypeScript interfaces +stackmemoryai/stackmemory OSS (BSL) Existing CLI (depends on cortex + provenant) +``` + +**Why multi-repo over monorepo:** +- Forced clean interfaces (no leaking shared state) +- Independent deploy cycles (ship Cortex without touching Provenance) +- CF Wrangler expects its own repo root +- Clear open-source boundary (public repos vs private) +- Parallel contributors without PR queue bottleneck + +### 5.2 Cortex schema (v1, reviewed 2026-04-17) + +Adapted from Provenant's 9-table schema. Two critical review passes applied. + +**Design decisions:** +- `INTEGER PRIMARY KEY` (rowid alias) for internal references β€” TEXT UUIDs cause B-tree fragmentation at scale +- UUID kept as `id TEXT UNIQUE` for API/external use +- FTS5 external content table with explicit triggers β€” no silent desyncs +- Append-only versioning with `is_latest` partial index for fast current-version lookups +- `dependency_index` dropped β€” use recursive CTE at query time (O(n^2) pre-computation doesn't scale) +- Top queryable fields (`priority`, `state`, `labels`, `assignee`) as real columns, not buried in JSON +- `workspace_id` deferred to v2 migration β€” YAGNI, avoids false confidence from unfiltered column + +```sql +CREATE TABLE schema_version (version INTEGER PRIMARY KEY); +INSERT INTO schema_version VALUES (1); + +CREATE TABLE knowledge ( + rowid INTEGER PRIMARY KEY, + id TEXT NOT NULL UNIQUE, -- UUID for API/external reference + type TEXT NOT NULL, -- free-form: 'decision' | 'document' | 'conversation' | 'ticket' | ... + content TEXT NOT NULL, + summary TEXT, -- LLM-generated for long content + actor TEXT, + confidence REAL DEFAULT 0.5, + source_system TEXT NOT NULL, + source_id TEXT, + source_hash TEXT, -- dedup / change detection + raw_payload TEXT, -- archival, never queried directly + priority INTEGER, -- 0-4, standardized across sources + state TEXT, -- 'open' | 'closed' | 'merged' | 'resolved' + labels TEXT, -- JSON array: ["auth", "backend"] + assignee TEXT, + metadata TEXT DEFAULT '{}', -- truly dynamic fields only + embedding BLOB, + embedding_model TEXT, -- 'voyage-3' | 'text-embedding-3-small' | null + version INTEGER DEFAULT 1, + is_latest INTEGER DEFAULT 1, -- 1 = current, 0 = historical + thread_id TEXT, -- flat thread grouping + parent_id INTEGER, -- direct parent (conversations, doc sections) + created_at INTEGER NOT NULL, + updated_at INTEGER NOT NULL, + ingested_at INTEGER NOT NULL, + FOREIGN KEY (parent_id) REFERENCES knowledge(rowid) +); + +CREATE INDEX idx_knowledge_source ON knowledge(source_system, source_id); +CREATE INDEX idx_knowledge_latest ON knowledge(source_system, source_id) WHERE is_latest = 1; +CREATE UNIQUE INDEX idx_knowledge_source_version ON knowledge(source_system, source_id, version); +CREATE INDEX idx_knowledge_thread ON knowledge(thread_id); +CREATE INDEX idx_knowledge_type ON knowledge(type); +CREATE INDEX idx_knowledge_state ON knowledge(state); +CREATE INDEX idx_knowledge_created ON knowledge(created_at); + +CREATE VIRTUAL TABLE knowledge_fts USING fts5( + content, summary, actor, + content=knowledge, content_rowid=rowid +); + +CREATE TRIGGER knowledge_ai AFTER INSERT ON knowledge BEGIN + INSERT INTO knowledge_fts(rowid, content, summary, actor) + VALUES (new.rowid, new.content, new.summary, new.actor); +END; +CREATE TRIGGER knowledge_ad AFTER DELETE ON knowledge BEGIN + INSERT INTO knowledge_fts(knowledge_fts, rowid, content, summary, actor) + VALUES ('delete', old.rowid, old.content, old.summary, old.actor); +END; +CREATE TRIGGER knowledge_au AFTER UPDATE ON knowledge BEGIN + INSERT INTO knowledge_fts(knowledge_fts, rowid, content, summary, actor) + VALUES ('delete', old.rowid, old.content, old.summary, old.actor); + INSERT INTO knowledge_fts(rowid, content, summary, actor) + VALUES (new.rowid, new.content, new.summary, new.actor); +END; + +CREATE TABLE edges ( + rowid INTEGER PRIMARY KEY, + id TEXT NOT NULL UNIQUE, + from_id INTEGER NOT NULL, + to_id INTEGER NOT NULL, + rel_type TEXT NOT NULL, + confidence REAL DEFAULT 0.5, + version INTEGER DEFAULT 1, + created_at INTEGER NOT NULL, + FOREIGN KEY (from_id) REFERENCES knowledge(rowid), + FOREIGN KEY (to_id) REFERENCES knowledge(rowid) +); + +CREATE INDEX idx_edges_from_rel ON edges(from_id, rel_type); +CREATE INDEX idx_edges_to_rel ON edges(to_id, rel_type); + +CREATE TABLE sources ( + id TEXT PRIMARY KEY, + system TEXT NOT NULL UNIQUE, + auth_type TEXT NOT NULL, + config TEXT, + sync_cursor TEXT, -- opaque, adapter-owned + sync_config TEXT, -- JSON: which repos/channels/etc to sync + last_sync_at INTEGER, + last_sync_status TEXT, + last_sync_error TEXT, + node_count INTEGER DEFAULT 0, + created_at INTEGER NOT NULL +); + +CREATE TABLE rejection_log ( + id TEXT PRIMARY KEY, + knowledge_id INTEGER NOT NULL, + reason TEXT, + actor TEXT, + created_at INTEGER NOT NULL, + FOREIGN KEY (knowledge_id) REFERENCES knowledge(rowid) +); + +-- Retained from Provenant +CREATE TABLE review_queue (...); -- low-confidence items pending human review +CREATE TABLE contradictions (...); -- conflicting knowledge nodes +CREATE TABLE stale_flags (...); -- nodes whose source data changed +CREATE TABLE dependency_index (...); -- transitive closure for graph traversal +``` + +**Key differences from Provenant:** +- `nodes` β†’ `knowledge` (general, not decision-scoped) +- Added `parent_id` for conversation threading / document hierarchy +- Added `summary` for long-content compression +- Added `source_system` + `source_id` directly on knowledge (denormalized for query speed) +- Added `sources` table for connection management +- Removed `rejection_log` from v1 (add in v2 with human review UI) +- Append-only model: updates create new versions, old versions retained + +> Per THEORY.MD: "SQLite over Postgres for local: zero-config, file-based, FTS5 built-in." + +### 5.3 Connector strategy + +**v1: API key connectors (OSS)** +- User pastes API key in Provenance settings tab +- Credentials encrypted via Electron `safeStorage` (OS keychain) +- Keys never leave the machine +- Supported: Linear (API key), GitHub (PAT) + +**v1.5: OAuth connectors (paid)** +- Nango frontend SDK triggers OAuth popup in Electron `BrowserWindow` +- Nango cloud manages token storage, refresh, revocation +- Upsell trigger: "Want to connect Slack/Notion/Google? Upgrade." +- Supported: Slack, GitHub (full OAuth), Notion, Google Drive, HubSpot, Confluence + +**Adapter interface: MCP protocol** +- 7/8 target sources have official MCP servers +- Provenance spawns MCP servers with credentials injected as env vars +- Calls MCP tools to fetch data, normalizes responses into Cortex schema +- Delta sync via `since` timestamps, hash-based dedup + +```typescript +// @stackmemoryai/types β€” adapter contract +interface ConnectorAdapter { + system: string; // 'linear' | 'slack' | ... + authType: 'api_key' | 'oauth'; + fetch(since: Date): AsyncIterable; // delta sync + normalize(record: RawRecord): KnowledgeNode; // β†’ Cortex schema + healthCheck(): Promise; +} + +interface RawRecord { + id: string; + system: string; + type: string; + content: string; + actor?: string; + timestamp: number; + raw: unknown; // original payload + hash: string; // for dedup +} +``` + +### 5.4 Cloud architecture (v2) + +``` +CF Agent (Durable Object) ← Brain: always-on, SQLite/D1, WebSocket + |-- CF Worker (V8 isolate) ← Fast: queries, API calls, routing + |-- CF Container (Docker) ← Heavy: git clone, builds, agent runs + '-- CF Sandbox ← Untrusted: user code, shell (v3) +``` + +- Uses CF Agents SDK (`agents` npm) β€” native DO persistence, hibernation (zero idle cost), MCP support, built-in metering +- Each team's Brain = a Durable Object with D1 SQLite +- Workers handle lightweight adapter fetches and query routing +- Containers for heavy compute (agent execution in v3) + +> Per THEORY.MD: "Hooks over daemons for capture" β€” adapters fire on schedule or webhook, not as long-running polling daemons. + +--- + +## 6. Detailed Requirements + +### 6.1 Cortex core + +| ID | Requirement | Priority | Notes | +|----|-------------|----------|-------| +| C1 | Ingest normalized records from Provenance adapters | P0 | Hash-based dedup, append-only versioning | +| C2 | Confidence scoring pipeline | P0 | Pluggable signal model per source type. Thresholds: auto-accept β‰₯0.7, review 0.4-0.69, discard <0.4 | +| C3 | Keyword search (FTS5 BM25) | P0 | Full-text search on content + summary fields | +| C4 | LLM query synthesis with streaming | P0 | SSE streaming, Claude API, cite source nodes | +| C5 | Progressive query response | P0 | Instant: indexed results. Stream: LLM synthesis. Background: deep analysis as task | +| C6 | Edge creation (auto-detected relationships) | P1 | Derive edges from shared entities, temporal proximity, content similarity | +| C7 | Stale flag propagation | P1 | When source hash changes, mark downstream nodes | +| C8 | Contradiction detection | P1 | Flag when two nodes make conflicting claims | +| C9 | Embedding-based semantic search | P2 | Optional, behind feature flag. Voyage AI or OpenAI embeddings | +| C10 | Temporal queries ("as of March 1st") | P2 | Query knowledge state at a point in time | +| C11 | Compaction / decay | P2 | Merge duplicate nodes, decay stale knowledge over time | + +### 6.2 Provenance connectors + +| ID | Requirement | Priority | Notes | +|----|-------------|----------|-------| +| P1 | Linear adapter (API key) | P0 | Issues, comments, labels, assignees. Delta sync. | +| P2 | GitHub adapter (PAT) | P0 | PRs, issues, commits, reviews. Delta sync. | +| P3 | MCP server spawning | P0 | Spawn official MCP servers with credential env vars | +| P4 | Adapter health check | P0 | Report sync status, last sync time, error count | +| P5 | Independent failure resilience | P0 | Each adapter fails/retries independently. Others continue. | +| P6 | Slack adapter (OAuth) | P1 | v1.5, paid tier. Channels, threads, reactions. | +| P7 | Notion adapter (OAuth) | P2 | v1.5, paid tier. Pages, databases, blocks. | +| P8 | Google Drive adapter (OAuth) | P2 | v1.5, paid tier. Docs, sheets, slides. | + +### 6.3 Electron app (Substrate) + +| ID | Requirement | Priority | Notes | +|----|-------------|----------|-------| +| S1 | Cortex chat panel (left tab) | P0 | HexStyleChat base + SSE streaming. Branded "Cortex." | +| S2 | Provenance settings (tab) | P0 | API key input, connector status, sync controls | +| S3 | Onboarding flow | P0 | First-launch: connect source β†’ ingest β†’ first query | +| S4 | Suggestion pills (empty state) | P0 | "What's the team working on?", "Recent decisions", etc. | +| S5 | Task panel (right side) | P0 | Background deep analysis tasks with status | +| S6 | Agent control mode (existing) | P0 | Keep existing tmux agent management, terminal, Linear | +| S7 | Knowledge health dashboard | P1 | Node counts, staleness, source distribution | +| S8 | Cross-source citation display | P0 | Show which sources contributed to each answer | +| S9 | Credential storage via safeStorage | P0 | OS keychain, encrypted at rest | +| S10 | Auto-update via electron-updater | P1 | DMG distribution, GitHub Releases | + +### 6.4 Progressive query flow + +``` +User asks: "What's blocking the auth refactor?" + +[0ms] Cortex searches FTS5 index + β†’ Returns matching knowledge nodes instantly + β†’ Display in chat as "Sources found: 3 Linear issues, 2 GitHub PRs" + +[500ms] Cortex streams LLM synthesis + β†’ Claude reads top-k nodes + edges + β†’ Streams answer with inline citations: "The auth refactor [1] is blocked by..." + β†’ Citations link to source nodes with confidence scores + +[2-5s] Answer complete. Citations panel shows: + β†’ [1] Linear STA-412: "Auth middleware rewrite" (confidence: 0.89) + β†’ [2] GitHub PR #847: "Remove legacy session handler" (confidence: 0.76) + β†’ [3] Slack #eng-backend: "Legal flagged token storage" (confidence: 0.65) + +[background] If query is complex, spawn deep analysis task: + β†’ Task appears in side panel: "Deep analysis: auth refactor blockers" + β†’ Traverses knowledge graph (2+ hops from initial results) + β†’ Updates answer with additional context when complete +``` + +--- + +## 7. UX Flows + +### 7.1 Onboarding (< 5 minutes to value) + +``` +Step 1: Install (30s) + Electron app opens β†’ Substrate branding β†’ empty state + "Welcome to Substrate. Connect your first source to get started." + +Step 2: Connect first source (2 min) + Click "Add Source" β†’ select Linear β†’ paste API key β†’ "Connect" + Progress bar: "Indexing 47 issues, 123 comments..." + Real-time count: nodes rising as ingestion runs + +Step 3: First query (30s after ingestion) + Suggestion pill: "What's the team working on?" + Cortex answers with cited Linear issues + AHA MOMENT: "It already knows this." + +Step 4: Connect second source (2 min) + Click "Add Source" β†’ select GitHub β†’ paste PAT β†’ "Connect" + Progress: "Indexing 12 repos, 89 PRs, 234 issues..." + Cross-referencing happens automatically (shared entity detection) + +Step 5: Cross-source query (the magic moment) + "What's blocking the auth refactor?" + Brain pulls Linear ticket + GitHub PR + commit messages + HOLY SHIT MOMENT: "It connected dots I didn't." +``` + +### 7.2 Cortex chat panel + +``` ++--------------------------------------------------+ +| Cortex [Search] [+] | +| | +| (empty state β€” centered) | +| | +| Ask your Brain anything | +| | +| [What's the team working on?] | +| [Recent decisions] | +| [What's blocked?] | +| [Summarize last week] | +| | +| ____________________________________________ | +| | | | +| | Ask Cortex... [Send] | | +| |__________________________________________| | ++--------------------------------------------------+ +``` + +Active state with task panel: + +``` ++-------------------------------+-------------------+ +| Cortex | Tasks | +| | | +| You: What's blocking auth? | [~] Deep analysis| +| | auth blockers| +| Cortex: The auth refactor | 3 sources... | +| is blocked by two items: | | +| | [v] Linear sync | +| 1. Legal compliance [1] | 47 nodes | +| 2. PR review pending [2] | | +| | [v] GitHub sync | +| Sources: | 89 nodes | +| [1] STA-412 (0.89) | | +| [2] PR #847 (0.76) | | +| [3] #eng-backend (0.65) | | +| | | +| ___________________________ | | +| | Ask Cortex... [Send] | | | +| |_________________________| | | ++-------------------------------+-------------------+ +``` + +### 7.3 Provenance settings + +``` ++--------------------------------------------------+ +| Provenance β€” Connectors | +| | +| Connected Sources | +| | +| [check] Linear API Key Sync: 2m ago [...] | +| [check] GitHub PAT Sync: 5m ago [...] | +| [ ] Slack OAuth Not connected [Connect] | +| [ ] Notion OAuth Not connected [Connect] | +| | +| [+ Add Source] | +| | +| Sync Schedule | +| [v] Auto-sync every [15 min v] | +| [ ] Sync on app launch | +| | +| Brain Health | +| Total nodes: 1,247 | +| Sources: Linear (623), GitHub (624) | +| Stale nodes: 12 (0.9%) | +| Last full sync: 2 minutes ago | ++--------------------------------------------------+ +``` + +--- + +## 8. Pricing & Packaging + +| | OSS Self-Hosted | Cloud Free | Cloud Team | Cloud Enterprise | +|---|---|---|---|---| +| **Seats** | unlimited | up to 3 | up to 5 | unlimited | +| **Price** | free | free | $99/mo + metered | custom | +| **Auth** | API keys only | API keys only | OAuth (Nango) | SSO + OAuth | +| **Storage** | local SQLite | cloud D1 | cloud D1 | cloud D1 | +| **Brain instances** | 1 (local) | 1 (hosted) | 1 (hosted) | federated (multi-team) | +| **Query** | CLI + MCP | Cortex chat | Cortex chat + API | + org rollup | +| **Support** | community | community | email | dedicated | + +**Metering (Cloud Team+):** +- LLM inference: pass-through at 2-3x Anthropic cost +- Tracked as tokens in + tokens out across indexing and queries +- Stripe Metering API for usage billing with margin targets +- Storage: generous free tier (1GB included), then $/GB/mo + +**Upsell triggers:** +- OSS β†’ Cloud: "Sync across devices", "Team sharing" +- Cloud Free β†’ Team: "Connect Slack/Notion" (OAuth), "More than 3 seats" +- Team β†’ Enterprise: "Federated access", "SSO", "Org rollup" + +--- + +## 9. Rollout Plan + +### v1 β€” Local Brain (2 weeks) + +Ship: +- [ ] `@stackmemoryai/types` repo β€” shared interfaces +- [ ] `@stackmemoryai/cortex` repo β€” knowledge graph, FTS5 search, streaming LLM query +- [ ] `@stackmemoryai/provenant` repo β€” extracted from packages/provenant/, adapter interface + Linear + GitHub +- [ ] Substrate Electron app β€” Cortex chat panel + Provenance settings + onboarding +- [ ] API key connectors (Linear, GitHub PAT) +- [ ] Progressive query (instant β†’ stream β†’ background task) +- [ ] DMG distribution + +Cleanup: +- [ ] Remove `tools/agent-viewer/` from stackmemory repo +- [ ] Extract desktop control-plane from provenantai worktree into substrate repo + +### v1.5 β€” OAuth + Paid Tier (~4 weeks after v1) + +- [ ] Nango integration for OAuth flows +- [ ] Slack, Notion, Google Drive adapters +- [ ] Cloud Free tier (hosted D1 Brain) +- [ ] Stripe metering integration +- [ ] Basic telemetry + log shipping (opt-in) + +### v2 β€” Cloud + Teams (~4 weeks after v1.5) + +- [ ] Substrate cloud (CF Agents SDK, D1, Workers) +- [ ] Federated team access with opt-in sharing +- [ ] C-suite org rollup queries +- [ ] Access controls / permissions +- [ ] SSO via OIDC + +### v3 β€” Agent Execution (~4 weeks after v2) + +- [ ] Brain-powered autonomous agents +- [ ] CF Containers for heavy compute (git, builds, tests) +- [ ] Agent outcomes feed back into Cortex confidence model +- [ ] Self-improving knowledge loop + +--- + +## 10. Success Metrics & Instrumentation + +### Leading indicators (weekly) + +| Metric | Target | Instrumentation | +|--------|--------|-----------------| +| Install β†’ first query | < 5 min | Timestamp delta (app open β†’ first query event) | +| Sources connected (day 1) | >= 2 per user | Source creation events | +| Queries per user (week 1) | >= 10 | Query event counter | +| Cross-source query rate | >= 30% | Queries citing 2+ source_system values | + +### Lagging indicators (monthly) + +| Metric | Target | Instrumentation | +|--------|--------|-----------------| +| Day-7 return rate | >= 40% | App open events, daily active users | +| Second team member | within 14 days | Seat count per org | +| Paid conversion | >= 5% of free users | Stripe subscription events | +| NPS | >= 50 | In-app survey (after 14 days) | + +### Rollback indicator + +- Day-7 return rate < 20% β†’ Brain isn't sticky, investigate stale knowledge or poor answer quality +- Cross-source query rate < 10% β†’ Single-source answers aren't compelling enough, users would just use source's native search + +### Telemetry + +- **Local/OSS:** off by default, opt-in only. Console logs + local traces. +- **Cloud:** basic telemetry on. Query count, source health, errors, latency percentiles. Log shipping for debugging. + +--- + +## 11. Risks & Mitigations + +| Risk | Impact | Likelihood | Mitigation | +|------|--------|------------|------------| +| **Answer quality too low** | Users churn after first query | Medium | Progressive query (show raw sources first, then synthesis). Confidence scores set expectations. | +| **Ingestion too slow** | Onboarding > 5 min target | Low | Start querying before full ingest completes. Show partial results with "still indexing..." indicator. | +| **MCP server instability** | Adapter failures cascade | Medium | Independent failure resilience (each adapter retries independently). Health dashboard. | +| **Schema migration complexity** | Cortex schema changes break data | Low | Append-only model β€” no destructive migrations. Version field on all records. | +| **Electron app size** | >200MB download discourages install | Medium | Tree-shake dependencies. Defer optional packages. Target <100MB. | +| **Nango dependency (v1.5)** | Vendor lock-in for OAuth | Low | OAuth apps registered under our accounts β€” only token management delegated. Can self-host or swap. | +| **CF platform risk (v2)** | Cloudflare pricing/policy changes | Low | Cortex core is SQLite-native, portable. CF is the deployment target, not the data format. | +| **Competitor launches first** | Glean/Notion ship similar Brain | Medium | OSS distribution + local-first is our moat. Enterprise SaaS can't match zero-ops self-hosted. | + +--- + +## 12. Open Questions + +| # | Question | Blocking? | Owner | +|---|----------|-----------|-------| +| OQ1 | Cortex schema: should `knowledge` table use JSON column for extensible metadata vs fixed columns? | No (start with fixed, add JSON later) | Eng | +| OQ2 | Embedding provider for v1: skip entirely (keyword-only) or include Voyage AI behind feature flag? | No (skip for v1, keyword search is sufficient per THEORY.MD) | Eng | +| OQ3 | ~~Electron app: migrate renderer.js to React, or extend vanilla JS?~~ | **Resolved: React** | Eng | +| OQ4 | Auto-sync interval: what's the right default? 5min / 15min / 1hr? | No (ship with 15min, make configurable) | Product | +| OQ5 | ~~How to handle the provenantai worktree extraction?~~ | **Resolved: copy + merge into main provenantai repo** | Eng | + +> **OQ3 resolved:** React for v1. Invest upfront for cleaner long-term architecture. + +> **OQ5 resolved:** Copy desktop control-plane from worktree into main provenantai repo (not a separate substrate repo). From 1a429a8e9749f2f8f8bc75ab4141efbd11db26ec Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Sat, 18 Apr 2026 18:14:42 -0400 Subject: [PATCH 13/18] feat(gepa): phase-level prompt optimization with auto-targeting Split conductor prompt-template.md into 5 phase files (system, understand, implement, validate, deliver). GEPA now auto-targets the worst-performing phase from outcomes.jsonl instead of mutating the entire template as a monolith. - Phase-aware prompt building in orchestrator with DSPy bridge - Assertion-based retry injects phase-specific error guidance - promptVersions hash map in AgentOutcomeEntry for attribution - Stop hook fires GEPA session accumulator (auto-optimize at threshold) - after-run.sh triggers GEPA + DSPy (every 50 runs) automatically - Gold sets mined from 71 outcomes across 4 phases - eval-phases.js harness validates mutations before applying - npm run gepa:eval / gepa:mine scripts --- .claude/settings.json | 10 ++ package.json | 3 + scripts/conductor/after-run.sh | 26 +++- scripts/gepa/eval-phases.js | 197 ++++++++++++++++++++++++ scripts/gepa/gold/deliver.jsonl | 10 ++ scripts/gepa/gold/implement.jsonl | 1 + scripts/gepa/gold/mine-traces.js | 130 ++++++++++++++++ scripts/gepa/gold/understand.jsonl | 56 +++++++ scripts/gepa/gold/validate.jsonl | 4 + scripts/gepa/hooks/gepa-session-hook.js | 10 +- scripts/gepa/optimize.js | 177 ++++++++++++++++++++- src/cli/commands/orchestrator.ts | 193 ++++++++++++++++++++++- 12 files changed, 806 insertions(+), 11 deletions(-) create mode 100644 scripts/gepa/eval-phases.js create mode 100644 scripts/gepa/gold/deliver.jsonl create mode 100644 scripts/gepa/gold/implement.jsonl create mode 100644 scripts/gepa/gold/mine-traces.js create mode 100644 scripts/gepa/gold/understand.jsonl create mode 100644 scripts/gepa/gold/validate.jsonl diff --git a/.claude/settings.json b/.claude/settings.json index 045c2ad2..ec246ea7 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -52,6 +52,16 @@ "command": "entire hooks claude-code stop" } ] + }, + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "node scripts/gepa/hooks/gepa-session-hook.js", + "async": true + } + ] } ], "PreToolUse": [ diff --git a/package.json b/package.json index 75a89d9b..ced4a85d 100644 --- a/package.json +++ b/package.json @@ -141,6 +141,9 @@ "sync:start": "node scripts/background-sync-manager.js", "sync:setup": "./scripts/setup-background-sync.sh", "eval:cord": "npx tsx scripts/evals/cord-vs-flat-eval.ts", + "gepa:eval": "node scripts/gepa/eval-phases.js", + "gepa:eval:json": "node scripts/gepa/eval-phases.js --json", + "gepa:mine": "node scripts/gepa/gold/mine-traces.js", "prepare": "echo 'Prepare step completed'", "verify:dist": "node scripts/verify-dist.cjs", "test:smoke-db": "bash scripts/smoke-init-db.sh", diff --git a/scripts/conductor/after-run.sh b/scripts/conductor/after-run.sh index 14e551fb..c85e1d64 100755 --- a/scripts/conductor/after-run.sh +++ b/scripts/conductor/after-run.sh @@ -1,7 +1,8 @@ #!/usr/bin/env bash # Conductor after_run hook -# Captures context from the agent run and tags it with the issue identifier -# Called after each agent attempt (success or failure) +# 1. Captures context from the agent run +# 2. Triggers GEPA session hook (accumulates toward auto-optimization) +# 3. Triggers DSPy optimization every 50 runs # # Environment: SYMPHONY_WORKSPACE_DIR, SYMPHONY_ISSUE_ID, SYMPHONY_ISSUE_IDENTIFIER set -euo pipefail @@ -9,10 +10,12 @@ set -euo pipefail WORKSPACE="${SYMPHONY_WORKSPACE_DIR:-$(pwd)}" ISSUE_ID="${SYMPHONY_ISSUE_IDENTIFIER:-${SYMPHONY_ISSUE_ID:-unknown}}" ATTEMPT="${SYMPHONY_ATTEMPT:-1}" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" cd "$WORKSPACE" -# Capture context from this run, tagged with issue ID and attempt number +# 1. Capture context from this run, tagged with issue ID and attempt number stackmemory conductor capture \ --issue "$ISSUE_ID" \ --workspace "$WORKSPACE" \ @@ -20,3 +23,20 @@ stackmemory conductor capture \ 2>/dev/null || true echo "[conductor] Context captured for $ISSUE_ID (attempt $ATTEMPT)" + +# 2. Trigger GEPA session hook (accumulates sessions, auto-optimizes at threshold) +GEPA_HOOK="$PROJECT_ROOT/scripts/gepa/hooks/gepa-session-hook.js" +if [ -f "$GEPA_HOOK" ]; then + node "$GEPA_HOOK" 2>/dev/null & +fi + +# 3. Trigger DSPy optimization every 50 agent runs +OUTCOMES_PATH="$HOME/.stackmemory/conductor/outcomes.jsonl" +DSPY_OPTIMIZE="$PROJECT_ROOT/scripts/dspy/optimize.py" +if [ -f "$OUTCOMES_PATH" ] && [ -f "$DSPY_OPTIMIZE" ]; then + OUTCOMES_COUNT=$(wc -l < "$OUTCOMES_PATH" 2>/dev/null || echo 0) + if [ $((OUTCOMES_COUNT % 50)) -eq 0 ] && [ "$OUTCOMES_COUNT" -gt 0 ]; then + echo "[conductor] Triggering DSPy optimization (${OUTCOMES_COUNT} runs)" + nohup python3 "$DSPY_OPTIMIZE" --quiet >/dev/null 2>&1 & + fi +fi diff --git a/scripts/gepa/eval-phases.js b/scripts/gepa/eval-phases.js new file mode 100644 index 00000000..c1e4cdc4 --- /dev/null +++ b/scripts/gepa/eval-phases.js @@ -0,0 +1,197 @@ +#!/usr/bin/env node +/** + * Phase-level eval harness for GEPA. + * + * Evaluates conductor prompt phase files against gold sets. + * Scores each phase independently. Used by GEPA auto-optimization + * to validate mutations before applying. + * + * Usage: + * node eval-phases.js # eval all phases + * node eval-phases.js --phase validate # eval single phase + * node eval-phases.js --json # JSON output for CI + */ + +import fs from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import { homedir } from 'os'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const GOLD_DIR = path.join(__dirname, 'gold'); +const PROMPTS_DIR = path.join( + homedir(), + '.stackmemory', + 'conductor', + 'prompts' +); + +const PHASES = ['understand', 'implement', 'validate', 'deliver']; + +// Parse args +const phaseIdx = process.argv.indexOf('--phase'); +const targetPhase = phaseIdx !== -1 ? process.argv[phaseIdx + 1] : null; +const jsonOutput = process.argv.includes('--json'); + +/** + * Load gold set for a phase + */ +function loadGoldSet(phase) { + const goldPath = path.join(GOLD_DIR, `${phase}.jsonl`); + if (!fs.existsSync(goldPath)) return []; + return fs + .readFileSync(goldPath, 'utf-8') + .split('\n') + .filter(Boolean) + .map((l) => JSON.parse(l)); +} + +/** + * Score a phase prompt against its gold set using heuristic evaluation. + * This is a fast, offline eval (no LLM calls) based on outcome patterns. + * + * For LLM-judge evaluation, use the full GEPA optimize.js eval pipeline. + */ +function evalPhase(phase) { + const goldSet = loadGoldSet(phase); + if (goldSet.length === 0) { + return { phase, score: 0, total: 0, passed: 0, skipped: true }; + } + + const promptPath = path.join(PROMPTS_DIR, `${phase}.md`); + if (!fs.existsSync(promptPath)) { + return { phase, score: 0, total: goldSet.length, passed: 0, missing: true }; + } + + const prompt = fs.readFileSync(promptPath, 'utf-8'); + let passed = 0; + const failures = []; + + for (const entry of goldSet) { + const expected = entry.expected; + if (!expected) continue; + + // Heuristic: check if the prompt addresses the failure patterns + let entryPassed = true; + + switch (phase) { + case 'understand': { + // Check if prompt guides complexity assessment + if (expected.complexity === 'careful' && !prompt.includes('plan')) { + entryPassed = false; + } + break; + } + + case 'implement': { + // Check if prompt constrains scope + if (!expected.scopeKept && !prompt.includes('scope')) { + entryPassed = false; + } + // Check ESM import guidance + if ( + entry.errorTail && + /import|ESM/i.test(entry.errorTail) && + !prompt.includes('.js') + ) { + entryPassed = false; + } + break; + } + + case 'validate': { + // Check if prompt covers the specific failure type + if (expected.retryStrategy === 'fix_lint' && !prompt.includes('lint')) { + entryPassed = false; + } + if (expected.retryStrategy === 'fix_test' && !prompt.includes('test')) { + entryPassed = false; + } + if ( + expected.retryStrategy === 'fix_build' && + !prompt.includes('build') + ) { + entryPassed = false; + } + // Check --no-verify prevention + if (!prompt.includes('no-verify') && !prompt.includes('--no-verify')) { + entryPassed = false; + } + break; + } + + case 'deliver': { + // Check commit format guidance + if (!prompt.includes('type(scope)') && !prompt.includes('commit')) { + entryPassed = false; + } + break; + } + } + + if (entryPassed) { + passed++; + } else { + failures.push({ + issue: entry.issue, + outcome: entry.outcome, + reason: `Prompt missing guidance for: ${JSON.stringify(expected)}`, + }); + } + } + + return { + phase, + score: goldSet.length > 0 ? passed / goldSet.length : 0, + total: goldSet.length, + passed, + failures: failures.slice(0, 5), // top 5 failures + }; +} + +// Main +const phases = targetPhase ? [targetPhase] : PHASES; +const results = phases.map(evalPhase); + +if (jsonOutput) { + console.log(JSON.stringify(results, null, 2)); +} else { + console.log('GEPA Phase Evaluation'); + console.log('═'.repeat(50)); + + let totalScore = 0; + let totalPhases = 0; + + for (const r of results) { + if (r.skipped) { + console.log(` ${r.phase.padEnd(12)} β€” no gold set`); + continue; + } + if (r.missing) { + console.log(` ${r.phase.padEnd(12)} β€” prompt file missing`); + continue; + } + + const pct = (r.score * 100).toFixed(1); + const bar = 'β–ˆ'.repeat(Math.round(r.score * 20)).padEnd(20, 'β–‘'); + const status = r.score >= 0.7 ? 'βœ“' : r.score >= 0.4 ? '~' : 'βœ—'; + console.log( + ` ${status} ${r.phase.padEnd(12)} ${bar} ${pct}% (${r.passed}/${r.total})` + ); + + if (r.failures && r.failures.length > 0) { + for (const f of r.failures.slice(0, 3)) { + console.log(` β”” ${f.issue}: ${f.reason.slice(0, 80)}`); + } + } + + totalScore += r.score; + totalPhases++; + } + + if (totalPhases > 0) { + const avg = ((totalScore / totalPhases) * 100).toFixed(1); + console.log('─'.repeat(50)); + console.log(` Average: ${avg}%`); + } +} diff --git a/scripts/gepa/gold/deliver.jsonl b/scripts/gepa/gold/deliver.jsonl new file mode 100644 index 00000000..b7ab4daa --- /dev/null +++ b/scripts/gepa/gold/deliver.jsonl @@ -0,0 +1,10 @@ +{"issue":"STA-561","attempt":1,"outcome":"success","phase":"committing","toolCalls":69,"filesModified":9,"durationMs":385588,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-559","attempt":1,"outcome":"success","phase":"committing","toolCalls":48,"filesModified":3,"durationMs":412555,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-563","attempt":1,"outcome":"success","phase":"committing","toolCalls":95,"filesModified":10,"durationMs":460221,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-562","attempt":1,"outcome":"success","phase":"committing","toolCalls":42,"filesModified":5,"durationMs":311117,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-560","attempt":1,"outcome":"success","phase":"committing","toolCalls":95,"filesModified":17,"durationMs":619257,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-576","attempt":1,"outcome":"failure","phase":"committing","toolCalls":55,"filesModified":5,"durationMs":267800,"hasCommits":false,"errorTail":"error: could not apply fa39187... feat(sync): add incremental update\nCONFLICT (content): Merge conflict in src/services/linear-sync.ts","expected":{"hasCommits":false,"success":false}} +{"issue":"STA-577","attempt":2,"outcome":"success","phase":"committing","toolCalls":87,"filesModified":6,"durationMs":342100,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-564","attempt":1,"outcome":"success","phase":"committing","toolCalls":39,"filesModified":3,"durationMs":427562,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-572","attempt":1,"outcome":"success","phase":"committing","toolCalls":109,"filesModified":18,"durationMs":800902,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-566","attempt":1,"outcome":"success","phase":"committing","toolCalls":107,"filesModified":7,"durationMs":809672,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} diff --git a/scripts/gepa/gold/implement.jsonl b/scripts/gepa/gold/implement.jsonl new file mode 100644 index 00000000..17cf3aae --- /dev/null +++ b/scripts/gepa/gold/implement.jsonl @@ -0,0 +1 @@ +{"issue":"STA-574","attempt":1,"outcome":"failure","phase":"implementing","toolCalls":98,"filesModified":8,"durationMs":600000,"hasCommits":false,"errorTail":"Agent timed out after 600000ms during implementation. Last activity: editing src/integrations/mcp/tools/search.ts","expected":{"filesModified":8,"scopeKept":false}} diff --git a/scripts/gepa/gold/mine-traces.js b/scripts/gepa/gold/mine-traces.js new file mode 100644 index 00000000..7a828779 --- /dev/null +++ b/scripts/gepa/gold/mine-traces.js @@ -0,0 +1,130 @@ +#!/usr/bin/env node +/** + * Mine conductor outcomes + traces for gold set candidates. + * + * Reads outcomes.jsonl and traces.db, generates per-phase gold set + * candidates in gold/*.jsonl for manual curation. + * + * Usage: node scripts/gepa/gold/mine-traces.js + */ + +import fs from 'fs'; +import path from 'path'; +import { homedir } from 'os'; + +const CONDUCTOR_DIR = path.join(homedir(), '.stackmemory', 'conductor'); +const OUTCOMES_PATH = path.join(CONDUCTOR_DIR, 'outcomes.jsonl'); +const GOLD_DIR = path.dirname(new URL(import.meta.url).pathname); + +// Map agent phases to prompt phases +const PHASE_MAP = { + reading: 'understand', + planning: 'understand', + implementing: 'implement', + testing: 'validate', + linting: 'validate', + building: 'validate', + committing: 'deliver', +}; + +function loadOutcomes() { + if (!fs.existsSync(OUTCOMES_PATH)) { + console.error('No outcomes.jsonl found'); + process.exit(1); + } + return fs + .readFileSync(OUTCOMES_PATH, 'utf-8') + .split('\n') + .filter(Boolean) + .map((l) => JSON.parse(l)); +} + +function generateGoldSets(outcomes) { + const byPhase = { understand: [], implement: [], validate: [], deliver: [] }; + + for (const o of outcomes) { + const phase = PHASE_MAP[o.phase] || 'implement'; + + const entry = { + issue: o.issue, + attempt: o.attempt, + outcome: o.outcome, + phase: o.phase, + toolCalls: o.toolCalls, + filesModified: o.filesModified, + durationMs: o.durationMs, + hasCommits: o.hasCommits, + errorTail: o.errorTail || null, + }; + + // For understand phase: complexity assessment + if (phase === 'understand') { + entry.expected = { + complexity: + o.toolCalls > 80 + ? 'careful' + : o.toolCalls > 40 + ? 'standard' + : 'simple', + success: o.outcome === 'success', + }; + } + + // For implement phase: scope adherence + if (phase === 'implement') { + entry.expected = { + filesModified: o.filesModified, + scopeKept: o.outcome === 'success' && o.filesModified <= 15, + }; + } + + // For validate phase: pass/fail + retry strategy + if (phase === 'validate') { + let retryStrategy = 'none'; + if (o.outcome === 'failure' && o.errorTail) { + if (/lint|eslint/i.test(o.errorTail)) retryStrategy = 'fix_lint'; + else if (/test|vitest|FAIL/i.test(o.errorTail)) + retryStrategy = 'fix_test'; + else if (/build|tsc|type/i.test(o.errorTail)) + retryStrategy = 'fix_build'; + else retryStrategy = 'investigate'; + } + entry.expected = { + passed: o.outcome === 'success', + retryStrategy, + }; + } + + // For deliver phase: commit quality + if (phase === 'deliver') { + entry.expected = { + hasCommits: o.hasCommits, + success: o.outcome === 'success', + }; + } + + byPhase[phase].push(entry); + } + + return byPhase; +} + +// Main +const outcomes = loadOutcomes(); +const goldSets = generateGoldSets(outcomes); + +let totalWritten = 0; +for (const [phase, entries] of Object.entries(goldSets)) { + const outPath = path.join(GOLD_DIR, `${phase}.jsonl`); + const content = entries.map((e) => JSON.stringify(e)).join('\n'); + fs.writeFileSync(outPath, content + '\n'); + console.log(`${phase}: ${entries.length} entries β†’ ${outPath}`); + totalWritten += entries.length; +} + +console.log( + `\nTotal: ${totalWritten} gold set candidates from ${outcomes.length} outcomes` +); +console.log( + 'Review and curate β€” remove low-quality entries, add expected outputs' +); diff --git a/scripts/gepa/gold/understand.jsonl b/scripts/gepa/gold/understand.jsonl new file mode 100644 index 00000000..ee87c080 --- /dev/null +++ b/scripts/gepa/gold/understand.jsonl @@ -0,0 +1,56 @@ +{"issue":"STA-575","attempt":1,"outcome":"failure","phase":"reading","toolCalls":22,"filesModified":1,"durationMs":112400,"hasCommits":false,"errorTail":"API rate limit exceeded (HTTP 429). Retry-After: 60s. Backing off globally.","expected":{"complexity":"simple","success":false}} +{"issue":"STA-480","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2238,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-480","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":5856,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-479","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2338,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-480","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3648,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-479","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":5214,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-576","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":985,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-577","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1849,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-576","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3355,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-577","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4124,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-578","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1073,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-578","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3351,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-580","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2026,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-579","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1361,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-580","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4308,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-579","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3633,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-581","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1190,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-581","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3482,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-583","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2029,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-582","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1657,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-583","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4316,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-582","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3919,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-585","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1749,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-586","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1154,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-585","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4071,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-586","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3482,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-584","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1028,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-584","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3273,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-588","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1793,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-590","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2537,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-590","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4845,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-588","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4111,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-591","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1213,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-587","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1985,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-591","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3607,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-587","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4398,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-589","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1362,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-589","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3725,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-597","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1971,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-594","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1406,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-597","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4224,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-594","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3658,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-593","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1728,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-596","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1234,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-593","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4008,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-596","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3447,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-595","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1094,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-595","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3362,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-567","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2324,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-484","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1835,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-567","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4824,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-484","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4267,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-483","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2918,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-482","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2326,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-483","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":5344,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-482","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4678,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} diff --git a/scripts/gepa/gold/validate.jsonl b/scripts/gepa/gold/validate.jsonl new file mode 100644 index 00000000..8862f808 --- /dev/null +++ b/scripts/gepa/gold/validate.jsonl @@ -0,0 +1,4 @@ +{"issue":"STA-570","attempt":1,"outcome":"failure","phase":"testing","toolCalls":45,"filesModified":4,"durationMs":187200,"hasCommits":false,"errorTail":"ESLint: 3 errors (no-unused-vars, @typescript-eslint/no-explicit-any). Fix lint errors before committing.","expected":{"passed":false,"retryStrategy":"fix_lint"}} +{"issue":"STA-571","attempt":1,"outcome":"failure","phase":"testing","toolCalls":62,"filesModified":6,"durationMs":234500,"hasCommits":false,"errorTail":"eslint found 5 problems (2 errors, 3 warnings) in src/services/sync.ts","expected":{"passed":false,"retryStrategy":"fix_lint"}} +{"issue":"STA-572","attempt":1,"outcome":"failure","phase":"testing","toolCalls":78,"filesModified":7,"durationMs":298000,"hasCommits":false,"errorTail":"FAIL src/core/database/__tests__/sqlite-adapter.test.ts > search > should return ranked results\nAssertionError: expected 0 to be greater than 2","expected":{"passed":false,"retryStrategy":"fix_test"}} +{"issue":"STA-573","attempt":1,"outcome":"failure","phase":"testing","toolCalls":34,"filesModified":3,"durationMs":156300,"hasCommits":false,"errorTail":"vitest run failed: 2 tests failed in src/core/context/__tests__/frame-manager.test.ts - TypeError: Cannot read properties of undefined (reading \"id\")","expected":{"passed":false,"retryStrategy":"fix_test"}} diff --git a/scripts/gepa/hooks/gepa-session-hook.js b/scripts/gepa/hooks/gepa-session-hook.js index 737621e7..15adc602 100644 --- a/scripts/gepa/hooks/gepa-session-hook.js +++ b/scripts/gepa/hooks/gepa-session-hook.js @@ -105,7 +105,7 @@ function triggerOptimization(hookState) { const optimizePath = path.join(GEPA_DIR, 'optimize.js'); const reflectPath = path.join(GEPA_DIR, 'hooks', 'reflect.js'); - // Run reflect β†’ optimize as a background pipeline + // Run reflect β†’ phase-targeted optimize as a background pipeline const script = ` // Reflect first (generates insights for mutation context) try { @@ -113,16 +113,18 @@ function triggerOptimization(hookState) { await generateReflection(); } catch {} - // Then optimize (1 generation, quick) + // Then optimize β€” use --auto-phase to target worst phase from outcomes const { execSync } = await import('child_process'); try { - execSync('node ${optimizePath} mutate', { stdio: 'pipe', timeout: 300000 }); + execSync('node ${optimizePath} mutate --auto-phase', { stdio: 'pipe', timeout: 300000 }); execSync('node ${optimizePath} score', { stdio: 'pipe', timeout: 300000 }); // Read result and notify const fs = await import('fs'); const state = JSON.parse(fs.readFileSync('${STATE_PATH}', 'utf8')); - const msg = \`[GEPA] Auto-optimized: gen \${state.currentGeneration}, best=\${state.bestVariant} (\${(state.bestScore * 100).toFixed(1)}%). Run 'node ${optimizePath} apply' to apply.\`; + const lastAction = state.history?.[state.history.length - 1]; + const phaseInfo = lastAction?.phase ? \` (phase: \${lastAction.phase})\` : ''; + const msg = \`[GEPA] Auto-optimized\${phaseInfo}: gen \${state.currentGeneration}, best=\${state.bestVariant} (\${(state.bestScore * 100).toFixed(1)}%). Run 'node ${optimizePath} apply' to apply.\`; process.stderr.write(msg + '\\n'); } catch (e) { process.stderr.write('[GEPA] Auto-optimize failed: ' + e.message + '\\n'); diff --git a/scripts/gepa/optimize.js b/scripts/gepa/optimize.js index 40f86403..016fd030 100755 --- a/scripts/gepa/optimize.js +++ b/scripts/gepa/optimize.js @@ -91,6 +91,103 @@ const GENERATIONS_DIR = path.join(GEPA_DIR, 'generations'); const RESULTS_DIR = path.join(GEPA_DIR, 'results'); const EVALS_DIR = path.join(GEPA_DIR, 'evals'); +// --phase scopes optimization to a single conductor phase file +const phaseIdx = process.argv.indexOf('--phase'); +const phaseName = phaseIdx !== -1 ? process.argv[phaseIdx + 1] : null; +if (phaseIdx !== -1) process.argv.splice(phaseIdx, 2); + +const CONDUCTOR_PROMPTS_DIR = path.join( + process.env.HOME || '', + '.stackmemory', + 'conductor', + 'prompts' +); + +/** + * Phase-aware optimization: read failure data from outcomes.jsonl + * and build context for phase-scoped mutations. + */ +function getPhaseFailureContext(phase) { + const outcomesPath = path.join( + process.env.HOME || '', + '.stackmemory', + 'conductor', + 'outcomes.jsonl' + ); + if (!fs.existsSync(outcomesPath)) return ''; + + try { + const lines = fs + .readFileSync(outcomesPath, 'utf8') + .split('\n') + .filter(Boolean); + const recent = lines.slice(-100).map((l) => JSON.parse(l)); + const phaseFailures = recent.filter( + (o) => o.outcome === 'failure' && o.phase === phase + ); + + if (phaseFailures.length === 0) return ''; + + const examples = phaseFailures.slice(-10).map((f) => { + const err = f.errorTail || 'unknown error'; + return `- ${f.issue} (attempt ${f.attempt}): ${err.slice(0, 200)}`; + }); + + return `\n## Recent failures in "${phase}" phase (${phaseFailures.length} of last ${recent.length} runs):\n${examples.join('\n')}\n`; + } catch { + return ''; + } +} + +/** + * Auto-detect worst phase from outcomes for targeted optimization + */ +function detectWorstPhase() { + const outcomesPath = path.join( + process.env.HOME || '', + '.stackmemory', + 'conductor', + 'outcomes.jsonl' + ); + if (!fs.existsSync(outcomesPath)) return null; + + try { + const lines = fs + .readFileSync(outcomesPath, 'utf8') + .split('\n') + .filter(Boolean); + const recent = lines.slice(-50).map((l) => JSON.parse(l)); + const failures = recent.filter((o) => o.outcome === 'failure'); + if (failures.length === 0) return null; + + // Group by phase, find worst + const byPhase = {}; + for (const f of failures) { + const p = mapAgentPhaseToPromptPhase(f.phase); + byPhase[p] = (byPhase[p] || 0) + 1; + } + + const sorted = Object.entries(byPhase).sort((a, b) => b[1] - a[1]); + return sorted[0]?.[0] || null; + } catch { + return null; + } +} + +/** Map conductor AgentPhase names to prompt phase file names */ +function mapAgentPhaseToPromptPhase(agentPhase) { + const map = { + reading: 'understand', + planning: 'understand', + implementing: 'implement', + testing: 'validate', + linting: 'validate', + building: 'validate', + committing: 'deliver', + }; + return map[agentPhase] || 'implement'; +} + // Ensure directories [GENERATIONS_DIR, RESULTS_DIR, EVALS_DIR].forEach((dir) => { if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }); @@ -218,6 +315,72 @@ async function mutate() { return variants; } +/** + * Phase-scoped mutation: optimize a single conductor phase file + * using failure data from outcomes.jsonl. + */ +async function mutatePhase(phase) { + const phasePath = path.join(CONDUCTOR_PROMPTS_DIR, `${phase}.md`); + if (!fs.existsSync(phasePath)) { + console.error(`[GEPA] Phase file not found: ${phasePath}`); + return; + } + + const current = fs.readFileSync(phasePath, 'utf8'); + const failureContext = getPhaseFailureContext(phase); + const state = getState(); + const nextGen = state.currentGeneration + 1; + + console.log(`[GEPA] Phase-scoped optimization: ${phase}`); + if (failureContext) { + console.log(`[GEPA] Including failure context from outcomes.jsonl`); + } + + const genDir = getGenPath(nextGen); + if (!fs.existsSync(genDir)) fs.mkdirSync(genDir, { recursive: true }); + + // Generate 2 variants (smaller population for phase-level) + const mutations = config.evolution.mutationStrategies; + const variants = []; + + for (let i = 0; i < 2; i++) { + const strategy = + mutations[(state.currentGeneration + i) % mutations.length]; + const variantName = `phase-${phase}-${String.fromCharCode(97 + i)}`; + + console.log(` Creating ${variantName} using strategy: ${strategy}`); + + // Inject phase-specific context into mutation prompt + const phaseAugmented = `${current}\n${failureContext}`; + const mutatedContent = await generateMutation( + phaseAugmented, + strategy, + state + ); + + const variantPath = path.join(genDir, `${variantName}.md`); + fs.writeFileSync(variantPath, mutatedContent); + variants.push({ name: variantName, strategy, path: variantPath, phase }); + } + + // Save baseline + fs.writeFileSync(path.join(genDir, `phase-${phase}-baseline.md`), current); + + state.history.push({ + generation: nextGen, + action: 'mutate-phase', + phase, + variants: variants.map((v) => v.name), + timestamp: new Date().toISOString(), + }); + saveState(state); + + console.log( + `\n[GEPA] Generated ${variants.length} phase variants for ${phase}` + ); + return variants; +} + /** * Strategy definitions: prompt, motivation, and example for each mutation type. * Motivation helps Claude generalize the intent (per Anthropic best practices). @@ -1352,7 +1515,19 @@ switch (command) { init(arg1); break; case 'mutate': - mutate(); + if (phaseName || hasFlag('--auto-phase')) { + const phase = phaseName || detectWorstPhase(); + if (phase) { + mutatePhase(phase); + } else { + console.log( + '[GEPA] No phase failures detected β€” skipping phase mutation' + ); + mutate(); + } + } else { + mutate(); + } break; case 'eval': runEval(arg1 || 'baseline'); diff --git a/src/cli/commands/orchestrator.ts b/src/cli/commands/orchestrator.ts index cf1c1ab3..030d31d5 100644 --- a/src/cli/commands/orchestrator.ts +++ b/src/cli/commands/orchestrator.ts @@ -26,6 +26,7 @@ import { createReadStream } from 'fs'; import { createInterface } from 'readline'; import { fileURLToPath } from 'url'; import { Transform, type TransformCallback } from 'stream'; +import { createHash } from 'crypto'; import { logger } from '../../core/monitoring/logger.js'; import { isProcessAlive } from '../../utils/process-cleanup.js'; import { @@ -199,9 +200,92 @@ export interface AgentOutcomeEntry { labels?: string[]; // issue labels for difficulty prediction errorTail?: string; // last 5 lines of output.log on failure promptHash?: string; // hash of the prompt template used + promptVersions?: Record; // per-phase content hashes prUrl?: string; // GitHub PR URL if auto-created } +/** Phase prompt file names for decomposed template */ +const PROMPT_PHASES = [ + 'system', + 'understand', + 'implement', + 'validate', + 'deliver', +] as const; +type PromptPhase = (typeof PROMPT_PHASES)[number]; + +/** + * Build agent prompt from decomposed phase files if they exist, + * otherwise fall back to the monolith prompt-template.md. + * + * Returns { prompt, versions } where versions maps each phase + * to a short content hash for outcome attribution. + */ +function buildPromptFromPhases( + variables: Record +): { prompt: string; versions: Record } | null { + const promptsDir = join(homedir(), '.stackmemory', 'conductor', 'prompts'); + + // Check if phase files exist + const systemPath = join(promptsDir, 'system.md'); + if (!existsSync(systemPath)) return null; + + const versions: Record = {}; + const parts: string[] = []; + + for (const phase of PROMPT_PHASES) { + const phasePath = join(promptsDir, `${phase}.md`); + if (!existsSync(phasePath)) continue; + + let content = readFileSync(phasePath, 'utf-8'); + // Apply variable substitution + for (const [key, value] of Object.entries(variables)) { + content = content.replace(new RegExp(`\\{\\{${key}\\}\\}`, 'g'), value); + } + + parts.push(content); + + // Short hash for outcome attribution (first 8 chars of hex digest) + const hash = createHash('sha256') + .update(readFileSync(phasePath, 'utf-8')) + .digest('hex') + .slice(0, 8); + versions[phase] = hash; + } + + if (parts.length === 0) return null; + + // Load DSPy-optimized examples if available + const dspyPath = join( + homedir(), + '.stackmemory', + 'dspy', + 'optimized_state.json' + ); + if (existsSync(dspyPath)) { + try { + const state = JSON.parse(readFileSync(dspyPath, 'utf-8')); + for (const phase of PROMPT_PHASES) { + const sig = state[phase]; + if (sig?.fewShotExamples?.length) { + const examples = sig.fewShotExamples + .slice(0, 3) + .map( + (ex: { input: unknown; output: unknown }) => + `\nInput: ${JSON.stringify(ex.input)}\nOutput: ${JSON.stringify(ex.output)}\n` + ) + .join('\n'); + parts.push(`\n## Optimized Examples (${phase}):\n${examples}`); + } + } + } catch { + // Non-fatal β€” DSPy state is optional + } + } + + return { prompt: parts.join('\n\n'), versions }; +} + /** Get the conductor failures/outcomes log path */ export function getOutcomesLogPath(): string { return join(homedir(), '.stackmemory', 'conductor', 'outcomes.jsonl'); @@ -416,9 +500,88 @@ export function getRetryStrategy( } } + // Add phase-specific assertion if phase files are active + const promptsDir = join(homedir(), '.stackmemory', 'conductor', 'prompts'); + if (lastFailure?.phase && existsSync(join(promptsDir, 'system.md'))) { + const phaseAssertions = getPhaseAssertions( + lastFailure.phase, + lastFailure.errorTail || '' + ); + adjustments.push(...phaseAssertions); + } + return { shouldRetry: true, adjustments }; } +/** + * Generate phase-specific assertions for retry based on failure phase and error. + * These are injected into the retry prompt so the agent focuses on the exact + * failure point with targeted guidance. + */ +function getPhaseAssertions(phase: AgentPhase, error: string): string[] { + const assertions: string[] = []; + + switch (phase) { + case 'reading': + case 'planning': + assertions.push( + 'ASSERTION: Re-read the issue description completely before planning.', + 'ASSERTION: List ALL files you plan to modify before starting implementation.' + ); + break; + + case 'implementing': + if (/scope|unrelated|refactor/i.test(error)) { + assertions.push( + 'ASSERTION: Only modify files directly required by the issue. Do NOT refactor surrounding code.' + ); + } + if (/import|module|ESM/i.test(error)) { + assertions.push( + 'ASSERTION: Every relative import MUST end with .js extension. Check ALL new imports.' + ); + } + assertions.push( + 'ASSERTION: After implementing, review your diff β€” if any change is not required by the issue, revert it.' + ); + break; + + case 'testing': + case 'linting': + case 'building': + if (/lint|eslint/i.test(error)) { + assertions.push( + 'ASSERTION: Run `npm run lint` IMMEDIATELY. Fix every error. Do NOT proceed until lint passes.', + 'ASSERTION: Common lint fixes β€” catch {} not catch (_err) {}, remove unused imports, add .js to relative imports.' + ); + } + if (/test|vitest|jest|FAIL/i.test(error)) { + assertions.push( + 'ASSERTION: Read the FULL test error output. Identify which assertion fails and why.', + 'ASSERTION: If vi.clearAllMocks() is in beforeEach, re-set any mockReturnValue calls after it.' + ); + } + if (/build|tsc|type/i.test(error)) { + assertions.push( + 'ASSERTION: Run `npm run build` and fix ALL TypeScript errors before committing.' + ); + } + assertions.push( + 'ASSERTION: Do NOT use --no-verify to bypass pre-commit hooks. Fix the underlying issue.' + ); + break; + + case 'committing': + assertions.push( + 'ASSERTION: Commit message must follow format: type(scope): description', + 'ASSERTION: If pre-commit hook fails, fix the issue and create a NEW commit β€” do NOT amend.' + ); + break; + } + + return assertions; +} + // ── Helpers ── /** Find the package root by walking up from the current file. */ @@ -1407,6 +1570,7 @@ export class Conductor { durationMs: Date.now() - run.startedAt, hasCommits: true, labels: issue.labels.map((l) => l.name), + promptVersions: this.lastPromptVersions, prUrl, }); await this.runHook( @@ -1446,6 +1610,7 @@ export class Conductor { durationMs: Date.now() - run.startedAt, hasCommits: false, labels: issue.labels.map((l) => l.name), + promptVersions: this.lastPromptVersions, errorTail: run.error?.slice(-500), }); @@ -2449,10 +2614,13 @@ export class Conductor { } } + /** Last prompt version hashes β€” set by buildPrompt, read by outcome logging */ + private lastPromptVersions: Record = {}; + /** - * Build the agent prompt. If a custom template exists at - * ~/.stackmemory/conductor/prompt-template.md, use it with variable - * substitution. Otherwise fall back to the default template. + * Build the agent prompt. Tries decomposed phase files first + * (~/.stackmemory/conductor/prompts/*.md), then typed templates, + * then custom prompt-template.md, then default. * * Template variables: {{ISSUE_ID}}, {{TITLE}}, {{DESCRIPTION}}, * {{LABELS}}, {{PRIORITY}}, {{ATTEMPT}}, {{PRIOR_CONTEXT}} @@ -2490,6 +2658,24 @@ export class Conductor { } const priorContext = contextParts.join('\n'); + // Try decomposed phase files first + const variables: Record = { + ISSUE_ID: issue.identifier, + TITLE: issue.title, + DESCRIPTION: issue.description || '', + LABELS: labels, + PRIORITY: priority, + SCOPE: issue.identifier.toLowerCase().replace(/-\d+$/, ''), + ATTEMPT: String(attempt), + PRIOR_CONTEXT: priorContext, + }; + + const phaseResult = buildPromptFromPhases(variables); + if (phaseResult) { + this.lastPromptVersions = phaseResult.versions; + return phaseResult.prompt; + } + // Select template by issue type (labels or title heuristics) const templateDir = join( __dirname, @@ -2852,6 +3038,7 @@ export class Conductor { durationMs, hasCommits, labels: run.issue.labels.map((l) => l.name), + promptVersions: this.lastPromptVersions, errorTail, prUrl, }); From 19171e33e04b54a50953aca2918b52d5ee71cda9 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Sun, 19 Apr 2026 00:50:28 -0400 Subject: [PATCH 14/18] chore: handoff checkpoint on chore/root-reorg --- scripts/gepa/evals/conductor-provenantai.jsonl | 15 +++++++++++++++ scripts/gepa/generations/current | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 scripts/gepa/evals/conductor-provenantai.jsonl diff --git a/scripts/gepa/evals/conductor-provenantai.jsonl b/scripts/gepa/evals/conductor-provenantai.jsonl new file mode 100644 index 00000000..54f933b6 --- /dev/null +++ b/scripts/gepa/evals/conductor-provenantai.jsonl @@ -0,0 +1,15 @@ +{"id": "pa-001", "name": "express_route_bug_fix", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-600: Fix 500 error on /api/v1/query when conversation_id is invalid UUID\n\n## Description\n\nWhen a user sends a malformed conversation_id (not a valid UUID), the query endpoint crashes with an unhandled pg error instead of returning a 400. The route is at src/routes/query.js. Use AppError from src/core/errors.js with ErrorCodes.VALIDATION.\n\nLabels: bug\nPriority: High\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate this prompt for a ProvenantAI Express route bug fix. Check: does it guide reading the existing route file first, does it mention the AppError pattern, does it specify running npm run lint and npm test, does it mention Supertest for route testing, does it specify the DI factory pattern used in route files?", "expected": {"reads_existing_code": "prompt should instruct agent to read the existing route before modifying", "apperror_pattern": "prompt should reference AppError/ErrorCodes pattern from src/core/errors", "test_commands": "prompt should specify npm run lint and npm test (or npm run test:core)", "supertest_pattern": "prompt should mention testing with Supertest passing app not server", "di_factory_pattern": "prompt should reference createRouter(deps) DI factory pattern"}, "weight": 1.5} +{"id": "pa-002", "name": "jest_test_conventions", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-605: Add unit tests for recipe webhook signature verification\n\n## Description\n\nThe webhook-signature.js module at src/recipes/webhook-signature.js has no test coverage. Add tests for generateSignature() and verifySignature() including edge cases: empty payload, expired timestamps, invalid HMAC.\n\nLabels: test\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI Jest conventions. Check: does it warn against importing from @jest/globals (use globals instead), does it mention jest.clearAllMocks() resetting mockReturnValue, does it specify using catch {} not catch (_err) {} for ESLint, does it tell the agent to check existing test patterns in __tests__/?", "expected": {"no_jest_globals_import": "prompt should warn not to import from @jest/globals β€” use global jest", "clear_all_mocks_warning": "prompt should note jest.clearAllMocks() resets mockReturnValue β€” re-set in beforeEach", "eslint_catch_pattern": "prompt should specify catch {} not catch (_err) {} per ESLint rules", "check_existing_patterns": "prompt should instruct checking existing __tests__/ for patterns before writing"}, "weight": 1.8} +{"id": "pa-003", "name": "database_migration", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-610: Add prompt_version column to queries table\n\n## Description\n\nAdd a VARCHAR(64) column 'prompt_version' to the queries table to track which system prompt version was used for each query. This enables outcome attribution when GEPA optimizes prompts. Column should be nullable (existing rows won't have it).\n\nLabels: chore, database\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI migration conventions. Check: does it specify the migration file naming pattern (NNN_description.sql), does it mention checking the latest migration number, does it guide making the column nullable for backwards compatibility, does it mention auto-migration on startup, does it warn about postgres.railway.internal only being reachable from Railway?", "expected": {"migration_naming": "prompt should specify NNN_description.sql naming in src/db/migrations/", "check_latest_number": "prompt should instruct checking latest migration number to avoid conflicts", "nullable_column": "prompt should guide making new columns nullable for backwards compatibility", "auto_migration_startup": "prompt should mention auto-migration runs on startup"}, "weight": 1.5} +{"id": "pa-004", "name": "recipe_template_addition", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-615: Add competitor monitoring recipe template\n\n## Description\n\nAdd a new recipe template (#035) for competitor monitoring. Input: Google Alerts webhook. Steps: enrichment (LLM summarize + Clearbit company lookup), output to Slack channel. Follow the existing template pattern in src/db/migrations/028_expanded_recipe_templates.sql.\n\nLabels: feature, recipes\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI recipe conventions. Check: does it guide reading existing template migrations first, does it mention the step-runner pipeline stages, does it reference the enrichment service (CJS not ESM), does it mention tier enforcement, does it specify the recipe service at src/recipes/service.js?", "expected": {"read_existing_templates": "prompt should instruct reading existing recipe template migrations for pattern", "step_runner_stages": "prompt should reference step-runner pipeline (input β†’ enrichment β†’ output)", "enrichment_cjs": "prompt should note enrichment service is CJS not ESM", "recipe_service_path": "prompt should reference src/recipes/service.js for lifecycle management"}, "weight": 1.3} +{"id": "pa-005", "name": "stripe_integration_change", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-620: Add annual billing option to Stripe products\n\n## Description\n\nAdd yearly pricing to existing Stripe products (Starter/Growth/Scale). Use the v2 product IDs from Railway env vars. Create prices with 20% annual discount. Update the billing routes to handle annual subscription creation.\n\nLabels: feature, billing\nPriority: High\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI billing conventions. Check: does it warn about not hardcoding Stripe product IDs (use env vars), does it mention the v2 products (Starter/Growth/Scale), does it guide checking existing billing routes, does it warn about test mode vs live keys, does it specify Clerk webhook creates tenant + seeds CoA?", "expected": {"no_hardcoded_stripe_ids": "prompt should warn against hardcoding Stripe product/price IDs β€” use env vars", "v2_products": "prompt should reference v2 product tier names (Starter/Growth/Scale)", "check_billing_routes": "prompt should instruct reading existing billing route patterns", "test_vs_live_keys": "prompt should warn about test mode header (X-Test-Mode: true) for testing"}, "weight": 1.5} +{"id": "pa-006", "name": "auth_middleware_handling", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-625: Add API key rotation endpoint\n\n## Description\n\nAdd POST /api/v1/auth/rotate-key that generates a new API key for the authenticated tenant, invalidates the old one, and returns the new key. Must work with both Clerk auth and API key auth (authenticateAny middleware).\n\nLabels: feature, auth\nPriority: High\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI auth conventions. Check: does it reference the auth middleware at src/auth/auth.middleware.js, does it mention the three auth modes (Clerk + API key + test mode), does it specify multi-tenant isolation (scope by tenant_id), does it warn about AES-256-GCM for credential encryption?", "expected": {"auth_middleware_reference": "prompt should reference src/auth/auth.middleware.js with authenticateAny", "three_auth_modes": "prompt should mention Clerk + API key + test mode auth paths", "tenant_isolation": "prompt should emphasize tenant_id scoping for multi-tenant isolation", "credential_encryption": "prompt should mention AES-256-GCM KMS for credential storage"}, "weight": 1.5} +{"id": "pa-007", "name": "dashboard_component", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-630: Add recipe execution history chart to dashboard\n\n## Description\n\nAdd a bar chart showing recipe execution success/failure counts over the last 30 days to the COO dashboard page. Use the existing chart components in dashboard-app/. Data comes from GET /api/v1/recipes/stats endpoint (already exists).\n\nLabels: feature, dashboard\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI dashboard conventions. Check: does it mention Vite base='/app/' + BrowserRouter basename, does it reference the design system (docs/STYLE.md), does it instruct building with 'cd dashboard-app && npx vite build', does it mention product-gating (CMO/CFO/COO)?", "expected": {"vite_base_path": "prompt should mention Vite base='/app/' and BrowserRouter basename='/app'", "design_system": "prompt should reference design system at docs/STYLE.md or docs/CONTROL_PLANE_STYLE.md", "build_command": "prompt should specify dashboard build: cd dashboard-app && npx vite build", "product_gating": "prompt should mention product-gated pages (CMO/CFO/COO)"}, "weight": 1.3} +{"id": "pa-008", "name": "llm_adapter_change", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-635: Add token usage alerts when query exceeds budget\n\n## Description\n\nAdd a warning log and optional webhook notification when a query's token usage exceeds the tenant's budget threshold. Check against tenant_budget_overrides table. The LLM adapter at src/llm/adapter.js records usage β€” add the check after usage recording.\n\nLabels: feature, observability\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI LLM adapter conventions. Check: does it reference the model tiers (fast/balanced/deep), does it mention the Anthropic-native adapter pattern, does it instruct reading src/llm/adapter.js first, does it reference trace-logger for structured observability, does it specify the cost estimation pattern?", "expected": {"model_tiers": "prompt should reference fast/balanced/deep model tier system", "anthropic_native": "prompt should mention Anthropic-native adapter (not multi-provider)", "read_adapter_first": "prompt should instruct reading src/llm/adapter.js before modifying", "trace_logger": "prompt should reference trace-logger for structured span observability"}, "weight": 1.3} +{"id": "pa-009", "name": "integration_connector", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-640: Add HubSpot connector for recipe input\n\n## Description\n\nAdd HubSpot as a connector input source for recipes. Follow the pattern of existing connectors (Stripe, Salesforce, QuickBooks). Use Pipedream Connect for OAuth (src/integrations/pipedream/pipedream-connect.js). Add the connector type to the recipe input schema.\n\nLabels: feature, integrations\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI integration conventions. Check: does it reference Pipedream Connect for managed OAuth, does it instruct reading existing connector patterns (Stripe/SF/QB), does it mention the fixture pattern at src/__fixtures__/, does it specify the integration test path (src/integrations/)?", "expected": {"pipedream_connect": "prompt should reference Pipedream Connect for managed OAuth", "existing_connector_patterns": "prompt should instruct reading existing connector implementations", "fixture_pattern": "prompt should mention test fixtures at src/__fixtures__/ for mock data", "integration_test_path": "prompt should specify test path src/integrations/ for integration tests"}, "weight": 1.3} +{"id": "pa-010", "name": "webhook_handler", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-645: Add idempotency check to recipe webhook handler\n\n## Description\n\nThe webhook handler at src/routes/webhook-recipes.js should check the webhook_idempotency table before processing duplicate events. Use the existing idempotency migration (066). Verify HMAC-SHA256 signature before the idempotency check.\n\nLabels: feature, security\nPriority: High\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI webhook conventions. Check: does it reference HMAC-SHA256 signature verification, does it mention the webhook-signature.js module, does it instruct reading the idempotency migration, does it specify parameterized queries for DB access, does it mention the service registry singleton pattern?", "expected": {"hmac_verification": "prompt should reference HMAC-SHA256 signature verification before processing", "webhook_signature_module": "prompt should reference src/recipes/webhook-signature.js", "parameterized_queries": "prompt should specify parameterized queries ($1, $2) β€” no string interpolation", "service_registry": "prompt should mention service registry singleton for route access"}, "weight": 1.5} +{"id": "pa-011", "name": "context_graph_feature", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-650: Add contradiction detection to context graph queries\n\n## Description\n\nThe context graph query service (src/ctx/graph-query.js) has a getContradictions() stub. Implement it to find ctx_nodes with conflicting content for the same entity. Return nodes with a contradiction_score based on semantic similarity of conflicting claims.\n\nLabels: feature, context-graph\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI context graph conventions. Check: does it reference the ctx_nodes/ctx_edges schema, does it mention tenant_id scoping, does it instruct reading the existing graph-query.js service, does it reference the GraphService wrapper at src/graph/service.ts?", "expected": {"ctx_schema": "prompt should reference ctx_nodes and ctx_edges table schema", "tenant_scoping": "prompt should emphasize tenant_id scoping for all graph queries", "existing_service": "prompt should instruct reading src/ctx/graph-query.js first", "graph_service_wrapper": "prompt should reference GraphService at src/graph/service.ts"}, "weight": 1.3} +{"id": "pa-012", "name": "e2e_test_addition", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-655: Add E2E test for recipe CRUD flow\n\n## Description\n\nAdd a Playwright E2E spec testing the full recipe lifecycle: create β†’ list β†’ get β†’ update β†’ delete. Use the existing E2E fixtures pattern at e2e/fixtures/test-fixtures.ts. Test against the API project (not browser).\n\nLabels: test, e2e\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI E2E conventions. Check: does it reference Playwright config with project names (api, business, chromium), does it instruct reading existing E2E specs, does it mention the X-Test-Mode header for test auth, does it specify npm run e2e:api for API tests?", "expected": {"playwright_projects": "prompt should reference Playwright projects (api, business, chromium)", "existing_e2e_patterns": "prompt should instruct reading existing e2e/ specs for patterns", "test_mode_header": "prompt should mention X-Test-Mode: true header for test authentication", "e2e_api_command": "prompt should specify npm run e2e:api for API-only E2E tests"}, "weight": 1.3} +{"id": "pa-013", "name": "observability_addition", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-660: Add trace spans to recipe step execution\n\n## Description\n\nThe step-runner at src/core/step-runner.js executes recipe pipeline steps but has no tracing. Add trace-logger spans around each step execution with step name, duration, success/failure, and input/output token counts for LLM steps.\n\nLabels: feature, observability\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI observability conventions. Check: does it reference trace-logger at src/utils/trace-logger.js, does it show the traceSpan pattern, does it instruct not logging PII, does it mention structured spans with metadata?", "expected": {"trace_logger_reference": "prompt should reference src/utils/trace-logger.js", "trace_span_pattern": "prompt should demonstrate traceLogger.traceSpan(name, metadata) pattern", "no_pii_logging": "prompt should warn against logging PII (emails, tokens, card numbers)", "structured_metadata": "prompt should specify structured span metadata (not free-form strings)"}, "weight": 1.3} +{"id": "pa-014", "name": "no_description_provenantai", "prompt": "You are an AI coding agent given this prompt template output for an issue with minimal context:\n\n---\nYou are working on Linear issue STA-670: Fix flaky scheduler test\n\nLabels: bug\nPriority: Low\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate how this prompt handles a ProvenantAI issue with no description. Check: does it guide searching for the test file (test/unit/scheduler.test.js is known flaky), does it instruct reading the test to understand what's flaky (timer-dependent), does it mention jest.useFakeTimers() as a common fix, does it degrade gracefully without a description?", "expected": {"guides_file_search": "prompt should guide searching codebase for the relevant test file", "understand_flakiness": "prompt should instruct understanding the root cause of flakiness", "graceful_degradation": "prompt should still be useful without a description", "codebase_context": "prompt should tell agent to gather context from the codebase when description is missing"}, "weight": 1.8} +{"id": "pa-015", "name": "security_sensitive_change", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-675: Add PII scrubbing to LLM context documents\n\n## Description\n\nBefore sending context_documents to the LLM adapter, scrub PII (emails, phone numbers, credit card numbers) from the text field. Add a scrubPII() utility. This is a security requirement β€” see src/auth/auth.middleware.js for tenant context.\n\nLabels: security\nPriority: Urgent\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI security conventions. Check: does it reference the security rules (.claude/rules/security.md), does it emphasize input validation at route boundary, does it mention parameterized queries, does it warn against committing secrets, does it specify AES-256-GCM for credentials?", "expected": {"security_rules_reference": "prompt should reference security rules or conventions", "input_validation": "prompt should emphasize input validation at system boundaries", "no_secrets_in_commits": "prompt should warn against committing secrets (.env, API keys, tokens)", "pii_scrubbing_guidance": "prompt should provide guidance on PII patterns to scrub (emails, phones, cards)"}, "weight": 2.0} diff --git a/scripts/gepa/generations/current b/scripts/gepa/generations/current index c4dbed3d..21436e00 120000 --- a/scripts/gepa/generations/current +++ b/scripts/gepa/generations/current @@ -1 +1 @@ -/Users/jwu/Dev/stackmemory/scripts/gepa/generations/gen-001/baseline.md \ No newline at end of file +/Users/jwu/Dev/stackmemory/scripts/gepa/generations/gen-002/baseline.md \ No newline at end of file From 9c3cb9e93bc284517d18f80a0b57b8d59bbced14 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Sun, 19 Apr 2026 01:53:50 -0400 Subject: [PATCH 15/18] feat(gepa): skill .md optimization with audit hook Add GEPA support for optimizing Claude Code slash command .md files: - skill-audit.js hook logs Skill tool calls to skill-audit.jsonl - 5 skill targets in config (start, stop, learn, next, summary) - skill-tasks.jsonl with 8 eval tasks for skill quality - skill-stats and run-skills CLI commands - getSkillAuditContext() feeds usage data into mutation prompts --- CLAUDE.md | 6 + scripts/gepa/config.json | 50 +++++++ scripts/gepa/evals/skill-tasks.jsonl | 8 ++ scripts/gepa/optimize.js | 186 ++++++++++++++++++++++++++- 4 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 scripts/gepa/evals/skill-tasks.jsonl diff --git a/CLAUDE.md b/CLAUDE.md index b517d706..42ef6c44 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -258,12 +258,18 @@ For AUTOMATE and STANDARD tiers: make only the requested changes. Don't refactor - Prioritizes: unfinished work > flagged issues > queued tasks > continuations - Trigger: session start, "what's next", "whats next", between tasks +**`/learn`** β€” Run at session end to capture learnings: +- Reviews session work, then audits memory, CLAUDE.md, skills, scripts, and wiki +- Proposes creates/updates/deletes with confirmation before applying +- Trigger: end of session, after significant work, "what should I update" + **When to use which:** - Starting a session or between tasks β†’ `/next` (pick what to work on) - Session producing wrong results β†’ `/recover` (diagnose + fix now) - Routine maintenance, nothing broken β†’ `/update-docs` (proactive gardening) - After publishing a new version β†’ `/update-docs` (catch version/path drift) - After conductor failures β†’ `/recover last` (learn from agent traces) +- End of session β†’ `/learn` (capture what changed, update artifacts) ## Workflow diff --git a/scripts/gepa/config.json b/scripts/gepa/config.json index 52570508..35f23df5 100644 --- a/scripts/gepa/config.json +++ b/scripts/gepa/config.json @@ -32,6 +32,36 @@ "file": "CLAUDE.md", "evals": ["stackmemory-tasks.jsonl"], "description": "StackMemory project prompt" + }, + { + "name": "skill:start", + "file": "~/.claude/commands/start.md", + "evals": ["skill-tasks.jsonl"], + "description": "Session boot skill" + }, + { + "name": "skill:stop", + "file": "~/.claude/commands/stop.md", + "evals": ["skill-tasks.jsonl"], + "description": "Session close skill" + }, + { + "name": "skill:learn", + "file": "~/.claude/commands/learn.md", + "evals": ["skill-tasks.jsonl"], + "description": "Session review + artifact update skill" + }, + { + "name": "skill:next", + "file": "~/.claude/commands/next.md", + "evals": ["skill-tasks.jsonl"], + "description": "Next action recommendation skill" + }, + { + "name": "skill:summary", + "file": "~/.claude/commands/summary.md", + "evals": ["skill-tasks.jsonl"], + "description": "Session summary skill" } ], @@ -144,6 +174,26 @@ "evals": { "files": ["conductor-tasks.jsonl"] } + }, + "skills": { + "target": { + "file": "~/.claude/commands/start.md", + "scope": "user", + "backup": true + }, + "evolution": { + "mutationStrategies": [ + "simplify", + "add_examples", + "rephrase", + "add_constraints", + "reduce_overengineering", + "add_self_check" + ] + }, + "evals": { + "files": ["skill-tasks.jsonl"] + } } } } diff --git a/scripts/gepa/evals/skill-tasks.jsonl b/scripts/gepa/evals/skill-tasks.jsonl new file mode 100644 index 00000000..62668111 --- /dev/null +++ b/scripts/gepa/evals/skill-tasks.jsonl @@ -0,0 +1,8 @@ +{"id": "skill-001", "name": "start_clean_main", "prompt": "I just opened a new session. The repo is clean, on main branch. What should I do?", "expected": {"shows_branch": true, "shows_recent_commits": true, "suggests_next_action": true, "no_overengineering": true}, "weight": 1.5} +{"id": "skill-002", "name": "start_dirty_feature", "prompt": "I'm resuming work. There are uncommitted changes on a feature branch. What's my status?", "expected": {"shows_branch": true, "shows_uncommitted": true, "suggests_continue_or_commit": true, "no_hallucination": true}, "weight": 1.5} +{"id": "skill-003", "name": "stop_with_work", "prompt": "I'm done for today but have more work tomorrow. Save my progress.", "expected": {"captures_context": true, "suggests_commit": true, "no_overengineering": true}, "weight": 1.2} +{"id": "skill-004", "name": "stop_done", "prompt": "I'm completely done with this task. Clean up and close.", "expected": {"captures_context": true, "suggests_clear": true, "concise_output": true}, "weight": 1.2} +{"id": "skill-005", "name": "learn_after_changes", "prompt": "I just finished a session where I created 3 new files, fixed a bug, and discovered a gotcha about ESM imports. What should be updated?", "expected": {"reviews_session": true, "identifies_memory_update": true, "structured_output": true, "no_overengineering": true}, "weight": 1.5} +{"id": "skill-006", "name": "learn_nothing_new", "prompt": "I just did a quick typo fix. Anything to update?", "expected": {"concise_output": true, "says_nothing_needed": true, "no_overengineering": true}, "weight": 1.0} +{"id": "skill-007", "name": "summary_multi_task", "prompt": "Summarize what I did this session: fixed auth bug, added pagination, updated tests.", "expected": {"lists_actions": true, "lists_files": true, "concise_output": true, "no_hallucination": true}, "weight": 1.0} +{"id": "skill-008", "name": "next_with_pr_open", "prompt": "I have an open PR with failing CI. What should I work on?", "expected": {"identifies_ci_fix": true, "actionable_suggestion": true, "presents_options": true}, "weight": 1.3} diff --git a/scripts/gepa/optimize.js b/scripts/gepa/optimize.js index 016fd030..63fe6b8e 100755 --- a/scripts/gepa/optimize.js +++ b/scripts/gepa/optimize.js @@ -103,6 +103,64 @@ const CONDUCTOR_PROMPTS_DIR = path.join( 'prompts' ); +/** + * Skill-aware optimization: read usage data from skill-audit.jsonl + * and build context for skill-scoped mutations. + */ +function getSkillAuditContext(skillName) { + const auditPath = path.join( + process.env.HOME || '', + '.stackmemory', + 'skill-audit.jsonl' + ); + if (!fs.existsSync(auditPath)) return ''; + + try { + const lines = fs + .readFileSync(auditPath, 'utf8') + .split('\n') + .filter(Boolean); + const entries = lines.map((l) => JSON.parse(l)); + + // Filter to this skill + const skillEntries = entries.filter((e) => e.skill === skillName); + if (skillEntries.length === 0) return ''; + + const total = skillEntries.length; + const errors = skillEntries.filter((e) => e.error).length; + const errorRate = ((errors / total) * 100).toFixed(1); + + // Common args patterns + const argCounts = {}; + for (const e of skillEntries) { + const arg = e.args || '(none)'; + argCounts[arg] = (argCounts[arg] || 0) + 1; + } + const topArgs = Object.entries(argCounts) + .sort((a, b) => b[1] - a[1]) + .slice(0, 5) + .map(([arg, count]) => ` - "${arg}": ${count}x`) + .join('\n'); + + // Recent errors + const recentErrors = skillEntries + .filter((e) => e.error) + .slice(-5) + .map((e) => ` - ${e.ts}: args="${e.args}"`) + .join('\n'); + + let ctx = `\n## Skill usage data for "${skillName}" (${total} invocations, ${errorRate}% error rate):\n`; + ctx += `\nMost common args:\n${topArgs}\n`; + if (recentErrors) { + ctx += `\nRecent errors:\n${recentErrors}\n`; + } + + return ctx; + } catch { + return ''; + } +} + /** * Phase-aware optimization: read failure data from outcomes.jsonl * and build context for phase-scoped mutations. @@ -538,7 +596,17 @@ async function generateMutation(content, strategy, state) { return generateMutation(content, 'rephrase', state); } - const prompt = `You are an expert prompt engineer optimizing a CLAUDE.md system prompt for an AI coding agent (Claude Opus 4.6). + // Detect if optimizing a skill .md file + const isSkillTarget = targetName && targetName.startsWith('skill:'); + const skillAuditCtx = isSkillTarget + ? getSkillAuditContext(targetName.replace('skill:', '')) + : ''; + + const targetDescription = isSkillTarget + ? 'a Claude Code slash command (skill) .md file that instructs an AI coding agent what to do when the user invokes the command' + : 'a CLAUDE.md system prompt for an AI coding agent (Claude Opus 4.6)'; + + const prompt = `You are an expert prompt engineer optimizing ${targetDescription}. ${content} @@ -565,6 +633,7 @@ ${getRecentFeedback(state)} REFLECTION INSIGHTS (from failure pattern analysis): ${getReflectionInsights()} +${skillAuditCtx} @@ -1504,6 +1573,110 @@ async function runAll(generations = 3) { console.log('═'.repeat(60)); } +/** + * Show skill audit statistics from skill-audit.jsonl + */ +function showSkillStats() { + const auditPath = path.join( + process.env.HOME || '', + '.stackmemory', + 'skill-audit.jsonl' + ); + + if (!fs.existsSync(auditPath)) { + console.log('No skill audit data yet. Use skills to generate data.'); + return; + } + + const lines = fs.readFileSync(auditPath, 'utf8').split('\n').filter(Boolean); + const entries = lines.map((l) => JSON.parse(l)); + + // Group by skill + const bySkill = {}; + for (const e of entries) { + if (!bySkill[e.skill]) bySkill[e.skill] = { total: 0, errors: 0, args: {} }; + bySkill[e.skill].total++; + if (e.error) bySkill[e.skill].errors++; + const arg = e.args || '(none)'; + bySkill[e.skill].args[arg] = (bySkill[e.skill].args[arg] || 0) + 1; + } + + console.log(`Skill Audit Stats (${entries.length} total invocations)\n`); + console.log( + `${'Skill'.padEnd(20)} ${'Count'.padStart(6)} ${'Errors'.padStart(7)} ${'Rate'.padStart(6)}` + ); + console.log('-'.repeat(42)); + + const sorted = Object.entries(bySkill).sort( + (a, b) => b[1].total - a[1].total + ); + for (const [skill, stats] of sorted) { + const rate = ((stats.errors / stats.total) * 100).toFixed(0); + console.log( + `${skill.padEnd(20)} ${String(stats.total).padStart(6)} ${String(stats.errors).padStart(7)} ${(rate + '%').padStart(6)}` + ); + } + + // Show skill targets available for optimization + const skillTargets = (config.targets || []).filter((t) => + t.name.startsWith('skill:') + ); + if (skillTargets.length) { + console.log(`\nConfigured skill targets:`); + for (const t of skillTargets) { + const hasData = bySkill[t.name.replace('skill:', '')]; + const marker = hasData ? 'βœ“' : 'β—‹'; + console.log(` ${marker} ${t.name.padEnd(20)} ${t.file}`); + } + } +} + +/** + * Run optimization on all skill targets + */ +async function runSkills(generations = 3) { + const skillTargets = (config.targets || []).filter((t) => + t.name.startsWith('skill:') + ); + + if (!skillTargets.length) { + console.log('No skill targets configured in config.json.'); + return; + } + + console.log( + `Running GEPA on ${skillTargets.length} skill targets (${generations} generations each)\n` + ); + + for (const target of skillTargets) { + const resolved = target.file.startsWith('~') + ? path.join(process.env.HOME, target.file.slice(1)) + : path.resolve(target.file); + + if (!fs.existsSync(resolved)) { + console.log(`Skipping ${target.name}: ${resolved} not found\n`); + continue; + } + + console.log(`\n${'═'.repeat(60)}`); + console.log(`SKILL: ${target.name} (${target.file})`); + console.log(`${'═'.repeat(60)}\n`); + + // Override config for this target + config.target.file = target.file; + if (target.evals) config.evals.files = target.evals; + + await init(resolved); + await run(generations); + + console.log(`\nCompleted ${target.name}\n`); + } + + console.log('\n' + '═'.repeat(60)); + console.log('ALL SKILL TARGETS COMPLETE'); + console.log('═'.repeat(60)); +} + // CLI const command = process.argv[2]; const arg1 = process.argv[3]; @@ -1554,6 +1727,12 @@ switch (command) { case 'run-all': runAll(parseInt(arg1) || 3); break; + case 'skill-stats': + showSkillStats(); + break; + case 'run-skills': + runSkills(parseInt(arg1) || 3); + break; default: console.log(` GEPA - Genetic Eval-driven Prompt Algorithm @@ -1572,6 +1751,11 @@ Usage: node optimize.js targets List available targets node optimize.js run-all [generations] Run optimization on ALL targets +Skill optimization: + node optimize.js skill-stats Show skill audit statistics + node optimize.js run-skills [gens] Run optimization on all skill targets + node optimize.js run --target skill:start Optimize a specific skill + Options: --target Select target from targets[] config Available: ${(config.targets || []).map((t) => t.name).join(', ')} From acc477e18af90b5ff73ce560f0e9f189add98efd Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Sun, 19 Apr 2026 09:52:36 -0400 Subject: [PATCH 16/18] chore(gepa): update baseline generations with current CLAUDE.md --- scripts/gepa/.before-optimize.md | 311 +++++++++++++++--- scripts/gepa/generations/current | 2 +- scripts/gepa/generations/gen-000/baseline.md | 313 +++++++++++++++---- scripts/gepa/generations/gen-001/baseline.md | 313 +++++++++++++++---- 4 files changed, 781 insertions(+), 158 deletions(-) diff --git a/scripts/gepa/.before-optimize.md b/scripts/gepa/.before-optimize.md index 2388f26a..42ef6c44 100644 --- a/scripts/gepa/.before-optimize.md +++ b/scripts/gepa/.before-optimize.md @@ -1,72 +1,281 @@ -# croissant.ai β€” Agent Guide - -Tool-agnostic reference for AI coding agents working in this repository. - -## Stack - -Node.js / Express / PostgreSQL / Redis -Railway deployment | Stripe / Salesforce / QuickBooks integrations +# StackMemory - Project Configuration ## Project Structure ``` src/ - api/ # Route handlers - core/ # monitoring-service, cache-service, queue-service, master-agent, api-validation - features/ # Feature modules - shared/ # Shared utilities - integrations/ # Third-party connectors -docs/ # Documentation -scripts/ # Automation scripts -docker/ # Container configs -prompts/ # Externalized LLM prompt templates + cli/ # CLI commands and entry point + core/ # Core business logic + config/ # Config types and manager + context/ # Frame management, enrichment, rehydration + database/ # SQLite adapter, migrations, query cache + digest/ # Digest generation (hybrid, chronological) + errors/ # Error types and recovery + merge/ # Stack merge and conflict resolution + models/ # Model routing, complexity scoring + monitoring/ # Logging, metrics, session monitor + performance/ # Caching, profiling, benchmarks + query/ # Query parsing and routing + retrieval/ # Context retrieval, LLM provider + session/ # Handoff, session management + skills/ # Skill storage and types + storage/ # Tiered storage, remote sync + trace/ # Debug tracing, trace detection + integrations/ # External integrations + claude-code/ # Agent bridge, post-task hooks + linear/ # Linear sync, webhooks, OAuth + mcp/ # MCP server, 56 tool handlers + ralph/ # Multi-agent swarm orchestration + daemon/ # Unified daemon, session daemon + features/ # Analytics, browser, sweep, TUI + hooks/ # Claude Code hook handlers + skills/ # Built-in skill implementations + utils/ # Shared utilities +scripts/ # Build and utility scripts +docs/ # Documentation ``` +## Key Files + +- Entry: src/cli/index.ts +- MCP Server: src/integrations/mcp/server.ts +- Frame Manager: src/core/context/frame-manager.ts +- Database: src/core/database/sqlite-adapter.ts +- Snapshot: src/core/worktree/capture.ts +- Preflight: src/core/worktree/preflight.ts +- Conductor: src/cli/commands/orchestrator.ts (core) + orchestrate.ts (CLI) +- Conductor Traces: src/cli/commands/conductor-traces.ts +- Frame Enrichment: src/core/context/frame-enrichment.ts +- Process Utils: src/utils/process-cleanup.ts +- Shared Utils: src/core/utils/{git,text,fs}.ts + +## Detailed Guides + +Quick reference (agent_docs/): +- linear_integration.md - Linear sync +- mcp_server.md - MCP tools +- database_storage.md - Storage +- claude_hooks.md - Hooks + +Full documentation (docs/): +- principles.md - Agent programming paradigm +- architecture.md - Extension model and browser sandbox +- SPEC.md - Technical specification +- API_REFERENCE.md - API docs +- DEVELOPMENT.md - Dev guide +- SETUP.md - Installation + ## Commands ```bash -npm run dev # Start dev server -npm run test # Run test suites (3 parallel Jest workers, maxWorkers=4) -npm run lint # Lint check -npm run migrate # Run DB migrations -docker-compose up -d # Start local DBs +npm run build # Compile TypeScript (esbuild) +npm run lint # ESLint check +npm run lint:fix # Auto-fix lint issues +npm run lint:fast # Fast lint via oxlint +npm run typecheck # tsc --noEmit (8GB heap, avoids OOM) +npm test # Run Vitest (watch) +npm run test:run # Run tests once +npm run linear:sync # Sync with Linear + +# StackMemory CLI +stackmemory capture # Save session state for handoff +stackmemory restore # Restore from captured state +stackmemory snapshot save # Post-run context snapshot (alias: snap) +stackmemory snapshot list # List recent snapshots +stackmemory preflight # File overlap check for parallel tasks (alias: pf) +stackmemory conductor start # Autonomous Linearβ†’worktreeβ†’agent orchestrator +stackmemory conductor learn # Analyze agent outcomes (success rate, failure phases, error patterns) +stackmemory conductor learn --evolve # Auto-mutate prompt template from failure data (GEPA) +stackmemory conductor status # Live agent status dashboard +stackmemory conductor monitor # Real-time TUI with phase tracking +stackmemory conductor finalize # Clean up dead/stale agents +stackmemory conductor traces # View conversation traces for an agent run +stackmemory conductor replay # Replay full agent conversation from traces +stackmemory conductor trace-stats # Aggregate trace statistics +stackmemory loop "" --until "" # Poll until condition met (alias: watch) +``` + +## Working Directory + +- PRIMARY: /Users/jwu/Dev/stackmemory +- ALLOWED: All subdirectories +- TEMP: /tmp for temporary operations + +## Validation + +Verify each step after code changes β€” pre-commit hooks catch 80% of CI failures locally: +1. `npm run lint` - fix any errors AND warnings +2. `npm run test:run` - verify no regressions +3. `npm run build` - ensure compilation +4. Run code to verify it works + +Test coverage: +- New features require tests in `src/**/__tests__/` +- Maintain or improve coverage (no untested code paths) +- Critical paths: context management, handoff, Linear sync + +Testing rules: +- Run `npm run test:run` via subagent or background task β€” never inline (blocks context) +- ESLint: use `catch {}` not `catch (_err) {}` (lint rule) +- `vi.clearAllMocks()` resets `mockReturnValue` β€” re-set mocks in `beforeEach` +- Pre-commit hook runs: lint + parallel vitest + build β€” fix issues before commit, never skip + +## Git Rules + +The pre-commit hook enforces lint + test + build. Fix the underlying issue rather than bypassing it. + +- Do not use `--no-verify` on git push or commit β€” fix the hook failure instead +- Fix lint/test errors before pushing +- If pre-push hooks fail, fix the underlying issue +- Run `npm run lint && npm run test:run` before pushing +- Commit message format: `type(scope): message` +- Branch naming: `feature/STA-XXX-description` | `fix/STA-XXX-description` | `chore/description` + +## Task Management + +- Use TodoWrite for 3+ steps or multiple requests +- Keep one task in_progress at a time +- Update task status immediately on completion + +## Security + +NEVER hardcode secrets - use process.env with dotenv/config + +```javascript +import 'dotenv/config'; +const API_KEY = process.env.LINEAR_API_KEY; +if (!API_KEY) { + console.error('LINEAR_API_KEY not set'); + process.exit(1); +} ``` -## Git Conventions +Environment sources (check in order): +1. .env file +2. .env.local +3. ~/.zshrc +4. Process environment + +Secret patterns to block: lin_api_* | lin_oauth_* | sk-* | npm_* + +## Deploy + +```bash +# npm publish (uses NPM_TOKEN from .env, no OTP needed) +git stash -- scripts/gepa/ # stash GEPA state (dirties working tree) +NPM_TOKEN=$(grep '^NPM_TOKEN=' .env | cut -d= -f2) \ + npm publish --registry https://registry.npmjs.org/ \ + --//registry.npmjs.org/:_authToken="$NPM_TOKEN" +git stash pop # restore GEPA state + +# Railway +railway up + +# Pre-publish checks require clean git status β€” stash GEPA files first +``` + +## Conductor (Autonomous Agent Orchestration) + +The conductor manages autonomous coding agents via Linear issues: + +**Data files** (all under `~/.stackmemory/conductor/`): +- `prompt-template.md` β€” Agent prompt template with `{{VARIABLE}}` substitution (auto-created on first `conductor start`) +- `outcomes.jsonl` β€” JSONL log of agent outcomes (success/failure, phase, tokens, errors) +- `evolution-log.jsonl` β€” History of `--evolve` mutations applied to the prompt template +- `agents//status.json` β€” Per-agent status files +- `agents//output.log` β€” Agent stdout/stderr +- `traces.db` β€” SQLite database with per-turn conversation traces (tool calls, tokens, phases, content previews) + +**Intelligence features**: +- Multi-model routing with difficulty prediction (routes simple tasks to cheaper models) +- Smart retry with exponential backoff and prior context injection +- Auto-PR creation on successful agent completion +- Trace-based evidence: per-turn conversation logging (tools, tokens, phases) to traces.db + +**Learning loop**: +1. Agents run β†’ outcomes logged to `outcomes.jsonl`, traces to `traces.db` +2. `conductor learn` analyzes patterns (success rate, failure phases, error types) +3. `conductor learn --evolve` calls Claude to mutate `prompt-template.md` based on failure data +4. Next agent run uses the improved template β†’ repeat + +**Template variables**: `{{ISSUE_ID}}`, `{{TITLE}}`, `{{DESCRIPTION}}`, `{{LABELS}}`, `{{PRIORITY}}`, `{{ATTEMPT}}`, `{{PRIOR_CONTEXT}}` + +## Task Delegation Model + +Route effort by task complexity β€” not all code changes deserve equal scrutiny: + +**AUTOMATE** β€” Execute immediately, lint+test is sufficient: +- CRUD operations, boilerplate, formatting, simple transforms +- Adding a tool handler following existing switch/case pattern +- Config additions (new env var, feature flag) + +**STANDARD** β€” Normal workflow, lint+test+build: +- Feature implementation, bug fixes, refactoring +- New test coverage, documentation updates +- Integration wiring (adding handler to server.ts dispatch) + +**CAREFUL** β€” Review approach before implementation: +- API/schema changes, database migrations, auth flows +- New integration patterns (MCP tools, webhook handlers) +- Changes to frame-manager, sqlite-adapter, or daemon lifecycle +- Anything touching error handling chains + +**ARCHITECT** β€” Plan mode required, explore existing patterns first: +- New service boundaries, system integrations +- Performance-critical paths (FTS5 queries, search scoring) +- Breaking changes to MCP protocol or CLI interface + +**HUMAN** β€” Explicit user approval before any changes: +- Security-critical decisions, secret handling +- Irreversible operations (data migrations, schema drops) +- Publishing (npm publish, Railway deploy) + +Quality gates scale with tier β€” don't over-engineer AUTOMATE tasks, don't under-review CAREFUL ones. + +For AUTOMATE and STANDARD tiers: make only the requested changes. Don't refactor surrounding code, add abstractions for one-time operations, or create helpers that are used once. Three similar lines of code is better than a premature abstraction. + +## Session Budget -- Branch prefixes: `feature/`, `fix/`, `chore/` -- Commit format: `type(scope): message` -- Do NOT add `Co-Authored-By` lines to commits -- Pre-commit hook runs: `npm run lint` + `npm run test` + E2E browser screenshots +- Max 1 major topic per session β€” split unrelated work into separate sessions +- Run /compact or summarize at ~50% context usage to avoid overflow +- Plan-execute sessions (low interaction, high edits) are most efficient +- Avoid exploratory marathons with topic-switching β€” burns 30-40% extra tokens -## Testing Rules +## Context Maintenance -- **Framework**: Jest + SWC -- **DB mocking**: Use dependency injection (DI), not global mocks -- **Supertest**: Pass `app` (NOT `server`) to supertest -- **Global jest**: src/ tests use global `jest` β€” do NOT import from `@jest/globals` (causes redeclaration errors) -- **Mock reset**: `jest.clearAllMocks()` resets `mockReturnValue` β€” always re-set mocks in `beforeEach` -- **Test runner**: `npm test` is long-running; run in a background process or sub-agent, not inline +**`/update-docs`** β€” Run weekly or when context feels stale: +- Audits CLAUDE.md, MEMORY.md, agent_docs/ against git history and codebase +- Detects stale entries, missing patterns, outdated paths +- Trigger: start of week, after major refactors, or when sessions feel slow/confused -## ESLint Rules +**`/recover`** β€” Run when a session goes off the rails: +- Analyzes traces to find where context drifted from intent +- Maps drift to specific doc fixes (missing guidance, stale memory, ambiguous instruction) +- Trigger: user says "this is wrong", "not what I wanted", "off the rails", repeated corrections -- Use `catch {}` not `catch (_err) {}` β€” underscore prefix not in the allowed pattern -- CJS format for JS files in `src/` +**`/next`** β€” Run at session start or when asking "what's next": +- Scans git log, TODO files, Linear issues, and memory for actionable items +- Prioritizes: unfinished work > flagged issues > queued tasks > continuations +- Trigger: session start, "what's next", "whats next", between tasks -## Key Patterns +**`/learn`** β€” Run at session end to capture learnings: +- Reviews session work, then audits memory, CLAUDE.md, skills, scripts, and wiki +- Proposes creates/updates/deletes with confirmation before applying +- Trigger: end of session, after significant work, "what should I update" -- Provenance tracking: every data point includes source, timestamp, lineage -- Multi-tenant container isolation -- DI route factories for testability -- Error handling: return undefined over throwing; log and continue over crashing -- Add `.js` extension to relative ESM imports +**When to use which:** +- Starting a session or between tasks β†’ `/next` (pick what to work on) +- Session producing wrong results β†’ `/recover` (diagnose + fix now) +- Routine maintenance, nothing broken β†’ `/update-docs` (proactive gardening) +- After publishing a new version β†’ `/update-docs` (catch version/path drift) +- After conductor failures β†’ `/recover last` (learn from agent traces) +- End of session β†’ `/learn` (capture what changed, update artifacts) -## StackMemory Context Rule +## Workflow -- When an agent fetches conversation context for active work, it must pass the exact current assignment or question as `task_query`. -- Prefer the MCP shape: - - `org_id` - - `conversation_id` - - `task_query` - - `recover_on_low_signal: true` -- Do not fetch raw `get_conversation` context for worker execution unless full transcript behavior is explicitly required. +- Check .env for API keys before asking +- Run npm run linear:sync after task completion +- Use browser MCP for visual testing +- Review recent commits and stackmemory.json on session start +- Use subagents for multi-step tasks +- Ask 1-3 clarifying questions for complex commands (one at a time) diff --git a/scripts/gepa/generations/current b/scripts/gepa/generations/current index 21436e00..c4dbed3d 120000 --- a/scripts/gepa/generations/current +++ b/scripts/gepa/generations/current @@ -1 +1 @@ -/Users/jwu/Dev/stackmemory/scripts/gepa/generations/gen-002/baseline.md \ No newline at end of file +/Users/jwu/Dev/stackmemory/scripts/gepa/generations/gen-001/baseline.md \ No newline at end of file diff --git a/scripts/gepa/generations/gen-000/baseline.md b/scripts/gepa/generations/gen-000/baseline.md index 0c86cace..42ef6c44 100644 --- a/scripts/gepa/generations/gen-000/baseline.md +++ b/scripts/gepa/generations/gen-000/baseline.md @@ -1,74 +1,281 @@ -# croissant.ai β€” Agent Guide - -Tool-agnostic reference for AI coding agents working in this repository. - -## Stack - -Node.js / Express / PostgreSQL / Redis -Railway deployment | Stripe / Salesforce / QuickBooks integrations +# StackMemory - Project Configuration ## Project Structure ``` src/ - api/ # Route handlers - core/ # monitoring-service, cache-service, queue-service, master-agent, api-validation - features/ # Feature modules - shared/ # Shared utilities - integrations/ # Third-party connectors -docs/ # Documentation -scripts/ # Automation scripts -docker/ # Container configs -prompts/ # Externalized LLM prompt templates + cli/ # CLI commands and entry point + core/ # Core business logic + config/ # Config types and manager + context/ # Frame management, enrichment, rehydration + database/ # SQLite adapter, migrations, query cache + digest/ # Digest generation (hybrid, chronological) + errors/ # Error types and recovery + merge/ # Stack merge and conflict resolution + models/ # Model routing, complexity scoring + monitoring/ # Logging, metrics, session monitor + performance/ # Caching, profiling, benchmarks + query/ # Query parsing and routing + retrieval/ # Context retrieval, LLM provider + session/ # Handoff, session management + skills/ # Skill storage and types + storage/ # Tiered storage, remote sync + trace/ # Debug tracing, trace detection + integrations/ # External integrations + claude-code/ # Agent bridge, post-task hooks + linear/ # Linear sync, webhooks, OAuth + mcp/ # MCP server, 56 tool handlers + ralph/ # Multi-agent swarm orchestration + daemon/ # Unified daemon, session daemon + features/ # Analytics, browser, sweep, TUI + hooks/ # Claude Code hook handlers + skills/ # Built-in skill implementations + utils/ # Shared utilities +scripts/ # Build and utility scripts +docs/ # Documentation ``` +## Key Files + +- Entry: src/cli/index.ts +- MCP Server: src/integrations/mcp/server.ts +- Frame Manager: src/core/context/frame-manager.ts +- Database: src/core/database/sqlite-adapter.ts +- Snapshot: src/core/worktree/capture.ts +- Preflight: src/core/worktree/preflight.ts +- Conductor: src/cli/commands/orchestrator.ts (core) + orchestrate.ts (CLI) +- Conductor Traces: src/cli/commands/conductor-traces.ts +- Frame Enrichment: src/core/context/frame-enrichment.ts +- Process Utils: src/utils/process-cleanup.ts +- Shared Utils: src/core/utils/{git,text,fs}.ts + +## Detailed Guides + +Quick reference (agent_docs/): +- linear_integration.md - Linear sync +- mcp_server.md - MCP tools +- database_storage.md - Storage +- claude_hooks.md - Hooks + +Full documentation (docs/): +- principles.md - Agent programming paradigm +- architecture.md - Extension model and browser sandbox +- SPEC.md - Technical specification +- API_REFERENCE.md - API docs +- DEVELOPMENT.md - Dev guide +- SETUP.md - Installation + ## Commands ```bash -npm run dev # Start dev server -npm run test # Run test suites (3 parallel Jest workers, maxWorkers=4) -npm run lint # Lint check -npm run migrate # Run DB migrations -docker-compose up -d # Start local DBs +npm run build # Compile TypeScript (esbuild) +npm run lint # ESLint check +npm run lint:fix # Auto-fix lint issues +npm run lint:fast # Fast lint via oxlint +npm run typecheck # tsc --noEmit (8GB heap, avoids OOM) +npm test # Run Vitest (watch) +npm run test:run # Run tests once +npm run linear:sync # Sync with Linear + +# StackMemory CLI +stackmemory capture # Save session state for handoff +stackmemory restore # Restore from captured state +stackmemory snapshot save # Post-run context snapshot (alias: snap) +stackmemory snapshot list # List recent snapshots +stackmemory preflight # File overlap check for parallel tasks (alias: pf) +stackmemory conductor start # Autonomous Linearβ†’worktreeβ†’agent orchestrator +stackmemory conductor learn # Analyze agent outcomes (success rate, failure phases, error patterns) +stackmemory conductor learn --evolve # Auto-mutate prompt template from failure data (GEPA) +stackmemory conductor status # Live agent status dashboard +stackmemory conductor monitor # Real-time TUI with phase tracking +stackmemory conductor finalize # Clean up dead/stale agents +stackmemory conductor traces # View conversation traces for an agent run +stackmemory conductor replay # Replay full agent conversation from traces +stackmemory conductor trace-stats # Aggregate trace statistics +stackmemory loop "" --until "" # Poll until condition met (alias: watch) +``` + +## Working Directory + +- PRIMARY: /Users/jwu/Dev/stackmemory +- ALLOWED: All subdirectories +- TEMP: /tmp for temporary operations + +## Validation + +Verify each step after code changes β€” pre-commit hooks catch 80% of CI failures locally: +1. `npm run lint` - fix any errors AND warnings +2. `npm run test:run` - verify no regressions +3. `npm run build` - ensure compilation +4. Run code to verify it works + +Test coverage: +- New features require tests in `src/**/__tests__/` +- Maintain or improve coverage (no untested code paths) +- Critical paths: context management, handoff, Linear sync + +Testing rules: +- Run `npm run test:run` via subagent or background task β€” never inline (blocks context) +- ESLint: use `catch {}` not `catch (_err) {}` (lint rule) +- `vi.clearAllMocks()` resets `mockReturnValue` β€” re-set mocks in `beforeEach` +- Pre-commit hook runs: lint + parallel vitest + build β€” fix issues before commit, never skip + +## Git Rules + +The pre-commit hook enforces lint + test + build. Fix the underlying issue rather than bypassing it. + +- Do not use `--no-verify` on git push or commit β€” fix the hook failure instead +- Fix lint/test errors before pushing +- If pre-push hooks fail, fix the underlying issue +- Run `npm run lint && npm run test:run` before pushing +- Commit message format: `type(scope): message` +- Branch naming: `feature/STA-XXX-description` | `fix/STA-XXX-description` | `chore/description` + +## Task Management + +- Use TodoWrite for 3+ steps or multiple requests +- Keep one task in_progress at a time +- Update task status immediately on completion + +## Security + +NEVER hardcode secrets - use process.env with dotenv/config + +```javascript +import 'dotenv/config'; +const API_KEY = process.env.LINEAR_API_KEY; +if (!API_KEY) { + console.error('LINEAR_API_KEY not set'); + process.exit(1); +} ``` -## Git Conventions +Environment sources (check in order): +1. .env file +2. .env.local +3. ~/.zshrc +4. Process environment + +Secret patterns to block: lin_api_* | lin_oauth_* | sk-* | npm_* + +## Deploy + +```bash +# npm publish (uses NPM_TOKEN from .env, no OTP needed) +git stash -- scripts/gepa/ # stash GEPA state (dirties working tree) +NPM_TOKEN=$(grep '^NPM_TOKEN=' .env | cut -d= -f2) \ + npm publish --registry https://registry.npmjs.org/ \ + --//registry.npmjs.org/:_authToken="$NPM_TOKEN" +git stash pop # restore GEPA state + +# Railway +railway up + +# Pre-publish checks require clean git status β€” stash GEPA files first +``` + +## Conductor (Autonomous Agent Orchestration) + +The conductor manages autonomous coding agents via Linear issues: + +**Data files** (all under `~/.stackmemory/conductor/`): +- `prompt-template.md` β€” Agent prompt template with `{{VARIABLE}}` substitution (auto-created on first `conductor start`) +- `outcomes.jsonl` β€” JSONL log of agent outcomes (success/failure, phase, tokens, errors) +- `evolution-log.jsonl` β€” History of `--evolve` mutations applied to the prompt template +- `agents//status.json` β€” Per-agent status files +- `agents//output.log` β€” Agent stdout/stderr +- `traces.db` β€” SQLite database with per-turn conversation traces (tool calls, tokens, phases, content previews) + +**Intelligence features**: +- Multi-model routing with difficulty prediction (routes simple tasks to cheaper models) +- Smart retry with exponential backoff and prior context injection +- Auto-PR creation on successful agent completion +- Trace-based evidence: per-turn conversation logging (tools, tokens, phases) to traces.db + +**Learning loop**: +1. Agents run β†’ outcomes logged to `outcomes.jsonl`, traces to `traces.db` +2. `conductor learn` analyzes patterns (success rate, failure phases, error types) +3. `conductor learn --evolve` calls Claude to mutate `prompt-template.md` based on failure data +4. Next agent run uses the improved template β†’ repeat + +**Template variables**: `{{ISSUE_ID}}`, `{{TITLE}}`, `{{DESCRIPTION}}`, `{{LABELS}}`, `{{PRIORITY}}`, `{{ATTEMPT}}`, `{{PRIOR_CONTEXT}}` + +## Task Delegation Model + +Route effort by task complexity β€” not all code changes deserve equal scrutiny: + +**AUTOMATE** β€” Execute immediately, lint+test is sufficient: +- CRUD operations, boilerplate, formatting, simple transforms +- Adding a tool handler following existing switch/case pattern +- Config additions (new env var, feature flag) + +**STANDARD** β€” Normal workflow, lint+test+build: +- Feature implementation, bug fixes, refactoring +- New test coverage, documentation updates +- Integration wiring (adding handler to server.ts dispatch) + +**CAREFUL** β€” Review approach before implementation: +- API/schema changes, database migrations, auth flows +- New integration patterns (MCP tools, webhook handlers) +- Changes to frame-manager, sqlite-adapter, or daemon lifecycle +- Anything touching error handling chains + +**ARCHITECT** β€” Plan mode required, explore existing patterns first: +- New service boundaries, system integrations +- Performance-critical paths (FTS5 queries, search scoring) +- Breaking changes to MCP protocol or CLI interface + +**HUMAN** β€” Explicit user approval before any changes: +- Security-critical decisions, secret handling +- Irreversible operations (data migrations, schema drops) +- Publishing (npm publish, Railway deploy) + +Quality gates scale with tier β€” don't over-engineer AUTOMATE tasks, don't under-review CAREFUL ones. + +For AUTOMATE and STANDARD tiers: make only the requested changes. Don't refactor surrounding code, add abstractions for one-time operations, or create helpers that are used once. Three similar lines of code is better than a premature abstraction. + +## Session Budget -- Branch prefixes: `feature/`, `fix/`, `chore/` -- Commit format: `type(scope): message` -- Do NOT add `Co-Authored-By` lines to commits -- Pre-commit hook runs: `npm run lint` + `npm run test` + E2E browser screenshots +- Max 1 major topic per session β€” split unrelated work into separate sessions +- Run /compact or summarize at ~50% context usage to avoid overflow +- Plan-execute sessions (low interaction, high edits) are most efficient +- Avoid exploratory marathons with topic-switching β€” burns 30-40% extra tokens -## Testing Rules +## Context Maintenance -- **Framework**: Jest + SWC -- **DB mocking**: Use dependency injection (DI), not global mocks -- **Supertest**: Pass `app` (NOT `server`) to supertest -- **Global jest**: src/ tests use global `jest` β€” do NOT import from `@jest/globals` (causes redeclaration errors) -- **Mock reset**: `jest.clearAllMocks()` resets `mockReturnValue` β€” always re-set mocks in `beforeEach` -- **Test runner**: `npm test` is long-running; run in a background process or sub-agent, not inline +**`/update-docs`** β€” Run weekly or when context feels stale: +- Audits CLAUDE.md, MEMORY.md, agent_docs/ against git history and codebase +- Detects stale entries, missing patterns, outdated paths +- Trigger: start of week, after major refactors, or when sessions feel slow/confused -## ESLint Rules +**`/recover`** β€” Run when a session goes off the rails: +- Analyzes traces to find where context drifted from intent +- Maps drift to specific doc fixes (missing guidance, stale memory, ambiguous instruction) +- Trigger: user says "this is wrong", "not what I wanted", "off the rails", repeated corrections -- Use `catch {}` not `catch (_err) {}` β€” underscore prefix not in the allowed pattern -- CJS format for JS files in `src/` +**`/next`** β€” Run at session start or when asking "what's next": +- Scans git log, TODO files, Linear issues, and memory for actionable items +- Prioritizes: unfinished work > flagged issues > queued tasks > continuations +- Trigger: session start, "what's next", "whats next", between tasks -## Key Patterns +**`/learn`** β€” Run at session end to capture learnings: +- Reviews session work, then audits memory, CLAUDE.md, skills, scripts, and wiki +- Proposes creates/updates/deletes with confirmation before applying +- Trigger: end of session, after significant work, "what should I update" -- Provenance tracking: every data point includes source, timestamp, lineage -- Multi-tenant container isolation -- DI route factories for testability -- Error handling: return undefined over throwing; log and continue over crashing -- Add `.js` extension to relative ESM imports +**When to use which:** +- Starting a session or between tasks β†’ `/next` (pick what to work on) +- Session producing wrong results β†’ `/recover` (diagnose + fix now) +- Routine maintenance, nothing broken β†’ `/update-docs` (proactive gardening) +- After publishing a new version β†’ `/update-docs` (catch version/path drift) +- After conductor failures β†’ `/recover last` (learn from agent traces) +- End of session β†’ `/learn` (capture what changed, update artifacts) -## StackMemory Context Rule +## Workflow -- When an agent fetches conversation context for active work, it must pass the exact current assignment or question as `task_query`. -- Prefer the MCP shape: - - `org_id` - - `conversation_id` - - `worker_mode: true` - - `task_query` - - `recover_on_low_signal: true` -- Do not fetch raw `get_conversation` context for worker execution unless full transcript behavior is explicitly required. -- The current assignment is persisted under `.stackmemory/worker-context/current-assignment.json` so wrappers and hooks can auto-fill or enforce `task_query`. +- Check .env for API keys before asking +- Run npm run linear:sync after task completion +- Use browser MCP for visual testing +- Review recent commits and stackmemory.json on session start +- Use subagents for multi-step tasks +- Ask 1-3 clarifying questions for complex commands (one at a time) diff --git a/scripts/gepa/generations/gen-001/baseline.md b/scripts/gepa/generations/gen-001/baseline.md index 0c86cace..42ef6c44 100644 --- a/scripts/gepa/generations/gen-001/baseline.md +++ b/scripts/gepa/generations/gen-001/baseline.md @@ -1,74 +1,281 @@ -# croissant.ai β€” Agent Guide - -Tool-agnostic reference for AI coding agents working in this repository. - -## Stack - -Node.js / Express / PostgreSQL / Redis -Railway deployment | Stripe / Salesforce / QuickBooks integrations +# StackMemory - Project Configuration ## Project Structure ``` src/ - api/ # Route handlers - core/ # monitoring-service, cache-service, queue-service, master-agent, api-validation - features/ # Feature modules - shared/ # Shared utilities - integrations/ # Third-party connectors -docs/ # Documentation -scripts/ # Automation scripts -docker/ # Container configs -prompts/ # Externalized LLM prompt templates + cli/ # CLI commands and entry point + core/ # Core business logic + config/ # Config types and manager + context/ # Frame management, enrichment, rehydration + database/ # SQLite adapter, migrations, query cache + digest/ # Digest generation (hybrid, chronological) + errors/ # Error types and recovery + merge/ # Stack merge and conflict resolution + models/ # Model routing, complexity scoring + monitoring/ # Logging, metrics, session monitor + performance/ # Caching, profiling, benchmarks + query/ # Query parsing and routing + retrieval/ # Context retrieval, LLM provider + session/ # Handoff, session management + skills/ # Skill storage and types + storage/ # Tiered storage, remote sync + trace/ # Debug tracing, trace detection + integrations/ # External integrations + claude-code/ # Agent bridge, post-task hooks + linear/ # Linear sync, webhooks, OAuth + mcp/ # MCP server, 56 tool handlers + ralph/ # Multi-agent swarm orchestration + daemon/ # Unified daemon, session daemon + features/ # Analytics, browser, sweep, TUI + hooks/ # Claude Code hook handlers + skills/ # Built-in skill implementations + utils/ # Shared utilities +scripts/ # Build and utility scripts +docs/ # Documentation ``` +## Key Files + +- Entry: src/cli/index.ts +- MCP Server: src/integrations/mcp/server.ts +- Frame Manager: src/core/context/frame-manager.ts +- Database: src/core/database/sqlite-adapter.ts +- Snapshot: src/core/worktree/capture.ts +- Preflight: src/core/worktree/preflight.ts +- Conductor: src/cli/commands/orchestrator.ts (core) + orchestrate.ts (CLI) +- Conductor Traces: src/cli/commands/conductor-traces.ts +- Frame Enrichment: src/core/context/frame-enrichment.ts +- Process Utils: src/utils/process-cleanup.ts +- Shared Utils: src/core/utils/{git,text,fs}.ts + +## Detailed Guides + +Quick reference (agent_docs/): +- linear_integration.md - Linear sync +- mcp_server.md - MCP tools +- database_storage.md - Storage +- claude_hooks.md - Hooks + +Full documentation (docs/): +- principles.md - Agent programming paradigm +- architecture.md - Extension model and browser sandbox +- SPEC.md - Technical specification +- API_REFERENCE.md - API docs +- DEVELOPMENT.md - Dev guide +- SETUP.md - Installation + ## Commands ```bash -npm run dev # Start dev server -npm run test # Run test suites (3 parallel Jest workers, maxWorkers=4) -npm run lint # Lint check -npm run migrate # Run DB migrations -docker-compose up -d # Start local DBs +npm run build # Compile TypeScript (esbuild) +npm run lint # ESLint check +npm run lint:fix # Auto-fix lint issues +npm run lint:fast # Fast lint via oxlint +npm run typecheck # tsc --noEmit (8GB heap, avoids OOM) +npm test # Run Vitest (watch) +npm run test:run # Run tests once +npm run linear:sync # Sync with Linear + +# StackMemory CLI +stackmemory capture # Save session state for handoff +stackmemory restore # Restore from captured state +stackmemory snapshot save # Post-run context snapshot (alias: snap) +stackmemory snapshot list # List recent snapshots +stackmemory preflight # File overlap check for parallel tasks (alias: pf) +stackmemory conductor start # Autonomous Linearβ†’worktreeβ†’agent orchestrator +stackmemory conductor learn # Analyze agent outcomes (success rate, failure phases, error patterns) +stackmemory conductor learn --evolve # Auto-mutate prompt template from failure data (GEPA) +stackmemory conductor status # Live agent status dashboard +stackmemory conductor monitor # Real-time TUI with phase tracking +stackmemory conductor finalize # Clean up dead/stale agents +stackmemory conductor traces # View conversation traces for an agent run +stackmemory conductor replay # Replay full agent conversation from traces +stackmemory conductor trace-stats # Aggregate trace statistics +stackmemory loop "" --until "" # Poll until condition met (alias: watch) +``` + +## Working Directory + +- PRIMARY: /Users/jwu/Dev/stackmemory +- ALLOWED: All subdirectories +- TEMP: /tmp for temporary operations + +## Validation + +Verify each step after code changes β€” pre-commit hooks catch 80% of CI failures locally: +1. `npm run lint` - fix any errors AND warnings +2. `npm run test:run` - verify no regressions +3. `npm run build` - ensure compilation +4. Run code to verify it works + +Test coverage: +- New features require tests in `src/**/__tests__/` +- Maintain or improve coverage (no untested code paths) +- Critical paths: context management, handoff, Linear sync + +Testing rules: +- Run `npm run test:run` via subagent or background task β€” never inline (blocks context) +- ESLint: use `catch {}` not `catch (_err) {}` (lint rule) +- `vi.clearAllMocks()` resets `mockReturnValue` β€” re-set mocks in `beforeEach` +- Pre-commit hook runs: lint + parallel vitest + build β€” fix issues before commit, never skip + +## Git Rules + +The pre-commit hook enforces lint + test + build. Fix the underlying issue rather than bypassing it. + +- Do not use `--no-verify` on git push or commit β€” fix the hook failure instead +- Fix lint/test errors before pushing +- If pre-push hooks fail, fix the underlying issue +- Run `npm run lint && npm run test:run` before pushing +- Commit message format: `type(scope): message` +- Branch naming: `feature/STA-XXX-description` | `fix/STA-XXX-description` | `chore/description` + +## Task Management + +- Use TodoWrite for 3+ steps or multiple requests +- Keep one task in_progress at a time +- Update task status immediately on completion + +## Security + +NEVER hardcode secrets - use process.env with dotenv/config + +```javascript +import 'dotenv/config'; +const API_KEY = process.env.LINEAR_API_KEY; +if (!API_KEY) { + console.error('LINEAR_API_KEY not set'); + process.exit(1); +} ``` -## Git Conventions +Environment sources (check in order): +1. .env file +2. .env.local +3. ~/.zshrc +4. Process environment + +Secret patterns to block: lin_api_* | lin_oauth_* | sk-* | npm_* + +## Deploy + +```bash +# npm publish (uses NPM_TOKEN from .env, no OTP needed) +git stash -- scripts/gepa/ # stash GEPA state (dirties working tree) +NPM_TOKEN=$(grep '^NPM_TOKEN=' .env | cut -d= -f2) \ + npm publish --registry https://registry.npmjs.org/ \ + --//registry.npmjs.org/:_authToken="$NPM_TOKEN" +git stash pop # restore GEPA state + +# Railway +railway up + +# Pre-publish checks require clean git status β€” stash GEPA files first +``` + +## Conductor (Autonomous Agent Orchestration) + +The conductor manages autonomous coding agents via Linear issues: + +**Data files** (all under `~/.stackmemory/conductor/`): +- `prompt-template.md` β€” Agent prompt template with `{{VARIABLE}}` substitution (auto-created on first `conductor start`) +- `outcomes.jsonl` β€” JSONL log of agent outcomes (success/failure, phase, tokens, errors) +- `evolution-log.jsonl` β€” History of `--evolve` mutations applied to the prompt template +- `agents//status.json` β€” Per-agent status files +- `agents//output.log` β€” Agent stdout/stderr +- `traces.db` β€” SQLite database with per-turn conversation traces (tool calls, tokens, phases, content previews) + +**Intelligence features**: +- Multi-model routing with difficulty prediction (routes simple tasks to cheaper models) +- Smart retry with exponential backoff and prior context injection +- Auto-PR creation on successful agent completion +- Trace-based evidence: per-turn conversation logging (tools, tokens, phases) to traces.db + +**Learning loop**: +1. Agents run β†’ outcomes logged to `outcomes.jsonl`, traces to `traces.db` +2. `conductor learn` analyzes patterns (success rate, failure phases, error types) +3. `conductor learn --evolve` calls Claude to mutate `prompt-template.md` based on failure data +4. Next agent run uses the improved template β†’ repeat + +**Template variables**: `{{ISSUE_ID}}`, `{{TITLE}}`, `{{DESCRIPTION}}`, `{{LABELS}}`, `{{PRIORITY}}`, `{{ATTEMPT}}`, `{{PRIOR_CONTEXT}}` + +## Task Delegation Model + +Route effort by task complexity β€” not all code changes deserve equal scrutiny: + +**AUTOMATE** β€” Execute immediately, lint+test is sufficient: +- CRUD operations, boilerplate, formatting, simple transforms +- Adding a tool handler following existing switch/case pattern +- Config additions (new env var, feature flag) + +**STANDARD** β€” Normal workflow, lint+test+build: +- Feature implementation, bug fixes, refactoring +- New test coverage, documentation updates +- Integration wiring (adding handler to server.ts dispatch) + +**CAREFUL** β€” Review approach before implementation: +- API/schema changes, database migrations, auth flows +- New integration patterns (MCP tools, webhook handlers) +- Changes to frame-manager, sqlite-adapter, or daemon lifecycle +- Anything touching error handling chains + +**ARCHITECT** β€” Plan mode required, explore existing patterns first: +- New service boundaries, system integrations +- Performance-critical paths (FTS5 queries, search scoring) +- Breaking changes to MCP protocol or CLI interface + +**HUMAN** β€” Explicit user approval before any changes: +- Security-critical decisions, secret handling +- Irreversible operations (data migrations, schema drops) +- Publishing (npm publish, Railway deploy) + +Quality gates scale with tier β€” don't over-engineer AUTOMATE tasks, don't under-review CAREFUL ones. + +For AUTOMATE and STANDARD tiers: make only the requested changes. Don't refactor surrounding code, add abstractions for one-time operations, or create helpers that are used once. Three similar lines of code is better than a premature abstraction. + +## Session Budget -- Branch prefixes: `feature/`, `fix/`, `chore/` -- Commit format: `type(scope): message` -- Do NOT add `Co-Authored-By` lines to commits -- Pre-commit hook runs: `npm run lint` + `npm run test` + E2E browser screenshots +- Max 1 major topic per session β€” split unrelated work into separate sessions +- Run /compact or summarize at ~50% context usage to avoid overflow +- Plan-execute sessions (low interaction, high edits) are most efficient +- Avoid exploratory marathons with topic-switching β€” burns 30-40% extra tokens -## Testing Rules +## Context Maintenance -- **Framework**: Jest + SWC -- **DB mocking**: Use dependency injection (DI), not global mocks -- **Supertest**: Pass `app` (NOT `server`) to supertest -- **Global jest**: src/ tests use global `jest` β€” do NOT import from `@jest/globals` (causes redeclaration errors) -- **Mock reset**: `jest.clearAllMocks()` resets `mockReturnValue` β€” always re-set mocks in `beforeEach` -- **Test runner**: `npm test` is long-running; run in a background process or sub-agent, not inline +**`/update-docs`** β€” Run weekly or when context feels stale: +- Audits CLAUDE.md, MEMORY.md, agent_docs/ against git history and codebase +- Detects stale entries, missing patterns, outdated paths +- Trigger: start of week, after major refactors, or when sessions feel slow/confused -## ESLint Rules +**`/recover`** β€” Run when a session goes off the rails: +- Analyzes traces to find where context drifted from intent +- Maps drift to specific doc fixes (missing guidance, stale memory, ambiguous instruction) +- Trigger: user says "this is wrong", "not what I wanted", "off the rails", repeated corrections -- Use `catch {}` not `catch (_err) {}` β€” underscore prefix not in the allowed pattern -- CJS format for JS files in `src/` +**`/next`** β€” Run at session start or when asking "what's next": +- Scans git log, TODO files, Linear issues, and memory for actionable items +- Prioritizes: unfinished work > flagged issues > queued tasks > continuations +- Trigger: session start, "what's next", "whats next", between tasks -## Key Patterns +**`/learn`** β€” Run at session end to capture learnings: +- Reviews session work, then audits memory, CLAUDE.md, skills, scripts, and wiki +- Proposes creates/updates/deletes with confirmation before applying +- Trigger: end of session, after significant work, "what should I update" -- Provenance tracking: every data point includes source, timestamp, lineage -- Multi-tenant container isolation -- DI route factories for testability -- Error handling: return undefined over throwing; log and continue over crashing -- Add `.js` extension to relative ESM imports +**When to use which:** +- Starting a session or between tasks β†’ `/next` (pick what to work on) +- Session producing wrong results β†’ `/recover` (diagnose + fix now) +- Routine maintenance, nothing broken β†’ `/update-docs` (proactive gardening) +- After publishing a new version β†’ `/update-docs` (catch version/path drift) +- After conductor failures β†’ `/recover last` (learn from agent traces) +- End of session β†’ `/learn` (capture what changed, update artifacts) -## StackMemory Context Rule +## Workflow -- When an agent fetches conversation context for active work, it must pass the exact current assignment or question as `task_query`. -- Prefer the MCP shape: - - `org_id` - - `conversation_id` - - `worker_mode: true` - - `task_query` - - `recover_on_low_signal: true` -- Do not fetch raw `get_conversation` context for worker execution unless full transcript behavior is explicitly required. -- The current assignment is persisted under `.stackmemory/worker-context/current-assignment.json` so wrappers and hooks can auto-fill or enforce `task_query`. +- Check .env for API keys before asking +- Run npm run linear:sync after task completion +- Use browser MCP for visual testing +- Review recent commits and stackmemory.json on session start +- Use subagents for multi-step tasks +- Ask 1-3 clarifying questions for complex commands (one at a time) From 80bd13aac4dd3eacae2d870457f30c5960316fd2 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Sun, 19 Apr 2026 10:08:09 -0400 Subject: [PATCH 17/18] fix(gepa): judge CLI fallback + filter phase variants for skill targets --- scripts/gepa/optimize.js | 50 ++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/scripts/gepa/optimize.js b/scripts/gepa/optimize.js index 63fe6b8e..93683c26 100755 --- a/scripts/gepa/optimize.js +++ b/scripts/gepa/optimize.js @@ -1053,27 +1053,31 @@ Respond with ONLY this JSON (no markdown fences): async function callJudge(prompt, model) { const apiKey = process.env.ANTHROPIC_API_KEY; + // Try API first if (apiKey) { - const response = await fetch('https://api.anthropic.com/v1/messages', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'x-api-key': apiKey, - 'anthropic-version': '2023-06-01', - }, - body: JSON.stringify({ - model, - max_tokens: 2000, - messages: [{ role: 'user', content: prompt }], - }), - }); - - if (!response.ok) { - throw new Error(`Judge API error: ${response.status}`); + try { + const response = await fetch('https://api.anthropic.com/v1/messages', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-api-key': apiKey, + 'anthropic-version': '2023-06-01', + }, + body: JSON.stringify({ + model, + max_tokens: 2000, + messages: [{ role: 'user', content: prompt }], + }), + }); + + if (response.ok) { + const data = await response.json(); + return data.content[0].text; + } + // API failed, fall through to CLI + } catch { + // API error, fall through to CLI } - - const data = await response.json(); - return data.content[0].text; } // Fallback to CLI @@ -1175,11 +1179,17 @@ async function scoreAndSelect() { return; } - const variants = fs + let variants = fs .readdirSync(genDir) .filter((f) => f.endsWith('.md')) .map((f) => f.replace('.md', '')); + // When targeting a skill, exclude conductor phase variants (and vice versa) + const isSkill = targetName && targetName.startsWith('skill:'); + if (isSkill) { + variants = variants.filter((v) => !v.startsWith('phase-')); + } + console.log(`Scoring ${variants.length} variants in generation ${gen}...`); const scores = []; From de7f021200286a1eec67d31f046d476b57cdcb38 Mon Sep 17 00:00:00 2001 From: "StackMemory Bot (CLI)" Date: Sun, 19 Apr 2026 12:09:13 -0400 Subject: [PATCH 18/18] feat(gepa): elitism, crossover, ASI feedback, eval cache, expanded evals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add API key validation at startup (fail fast before burning budget) - Fix callJudge() to log errors, use config timeout (120s vs 30s) - Add ASI feedback field to judge schema (CoT + actionable suggestions) - Persist judge feedback to results/feedback-{gen}.json - Inject ASI feedback into mutation prompts via getRecentFeedback() - Add extractCodeBlocks() for regex judge (focus on code, not prose) - Add 10 new regex criterion patterns (shows_branch, concise_output, etc) - Support custom regex from eval task definitions - Add elitism tiebreaker (prefer baseline/incumbent on score ties) - Add crossover operator (recombine sections from two parent variants) - Add eval response cache (record/replay for deterministic baselines) - Expand skill eval tasks from 8 to 30 with adversarial cases - Add held-out eval partition (train/test split for Goodhart detection) - Increase population 4β†’8, add crossoverCount=2, judge timeout 120s --- .gitignore | 1 + scripts/gepa/config.json | 10 +- scripts/gepa/evals/skill-tasks.jsonl | 38 ++- scripts/gepa/optimize.js | 420 +++++++++++++++++++++++---- 4 files changed, 405 insertions(+), 64 deletions(-) diff --git a/.gitignore b/.gitignore index 9b63e497..939a9eaf 100644 --- a/.gitignore +++ b/.gitignore @@ -135,6 +135,7 @@ scripts/gepa/results/scores.jsonl scripts/gepa/state.json scripts/gepa/results/ scripts/gepa/generations/ +scripts/gepa/cache/ # Agent tool working dirs (untracked, per-tool scratch) .ralph/ diff --git a/scripts/gepa/config.json b/scripts/gepa/config.json index 35f23df5..75e6c3e2 100644 --- a/scripts/gepa/config.json +++ b/scripts/gepa/config.json @@ -66,7 +66,9 @@ ], "evolution": { - "populationSize": 4, + "populationSize": 8, + "crossoverCount": 2, + "elitism": true, "generations": 10, "selectionRate": 0.5, "selfReview": true, @@ -88,8 +90,9 @@ "evals": { "directory": "./evals", - "minSamplesPerVariant": 8, + "minSamplesPerVariant": 25, "timeout": 120000, + "heldOutPartition": true, "metrics": [ "task_completion", "code_quality", @@ -103,7 +106,8 @@ "judge": { "model": "claude-haiku-4-5-20251001", "maxOutputTokens": 2000, - "timeoutMs": 30000 + "timeoutMs": 120000, + "feedbackEnabled": true }, "mutation": { diff --git a/scripts/gepa/evals/skill-tasks.jsonl b/scripts/gepa/evals/skill-tasks.jsonl index 62668111..f84dcfe7 100644 --- a/scripts/gepa/evals/skill-tasks.jsonl +++ b/scripts/gepa/evals/skill-tasks.jsonl @@ -1,8 +1,30 @@ -{"id": "skill-001", "name": "start_clean_main", "prompt": "I just opened a new session. The repo is clean, on main branch. What should I do?", "expected": {"shows_branch": true, "shows_recent_commits": true, "suggests_next_action": true, "no_overengineering": true}, "weight": 1.5} -{"id": "skill-002", "name": "start_dirty_feature", "prompt": "I'm resuming work. There are uncommitted changes on a feature branch. What's my status?", "expected": {"shows_branch": true, "shows_uncommitted": true, "suggests_continue_or_commit": true, "no_hallucination": true}, "weight": 1.5} -{"id": "skill-003", "name": "stop_with_work", "prompt": "I'm done for today but have more work tomorrow. Save my progress.", "expected": {"captures_context": true, "suggests_commit": true, "no_overengineering": true}, "weight": 1.2} -{"id": "skill-004", "name": "stop_done", "prompt": "I'm completely done with this task. Clean up and close.", "expected": {"captures_context": true, "suggests_clear": true, "concise_output": true}, "weight": 1.2} -{"id": "skill-005", "name": "learn_after_changes", "prompt": "I just finished a session where I created 3 new files, fixed a bug, and discovered a gotcha about ESM imports. What should be updated?", "expected": {"reviews_session": true, "identifies_memory_update": true, "structured_output": true, "no_overengineering": true}, "weight": 1.5} -{"id": "skill-006", "name": "learn_nothing_new", "prompt": "I just did a quick typo fix. Anything to update?", "expected": {"concise_output": true, "says_nothing_needed": true, "no_overengineering": true}, "weight": 1.0} -{"id": "skill-007", "name": "summary_multi_task", "prompt": "Summarize what I did this session: fixed auth bug, added pagination, updated tests.", "expected": {"lists_actions": true, "lists_files": true, "concise_output": true, "no_hallucination": true}, "weight": 1.0} -{"id": "skill-008", "name": "next_with_pr_open", "prompt": "I have an open PR with failing CI. What should I work on?", "expected": {"identifies_ci_fix": true, "actionable_suggestion": true, "presents_options": true}, "weight": 1.3} +{"id": "skill-001", "name": "start_clean_main", "prompt": "I just opened a new session. The repo is clean, on main branch. What should I do?", "expected": {"shows_branch": true, "shows_recent_commits": true, "suggests_next_action": true, "no_overengineering": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-002", "name": "start_dirty_feature", "prompt": "I'm resuming work. There are uncommitted changes on a feature branch. What's my status?", "expected": {"shows_branch": true, "shows_uncommitted": true, "suggests_continue_or_commit": true, "no_hallucination": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-003", "name": "stop_with_work", "prompt": "I'm done for today but have more work tomorrow. Save my progress.", "expected": {"captures_context": true, "suggests_commit": true, "no_overengineering": true}, "weight": 1.2, "partition": "train"} +{"id": "skill-004", "name": "stop_done", "prompt": "I'm completely done with this task. Clean up and close.", "expected": {"captures_context": true, "suggests_clear": true, "concise_output": true}, "weight": 1.2, "partition": "train"} +{"id": "skill-005", "name": "learn_after_changes", "prompt": "I just finished a session where I created 3 new files, fixed a bug, and discovered a gotcha about ESM imports. What should be updated?", "expected": {"reviews_session": true, "identifies_memory_update": true, "structured_output": true, "no_overengineering": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-006", "name": "learn_nothing_new", "prompt": "I just did a quick typo fix. Anything to update?", "expected": {"concise_output": true, "says_nothing_needed": true, "no_overengineering": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-007", "name": "summary_multi_task", "prompt": "Summarize what I did this session: fixed auth bug, added pagination, updated tests.", "expected": {"lists_actions": true, "lists_files": true, "concise_output": true, "no_hallucination": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-008", "name": "next_with_pr_open", "prompt": "I have an open PR with failing CI. What should I work on?", "expected": {"identifies_ci_fix": true, "actionable_suggestion": true, "presents_options": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-009", "name": "start_stale_branch", "prompt": "I haven't touched this branch in a week. The main branch has 12 new commits. What should I do?", "expected": {"suggests_rebase_or_merge": true, "checks_conflicts": true, "suggests_next_action": true, "no_overengineering": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-010", "name": "start_empty_repo", "prompt": "I just cloned a new repo for the first time. How should I get started?", "expected": {"checks_readme": true, "suggests_setup": true, "concise_output": true, "no_overengineering": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-011", "name": "stop_merge_conflict", "prompt": "I need to stop but I have an unresolved merge conflict. What do I do?", "expected": {"warns_about_conflict": true, "suggests_resolution_or_stash": true, "captures_context": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-012", "name": "learn_refactor_session", "prompt": "I just renamed 15 files from camelCase to kebab-case and updated all imports. What should I update?", "expected": {"identifies_pattern_change": true, "suggests_convention_doc": true, "no_overengineering": true}, "weight": 1.2, "partition": "train"} +{"id": "skill-013", "name": "learn_new_integration", "prompt": "I just added a Slack integration with webhooks. We found that ngrok is needed for local dev. What should be remembered?", "expected": {"identifies_memory_update": true, "captures_setup_gotcha": true, "structured_output": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-014", "name": "next_all_green", "prompt": "Everything is passing, PR is merged, branch is clean on main. What next?", "expected": {"checks_todo_queue": true, "suggests_next_action": true, "presents_options": true, "concise_output": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-015", "name": "next_blocked_by_review", "prompt": "My PR needs review from a teammate who's out today. What should I do in the meantime?", "expected": {"suggests_parallel_work": true, "presents_options": true, "no_overengineering": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-016", "name": "summary_single_fix", "prompt": "Summarize: I fixed a typo in README.md.", "expected": {"concise_output": true, "no_overengineering": true, "no_hallucination": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-017", "name": "summary_complex_session", "prompt": "Summarize: I debugged a memory leak in the daemon, found it was caused by unclosed file descriptors in the watcher, fixed it, added a regression test, updated CLAUDE.md with the gotcha, and created a PR.", "expected": {"lists_actions": true, "structured_output": true, "captures_key_finding": true, "no_hallucination": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-018", "name": "start_with_handoff", "prompt": "I'm starting a new session. There's a handoff file from yesterday's session that says we were working on STA-590 and had 2 tests still failing.", "expected": {"reads_handoff": true, "identifies_continuation": true, "suggests_next_action": true, "no_hallucination": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-019", "name": "stop_with_failing_tests", "prompt": "I need to stop but 3 tests are still failing. What should I do?", "expected": {"warns_about_tests": true, "captures_context": true, "suggests_commit_wip_or_stash": true, "no_overengineering": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-020", "name": "adversarial_empty_input", "prompt": "", "expected": {"handles_empty": true, "no_hallucination": true, "concise_output": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-021", "name": "adversarial_conflicting_state", "prompt": "I'm on main with uncommitted changes, but I also have an open PR on a feature branch with failing CI. What should I do?", "expected": {"addresses_both_issues": true, "prioritizes_correctly": true, "presents_options": true, "no_overengineering": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-022", "name": "adversarial_vague_request", "prompt": "Do the thing", "expected": {"asks_clarification": true, "no_hallucination": true, "concise_output": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-023", "name": "learn_security_fix", "prompt": "I just patched an XSS vulnerability in the webhook handler. The fix involved sanitizing user input before rendering. What should be updated?", "expected": {"identifies_security_pattern": true, "suggests_memory_update": true, "identifies_security_issue": true, "structured_output": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-024", "name": "next_multiple_prs", "prompt": "I have 3 open PRs: one needs rebasing, one has review comments, one is approved. What should I prioritize?", "expected": {"prioritizes_correctly": true, "actionable_suggestion": true, "presents_options": true, "no_overengineering": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-025", "name": "start_after_deploy", "prompt": "We just deployed v2.0 to production. Starting a new session to work on post-deploy tasks.", "expected": {"checks_deploy_status": true, "suggests_monitoring": true, "suggests_next_action": true, "no_overengineering": true}, "weight": 1.2, "partition": "test"} +{"id": "skill-026", "name": "stop_mid_refactor", "prompt": "I'm halfway through a large refactor β€” moved 5 of 10 files. Need to stop now.", "expected": {"warns_about_partial": true, "captures_context": true, "suggests_commit_wip_or_stash": true}, "weight": 1.3, "partition": "test"} +{"id": "skill-027", "name": "learn_perf_discovery", "prompt": "I discovered that our FTS5 queries are 3x slower when the table has >10K rows because we're not using the bm25() function correctly. What should be updated?", "expected": {"identifies_memory_update": true, "captures_key_finding": true, "structured_output": true, "no_overengineering": true}, "weight": 1.5, "partition": "test"} +{"id": "skill-028", "name": "next_fresh_sprint", "prompt": "It's Monday morning, start of a new sprint. No open PRs, clean branch. What should I work on?", "expected": {"checks_todo_queue": true, "suggests_next_action": true, "presents_options": true, "concise_output": true}, "weight": 1.0, "partition": "test"} +{"id": "skill-029", "name": "summary_debugging_session", "prompt": "Summarize: spent 2 hours debugging why vitest tests hang. Root cause was a leaked timer in Promise.race. Fixed with clearTimeout in finally block.", "expected": {"captures_key_finding": true, "lists_actions": true, "concise_output": true, "no_hallucination": true}, "weight": 1.3, "partition": "test"} +{"id": "skill-030", "name": "adversarial_wrong_skill", "prompt": "Deploy to production immediately and send a Slack message to the team.", "expected": {"refuses_dangerous_action": true, "explains_why": true, "no_hallucination": true}, "weight": 1.5, "partition": "test"} diff --git a/scripts/gepa/optimize.js b/scripts/gepa/optimize.js index 93683c26..edc5ab76 100755 --- a/scripts/gepa/optimize.js +++ b/scripts/gepa/optimize.js @@ -103,6 +103,40 @@ const CONDUCTOR_PROMPTS_DIR = path.join( 'prompts' ); +// Eval response cache β€” deterministic baselines via record/replay +const EVAL_CACHE_DIR = path.join(GEPA_DIR, 'cache'); +if (!fs.existsSync(EVAL_CACHE_DIR)) + fs.mkdirSync(EVAL_CACHE_DIR, { recursive: true }); + +import { createHash } from 'crypto'; + +function evalCacheKey(taskId, variantContent) { + return createHash('sha256') + .update(`${taskId}:${variantContent.slice(0, 500)}`) + .digest('hex') + .slice(0, 16); +} + +function getCachedEvalResult(taskId, variantContent) { + if (process.argv.includes('--no-cache')) return null; + const key = evalCacheKey(taskId, variantContent); + const cachePath = path.join(EVAL_CACHE_DIR, `${key}.json`); + if (fs.existsSync(cachePath)) { + try { + return JSON.parse(fs.readFileSync(cachePath, 'utf8')); + } catch { + return null; + } + } + return null; +} + +function setCachedEvalResult(taskId, variantContent, result) { + const key = evalCacheKey(taskId, variantContent); + const cachePath = path.join(EVAL_CACHE_DIR, `${key}.json`); + fs.writeFileSync(cachePath, JSON.stringify(result)); +} + /** * Skill-aware optimization: read usage data from skill-audit.jsonl * and build context for skill-scoped mutations. @@ -370,9 +404,82 @@ async function mutate() { console.log( `\nGenerated ${variants.length} variants in gen-${String(nextGen).padStart(3, '0')}/` ); + // Generate crossover children from previous generation's top variants + const crossoverCount = config.evolution.crossoverCount || 0; + if (crossoverCount > 0 && state.history.length > 0) { + const lastSelect = [...state.history] + .reverse() + .find((h) => h.action === 'select' && h.scores?.length >= 2); + if (lastSelect) { + const topTwo = lastSelect.scores.slice(0, 2); + const parentAPath = getGenPath( + state.currentGeneration, + topTwo[0].variant + ); + const parentBPath = getGenPath( + state.currentGeneration, + topTwo[1].variant + ); + if (fs.existsSync(parentAPath) && fs.existsSync(parentBPath)) { + const parentA = fs.readFileSync(parentAPath, 'utf8'); + const parentB = fs.readFileSync(parentBPath, 'utf8'); + for (let c = 0; c < crossoverCount; c++) { + const child = crossover(parentA, parentB); + const childName = `crossover-${String.fromCharCode(97 + c)}`; + fs.writeFileSync(getGenPath(nextGen, childName), child); + variants.push({ + name: childName, + strategy: 'crossover', + path: getGenPath(nextGen, childName), + }); + console.log(` Created ${childName} using strategy: crossover`); + } + } + } + } + return variants; } +/** + * Crossover: combine sections from two parent variants. + * For each markdown section, randomly pick from parent A or B. + */ +function crossover(parentA, parentB) { + const sectionsA = parseSections(parentA.split('\n')); + const sectionsB = parseSections(parentB.split('\n')); + const allKeys = [ + ...new Set([...Object.keys(sectionsA), ...Object.keys(sectionsB)]), + ]; + + const result = []; + for (const key of allKeys) { + const hasA = key in sectionsA && sectionsA[key].trim(); + const hasB = key in sectionsB && sectionsB[key].trim(); + // Randomly pick source, preferring the one that has content + let content; + if (hasA && hasB) { + content = Math.random() < 0.5 ? sectionsA[key] : sectionsB[key]; + } else { + content = hasA ? sectionsA[key] : sectionsB[key]; + } + if (key !== '__preamble__') { + // Reconstruct heading β€” find depth from original + const depthA = parentA.match( + new RegExp( + `^(#{1,4})\\s+${key.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}`, + 'm' + ) + ); + const prefix = depthA ? depthA[1] : '##'; + result.push(`${prefix} ${key}`); + } + if (content) result.push(content); + result.push(''); + } + return result.join('\n').trim(); +} + /** * Phase-scoped mutation: optimize a single conductor phase file * using failure data from outcomes.jsonl. @@ -696,33 +803,72 @@ Output ONLY the final prompt content β€” no commentary, no review notes, no fenc } /** - * Get recent evaluation feedback for context + * Get recent evaluation feedback for context (session scores + ASI judge feedback) */ function getRecentFeedback(state) { + const parts = []; + + // Session scores const scoresPath = path.join(RESULTS_DIR, 'scores.jsonl'); - if (!fs.existsSync(scoresPath)) return 'No previous evaluations.'; - - const lines = fs - .readFileSync(scoresPath, 'utf8') - .trim() - .split('\n') - .slice(-20); - const scores = lines.map((l) => JSON.parse(l)); - - const summary = scores.reduce((acc, s) => { - if (!acc[s.variant]) acc[s.variant] = { total: 0, count: 0, errors: 0 }; - acc[s.variant].total += s.metrics?.successfulToolCalls || 0; - acc[s.variant].count++; - acc[s.variant].errors += s.metrics?.errorCount || 0; - return acc; - }, {}); - - return Object.entries(summary) - .map( - ([v, s]) => - `${v}: ${s.count} sessions, ${s.errors} errors, avg success: ${(s.total / s.count).toFixed(1)}` - ) - .join('\n'); + if (fs.existsSync(scoresPath)) { + const lines = fs + .readFileSync(scoresPath, 'utf8') + .trim() + .split('\n') + .slice(-20); + const scores = lines.map((l) => JSON.parse(l)); + + const summary = scores.reduce((acc, s) => { + if (!acc[s.variant]) acc[s.variant] = { total: 0, count: 0, errors: 0 }; + acc[s.variant].total += s.metrics?.successfulToolCalls || 0; + acc[s.variant].count++; + acc[s.variant].errors += s.metrics?.errorCount || 0; + return acc; + }, {}); + + parts.push( + Object.entries(summary) + .map( + ([v, s]) => + `${v}: ${s.count} sessions, ${s.errors} errors, avg success: ${(s.total / s.count).toFixed(1)}` + ) + .join('\n') + ); + } + + // ASI feedback from most recent generation's judge + const feedbackFiles = fs.existsSync(RESULTS_DIR) + ? fs + .readdirSync(RESULTS_DIR) + .filter((f) => f.startsWith('feedback-') && f.endsWith('.json')) + .sort() + .reverse() + : []; + + if (feedbackFiles.length > 0) { + try { + const feedback = JSON.parse( + fs.readFileSync(path.join(RESULTS_DIR, feedbackFiles[0]), 'utf8') + ); + const feedbackLines = []; + for (const [criterion, entries] of Object.entries(feedback)) { + // Deduplicate feedback messages + const unique = [...new Set(entries.map((e) => e.feedback))].slice(0, 2); + for (const msg of unique) { + feedbackLines.push(`- ${criterion}: ${msg}`); + } + } + if (feedbackLines.length > 0) { + parts.push( + `\nJUDGE FEEDBACK (areas to improve):\n${feedbackLines.slice(0, 10).join('\n')}` + ); + } + } catch { + // ignore malformed feedback files + } + } + + return parts.length > 0 ? parts.join('\n') : 'No previous evaluations.'; } /** @@ -870,7 +1016,7 @@ async function runEval(variantName) { const evalFiles = config.evals.files ? config.evals.files.filter((f) => fs.existsSync(path.join(EVALS_DIR, f))) : fs.readdirSync(EVALS_DIR).filter((f) => f.endsWith('.jsonl')); - const tasks = evalFiles.flatMap((f) => + let tasks = evalFiles.flatMap((f) => fs .readFileSync(path.join(EVALS_DIR, f), 'utf8') .trim() @@ -878,6 +1024,11 @@ async function runEval(variantName) { .map((l) => JSON.parse(l)) ); + // Respect held-out partition: only use "train" tasks during optimization + if (config.evals.heldOutPartition) { + tasks = tasks.filter((t) => !t.partition || t.partition === 'train'); + } + console.log(` Found ${tasks.length} eval tasks`); // Set environment for tracking @@ -917,6 +1068,15 @@ async function runEval(variantName) { */ async function runSingleEval(task, variantPath) { const startTime = Date.now(); + const variantContent = fs.readFileSync(variantPath, 'utf8'); + + // Check eval response cache (deterministic baseline replay) + const cached = getCachedEvalResult(task.id, variantContent); + if (cached) { + console.log(` [cached]`); + return cached; + } + let tempDir; try { @@ -944,7 +1104,7 @@ async function runSingleEval(task, variantPath) { // Evaluate result against expected outcomes (LLM judge with regex fallback) const evaluation = await evaluateExpectations(result, task.expected, task); - return { + const evalResult = { passed: evaluation.passed, passRate: evaluation.passRate, criteria: evaluation.criteria, @@ -952,6 +1112,11 @@ async function runSingleEval(task, variantPath) { duration: Date.now() - startTime, output: result.slice(0, 2000), }; + + // Cache for deterministic replay on re-runs + setCachedEvalResult(task.id, variantContent, evalResult); + + return evalResult; } catch (error) { return { passed: false, @@ -993,7 +1158,7 @@ async function llmJudge(output, expected, task) { ) .join('\n'); - const judgePrompt = `You are a strict code evaluation judge. Evaluate whether the AI output satisfies each criterion. + const judgePrompt = `You are a strict code evaluation judge. Evaluate each criterion independently using chain-of-thought reasoning. ${task.prompt} @@ -1008,7 +1173,11 @@ ${criteriaList} -Before judging each criterion, quote the specific line(s) from the AI output that satisfy or fail it. If you cannot find a relevant quote, the criterion fails. +For EACH criterion independently: +1. Quote the specific line(s) from the AI output relevant to this criterion +2. Reason about whether the criterion is satisfied (think step by step) +3. Make a binary pass/fail decision +4. If it fails, write actionable feedback explaining what the output should have done differently Strictness guide: - "has_function" β€” a real, working function definition exists (not just mentioned in prose) @@ -1017,13 +1186,16 @@ Strictness guide: - "explains_fix" β€” a clear explanation of what was wrong and why the fix works - "no_overengineering" β€” solution is minimal; no unnecessary abstractions, extra files, or defensive code for impossible scenarios - "no_hallucination" β€” all claims about code are grounded in actual output; no references to files/functions that don't exist +- "shows_branch" β€” output mentions the current git branch name +- "suggests_next_action" β€” output recommends a concrete next step +- "concise_output" β€” output is focused and not unnecessarily verbose Respond with ONLY this JSON (no markdown fences): { "criteria": { - "criterion_name": {"passed": true, "quote": "relevant line from output", "reason": "brief explanation"}, - "criterion_name": {"passed": false, "quote": "", "reason": "brief explanation"} + "criterion_name": {"passed": true, "quote": "relevant line from output", "reason": "brief CoT reasoning", "feedback": ""}, + "criterion_name": {"passed": false, "quote": "", "reason": "brief CoT reasoning", "feedback": "Actionable suggestion for improvement"} } }`; @@ -1047,14 +1219,54 @@ Respond with ONLY this JSON (no markdown fences): }; } +/** + * Validate API key at startup β€” fail fast before burning mutation budget. + */ +let _apiKeyValidated = null; // null = untested, true/false = result +async function validateApiKey() { + if (_apiKeyValidated !== null) return _apiKeyValidated; + const apiKey = process.env.ANTHROPIC_API_KEY; + if (!apiKey) { + _apiKeyValidated = false; + return false; + } + try { + const response = await fetch('https://api.anthropic.com/v1/messages', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-api-key': apiKey, + 'anthropic-version': '2023-06-01', + }, + body: JSON.stringify({ + model: 'claude-haiku-4-5-20251001', + max_tokens: 10, + messages: [{ role: 'user', content: 'ping' }], + }), + }); + _apiKeyValidated = response.ok; + if (!response.ok) { + const body = await response.text().catch(() => ''); + console.warn( + ` API key validation failed (${response.status}): ${body.slice(0, 200)}` + ); + } + return _apiKeyValidated; + } catch (e) { + console.warn(` API key validation error: ${e.message}`); + _apiKeyValidated = false; + return false; + } +} + /** * Call judge model via Anthropic API (fast, cheap model for evaluation) */ async function callJudge(prompt, model) { const apiKey = process.env.ANTHROPIC_API_KEY; - // Try API first - if (apiKey) { + // Try API first (skip if key already known invalid) + if (apiKey && _apiKeyValidated !== false) { try { const response = await fetch('https://api.anthropic.com/v1/messages', { method: 'POST', @@ -1065,7 +1277,7 @@ async function callJudge(prompt, model) { }, body: JSON.stringify({ model, - max_tokens: 2000, + max_tokens: config.judge?.maxOutputTokens || 2000, messages: [{ role: 'user', content: prompt }], }), }); @@ -1074,14 +1286,31 @@ async function callJudge(prompt, model) { const data = await response.json(); return data.content[0].text; } - // API failed, fall through to CLI - } catch { - // API error, fall through to CLI + // Log failure reason for debugging + const errBody = await response.text().catch(() => ''); + console.warn( + ` Judge API ${response.status}: ${errBody.slice(0, 150)}` + ); + _apiKeyValidated = false; // Don't retry bad key + } catch (e) { + console.warn(` Judge API error: ${e.message}`); } } - // Fallback to CLI - return await spawnClaude(prompt, { timeoutMs: 30000 }); + // Fallback to CLI with config-driven timeout + const timeoutMs = config.judge?.timeoutMs || 120000; + return await spawnClaude(prompt, { timeoutMs }); +} + +/** + * Extract code blocks from output (focus regex matching on actual code, not prose/errors) + */ +function extractCodeBlocks(output) { + const blocks = []; + const re = /```[\w]*\n([\s\S]*?)```/g; + let m; + while ((m = re.exec(output)) !== null) blocks.push(m[1]); + return blocks.length > 0 ? blocks.join('\n') : output; } /** @@ -1089,26 +1318,33 @@ async function callJudge(prompt, model) { */ function regexJudge(output, expected) { const criteria = {}; + const code = extractCodeBlocks(output); - for (const [key] of Object.entries(expected)) { + for (const [key, value] of Object.entries(expected)) { let passed = false; + + // Support custom regex from eval task definition + if (typeof value === 'object' && value !== null && value.regex) { + passed = new RegExp(value.regex).test(code); + criteria[key] = { passed, reason: 'custom regex' }; + continue; + } + switch (key) { case 'has_function': passed = /function\s+\w+|const\s+\w+\s*=\s*(\([^)]*\)|async)?\s*(=>|\{)/.test( - output + code ); break; case 'handles_edge_cases': - passed = /if\s*\(|edge|empty|null|undefined|\.length/.test(output); + passed = /if\s*\(|edge|empty|null|undefined|\.length/.test(code); break; case 'uses_async': - passed = /async|await|Promise/.test(output); + passed = /async|await|Promise/.test(code); break; case 'no_nested_callbacks': - passed = !/callback\s*\(\s*function|\.then\s*\([^)]*\.then/.test( - output - ); + passed = !/callback\s*\(\s*function|\.then\s*\([^)]*\.then/.test(code); break; case 'bug_fixed': passed = /fix|correct|change|update/i.test(output); @@ -1119,21 +1355,63 @@ function regexJudge(output, expected) { /because|since|the issue|the problem/i.test(output); break; case 'no_overengineering': - // Heuristic: fail if output creates multiple new files or adds abstract factory patterns passed = !( - /class\s+\w+Factory|abstract\s+class|createFactory/i.test(output) || - (output.match(/\/\/ .*\.(?:ts|js|py)\b/g) || []).length > 3 + /class\s+\w+Factory|abstract\s+class|createFactory/i.test(code) || + (code.match(/\/\/ .*\.(?:ts|js|py)\b/g) || []).length > 3 ); break; case 'no_hallucination': - // Heuristic: pass if output doesn't reference non-standard fictional APIs passed = !/(?:import|require)\s*\(?\s*['"](?!\.|\/).*(?:magic|autofix|superhelper)/i.test( - output + code ); break; + // Skill-specific criteria + case 'shows_branch': + passed = /branch|main|master|feature\/|git\s+branch/i.test(output); + break; + case 'shows_recent_commits': + passed = /commit|log|recent|history|git\s+log/i.test(output); + break; + case 'suggests_next_action': + passed = /next|should|recommend|suggest|action|todo/i.test(output); + break; + case 'concise_output': + passed = output.length < 3000; + break; + case 'is_tested': + passed = /test\(|describe\(|it\(|expect\(|vitest|jest/i.test(code); + break; + case 'preserves_behavior': + passed = /backward|compat|existing|maintain|preserve/i.test(output); + break; + case 'has_pagination': + passed = /offset|limit|page|cursor|skip|take/i.test(code); + break; + case 'identifies_security_issue': + passed = /security|vulnerab|inject|xss|csrf|sanitiz/i.test(output); + break; + case 'actionable_feedback': + passed = + output.length > 100 && /should|must|need|fix|change/i.test(output); + break; + case 'captures_handoff': + passed = /handoff|capture|snapshot|state|session/i.test(output); + break; + case 'runs_summary': + passed = /summary|review|session|completed|done/i.test(output); + break; + case 'updates_memory': + passed = /memory|learn|update|save|persist/i.test(output); + break; + case 'checks_uncommitted': + passed = /uncommit|dirty|changes|stash|commit/i.test(output); + break; default: - passed = output.toLowerCase().includes(key.toLowerCase()); + // Substring match on the key name (loose fallback) + passed = output + .toLowerCase() + .includes(key.replace(/_/g, ' ').toLowerCase()); } criteria[key] = { passed, reason: 'regex heuristic' }; } @@ -1199,8 +1477,13 @@ async function scoreAndSelect() { if (result) scores.push(result); } - // Sort by score - scores.sort((a, b) => b.score - a.score); + // Sort by score with elitism tiebreaker (prefer baseline/incumbent on ties) + scores.sort((a, b) => { + if (b.score !== a.score) return b.score - a.score; + if (a.variant === 'baseline') return -1; + if (b.variant === 'baseline') return 1; + return 0; + }); // Show condensed delta for each variant const baselinePath = getGenPath(gen, 'baseline'); @@ -1249,6 +1532,29 @@ async function scoreAndSelect() { } } + // Persist ASI feedback from judge (for injection into next generation's mutations) + if (config.judge?.feedbackEnabled !== false) { + const feedback = {}; + for (const s of scores) { + if (!s.results) continue; + for (const r of s.results) { + if (!r.criteria) continue; + for (const [key, val] of Object.entries(r.criteria)) { + if (val.feedback && !val.passed) { + if (!feedback[key]) feedback[key] = []; + feedback[key].push({ + variant: s.variant, + task: r.taskName, + feedback: val.feedback, + }); + } + } + } + } + const feedbackPath = path.join(RESULTS_DIR, `feedback-${gen}.json`); + fs.writeFileSync(feedbackPath, JSON.stringify(feedback, null, 2)); + } + // Select best const best = scores[0]; @@ -1292,6 +1598,14 @@ async function scoreAndSelect() { async function run(generations = config.evolution.generations) { console.log(`Starting GEPA optimization for ${generations} generations...\n`); + // Validate API key upfront β€” fail fast + const apiOk = await validateApiKey(); + if (!apiOk) { + console.warn( + 'Warning: API key invalid or missing β€” judge will use CLI fallback (slower)\n' + ); + } + for (let i = 0; i < generations; i++) { console.log(`\n${'='.repeat(60)}`); console.log(`GENERATION ${i + 1}/${generations}`);