stackmemoryai · jonathanpeterwu · Apr 20, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/.claude/settings.json b/.claude/settings.json
@@ -52,6 +52,16 @@
             "command": "entire hooks claude-code stop"
           }
         ]
+      },
+      {
+        "matcher": "",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "node scripts/gepa/hooks/gepa-session-hook.js",
+            "async": true
+          }
+        ]
       }
     ],
     "PreToolUse": [

diff --git a/.gitignore b/.gitignore
@@ -135,6 +135,7 @@ scripts/gepa/results/scores.jsonl
 scripts/gepa/state.json
 scripts/gepa/results/
 scripts/gepa/generations/
+scripts/gepa/cache/
 
 # Agent tool working dirs (untracked, per-tool scratch)
 .ralph/

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -258,12 +258,18 @@ For AUTOMATE and STANDARD tiers: make only the requested changes. Don't refactor
 - Prioritizes: unfinished work > flagged issues > queued tasks > continuations
 - Trigger: session start, "what's next", "whats next", between tasks
 
+**`/learn`** — Run at session end to capture learnings:
+- Reviews session work, then audits memory, CLAUDE.md, skills, scripts, and wiki
+- Proposes creates/updates/deletes with confirmation before applying
+- Trigger: end of session, after significant work, "what should I update"
+
 **When to use which:**
 - Starting a session or between tasks → `/next` (pick what to work on)
 - Session producing wrong results → `/recover` (diagnose + fix now)
 - Routine maintenance, nothing broken → `/update-docs` (proactive gardening)
 - After publishing a new version → `/update-docs` (catch version/path drift)
 - After conductor failures → `/recover last` (learn from agent traces)
+- End of session → `/learn` (capture what changed, update artifacts)
 
 ## Workflow
 

diff --git a/docs/prds/substrate-enterprise-brain.md b/docs/prds/substrate-enterprise-brain.md
diff --git a/package.json b/package.json
@@ -141,6 +141,9 @@
     "sync:start": "node scripts/background-sync-manager.js",
     "sync:setup": "./scripts/setup-background-sync.sh",
     "eval:cord": "npx tsx scripts/evals/cord-vs-flat-eval.ts",
+    "gepa:eval": "node scripts/gepa/eval-phases.js",
+    "gepa:eval:json": "node scripts/gepa/eval-phases.js --json",
+    "gepa:mine": "node scripts/gepa/gold/mine-traces.js",
     "prepare": "echo 'Prepare step completed'",
     "verify:dist": "node scripts/verify-dist.cjs",
     "test:smoke-db": "bash scripts/smoke-init-db.sh",

diff --git a/scripts/conductor/after-run.sh b/scripts/conductor/after-run.sh
@@ -1,22 +1,42 @@
 #!/usr/bin/env bash
 # Conductor after_run hook
-# Captures context from the agent run and tags it with the issue identifier
-# Called after each agent attempt (success or failure)
+# 1. Captures context from the agent run
+# 2. Triggers GEPA session hook (accumulates toward auto-optimization)
+# 3. Triggers DSPy optimization every 50 runs
 #
 # Environment: SYMPHONY_WORKSPACE_DIR, SYMPHONY_ISSUE_ID, SYMPHONY_ISSUE_IDENTIFIER
 set -euo pipefail
 
 WORKSPACE="${SYMPHONY_WORKSPACE_DIR:-$(pwd)}"
 ISSUE_ID="${SYMPHONY_ISSUE_IDENTIFIER:-${SYMPHONY_ISSUE_ID:-unknown}}"
 ATTEMPT="${SYMPHONY_ATTEMPT:-1}"
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 
 cd "$WORKSPACE"
 
-# Capture context from this run, tagged with issue ID and attempt number
+# 1. Capture context from this run, tagged with issue ID and attempt number
 stackmemory conductor capture \
   --issue "$ISSUE_ID" \
   --workspace "$WORKSPACE" \
   --attempt "$ATTEMPT" \
   2>/dev/null || true
 
 echo "[conductor] Context captured for $ISSUE_ID (attempt $ATTEMPT)"
+
+# 2. Trigger GEPA session hook (accumulates sessions, auto-optimizes at threshold)
+GEPA_HOOK="$PROJECT_ROOT/scripts/gepa/hooks/gepa-session-hook.js"
+if [ -f "$GEPA_HOOK" ]; then
+  node "$GEPA_HOOK" 2>/dev/null &
+fi
+
+# 3. Trigger DSPy optimization every 50 agent runs
+OUTCOMES_PATH="$HOME/.stackmemory/conductor/outcomes.jsonl"
+DSPY_OPTIMIZE="$PROJECT_ROOT/scripts/dspy/optimize.py"
+if [ -f "$OUTCOMES_PATH" ] && [ -f "$DSPY_OPTIMIZE" ]; then
+  OUTCOMES_COUNT=$(wc -l < "$OUTCOMES_PATH" 2>/dev/null || echo 0)
+  if [ $((OUTCOMES_COUNT % 50)) -eq 0 ] && [ "$OUTCOMES_COUNT" -gt 0 ]; then
+    echo "[conductor] Triggering DSPy optimization (${OUTCOMES_COUNT} runs)"
+    nohup python3 "$DSPY_OPTIMIZE" --quiet >/dev/null 2>&1 &
+  fi
+fi
diff --git a/scripts/gepa/config.json b/scripts/gepa/config.json
@@ -32,11 +32,43 @@
       "file": "CLAUDE.md",
       "evals": ["stackmemory-tasks.jsonl"],
       "description": "StackMemory project prompt"
+    },
+    {
+      "name": "skill:start",
+      "file": "~/.claude/commands/start.md",
+      "evals": ["skill-tasks.jsonl"],
+      "description": "Session boot skill"
+    },
+    {
+      "name": "skill:stop",
+      "file": "~/.claude/commands/stop.md",
+      "evals": ["skill-tasks.jsonl"],
+      "description": "Session close skill"
+    },
+    {
+      "name": "skill:learn",
+      "file": "~/.claude/commands/learn.md",
+      "evals": ["skill-tasks.jsonl"],
+      "description": "Session review + artifact update skill"
+    },
+    {
+      "name": "skill:next",
+      "file": "~/.claude/commands/next.md",
+      "evals": ["skill-tasks.jsonl"],
+      "description": "Next action recommendation skill"
+    },
+    {
+      "name": "skill:summary",
+      "file": "~/.claude/commands/summary.md",
+      "evals": ["skill-tasks.jsonl"],
+      "description": "Session summary skill"
     }
   ],
 
   "evolution": {
-    "populationSize": 4,
+    "populationSize": 8,
+    "crossoverCount": 2,
+    "elitism": true,
     "generations": 10,
     "selectionRate": 0.5,
     "selfReview": true,
@@ -58,8 +90,9 @@
 
   "evals": {
     "directory": "./evals",
-    "minSamplesPerVariant": 8,
+    "minSamplesPerVariant": 25,
     "timeout": 120000,
+    "heldOutPartition": true,
     "metrics": [
       "task_completion",
       "code_quality",
@@ -73,7 +106,8 @@
   "judge": {
     "model": "claude-haiku-4-5-20251001",
     "maxOutputTokens": 2000,
-    "timeoutMs": 30000
+    "timeoutMs": 120000,
+    "feedbackEnabled": true
   },
 
   "mutation": {
@@ -144,6 +178,26 @@
       "evals": {
         "files": ["conductor-tasks.jsonl"]
       }
+    },
+    "skills": {
+      "target": {
+        "file": "~/.claude/commands/start.md",
+        "scope": "user",
+        "backup": true
+      },
+      "evolution": {
+        "mutationStrategies": [
+          "simplify",
+          "add_examples",
+          "rephrase",
+          "add_constraints",
+          "reduce_overengineering",
+          "add_self_check"
+        ]
+      },
+      "evals": {
+        "files": ["skill-tasks.jsonl"]
+      }
     }
   }
 }
diff --git a/scripts/gepa/eval-phases.js b/scripts/gepa/eval-phases.js
@@ -0,0 +1,197 @@
+#!/usr/bin/env node
+/**
+ * Phase-level eval harness for GEPA.
+ *
+ * Evaluates conductor prompt phase files against gold sets.
+ * Scores each phase independently. Used by GEPA auto-optimization
+ * to validate mutations before applying.
+ *
+ * Usage:
+ *   node eval-phases.js                    # eval all phases
+ *   node eval-phases.js --phase validate   # eval single phase
+ *   node eval-phases.js --json             # JSON output for CI
+ */
+
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+import { homedir } from 'os';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const GOLD_DIR = path.join(__dirname, 'gold');
+const PROMPTS_DIR = path.join(
+  homedir(),
+  '.stackmemory',
+  'conductor',
+  'prompts'
+);
+
+const PHASES = ['understand', 'implement', 'validate', 'deliver'];
+
+// Parse args
+const phaseIdx = process.argv.indexOf('--phase');
+const targetPhase = phaseIdx !== -1 ? process.argv[phaseIdx + 1] : null;
+const jsonOutput = process.argv.includes('--json');
+
+/**
+ * Load gold set for a phase
+ */
+function loadGoldSet(phase) {
+  const goldPath = path.join(GOLD_DIR, `${phase}.jsonl`);
+  if (!fs.existsSync(goldPath)) return [];
+  return fs
+    .readFileSync(goldPath, 'utf-8')
+    .split('\n')
+    .filter(Boolean)
+    .map((l) => JSON.parse(l));
+}
+
+/**
+ * Score a phase prompt against its gold set using heuristic evaluation.
+ * This is a fast, offline eval (no LLM calls) based on outcome patterns.
+ *
+ * For LLM-judge evaluation, use the full GEPA optimize.js eval pipeline.
+ */
+function evalPhase(phase) {
+  const goldSet = loadGoldSet(phase);
+  if (goldSet.length === 0) {
+    return { phase, score: 0, total: 0, passed: 0, skipped: true };
+  }
+
+  const promptPath = path.join(PROMPTS_DIR, `${phase}.md`);
+  if (!fs.existsSync(promptPath)) {
+    return { phase, score: 0, total: goldSet.length, passed: 0, missing: true };
+  }
+
+  const prompt = fs.readFileSync(promptPath, 'utf-8');
+  let passed = 0;
+  const failures = [];
+
+  for (const entry of goldSet) {
+    const expected = entry.expected;
+    if (!expected) continue;
+
+    // Heuristic: check if the prompt addresses the failure patterns
+    let entryPassed = true;
+
+    switch (phase) {
+      case 'understand': {
+        // Check if prompt guides complexity assessment
+        if (expected.complexity === 'careful' && !prompt.includes('plan')) {
+          entryPassed = false;
+        }
+        break;
+      }
+
+      case 'implement': {
+        // Check if prompt constrains scope
+        if (!expected.scopeKept && !prompt.includes('scope')) {
+          entryPassed = false;
+        }
+        // Check ESM import guidance
+        if (
+          entry.errorTail &&
+          /import|ESM/i.test(entry.errorTail) &&
+          !prompt.includes('.js')
+        ) {
+          entryPassed = false;
+        }
+        break;
+      }
+
+      case 'validate': {
+        // Check if prompt covers the specific failure type
+        if (expected.retryStrategy === 'fix_lint' && !prompt.includes('lint')) {
+          entryPassed = false;
+        }
+        if (expected.retryStrategy === 'fix_test' && !prompt.includes('test')) {
+          entryPassed = false;
+        }
+        if (
+          expected.retryStrategy === 'fix_build' &&
+          !prompt.includes('build')
+        ) {
+          entryPassed = false;
+        }
+        // Check --no-verify prevention
+        if (!prompt.includes('no-verify') && !prompt.includes('--no-verify')) {
+          entryPassed = false;
+        }
+        break;
+      }
+
+      case 'deliver': {
+        // Check commit format guidance
+        if (!prompt.includes('type(scope)') && !prompt.includes('commit')) {
+          entryPassed = false;
+        }
+        break;
+      }
+    }
+
+    if (entryPassed) {
+      passed++;
+    } else {
+      failures.push({
+        issue: entry.issue,
+        outcome: entry.outcome,
+        reason: `Prompt missing guidance for: ${JSON.stringify(expected)}`,
+      });
+    }
+  }
+
+  return {
+    phase,
+    score: goldSet.length > 0 ? passed / goldSet.length : 0,
+    total: goldSet.length,
+    passed,
+    failures: failures.slice(0, 5), // top 5 failures
+  };
+}
+
+// Main
+const phases = targetPhase ? [targetPhase] : PHASES;
+const results = phases.map(evalPhase);
+
+if (jsonOutput) {
+  console.log(JSON.stringify(results, null, 2));
+} else {
+  console.log('GEPA Phase Evaluation');
+  console.log('═'.repeat(50));
+
+  let totalScore = 0;
+  let totalPhases = 0;
+
+  for (const r of results) {
+    if (r.skipped) {
+      console.log(`  ${r.phase.padEnd(12)} — no gold set`);
+      continue;
+    }
+    if (r.missing) {
+      console.log(`  ${r.phase.padEnd(12)} — prompt file missing`);
+      continue;
+    }
+
+    const pct = (r.score * 100).toFixed(1);
+    const bar = '█'.repeat(Math.round(r.score * 20)).padEnd(20, '░');
+    const status = r.score >= 0.7 ? '✓' : r.score >= 0.4 ? '~' : '✗';
+    console.log(
+      `  ${status} ${r.phase.padEnd(12)} ${bar} ${pct}% (${r.passed}/${r.total})`
+    );
+
+    if (r.failures && r.failures.length > 0) {
+      for (const f of r.failures.slice(0, 3)) {
+        console.log(`    └ ${f.issue}: ${f.reason.slice(0, 80)}`);
+      }
+    }
+
+    totalScore += r.score;
+    totalPhases++;
+  }
+
+  if (totalPhases > 0) {
+    const avg = ((totalScore / totalPhases) * 100).toFixed(1);
+    console.log('─'.repeat(50));
+    console.log(`  Average: ${avg}%`);
+  }
+}