Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2bba656
feat(conductor): GitButler virtual branch mode for workspace management
Apr 8, 2026
6c756c7
fix(conductor): state filter + labels flatten for issue dispatch
Apr 8, 2026
48d1d68
fix(linear): flatten labels in getIssues response
Apr 8, 2026
fd74c33
GitButler Workspace Commit
gitbutler-client Apr 8, 2026
019e499
feat(cross-search): multi-database frame search across projects (STA-…
Apr 9, 2026
39c1b39
feat(shared-state): add canonical instance coordination
Apr 13, 2026
10db093
feat: add deterministic harness smoke tooling
Apr 14, 2026
6bf62e9
docs: add design principles architecture note
Apr 14, 2026
b6c3afb
chore: update gepa baselines and clean GitButler hooks
Apr 14, 2026
2f8ed5f
fix(conductor): harden lane mode cleanup
Apr 16, 2026
079b395
chore: reorganize root for clarity
Apr 16, 2026
dbe9856
fix(test): mock canonicalStateStore in session tests
Apr 17, 2026
b1ca885
chore: handoff checkpoint on chore/root-reorg
Apr 17, 2026
1a429a8
feat(gepa): phase-level prompt optimization with auto-targeting
Apr 18, 2026
19171e3
chore: handoff checkpoint on chore/root-reorg
Apr 19, 2026
9c3cb9e
feat(gepa): skill .md optimization with audit hook
Apr 19, 2026
acc477e
chore(gepa): update baseline generations with current CLAUDE.md
Apr 19, 2026
80bd13a
fix(gepa): judge CLI fallback + filter phase variants for skill targets
Apr 19, 2026
de7f021
feat(gepa): elitism, crossover, ASI feedback, eval cache, expanded evals
Apr 19, 2026
046113b
merge: resolve conflicts with origin/main
Apr 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .claude/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@
"command": "entire hooks claude-code stop"
}
]
},
{
"matcher": "",
"hooks": [
{
"type": "command",
"command": "node scripts/gepa/hooks/gepa-session-hook.js",
"async": true
}
]
}
],
"PreToolUse": [
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ scripts/gepa/results/scores.jsonl
scripts/gepa/state.json
scripts/gepa/results/
scripts/gepa/generations/
scripts/gepa/cache/

# Agent tool working dirs (untracked, per-tool scratch)
.ralph/
Expand Down
6 changes: 6 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,12 +258,18 @@ For AUTOMATE and STANDARD tiers: make only the requested changes. Don't refactor
- Prioritizes: unfinished work > flagged issues > queued tasks > continuations
- Trigger: session start, "what's next", "whats next", between tasks

**`/learn`** — Run at session end to capture learnings:
- Reviews session work, then audits memory, CLAUDE.md, skills, scripts, and wiki
- Proposes creates/updates/deletes with confirmation before applying
- Trigger: end of session, after significant work, "what should I update"

**When to use which:**
- Starting a session or between tasks → `/next` (pick what to work on)
- Session producing wrong results → `/recover` (diagnose + fix now)
- Routine maintenance, nothing broken → `/update-docs` (proactive gardening)
- After publishing a new version → `/update-docs` (catch version/path drift)
- After conductor failures → `/recover last` (learn from agent traces)
- End of session → `/learn` (capture what changed, update artifacts)

## Workflow

Expand Down
632 changes: 632 additions & 0 deletions docs/prds/substrate-enterprise-brain.md

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@
"sync:start": "node scripts/background-sync-manager.js",
"sync:setup": "./scripts/setup-background-sync.sh",
"eval:cord": "npx tsx scripts/evals/cord-vs-flat-eval.ts",
"gepa:eval": "node scripts/gepa/eval-phases.js",
"gepa:eval:json": "node scripts/gepa/eval-phases.js --json",
"gepa:mine": "node scripts/gepa/gold/mine-traces.js",
"prepare": "echo 'Prepare step completed'",
"verify:dist": "node scripts/verify-dist.cjs",
"test:smoke-db": "bash scripts/smoke-init-db.sh",
Expand Down
26 changes: 23 additions & 3 deletions scripts/conductor/after-run.sh
Original file line number Diff line number Diff line change
@@ -1,22 +1,42 @@
#!/usr/bin/env bash
# Conductor after_run hook
# Captures context from the agent run and tags it with the issue identifier
# Called after each agent attempt (success or failure)
# 1. Captures context from the agent run
# 2. Triggers GEPA session hook (accumulates toward auto-optimization)
# 3. Triggers DSPy optimization every 50 runs
#
# Environment: SYMPHONY_WORKSPACE_DIR, SYMPHONY_ISSUE_ID, SYMPHONY_ISSUE_IDENTIFIER
set -euo pipefail

WORKSPACE="${SYMPHONY_WORKSPACE_DIR:-$(pwd)}"
ISSUE_ID="${SYMPHONY_ISSUE_IDENTIFIER:-${SYMPHONY_ISSUE_ID:-unknown}}"
ATTEMPT="${SYMPHONY_ATTEMPT:-1}"
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"

cd "$WORKSPACE"

# Capture context from this run, tagged with issue ID and attempt number
# 1. Capture context from this run, tagged with issue ID and attempt number
stackmemory conductor capture \
--issue "$ISSUE_ID" \
--workspace "$WORKSPACE" \
--attempt "$ATTEMPT" \
2>/dev/null || true

echo "[conductor] Context captured for $ISSUE_ID (attempt $ATTEMPT)"

# 2. Trigger GEPA session hook (accumulates sessions, auto-optimizes at threshold)
GEPA_HOOK="$PROJECT_ROOT/scripts/gepa/hooks/gepa-session-hook.js"
if [ -f "$GEPA_HOOK" ]; then
node "$GEPA_HOOK" 2>/dev/null &
fi

# 3. Trigger DSPy optimization every 50 agent runs
OUTCOMES_PATH="$HOME/.stackmemory/conductor/outcomes.jsonl"
DSPY_OPTIMIZE="$PROJECT_ROOT/scripts/dspy/optimize.py"
if [ -f "$OUTCOMES_PATH" ] && [ -f "$DSPY_OPTIMIZE" ]; then
OUTCOMES_COUNT=$(wc -l < "$OUTCOMES_PATH" 2>/dev/null || echo 0)
if [ $((OUTCOMES_COUNT % 50)) -eq 0 ] && [ "$OUTCOMES_COUNT" -gt 0 ]; then
echo "[conductor] Triggering DSPy optimization (${OUTCOMES_COUNT} runs)"
nohup python3 "$DSPY_OPTIMIZE" --quiet >/dev/null 2>&1 &
fi
fi
60 changes: 57 additions & 3 deletions scripts/gepa/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,43 @@
"file": "CLAUDE.md",
"evals": ["stackmemory-tasks.jsonl"],
"description": "StackMemory project prompt"
},
{
"name": "skill:start",
"file": "~/.claude/commands/start.md",
"evals": ["skill-tasks.jsonl"],
"description": "Session boot skill"
},
{
"name": "skill:stop",
"file": "~/.claude/commands/stop.md",
"evals": ["skill-tasks.jsonl"],
"description": "Session close skill"
},
{
"name": "skill:learn",
"file": "~/.claude/commands/learn.md",
"evals": ["skill-tasks.jsonl"],
"description": "Session review + artifact update skill"
},
{
"name": "skill:next",
"file": "~/.claude/commands/next.md",
"evals": ["skill-tasks.jsonl"],
"description": "Next action recommendation skill"
},
{
"name": "skill:summary",
"file": "~/.claude/commands/summary.md",
"evals": ["skill-tasks.jsonl"],
"description": "Session summary skill"
}
],

"evolution": {
"populationSize": 4,
"populationSize": 8,
"crossoverCount": 2,
"elitism": true,
"generations": 10,
"selectionRate": 0.5,
"selfReview": true,
Expand All @@ -58,8 +90,9 @@

"evals": {
"directory": "./evals",
"minSamplesPerVariant": 8,
"minSamplesPerVariant": 25,
"timeout": 120000,
"heldOutPartition": true,
"metrics": [
"task_completion",
"code_quality",
Expand All @@ -73,7 +106,8 @@
"judge": {
"model": "claude-haiku-4-5-20251001",
"maxOutputTokens": 2000,
"timeoutMs": 30000
"timeoutMs": 120000,
"feedbackEnabled": true
},

"mutation": {
Expand Down Expand Up @@ -144,6 +178,26 @@
"evals": {
"files": ["conductor-tasks.jsonl"]
}
},
"skills": {
"target": {
"file": "~/.claude/commands/start.md",
"scope": "user",
"backup": true
},
"evolution": {
"mutationStrategies": [
"simplify",
"add_examples",
"rephrase",
"add_constraints",
"reduce_overengineering",
"add_self_check"
]
},
"evals": {
"files": ["skill-tasks.jsonl"]
}
}
}
}
197 changes: 197 additions & 0 deletions scripts/gepa/eval-phases.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#!/usr/bin/env node
/**
* Phase-level eval harness for GEPA.
*
* Evaluates conductor prompt phase files against gold sets.
* Scores each phase independently. Used by GEPA auto-optimization
* to validate mutations before applying.
*
* Usage:
* node eval-phases.js # eval all phases
* node eval-phases.js --phase validate # eval single phase
* node eval-phases.js --json # JSON output for CI
*/

import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { homedir } from 'os';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const GOLD_DIR = path.join(__dirname, 'gold');
const PROMPTS_DIR = path.join(
homedir(),
'.stackmemory',
'conductor',
'prompts'
);

const PHASES = ['understand', 'implement', 'validate', 'deliver'];

// Parse args
const phaseIdx = process.argv.indexOf('--phase');
const targetPhase = phaseIdx !== -1 ? process.argv[phaseIdx + 1] : null;
const jsonOutput = process.argv.includes('--json');

/**
* Load gold set for a phase
*/
function loadGoldSet(phase) {
const goldPath = path.join(GOLD_DIR, `${phase}.jsonl`);
if (!fs.existsSync(goldPath)) return [];
return fs
.readFileSync(goldPath, 'utf-8')
.split('\n')
.filter(Boolean)
.map((l) => JSON.parse(l));
}

/**
* Score a phase prompt against its gold set using heuristic evaluation.
* This is a fast, offline eval (no LLM calls) based on outcome patterns.
*
* For LLM-judge evaluation, use the full GEPA optimize.js eval pipeline.
*/
function evalPhase(phase) {
const goldSet = loadGoldSet(phase);
if (goldSet.length === 0) {
return { phase, score: 0, total: 0, passed: 0, skipped: true };
}

const promptPath = path.join(PROMPTS_DIR, `${phase}.md`);
if (!fs.existsSync(promptPath)) {
return { phase, score: 0, total: goldSet.length, passed: 0, missing: true };
}

const prompt = fs.readFileSync(promptPath, 'utf-8');
let passed = 0;
const failures = [];

for (const entry of goldSet) {
const expected = entry.expected;
if (!expected) continue;

// Heuristic: check if the prompt addresses the failure patterns
let entryPassed = true;

switch (phase) {
case 'understand': {
// Check if prompt guides complexity assessment
if (expected.complexity === 'careful' && !prompt.includes('plan')) {
entryPassed = false;
}
break;
}

case 'implement': {
// Check if prompt constrains scope
if (!expected.scopeKept && !prompt.includes('scope')) {
entryPassed = false;
}
// Check ESM import guidance
if (
entry.errorTail &&
/import|ESM/i.test(entry.errorTail) &&
!prompt.includes('.js')
) {
entryPassed = false;
}
break;
}

case 'validate': {
// Check if prompt covers the specific failure type
if (expected.retryStrategy === 'fix_lint' && !prompt.includes('lint')) {
entryPassed = false;
}
if (expected.retryStrategy === 'fix_test' && !prompt.includes('test')) {
entryPassed = false;
}
if (
expected.retryStrategy === 'fix_build' &&
!prompt.includes('build')
) {
entryPassed = false;
}
// Check --no-verify prevention
if (!prompt.includes('no-verify') && !prompt.includes('--no-verify')) {
entryPassed = false;
}
break;
}

case 'deliver': {
// Check commit format guidance
if (!prompt.includes('type(scope)') && !prompt.includes('commit')) {
entryPassed = false;
}
break;
}
}

if (entryPassed) {
passed++;
} else {
failures.push({
issue: entry.issue,
outcome: entry.outcome,
reason: `Prompt missing guidance for: ${JSON.stringify(expected)}`,
});
}
}

return {
phase,
score: goldSet.length > 0 ? passed / goldSet.length : 0,
total: goldSet.length,
passed,
failures: failures.slice(0, 5), // top 5 failures
};
}

// Main
const phases = targetPhase ? [targetPhase] : PHASES;
const results = phases.map(evalPhase);

if (jsonOutput) {
console.log(JSON.stringify(results, null, 2));
} else {
console.log('GEPA Phase Evaluation');
console.log('═'.repeat(50));

let totalScore = 0;
let totalPhases = 0;

for (const r of results) {
if (r.skipped) {
console.log(` ${r.phase.padEnd(12)} — no gold set`);
continue;
}
if (r.missing) {
console.log(` ${r.phase.padEnd(12)} — prompt file missing`);
continue;
}

const pct = (r.score * 100).toFixed(1);
const bar = '█'.repeat(Math.round(r.score * 20)).padEnd(20, '░');
const status = r.score >= 0.7 ? '✓' : r.score >= 0.4 ? '~' : '✗';
console.log(
` ${status} ${r.phase.padEnd(12)} ${bar} ${pct}% (${r.passed}/${r.total})`
);

if (r.failures && r.failures.length > 0) {
for (const f of r.failures.slice(0, 3)) {
console.log(` └ ${f.issue}: ${f.reason.slice(0, 80)}`);
}
}

totalScore += r.score;
totalPhases++;
}

if (totalPhases > 0) {
const avg = ((totalScore / totalPhases) * 100).toFixed(1);
console.log('─'.repeat(50));
console.log(` Average: ${avg}%`);
}
}
Loading
Loading