diff --git a/.claude/settings.json b/.claude/settings.json index 045c2ad2..ec246ea7 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -52,6 +52,16 @@ "command": "entire hooks claude-code stop" } ] + }, + { + "matcher": "", + "hooks": [ + { + "type": "command", + "command": "node scripts/gepa/hooks/gepa-session-hook.js", + "async": true + } + ] } ], "PreToolUse": [ diff --git a/.gitignore b/.gitignore index 9b63e497..939a9eaf 100644 --- a/.gitignore +++ b/.gitignore @@ -135,6 +135,7 @@ scripts/gepa/results/scores.jsonl scripts/gepa/state.json scripts/gepa/results/ scripts/gepa/generations/ +scripts/gepa/cache/ # Agent tool working dirs (untracked, per-tool scratch) .ralph/ diff --git a/CLAUDE.md b/CLAUDE.md index b517d706..42ef6c44 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -258,12 +258,18 @@ For AUTOMATE and STANDARD tiers: make only the requested changes. Don't refactor - Prioritizes: unfinished work > flagged issues > queued tasks > continuations - Trigger: session start, "what's next", "whats next", between tasks +**`/learn`** — Run at session end to capture learnings: +- Reviews session work, then audits memory, CLAUDE.md, skills, scripts, and wiki +- Proposes creates/updates/deletes with confirmation before applying +- Trigger: end of session, after significant work, "what should I update" + **When to use which:** - Starting a session or between tasks → `/next` (pick what to work on) - Session producing wrong results → `/recover` (diagnose + fix now) - Routine maintenance, nothing broken → `/update-docs` (proactive gardening) - After publishing a new version → `/update-docs` (catch version/path drift) - After conductor failures → `/recover last` (learn from agent traces) +- End of session → `/learn` (capture what changed, update artifacts) ## Workflow diff --git a/docs/prds/substrate-enterprise-brain.md b/docs/prds/substrate-enterprise-brain.md new file mode 100644 index 00000000..d2424a21 --- /dev/null +++ b/docs/prds/substrate-enterprise-brain.md @@ -0,0 +1,632 @@ +# PRD: Substrate — Enterprise Knowledge Brain + +**Status:** Draft +**Author:** Jonathan Wu +**Date:** 2026-04-17 +**Codename:** Croissant +**Version:** v1.0 + +--- + +## 1. Problem & Evidence + +### The problem + +AI coding tools have solved context for engineers because all context lives in a git repo. For knowledge workers — product managers, marketers, sales teams, executives — context is fragmented across 5-15 SaaS tools. There is no "git repo for knowledge." + +Today's landscape: +- **Transcripts** live in Granola/Otter. **Documents** in Notion/Confluence. **Customer data** in HubSpot. **Tasks** in Linear/Jira. **Conversations** in Slack. **Code decisions** in GitHub. +- No system connects dots across these sources automatically. +- AI tools (Glean, Notion AI) sit on top of individual silos — they search within a tool, not across the organization's full knowledge surface. +- When an AI agent needs organizational context to complete a task, it doesn't exist in a queryable, structured form. + +### Evidence + +- Engineering teams using StackMemory's conductor report 70%+ of agent failures stem from missing organizational context (what was decided, why, by whom). +- Provenant's decision-tracking prototype (packages/provenant/) demonstrated that cross-source ingestion + confidence scoring produces actionable knowledge — but it's scoped to "decisions" only and lacks a user-facing product. +- The enterprise "AI readiness" conversation has shifted from "do we have data?" to "can AI access and reason over our data?" — this is the gap. + +### Why now + +- MCP protocol standardizes adapter interfaces — 7/8 target data sources have official MCP servers (per THEORY.MD: "standardize the intersection, expose the union"). +- Cloudflare Agents SDK + D1 provides zero-ops distributed SQLite — no Postgres migration needed (per THEORY.MD: "SQLite over Postgres for local"). +- StackMemory's conductor, scoring pipeline, and wiki compiler prove the core technical approaches work at production quality. + +--- + +## 2. Goals / Non-Goals + +### Goals + +| # | Goal | Measurable target | +|---|------|-------------------| +| G1 | Time to value under 5 minutes | Install → connect 2 sources → cross-source query < 5 min | +| G2 | Cross-source knowledge retrieval | ≥ 30% of queries cite 2+ sources within first week | +| G3 | Daily active use | Day-7 return rate ≥ 40% | +| G4 | Team adoption | Second user on same team within 14 days | +| G5 | Revenue | First paying Cloud Team customer within 60 days of launch | + +### Non-goals (v1) + +- OAuth connector flows (v1.5 — paid tier differentiator) +- Cloudflare-hosted Brain instances (v2) +- Federated team access / org-level rollup (v2) +- Autonomous agent execution using the Brain (v3) +- Stripe metering / billing infrastructure (v2) +- GDPR compliance / data residency controls +- Mobile or web-only client +- Multi-language support + +--- + +## 3. Users & Jobs-to-Be-Done + +### Primary persona: Engineering Team Lead + +**Context:** Manages 3-8 engineers. Uses Linear for tasks, GitHub for code, Slack for communication. Makes 10-20 decisions per week that are never captured in a queryable form. + +**Jobs:** +- "When I start my day, I want to know what happened overnight across all my tools without checking each one." +- "When a new engineer asks 'why did we build it this way?', I want to point them at the Brain instead of spending 30 minutes in Slack search." +- "When planning a sprint, I want to see what's blocked, what's decided, and what's still open — across Linear, GitHub, and Slack — in one view." + +### Secondary persona (v2+): Product Manager + +**Context:** Uses Notion for specs, Linear for tracking, Slack for stakeholder comms, HubSpot for customer feedback. + +**Jobs:** +- "When writing a PRD, I want the Brain to surface related past decisions, customer feedback, and technical constraints." +- "When asked 'why did we prioritize X?', I want a cited answer, not my memory." + +### Excluded (v1): C-suite, sales, marketing, non-technical operators + +--- + +## 4. Solution Overview + +### Product: Substrate + +An Electron desktop app that auto-indexes enterprise knowledge from connected sources into a queryable Brain. Users connect data sources, the Brain ingests and organizes knowledge, and a chat interface provides instant, cited answers. + +### Three components + +``` +Provenance (connectors) --> Cortex (brain) --> Substrate (app) + adapters/fetch/dedup graph/score/query Electron/UI/control +``` + +| Component | Package | License | Purpose | +|-----------|---------|---------|---------| +| **Cortex** | `@stackmemoryai/cortex` | BSL | Knowledge graph, confidence scoring, query engine, compaction | +| **Provenance** | `@stackmemoryai/provenant` | BSL | Connector adapters, MCP orchestration, delta sync, dedup | +| **Substrate** | `@stackmemoryai/substrate` | Private | Electron app, CF runtime, billing, team management | +| **Types** | `@stackmemoryai/types` | BSL | Shared interfaces between packages | + +### Why this decomposition + +Provenant was a monolith handling ingest + score + store + query + resolve. For a product: +- **Connectors are commodity** (every iPaaS does this) — keep them in Provenance +- **The graph + scoring + query + compaction is the moat** — that's Cortex +- Teams can add custom adapters without touching Brain internals +- CF architecture maps cleanly: adapters = Workers, Brain = Durable Object + +> Per THEORY.MD: "Standardize the intersection, expose the union" — MCP is the standardized intersection; Cortex's scoring/compaction is the exposed union. + +--- + +## 5. Architecture & Data Model + +### 5.1 Multi-repo structure + +``` +stackmemoryai/cortex OSS (BSL) Knowledge graph + query engine +stackmemoryai/provenant OSS (BSL) Connector adapters + MCP orchestration +stackmemoryai/substrate Private Electron app + CF runtime +stackmemoryai/types OSS (BSL) Shared TypeScript interfaces +stackmemoryai/stackmemory OSS (BSL) Existing CLI (depends on cortex + provenant) +``` + +**Why multi-repo over monorepo:** +- Forced clean interfaces (no leaking shared state) +- Independent deploy cycles (ship Cortex without touching Provenance) +- CF Wrangler expects its own repo root +- Clear open-source boundary (public repos vs private) +- Parallel contributors without PR queue bottleneck + +### 5.2 Cortex schema (v1, reviewed 2026-04-17) + +Adapted from Provenant's 9-table schema. Two critical review passes applied. + +**Design decisions:** +- `INTEGER PRIMARY KEY` (rowid alias) for internal references — TEXT UUIDs cause B-tree fragmentation at scale +- UUID kept as `id TEXT UNIQUE` for API/external use +- FTS5 external content table with explicit triggers — no silent desyncs +- Append-only versioning with `is_latest` partial index for fast current-version lookups +- `dependency_index` dropped — use recursive CTE at query time (O(n^2) pre-computation doesn't scale) +- Top queryable fields (`priority`, `state`, `labels`, `assignee`) as real columns, not buried in JSON +- `workspace_id` deferred to v2 migration — YAGNI, avoids false confidence from unfiltered column + +```sql +CREATE TABLE schema_version (version INTEGER PRIMARY KEY); +INSERT INTO schema_version VALUES (1); + +CREATE TABLE knowledge ( + rowid INTEGER PRIMARY KEY, + id TEXT NOT NULL UNIQUE, -- UUID for API/external reference + type TEXT NOT NULL, -- free-form: 'decision' | 'document' | 'conversation' | 'ticket' | ... + content TEXT NOT NULL, + summary TEXT, -- LLM-generated for long content + actor TEXT, + confidence REAL DEFAULT 0.5, + source_system TEXT NOT NULL, + source_id TEXT, + source_hash TEXT, -- dedup / change detection + raw_payload TEXT, -- archival, never queried directly + priority INTEGER, -- 0-4, standardized across sources + state TEXT, -- 'open' | 'closed' | 'merged' | 'resolved' + labels TEXT, -- JSON array: ["auth", "backend"] + assignee TEXT, + metadata TEXT DEFAULT '{}', -- truly dynamic fields only + embedding BLOB, + embedding_model TEXT, -- 'voyage-3' | 'text-embedding-3-small' | null + version INTEGER DEFAULT 1, + is_latest INTEGER DEFAULT 1, -- 1 = current, 0 = historical + thread_id TEXT, -- flat thread grouping + parent_id INTEGER, -- direct parent (conversations, doc sections) + created_at INTEGER NOT NULL, + updated_at INTEGER NOT NULL, + ingested_at INTEGER NOT NULL, + FOREIGN KEY (parent_id) REFERENCES knowledge(rowid) +); + +CREATE INDEX idx_knowledge_source ON knowledge(source_system, source_id); +CREATE INDEX idx_knowledge_latest ON knowledge(source_system, source_id) WHERE is_latest = 1; +CREATE UNIQUE INDEX idx_knowledge_source_version ON knowledge(source_system, source_id, version); +CREATE INDEX idx_knowledge_thread ON knowledge(thread_id); +CREATE INDEX idx_knowledge_type ON knowledge(type); +CREATE INDEX idx_knowledge_state ON knowledge(state); +CREATE INDEX idx_knowledge_created ON knowledge(created_at); + +CREATE VIRTUAL TABLE knowledge_fts USING fts5( + content, summary, actor, + content=knowledge, content_rowid=rowid +); + +CREATE TRIGGER knowledge_ai AFTER INSERT ON knowledge BEGIN + INSERT INTO knowledge_fts(rowid, content, summary, actor) + VALUES (new.rowid, new.content, new.summary, new.actor); +END; +CREATE TRIGGER knowledge_ad AFTER DELETE ON knowledge BEGIN + INSERT INTO knowledge_fts(knowledge_fts, rowid, content, summary, actor) + VALUES ('delete', old.rowid, old.content, old.summary, old.actor); +END; +CREATE TRIGGER knowledge_au AFTER UPDATE ON knowledge BEGIN + INSERT INTO knowledge_fts(knowledge_fts, rowid, content, summary, actor) + VALUES ('delete', old.rowid, old.content, old.summary, old.actor); + INSERT INTO knowledge_fts(rowid, content, summary, actor) + VALUES (new.rowid, new.content, new.summary, new.actor); +END; + +CREATE TABLE edges ( + rowid INTEGER PRIMARY KEY, + id TEXT NOT NULL UNIQUE, + from_id INTEGER NOT NULL, + to_id INTEGER NOT NULL, + rel_type TEXT NOT NULL, + confidence REAL DEFAULT 0.5, + version INTEGER DEFAULT 1, + created_at INTEGER NOT NULL, + FOREIGN KEY (from_id) REFERENCES knowledge(rowid), + FOREIGN KEY (to_id) REFERENCES knowledge(rowid) +); + +CREATE INDEX idx_edges_from_rel ON edges(from_id, rel_type); +CREATE INDEX idx_edges_to_rel ON edges(to_id, rel_type); + +CREATE TABLE sources ( + id TEXT PRIMARY KEY, + system TEXT NOT NULL UNIQUE, + auth_type TEXT NOT NULL, + config TEXT, + sync_cursor TEXT, -- opaque, adapter-owned + sync_config TEXT, -- JSON: which repos/channels/etc to sync + last_sync_at INTEGER, + last_sync_status TEXT, + last_sync_error TEXT, + node_count INTEGER DEFAULT 0, + created_at INTEGER NOT NULL +); + +CREATE TABLE rejection_log ( + id TEXT PRIMARY KEY, + knowledge_id INTEGER NOT NULL, + reason TEXT, + actor TEXT, + created_at INTEGER NOT NULL, + FOREIGN KEY (knowledge_id) REFERENCES knowledge(rowid) +); + +-- Retained from Provenant +CREATE TABLE review_queue (...); -- low-confidence items pending human review +CREATE TABLE contradictions (...); -- conflicting knowledge nodes +CREATE TABLE stale_flags (...); -- nodes whose source data changed +CREATE TABLE dependency_index (...); -- transitive closure for graph traversal +``` + +**Key differences from Provenant:** +- `nodes` → `knowledge` (general, not decision-scoped) +- Added `parent_id` for conversation threading / document hierarchy +- Added `summary` for long-content compression +- Added `source_system` + `source_id` directly on knowledge (denormalized for query speed) +- Added `sources` table for connection management +- Removed `rejection_log` from v1 (add in v2 with human review UI) +- Append-only model: updates create new versions, old versions retained + +> Per THEORY.MD: "SQLite over Postgres for local: zero-config, file-based, FTS5 built-in." + +### 5.3 Connector strategy + +**v1: API key connectors (OSS)** +- User pastes API key in Provenance settings tab +- Credentials encrypted via Electron `safeStorage` (OS keychain) +- Keys never leave the machine +- Supported: Linear (API key), GitHub (PAT) + +**v1.5: OAuth connectors (paid)** +- Nango frontend SDK triggers OAuth popup in Electron `BrowserWindow` +- Nango cloud manages token storage, refresh, revocation +- Upsell trigger: "Want to connect Slack/Notion/Google? Upgrade." +- Supported: Slack, GitHub (full OAuth), Notion, Google Drive, HubSpot, Confluence + +**Adapter interface: MCP protocol** +- 7/8 target sources have official MCP servers +- Provenance spawns MCP servers with credentials injected as env vars +- Calls MCP tools to fetch data, normalizes responses into Cortex schema +- Delta sync via `since` timestamps, hash-based dedup + +```typescript +// @stackmemoryai/types — adapter contract +interface ConnectorAdapter { + system: string; // 'linear' | 'slack' | ... + authType: 'api_key' | 'oauth'; + fetch(since: Date): AsyncIterable; // delta sync + normalize(record: RawRecord): KnowledgeNode; // → Cortex schema + healthCheck(): Promise; +} + +interface RawRecord { + id: string; + system: string; + type: string; + content: string; + actor?: string; + timestamp: number; + raw: unknown; // original payload + hash: string; // for dedup +} +``` + +### 5.4 Cloud architecture (v2) + +``` +CF Agent (Durable Object) ← Brain: always-on, SQLite/D1, WebSocket + |-- CF Worker (V8 isolate) ← Fast: queries, API calls, routing + |-- CF Container (Docker) ← Heavy: git clone, builds, agent runs + '-- CF Sandbox ← Untrusted: user code, shell (v3) +``` + +- Uses CF Agents SDK (`agents` npm) — native DO persistence, hibernation (zero idle cost), MCP support, built-in metering +- Each team's Brain = a Durable Object with D1 SQLite +- Workers handle lightweight adapter fetches and query routing +- Containers for heavy compute (agent execution in v3) + +> Per THEORY.MD: "Hooks over daemons for capture" — adapters fire on schedule or webhook, not as long-running polling daemons. + +--- + +## 6. Detailed Requirements + +### 6.1 Cortex core + +| ID | Requirement | Priority | Notes | +|----|-------------|----------|-------| +| C1 | Ingest normalized records from Provenance adapters | P0 | Hash-based dedup, append-only versioning | +| C2 | Confidence scoring pipeline | P0 | Pluggable signal model per source type. Thresholds: auto-accept ≥0.7, review 0.4-0.69, discard <0.4 | +| C3 | Keyword search (FTS5 BM25) | P0 | Full-text search on content + summary fields | +| C4 | LLM query synthesis with streaming | P0 | SSE streaming, Claude API, cite source nodes | +| C5 | Progressive query response | P0 | Instant: indexed results. Stream: LLM synthesis. Background: deep analysis as task | +| C6 | Edge creation (auto-detected relationships) | P1 | Derive edges from shared entities, temporal proximity, content similarity | +| C7 | Stale flag propagation | P1 | When source hash changes, mark downstream nodes | +| C8 | Contradiction detection | P1 | Flag when two nodes make conflicting claims | +| C9 | Embedding-based semantic search | P2 | Optional, behind feature flag. Voyage AI or OpenAI embeddings | +| C10 | Temporal queries ("as of March 1st") | P2 | Query knowledge state at a point in time | +| C11 | Compaction / decay | P2 | Merge duplicate nodes, decay stale knowledge over time | + +### 6.2 Provenance connectors + +| ID | Requirement | Priority | Notes | +|----|-------------|----------|-------| +| P1 | Linear adapter (API key) | P0 | Issues, comments, labels, assignees. Delta sync. | +| P2 | GitHub adapter (PAT) | P0 | PRs, issues, commits, reviews. Delta sync. | +| P3 | MCP server spawning | P0 | Spawn official MCP servers with credential env vars | +| P4 | Adapter health check | P0 | Report sync status, last sync time, error count | +| P5 | Independent failure resilience | P0 | Each adapter fails/retries independently. Others continue. | +| P6 | Slack adapter (OAuth) | P1 | v1.5, paid tier. Channels, threads, reactions. | +| P7 | Notion adapter (OAuth) | P2 | v1.5, paid tier. Pages, databases, blocks. | +| P8 | Google Drive adapter (OAuth) | P2 | v1.5, paid tier. Docs, sheets, slides. | + +### 6.3 Electron app (Substrate) + +| ID | Requirement | Priority | Notes | +|----|-------------|----------|-------| +| S1 | Cortex chat panel (left tab) | P0 | HexStyleChat base + SSE streaming. Branded "Cortex." | +| S2 | Provenance settings (tab) | P0 | API key input, connector status, sync controls | +| S3 | Onboarding flow | P0 | First-launch: connect source → ingest → first query | +| S4 | Suggestion pills (empty state) | P0 | "What's the team working on?", "Recent decisions", etc. | +| S5 | Task panel (right side) | P0 | Background deep analysis tasks with status | +| S6 | Agent control mode (existing) | P0 | Keep existing tmux agent management, terminal, Linear | +| S7 | Knowledge health dashboard | P1 | Node counts, staleness, source distribution | +| S8 | Cross-source citation display | P0 | Show which sources contributed to each answer | +| S9 | Credential storage via safeStorage | P0 | OS keychain, encrypted at rest | +| S10 | Auto-update via electron-updater | P1 | DMG distribution, GitHub Releases | + +### 6.4 Progressive query flow + +``` +User asks: "What's blocking the auth refactor?" + +[0ms] Cortex searches FTS5 index + → Returns matching knowledge nodes instantly + → Display in chat as "Sources found: 3 Linear issues, 2 GitHub PRs" + +[500ms] Cortex streams LLM synthesis + → Claude reads top-k nodes + edges + → Streams answer with inline citations: "The auth refactor [1] is blocked by..." + → Citations link to source nodes with confidence scores + +[2-5s] Answer complete. Citations panel shows: + → [1] Linear STA-412: "Auth middleware rewrite" (confidence: 0.89) + → [2] GitHub PR #847: "Remove legacy session handler" (confidence: 0.76) + → [3] Slack #eng-backend: "Legal flagged token storage" (confidence: 0.65) + +[background] If query is complex, spawn deep analysis task: + → Task appears in side panel: "Deep analysis: auth refactor blockers" + → Traverses knowledge graph (2+ hops from initial results) + → Updates answer with additional context when complete +``` + +--- + +## 7. UX Flows + +### 7.1 Onboarding (< 5 minutes to value) + +``` +Step 1: Install (30s) + Electron app opens → Substrate branding → empty state + "Welcome to Substrate. Connect your first source to get started." + +Step 2: Connect first source (2 min) + Click "Add Source" → select Linear → paste API key → "Connect" + Progress bar: "Indexing 47 issues, 123 comments..." + Real-time count: nodes rising as ingestion runs + +Step 3: First query (30s after ingestion) + Suggestion pill: "What's the team working on?" + Cortex answers with cited Linear issues + AHA MOMENT: "It already knows this." + +Step 4: Connect second source (2 min) + Click "Add Source" → select GitHub → paste PAT → "Connect" + Progress: "Indexing 12 repos, 89 PRs, 234 issues..." + Cross-referencing happens automatically (shared entity detection) + +Step 5: Cross-source query (the magic moment) + "What's blocking the auth refactor?" + Brain pulls Linear ticket + GitHub PR + commit messages + HOLY SHIT MOMENT: "It connected dots I didn't." +``` + +### 7.2 Cortex chat panel + +``` ++--------------------------------------------------+ +| Cortex [Search] [+] | +| | +| (empty state — centered) | +| | +| Ask your Brain anything | +| | +| [What's the team working on?] | +| [Recent decisions] | +| [What's blocked?] | +| [Summarize last week] | +| | +| ____________________________________________ | +| | | | +| | Ask Cortex... [Send] | | +| |__________________________________________| | ++--------------------------------------------------+ +``` + +Active state with task panel: + +``` ++-------------------------------+-------------------+ +| Cortex | Tasks | +| | | +| You: What's blocking auth? | [~] Deep analysis| +| | auth blockers| +| Cortex: The auth refactor | 3 sources... | +| is blocked by two items: | | +| | [v] Linear sync | +| 1. Legal compliance [1] | 47 nodes | +| 2. PR review pending [2] | | +| | [v] GitHub sync | +| Sources: | 89 nodes | +| [1] STA-412 (0.89) | | +| [2] PR #847 (0.76) | | +| [3] #eng-backend (0.65) | | +| | | +| ___________________________ | | +| | Ask Cortex... [Send] | | | +| |_________________________| | | ++-------------------------------+-------------------+ +``` + +### 7.3 Provenance settings + +``` ++--------------------------------------------------+ +| Provenance — Connectors | +| | +| Connected Sources | +| | +| [check] Linear API Key Sync: 2m ago [...] | +| [check] GitHub PAT Sync: 5m ago [...] | +| [ ] Slack OAuth Not connected [Connect] | +| [ ] Notion OAuth Not connected [Connect] | +| | +| [+ Add Source] | +| | +| Sync Schedule | +| [v] Auto-sync every [15 min v] | +| [ ] Sync on app launch | +| | +| Brain Health | +| Total nodes: 1,247 | +| Sources: Linear (623), GitHub (624) | +| Stale nodes: 12 (0.9%) | +| Last full sync: 2 minutes ago | ++--------------------------------------------------+ +``` + +--- + +## 8. Pricing & Packaging + +| | OSS Self-Hosted | Cloud Free | Cloud Team | Cloud Enterprise | +|---|---|---|---|---| +| **Seats** | unlimited | up to 3 | up to 5 | unlimited | +| **Price** | free | free | $99/mo + metered | custom | +| **Auth** | API keys only | API keys only | OAuth (Nango) | SSO + OAuth | +| **Storage** | local SQLite | cloud D1 | cloud D1 | cloud D1 | +| **Brain instances** | 1 (local) | 1 (hosted) | 1 (hosted) | federated (multi-team) | +| **Query** | CLI + MCP | Cortex chat | Cortex chat + API | + org rollup | +| **Support** | community | community | email | dedicated | + +**Metering (Cloud Team+):** +- LLM inference: pass-through at 2-3x Anthropic cost +- Tracked as tokens in + tokens out across indexing and queries +- Stripe Metering API for usage billing with margin targets +- Storage: generous free tier (1GB included), then $/GB/mo + +**Upsell triggers:** +- OSS → Cloud: "Sync across devices", "Team sharing" +- Cloud Free → Team: "Connect Slack/Notion" (OAuth), "More than 3 seats" +- Team → Enterprise: "Federated access", "SSO", "Org rollup" + +--- + +## 9. Rollout Plan + +### v1 — Local Brain (2 weeks) + +Ship: +- [ ] `@stackmemoryai/types` repo — shared interfaces +- [ ] `@stackmemoryai/cortex` repo — knowledge graph, FTS5 search, streaming LLM query +- [ ] `@stackmemoryai/provenant` repo — extracted from packages/provenant/, adapter interface + Linear + GitHub +- [ ] Substrate Electron app — Cortex chat panel + Provenance settings + onboarding +- [ ] API key connectors (Linear, GitHub PAT) +- [ ] Progressive query (instant → stream → background task) +- [ ] DMG distribution + +Cleanup: +- [ ] Remove `tools/agent-viewer/` from stackmemory repo +- [ ] Extract desktop control-plane from provenantai worktree into substrate repo + +### v1.5 — OAuth + Paid Tier (~4 weeks after v1) + +- [ ] Nango integration for OAuth flows +- [ ] Slack, Notion, Google Drive adapters +- [ ] Cloud Free tier (hosted D1 Brain) +- [ ] Stripe metering integration +- [ ] Basic telemetry + log shipping (opt-in) + +### v2 — Cloud + Teams (~4 weeks after v1.5) + +- [ ] Substrate cloud (CF Agents SDK, D1, Workers) +- [ ] Federated team access with opt-in sharing +- [ ] C-suite org rollup queries +- [ ] Access controls / permissions +- [ ] SSO via OIDC + +### v3 — Agent Execution (~4 weeks after v2) + +- [ ] Brain-powered autonomous agents +- [ ] CF Containers for heavy compute (git, builds, tests) +- [ ] Agent outcomes feed back into Cortex confidence model +- [ ] Self-improving knowledge loop + +--- + +## 10. Success Metrics & Instrumentation + +### Leading indicators (weekly) + +| Metric | Target | Instrumentation | +|--------|--------|-----------------| +| Install → first query | < 5 min | Timestamp delta (app open → first query event) | +| Sources connected (day 1) | >= 2 per user | Source creation events | +| Queries per user (week 1) | >= 10 | Query event counter | +| Cross-source query rate | >= 30% | Queries citing 2+ source_system values | + +### Lagging indicators (monthly) + +| Metric | Target | Instrumentation | +|--------|--------|-----------------| +| Day-7 return rate | >= 40% | App open events, daily active users | +| Second team member | within 14 days | Seat count per org | +| Paid conversion | >= 5% of free users | Stripe subscription events | +| NPS | >= 50 | In-app survey (after 14 days) | + +### Rollback indicator + +- Day-7 return rate < 20% → Brain isn't sticky, investigate stale knowledge or poor answer quality +- Cross-source query rate < 10% → Single-source answers aren't compelling enough, users would just use source's native search + +### Telemetry + +- **Local/OSS:** off by default, opt-in only. Console logs + local traces. +- **Cloud:** basic telemetry on. Query count, source health, errors, latency percentiles. Log shipping for debugging. + +--- + +## 11. Risks & Mitigations + +| Risk | Impact | Likelihood | Mitigation | +|------|--------|------------|------------| +| **Answer quality too low** | Users churn after first query | Medium | Progressive query (show raw sources first, then synthesis). Confidence scores set expectations. | +| **Ingestion too slow** | Onboarding > 5 min target | Low | Start querying before full ingest completes. Show partial results with "still indexing..." indicator. | +| **MCP server instability** | Adapter failures cascade | Medium | Independent failure resilience (each adapter retries independently). Health dashboard. | +| **Schema migration complexity** | Cortex schema changes break data | Low | Append-only model — no destructive migrations. Version field on all records. | +| **Electron app size** | >200MB download discourages install | Medium | Tree-shake dependencies. Defer optional packages. Target <100MB. | +| **Nango dependency (v1.5)** | Vendor lock-in for OAuth | Low | OAuth apps registered under our accounts — only token management delegated. Can self-host or swap. | +| **CF platform risk (v2)** | Cloudflare pricing/policy changes | Low | Cortex core is SQLite-native, portable. CF is the deployment target, not the data format. | +| **Competitor launches first** | Glean/Notion ship similar Brain | Medium | OSS distribution + local-first is our moat. Enterprise SaaS can't match zero-ops self-hosted. | + +--- + +## 12. Open Questions + +| # | Question | Blocking? | Owner | +|---|----------|-----------|-------| +| OQ1 | Cortex schema: should `knowledge` table use JSON column for extensible metadata vs fixed columns? | No (start with fixed, add JSON later) | Eng | +| OQ2 | Embedding provider for v1: skip entirely (keyword-only) or include Voyage AI behind feature flag? | No (skip for v1, keyword search is sufficient per THEORY.MD) | Eng | +| OQ3 | ~~Electron app: migrate renderer.js to React, or extend vanilla JS?~~ | **Resolved: React** | Eng | +| OQ4 | Auto-sync interval: what's the right default? 5min / 15min / 1hr? | No (ship with 15min, make configurable) | Product | +| OQ5 | ~~How to handle the provenantai worktree extraction?~~ | **Resolved: copy + merge into main provenantai repo** | Eng | + +> **OQ3 resolved:** React for v1. Invest upfront for cleaner long-term architecture. + +> **OQ5 resolved:** Copy desktop control-plane from worktree into main provenantai repo (not a separate substrate repo). diff --git a/package.json b/package.json index 75a89d9b..ced4a85d 100644 --- a/package.json +++ b/package.json @@ -141,6 +141,9 @@ "sync:start": "node scripts/background-sync-manager.js", "sync:setup": "./scripts/setup-background-sync.sh", "eval:cord": "npx tsx scripts/evals/cord-vs-flat-eval.ts", + "gepa:eval": "node scripts/gepa/eval-phases.js", + "gepa:eval:json": "node scripts/gepa/eval-phases.js --json", + "gepa:mine": "node scripts/gepa/gold/mine-traces.js", "prepare": "echo 'Prepare step completed'", "verify:dist": "node scripts/verify-dist.cjs", "test:smoke-db": "bash scripts/smoke-init-db.sh", diff --git a/scripts/conductor/after-run.sh b/scripts/conductor/after-run.sh index 14e551fb..c85e1d64 100755 --- a/scripts/conductor/after-run.sh +++ b/scripts/conductor/after-run.sh @@ -1,7 +1,8 @@ #!/usr/bin/env bash # Conductor after_run hook -# Captures context from the agent run and tags it with the issue identifier -# Called after each agent attempt (success or failure) +# 1. Captures context from the agent run +# 2. Triggers GEPA session hook (accumulates toward auto-optimization) +# 3. Triggers DSPy optimization every 50 runs # # Environment: SYMPHONY_WORKSPACE_DIR, SYMPHONY_ISSUE_ID, SYMPHONY_ISSUE_IDENTIFIER set -euo pipefail @@ -9,10 +10,12 @@ set -euo pipefail WORKSPACE="${SYMPHONY_WORKSPACE_DIR:-$(pwd)}" ISSUE_ID="${SYMPHONY_ISSUE_IDENTIFIER:-${SYMPHONY_ISSUE_ID:-unknown}}" ATTEMPT="${SYMPHONY_ATTEMPT:-1}" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" cd "$WORKSPACE" -# Capture context from this run, tagged with issue ID and attempt number +# 1. Capture context from this run, tagged with issue ID and attempt number stackmemory conductor capture \ --issue "$ISSUE_ID" \ --workspace "$WORKSPACE" \ @@ -20,3 +23,20 @@ stackmemory conductor capture \ 2>/dev/null || true echo "[conductor] Context captured for $ISSUE_ID (attempt $ATTEMPT)" + +# 2. Trigger GEPA session hook (accumulates sessions, auto-optimizes at threshold) +GEPA_HOOK="$PROJECT_ROOT/scripts/gepa/hooks/gepa-session-hook.js" +if [ -f "$GEPA_HOOK" ]; then + node "$GEPA_HOOK" 2>/dev/null & +fi + +# 3. Trigger DSPy optimization every 50 agent runs +OUTCOMES_PATH="$HOME/.stackmemory/conductor/outcomes.jsonl" +DSPY_OPTIMIZE="$PROJECT_ROOT/scripts/dspy/optimize.py" +if [ -f "$OUTCOMES_PATH" ] && [ -f "$DSPY_OPTIMIZE" ]; then + OUTCOMES_COUNT=$(wc -l < "$OUTCOMES_PATH" 2>/dev/null || echo 0) + if [ $((OUTCOMES_COUNT % 50)) -eq 0 ] && [ "$OUTCOMES_COUNT" -gt 0 ]; then + echo "[conductor] Triggering DSPy optimization (${OUTCOMES_COUNT} runs)" + nohup python3 "$DSPY_OPTIMIZE" --quiet >/dev/null 2>&1 & + fi +fi diff --git a/scripts/gepa/config.json b/scripts/gepa/config.json index 52570508..75e6c3e2 100644 --- a/scripts/gepa/config.json +++ b/scripts/gepa/config.json @@ -32,11 +32,43 @@ "file": "CLAUDE.md", "evals": ["stackmemory-tasks.jsonl"], "description": "StackMemory project prompt" + }, + { + "name": "skill:start", + "file": "~/.claude/commands/start.md", + "evals": ["skill-tasks.jsonl"], + "description": "Session boot skill" + }, + { + "name": "skill:stop", + "file": "~/.claude/commands/stop.md", + "evals": ["skill-tasks.jsonl"], + "description": "Session close skill" + }, + { + "name": "skill:learn", + "file": "~/.claude/commands/learn.md", + "evals": ["skill-tasks.jsonl"], + "description": "Session review + artifact update skill" + }, + { + "name": "skill:next", + "file": "~/.claude/commands/next.md", + "evals": ["skill-tasks.jsonl"], + "description": "Next action recommendation skill" + }, + { + "name": "skill:summary", + "file": "~/.claude/commands/summary.md", + "evals": ["skill-tasks.jsonl"], + "description": "Session summary skill" } ], "evolution": { - "populationSize": 4, + "populationSize": 8, + "crossoverCount": 2, + "elitism": true, "generations": 10, "selectionRate": 0.5, "selfReview": true, @@ -58,8 +90,9 @@ "evals": { "directory": "./evals", - "minSamplesPerVariant": 8, + "minSamplesPerVariant": 25, "timeout": 120000, + "heldOutPartition": true, "metrics": [ "task_completion", "code_quality", @@ -73,7 +106,8 @@ "judge": { "model": "claude-haiku-4-5-20251001", "maxOutputTokens": 2000, - "timeoutMs": 30000 + "timeoutMs": 120000, + "feedbackEnabled": true }, "mutation": { @@ -144,6 +178,26 @@ "evals": { "files": ["conductor-tasks.jsonl"] } + }, + "skills": { + "target": { + "file": "~/.claude/commands/start.md", + "scope": "user", + "backup": true + }, + "evolution": { + "mutationStrategies": [ + "simplify", + "add_examples", + "rephrase", + "add_constraints", + "reduce_overengineering", + "add_self_check" + ] + }, + "evals": { + "files": ["skill-tasks.jsonl"] + } } } } diff --git a/scripts/gepa/eval-phases.js b/scripts/gepa/eval-phases.js new file mode 100644 index 00000000..c1e4cdc4 --- /dev/null +++ b/scripts/gepa/eval-phases.js @@ -0,0 +1,197 @@ +#!/usr/bin/env node +/** + * Phase-level eval harness for GEPA. + * + * Evaluates conductor prompt phase files against gold sets. + * Scores each phase independently. Used by GEPA auto-optimization + * to validate mutations before applying. + * + * Usage: + * node eval-phases.js # eval all phases + * node eval-phases.js --phase validate # eval single phase + * node eval-phases.js --json # JSON output for CI + */ + +import fs from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import { homedir } from 'os'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const GOLD_DIR = path.join(__dirname, 'gold'); +const PROMPTS_DIR = path.join( + homedir(), + '.stackmemory', + 'conductor', + 'prompts' +); + +const PHASES = ['understand', 'implement', 'validate', 'deliver']; + +// Parse args +const phaseIdx = process.argv.indexOf('--phase'); +const targetPhase = phaseIdx !== -1 ? process.argv[phaseIdx + 1] : null; +const jsonOutput = process.argv.includes('--json'); + +/** + * Load gold set for a phase + */ +function loadGoldSet(phase) { + const goldPath = path.join(GOLD_DIR, `${phase}.jsonl`); + if (!fs.existsSync(goldPath)) return []; + return fs + .readFileSync(goldPath, 'utf-8') + .split('\n') + .filter(Boolean) + .map((l) => JSON.parse(l)); +} + +/** + * Score a phase prompt against its gold set using heuristic evaluation. + * This is a fast, offline eval (no LLM calls) based on outcome patterns. + * + * For LLM-judge evaluation, use the full GEPA optimize.js eval pipeline. + */ +function evalPhase(phase) { + const goldSet = loadGoldSet(phase); + if (goldSet.length === 0) { + return { phase, score: 0, total: 0, passed: 0, skipped: true }; + } + + const promptPath = path.join(PROMPTS_DIR, `${phase}.md`); + if (!fs.existsSync(promptPath)) { + return { phase, score: 0, total: goldSet.length, passed: 0, missing: true }; + } + + const prompt = fs.readFileSync(promptPath, 'utf-8'); + let passed = 0; + const failures = []; + + for (const entry of goldSet) { + const expected = entry.expected; + if (!expected) continue; + + // Heuristic: check if the prompt addresses the failure patterns + let entryPassed = true; + + switch (phase) { + case 'understand': { + // Check if prompt guides complexity assessment + if (expected.complexity === 'careful' && !prompt.includes('plan')) { + entryPassed = false; + } + break; + } + + case 'implement': { + // Check if prompt constrains scope + if (!expected.scopeKept && !prompt.includes('scope')) { + entryPassed = false; + } + // Check ESM import guidance + if ( + entry.errorTail && + /import|ESM/i.test(entry.errorTail) && + !prompt.includes('.js') + ) { + entryPassed = false; + } + break; + } + + case 'validate': { + // Check if prompt covers the specific failure type + if (expected.retryStrategy === 'fix_lint' && !prompt.includes('lint')) { + entryPassed = false; + } + if (expected.retryStrategy === 'fix_test' && !prompt.includes('test')) { + entryPassed = false; + } + if ( + expected.retryStrategy === 'fix_build' && + !prompt.includes('build') + ) { + entryPassed = false; + } + // Check --no-verify prevention + if (!prompt.includes('no-verify') && !prompt.includes('--no-verify')) { + entryPassed = false; + } + break; + } + + case 'deliver': { + // Check commit format guidance + if (!prompt.includes('type(scope)') && !prompt.includes('commit')) { + entryPassed = false; + } + break; + } + } + + if (entryPassed) { + passed++; + } else { + failures.push({ + issue: entry.issue, + outcome: entry.outcome, + reason: `Prompt missing guidance for: ${JSON.stringify(expected)}`, + }); + } + } + + return { + phase, + score: goldSet.length > 0 ? passed / goldSet.length : 0, + total: goldSet.length, + passed, + failures: failures.slice(0, 5), // top 5 failures + }; +} + +// Main +const phases = targetPhase ? [targetPhase] : PHASES; +const results = phases.map(evalPhase); + +if (jsonOutput) { + console.log(JSON.stringify(results, null, 2)); +} else { + console.log('GEPA Phase Evaluation'); + console.log('═'.repeat(50)); + + let totalScore = 0; + let totalPhases = 0; + + for (const r of results) { + if (r.skipped) { + console.log(` ${r.phase.padEnd(12)} — no gold set`); + continue; + } + if (r.missing) { + console.log(` ${r.phase.padEnd(12)} — prompt file missing`); + continue; + } + + const pct = (r.score * 100).toFixed(1); + const bar = '█'.repeat(Math.round(r.score * 20)).padEnd(20, '░'); + const status = r.score >= 0.7 ? '✓' : r.score >= 0.4 ? '~' : '✗'; + console.log( + ` ${status} ${r.phase.padEnd(12)} ${bar} ${pct}% (${r.passed}/${r.total})` + ); + + if (r.failures && r.failures.length > 0) { + for (const f of r.failures.slice(0, 3)) { + console.log(` └ ${f.issue}: ${f.reason.slice(0, 80)}`); + } + } + + totalScore += r.score; + totalPhases++; + } + + if (totalPhases > 0) { + const avg = ((totalScore / totalPhases) * 100).toFixed(1); + console.log('─'.repeat(50)); + console.log(` Average: ${avg}%`); + } +} diff --git a/scripts/gepa/evals/conductor-provenantai.jsonl b/scripts/gepa/evals/conductor-provenantai.jsonl new file mode 100644 index 00000000..54f933b6 --- /dev/null +++ b/scripts/gepa/evals/conductor-provenantai.jsonl @@ -0,0 +1,15 @@ +{"id": "pa-001", "name": "express_route_bug_fix", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-600: Fix 500 error on /api/v1/query when conversation_id is invalid UUID\n\n## Description\n\nWhen a user sends a malformed conversation_id (not a valid UUID), the query endpoint crashes with an unhandled pg error instead of returning a 400. The route is at src/routes/query.js. Use AppError from src/core/errors.js with ErrorCodes.VALIDATION.\n\nLabels: bug\nPriority: High\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate this prompt for a ProvenantAI Express route bug fix. Check: does it guide reading the existing route file first, does it mention the AppError pattern, does it specify running npm run lint and npm test, does it mention Supertest for route testing, does it specify the DI factory pattern used in route files?", "expected": {"reads_existing_code": "prompt should instruct agent to read the existing route before modifying", "apperror_pattern": "prompt should reference AppError/ErrorCodes pattern from src/core/errors", "test_commands": "prompt should specify npm run lint and npm test (or npm run test:core)", "supertest_pattern": "prompt should mention testing with Supertest passing app not server", "di_factory_pattern": "prompt should reference createRouter(deps) DI factory pattern"}, "weight": 1.5} +{"id": "pa-002", "name": "jest_test_conventions", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-605: Add unit tests for recipe webhook signature verification\n\n## Description\n\nThe webhook-signature.js module at src/recipes/webhook-signature.js has no test coverage. Add tests for generateSignature() and verifySignature() including edge cases: empty payload, expired timestamps, invalid HMAC.\n\nLabels: test\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI Jest conventions. Check: does it warn against importing from @jest/globals (use globals instead), does it mention jest.clearAllMocks() resetting mockReturnValue, does it specify using catch {} not catch (_err) {} for ESLint, does it tell the agent to check existing test patterns in __tests__/?", "expected": {"no_jest_globals_import": "prompt should warn not to import from @jest/globals — use global jest", "clear_all_mocks_warning": "prompt should note jest.clearAllMocks() resets mockReturnValue — re-set in beforeEach", "eslint_catch_pattern": "prompt should specify catch {} not catch (_err) {} per ESLint rules", "check_existing_patterns": "prompt should instruct checking existing __tests__/ for patterns before writing"}, "weight": 1.8} +{"id": "pa-003", "name": "database_migration", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-610: Add prompt_version column to queries table\n\n## Description\n\nAdd a VARCHAR(64) column 'prompt_version' to the queries table to track which system prompt version was used for each query. This enables outcome attribution when GEPA optimizes prompts. Column should be nullable (existing rows won't have it).\n\nLabels: chore, database\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI migration conventions. Check: does it specify the migration file naming pattern (NNN_description.sql), does it mention checking the latest migration number, does it guide making the column nullable for backwards compatibility, does it mention auto-migration on startup, does it warn about postgres.railway.internal only being reachable from Railway?", "expected": {"migration_naming": "prompt should specify NNN_description.sql naming in src/db/migrations/", "check_latest_number": "prompt should instruct checking latest migration number to avoid conflicts", "nullable_column": "prompt should guide making new columns nullable for backwards compatibility", "auto_migration_startup": "prompt should mention auto-migration runs on startup"}, "weight": 1.5} +{"id": "pa-004", "name": "recipe_template_addition", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-615: Add competitor monitoring recipe template\n\n## Description\n\nAdd a new recipe template (#035) for competitor monitoring. Input: Google Alerts webhook. Steps: enrichment (LLM summarize + Clearbit company lookup), output to Slack channel. Follow the existing template pattern in src/db/migrations/028_expanded_recipe_templates.sql.\n\nLabels: feature, recipes\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI recipe conventions. Check: does it guide reading existing template migrations first, does it mention the step-runner pipeline stages, does it reference the enrichment service (CJS not ESM), does it mention tier enforcement, does it specify the recipe service at src/recipes/service.js?", "expected": {"read_existing_templates": "prompt should instruct reading existing recipe template migrations for pattern", "step_runner_stages": "prompt should reference step-runner pipeline (input → enrichment → output)", "enrichment_cjs": "prompt should note enrichment service is CJS not ESM", "recipe_service_path": "prompt should reference src/recipes/service.js for lifecycle management"}, "weight": 1.3} +{"id": "pa-005", "name": "stripe_integration_change", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-620: Add annual billing option to Stripe products\n\n## Description\n\nAdd yearly pricing to existing Stripe products (Starter/Growth/Scale). Use the v2 product IDs from Railway env vars. Create prices with 20% annual discount. Update the billing routes to handle annual subscription creation.\n\nLabels: feature, billing\nPriority: High\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI billing conventions. Check: does it warn about not hardcoding Stripe product IDs (use env vars), does it mention the v2 products (Starter/Growth/Scale), does it guide checking existing billing routes, does it warn about test mode vs live keys, does it specify Clerk webhook creates tenant + seeds CoA?", "expected": {"no_hardcoded_stripe_ids": "prompt should warn against hardcoding Stripe product/price IDs — use env vars", "v2_products": "prompt should reference v2 product tier names (Starter/Growth/Scale)", "check_billing_routes": "prompt should instruct reading existing billing route patterns", "test_vs_live_keys": "prompt should warn about test mode header (X-Test-Mode: true) for testing"}, "weight": 1.5} +{"id": "pa-006", "name": "auth_middleware_handling", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-625: Add API key rotation endpoint\n\n## Description\n\nAdd POST /api/v1/auth/rotate-key that generates a new API key for the authenticated tenant, invalidates the old one, and returns the new key. Must work with both Clerk auth and API key auth (authenticateAny middleware).\n\nLabels: feature, auth\nPriority: High\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI auth conventions. Check: does it reference the auth middleware at src/auth/auth.middleware.js, does it mention the three auth modes (Clerk + API key + test mode), does it specify multi-tenant isolation (scope by tenant_id), does it warn about AES-256-GCM for credential encryption?", "expected": {"auth_middleware_reference": "prompt should reference src/auth/auth.middleware.js with authenticateAny", "three_auth_modes": "prompt should mention Clerk + API key + test mode auth paths", "tenant_isolation": "prompt should emphasize tenant_id scoping for multi-tenant isolation", "credential_encryption": "prompt should mention AES-256-GCM KMS for credential storage"}, "weight": 1.5} +{"id": "pa-007", "name": "dashboard_component", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-630: Add recipe execution history chart to dashboard\n\n## Description\n\nAdd a bar chart showing recipe execution success/failure counts over the last 30 days to the COO dashboard page. Use the existing chart components in dashboard-app/. Data comes from GET /api/v1/recipes/stats endpoint (already exists).\n\nLabels: feature, dashboard\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI dashboard conventions. Check: does it mention Vite base='/app/' + BrowserRouter basename, does it reference the design system (docs/STYLE.md), does it instruct building with 'cd dashboard-app && npx vite build', does it mention product-gating (CMO/CFO/COO)?", "expected": {"vite_base_path": "prompt should mention Vite base='/app/' and BrowserRouter basename='/app'", "design_system": "prompt should reference design system at docs/STYLE.md or docs/CONTROL_PLANE_STYLE.md", "build_command": "prompt should specify dashboard build: cd dashboard-app && npx vite build", "product_gating": "prompt should mention product-gated pages (CMO/CFO/COO)"}, "weight": 1.3} +{"id": "pa-008", "name": "llm_adapter_change", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-635: Add token usage alerts when query exceeds budget\n\n## Description\n\nAdd a warning log and optional webhook notification when a query's token usage exceeds the tenant's budget threshold. Check against tenant_budget_overrides table. The LLM adapter at src/llm/adapter.js records usage — add the check after usage recording.\n\nLabels: feature, observability\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI LLM adapter conventions. Check: does it reference the model tiers (fast/balanced/deep), does it mention the Anthropic-native adapter pattern, does it instruct reading src/llm/adapter.js first, does it reference trace-logger for structured observability, does it specify the cost estimation pattern?", "expected": {"model_tiers": "prompt should reference fast/balanced/deep model tier system", "anthropic_native": "prompt should mention Anthropic-native adapter (not multi-provider)", "read_adapter_first": "prompt should instruct reading src/llm/adapter.js before modifying", "trace_logger": "prompt should reference trace-logger for structured span observability"}, "weight": 1.3} +{"id": "pa-009", "name": "integration_connector", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-640: Add HubSpot connector for recipe input\n\n## Description\n\nAdd HubSpot as a connector input source for recipes. Follow the pattern of existing connectors (Stripe, Salesforce, QuickBooks). Use Pipedream Connect for OAuth (src/integrations/pipedream/pipedream-connect.js). Add the connector type to the recipe input schema.\n\nLabels: feature, integrations\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI integration conventions. Check: does it reference Pipedream Connect for managed OAuth, does it instruct reading existing connector patterns (Stripe/SF/QB), does it mention the fixture pattern at src/__fixtures__/, does it specify the integration test path (src/integrations/)?", "expected": {"pipedream_connect": "prompt should reference Pipedream Connect for managed OAuth", "existing_connector_patterns": "prompt should instruct reading existing connector implementations", "fixture_pattern": "prompt should mention test fixtures at src/__fixtures__/ for mock data", "integration_test_path": "prompt should specify test path src/integrations/ for integration tests"}, "weight": 1.3} +{"id": "pa-010", "name": "webhook_handler", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-645: Add idempotency check to recipe webhook handler\n\n## Description\n\nThe webhook handler at src/routes/webhook-recipes.js should check the webhook_idempotency table before processing duplicate events. Use the existing idempotency migration (066). Verify HMAC-SHA256 signature before the idempotency check.\n\nLabels: feature, security\nPriority: High\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI webhook conventions. Check: does it reference HMAC-SHA256 signature verification, does it mention the webhook-signature.js module, does it instruct reading the idempotency migration, does it specify parameterized queries for DB access, does it mention the service registry singleton pattern?", "expected": {"hmac_verification": "prompt should reference HMAC-SHA256 signature verification before processing", "webhook_signature_module": "prompt should reference src/recipes/webhook-signature.js", "parameterized_queries": "prompt should specify parameterized queries ($1, $2) — no string interpolation", "service_registry": "prompt should mention service registry singleton for route access"}, "weight": 1.5} +{"id": "pa-011", "name": "context_graph_feature", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-650: Add contradiction detection to context graph queries\n\n## Description\n\nThe context graph query service (src/ctx/graph-query.js) has a getContradictions() stub. Implement it to find ctx_nodes with conflicting content for the same entity. Return nodes with a contradiction_score based on semantic similarity of conflicting claims.\n\nLabels: feature, context-graph\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI context graph conventions. Check: does it reference the ctx_nodes/ctx_edges schema, does it mention tenant_id scoping, does it instruct reading the existing graph-query.js service, does it reference the GraphService wrapper at src/graph/service.ts?", "expected": {"ctx_schema": "prompt should reference ctx_nodes and ctx_edges table schema", "tenant_scoping": "prompt should emphasize tenant_id scoping for all graph queries", "existing_service": "prompt should instruct reading src/ctx/graph-query.js first", "graph_service_wrapper": "prompt should reference GraphService at src/graph/service.ts"}, "weight": 1.3} +{"id": "pa-012", "name": "e2e_test_addition", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-655: Add E2E test for recipe CRUD flow\n\n## Description\n\nAdd a Playwright E2E spec testing the full recipe lifecycle: create → list → get → update → delete. Use the existing E2E fixtures pattern at e2e/fixtures/test-fixtures.ts. Test against the API project (not browser).\n\nLabels: test, e2e\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI E2E conventions. Check: does it reference Playwright config with project names (api, business, chromium), does it instruct reading existing E2E specs, does it mention the X-Test-Mode header for test auth, does it specify npm run e2e:api for API tests?", "expected": {"playwright_projects": "prompt should reference Playwright projects (api, business, chromium)", "existing_e2e_patterns": "prompt should instruct reading existing e2e/ specs for patterns", "test_mode_header": "prompt should mention X-Test-Mode: true header for test authentication", "e2e_api_command": "prompt should specify npm run e2e:api for API-only E2E tests"}, "weight": 1.3} +{"id": "pa-013", "name": "observability_addition", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-660: Add trace spans to recipe step execution\n\n## Description\n\nThe step-runner at src/core/step-runner.js executes recipe pipeline steps but has no tracing. Add trace-logger spans around each step execution with step name, duration, success/failure, and input/output token counts for LLM steps.\n\nLabels: feature, observability\nPriority: Medium\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI observability conventions. Check: does it reference trace-logger at src/utils/trace-logger.js, does it show the traceSpan pattern, does it instruct not logging PII, does it mention structured spans with metadata?", "expected": {"trace_logger_reference": "prompt should reference src/utils/trace-logger.js", "trace_span_pattern": "prompt should demonstrate traceLogger.traceSpan(name, metadata) pattern", "no_pii_logging": "prompt should warn against logging PII (emails, tokens, card numbers)", "structured_metadata": "prompt should specify structured span metadata (not free-form strings)"}, "weight": 1.3} +{"id": "pa-014", "name": "no_description_provenantai", "prompt": "You are an AI coding agent given this prompt template output for an issue with minimal context:\n\n---\nYou are working on Linear issue STA-670: Fix flaky scheduler test\n\nLabels: bug\nPriority: Low\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate how this prompt handles a ProvenantAI issue with no description. Check: does it guide searching for the test file (test/unit/scheduler.test.js is known flaky), does it instruct reading the test to understand what's flaky (timer-dependent), does it mention jest.useFakeTimers() as a common fix, does it degrade gracefully without a description?", "expected": {"guides_file_search": "prompt should guide searching codebase for the relevant test file", "understand_flakiness": "prompt should instruct understanding the root cause of flakiness", "graceful_degradation": "prompt should still be useful without a description", "codebase_context": "prompt should tell agent to gather context from the codebase when description is missing"}, "weight": 1.8} +{"id": "pa-015", "name": "security_sensitive_change", "prompt": "You are an AI coding agent given this prompt template output:\n\n---\nYou are working on Linear issue STA-675: Add PII scrubbing to LLM context documents\n\n## Description\n\nBefore sending context_documents to the LLM adapter, scrub PII (emails, phone numbers, credit card numbers) from the text field. Add a scrubPII() utility. This is a security requirement — see src/auth/auth.middleware.js for tenant context.\n\nLabels: security\nPriority: Urgent\n\n## Instructions\n\n1. Read the issue description carefully\n2. Implement the requested changes\n3. Write or update tests as needed\n4. Run lint and tests to verify\n5. Commit your changes with a descriptive message\n\nWork in the current directory. All changes will be on a dedicated branch.\n---\n\nEvaluate for ProvenantAI security conventions. Check: does it reference the security rules (.claude/rules/security.md), does it emphasize input validation at route boundary, does it mention parameterized queries, does it warn against committing secrets, does it specify AES-256-GCM for credentials?", "expected": {"security_rules_reference": "prompt should reference security rules or conventions", "input_validation": "prompt should emphasize input validation at system boundaries", "no_secrets_in_commits": "prompt should warn against committing secrets (.env, API keys, tokens)", "pii_scrubbing_guidance": "prompt should provide guidance on PII patterns to scrub (emails, phones, cards)"}, "weight": 2.0} diff --git a/scripts/gepa/evals/skill-tasks.jsonl b/scripts/gepa/evals/skill-tasks.jsonl new file mode 100644 index 00000000..f84dcfe7 --- /dev/null +++ b/scripts/gepa/evals/skill-tasks.jsonl @@ -0,0 +1,30 @@ +{"id": "skill-001", "name": "start_clean_main", "prompt": "I just opened a new session. The repo is clean, on main branch. What should I do?", "expected": {"shows_branch": true, "shows_recent_commits": true, "suggests_next_action": true, "no_overengineering": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-002", "name": "start_dirty_feature", "prompt": "I'm resuming work. There are uncommitted changes on a feature branch. What's my status?", "expected": {"shows_branch": true, "shows_uncommitted": true, "suggests_continue_or_commit": true, "no_hallucination": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-003", "name": "stop_with_work", "prompt": "I'm done for today but have more work tomorrow. Save my progress.", "expected": {"captures_context": true, "suggests_commit": true, "no_overengineering": true}, "weight": 1.2, "partition": "train"} +{"id": "skill-004", "name": "stop_done", "prompt": "I'm completely done with this task. Clean up and close.", "expected": {"captures_context": true, "suggests_clear": true, "concise_output": true}, "weight": 1.2, "partition": "train"} +{"id": "skill-005", "name": "learn_after_changes", "prompt": "I just finished a session where I created 3 new files, fixed a bug, and discovered a gotcha about ESM imports. What should be updated?", "expected": {"reviews_session": true, "identifies_memory_update": true, "structured_output": true, "no_overengineering": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-006", "name": "learn_nothing_new", "prompt": "I just did a quick typo fix. Anything to update?", "expected": {"concise_output": true, "says_nothing_needed": true, "no_overengineering": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-007", "name": "summary_multi_task", "prompt": "Summarize what I did this session: fixed auth bug, added pagination, updated tests.", "expected": {"lists_actions": true, "lists_files": true, "concise_output": true, "no_hallucination": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-008", "name": "next_with_pr_open", "prompt": "I have an open PR with failing CI. What should I work on?", "expected": {"identifies_ci_fix": true, "actionable_suggestion": true, "presents_options": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-009", "name": "start_stale_branch", "prompt": "I haven't touched this branch in a week. The main branch has 12 new commits. What should I do?", "expected": {"suggests_rebase_or_merge": true, "checks_conflicts": true, "suggests_next_action": true, "no_overengineering": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-010", "name": "start_empty_repo", "prompt": "I just cloned a new repo for the first time. How should I get started?", "expected": {"checks_readme": true, "suggests_setup": true, "concise_output": true, "no_overengineering": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-011", "name": "stop_merge_conflict", "prompt": "I need to stop but I have an unresolved merge conflict. What do I do?", "expected": {"warns_about_conflict": true, "suggests_resolution_or_stash": true, "captures_context": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-012", "name": "learn_refactor_session", "prompt": "I just renamed 15 files from camelCase to kebab-case and updated all imports. What should I update?", "expected": {"identifies_pattern_change": true, "suggests_convention_doc": true, "no_overengineering": true}, "weight": 1.2, "partition": "train"} +{"id": "skill-013", "name": "learn_new_integration", "prompt": "I just added a Slack integration with webhooks. We found that ngrok is needed for local dev. What should be remembered?", "expected": {"identifies_memory_update": true, "captures_setup_gotcha": true, "structured_output": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-014", "name": "next_all_green", "prompt": "Everything is passing, PR is merged, branch is clean on main. What next?", "expected": {"checks_todo_queue": true, "suggests_next_action": true, "presents_options": true, "concise_output": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-015", "name": "next_blocked_by_review", "prompt": "My PR needs review from a teammate who's out today. What should I do in the meantime?", "expected": {"suggests_parallel_work": true, "presents_options": true, "no_overengineering": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-016", "name": "summary_single_fix", "prompt": "Summarize: I fixed a typo in README.md.", "expected": {"concise_output": true, "no_overengineering": true, "no_hallucination": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-017", "name": "summary_complex_session", "prompt": "Summarize: I debugged a memory leak in the daemon, found it was caused by unclosed file descriptors in the watcher, fixed it, added a regression test, updated CLAUDE.md with the gotcha, and created a PR.", "expected": {"lists_actions": true, "structured_output": true, "captures_key_finding": true, "no_hallucination": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-018", "name": "start_with_handoff", "prompt": "I'm starting a new session. There's a handoff file from yesterday's session that says we were working on STA-590 and had 2 tests still failing.", "expected": {"reads_handoff": true, "identifies_continuation": true, "suggests_next_action": true, "no_hallucination": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-019", "name": "stop_with_failing_tests", "prompt": "I need to stop but 3 tests are still failing. What should I do?", "expected": {"warns_about_tests": true, "captures_context": true, "suggests_commit_wip_or_stash": true, "no_overengineering": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-020", "name": "adversarial_empty_input", "prompt": "", "expected": {"handles_empty": true, "no_hallucination": true, "concise_output": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-021", "name": "adversarial_conflicting_state", "prompt": "I'm on main with uncommitted changes, but I also have an open PR on a feature branch with failing CI. What should I do?", "expected": {"addresses_both_issues": true, "prioritizes_correctly": true, "presents_options": true, "no_overengineering": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-022", "name": "adversarial_vague_request", "prompt": "Do the thing", "expected": {"asks_clarification": true, "no_hallucination": true, "concise_output": true}, "weight": 1.0, "partition": "train"} +{"id": "skill-023", "name": "learn_security_fix", "prompt": "I just patched an XSS vulnerability in the webhook handler. The fix involved sanitizing user input before rendering. What should be updated?", "expected": {"identifies_security_pattern": true, "suggests_memory_update": true, "identifies_security_issue": true, "structured_output": true}, "weight": 1.5, "partition": "train"} +{"id": "skill-024", "name": "next_multiple_prs", "prompt": "I have 3 open PRs: one needs rebasing, one has review comments, one is approved. What should I prioritize?", "expected": {"prioritizes_correctly": true, "actionable_suggestion": true, "presents_options": true, "no_overengineering": true}, "weight": 1.3, "partition": "train"} +{"id": "skill-025", "name": "start_after_deploy", "prompt": "We just deployed v2.0 to production. Starting a new session to work on post-deploy tasks.", "expected": {"checks_deploy_status": true, "suggests_monitoring": true, "suggests_next_action": true, "no_overengineering": true}, "weight": 1.2, "partition": "test"} +{"id": "skill-026", "name": "stop_mid_refactor", "prompt": "I'm halfway through a large refactor — moved 5 of 10 files. Need to stop now.", "expected": {"warns_about_partial": true, "captures_context": true, "suggests_commit_wip_or_stash": true}, "weight": 1.3, "partition": "test"} +{"id": "skill-027", "name": "learn_perf_discovery", "prompt": "I discovered that our FTS5 queries are 3x slower when the table has >10K rows because we're not using the bm25() function correctly. What should be updated?", "expected": {"identifies_memory_update": true, "captures_key_finding": true, "structured_output": true, "no_overengineering": true}, "weight": 1.5, "partition": "test"} +{"id": "skill-028", "name": "next_fresh_sprint", "prompt": "It's Monday morning, start of a new sprint. No open PRs, clean branch. What should I work on?", "expected": {"checks_todo_queue": true, "suggests_next_action": true, "presents_options": true, "concise_output": true}, "weight": 1.0, "partition": "test"} +{"id": "skill-029", "name": "summary_debugging_session", "prompt": "Summarize: spent 2 hours debugging why vitest tests hang. Root cause was a leaked timer in Promise.race. Fixed with clearTimeout in finally block.", "expected": {"captures_key_finding": true, "lists_actions": true, "concise_output": true, "no_hallucination": true}, "weight": 1.3, "partition": "test"} +{"id": "skill-030", "name": "adversarial_wrong_skill", "prompt": "Deploy to production immediately and send a Slack message to the team.", "expected": {"refuses_dangerous_action": true, "explains_why": true, "no_hallucination": true}, "weight": 1.5, "partition": "test"} diff --git a/scripts/gepa/generations/gen-000/baseline.md b/scripts/gepa/generations/gen-000/baseline.md deleted file mode 100644 index 0c86cace..00000000 --- a/scripts/gepa/generations/gen-000/baseline.md +++ /dev/null @@ -1,74 +0,0 @@ -# croissant.ai — Agent Guide - -Tool-agnostic reference for AI coding agents working in this repository. - -## Stack - -Node.js / Express / PostgreSQL / Redis -Railway deployment | Stripe / Salesforce / QuickBooks integrations - -## Project Structure - -``` -src/ - api/ # Route handlers - core/ # monitoring-service, cache-service, queue-service, master-agent, api-validation - features/ # Feature modules - shared/ # Shared utilities - integrations/ # Third-party connectors -docs/ # Documentation -scripts/ # Automation scripts -docker/ # Container configs -prompts/ # Externalized LLM prompt templates -``` - -## Commands - -```bash -npm run dev # Start dev server -npm run test # Run test suites (3 parallel Jest workers, maxWorkers=4) -npm run lint # Lint check -npm run migrate # Run DB migrations -docker-compose up -d # Start local DBs -``` - -## Git Conventions - -- Branch prefixes: `feature/`, `fix/`, `chore/` -- Commit format: `type(scope): message` -- Do NOT add `Co-Authored-By` lines to commits -- Pre-commit hook runs: `npm run lint` + `npm run test` + E2E browser screenshots - -## Testing Rules - -- **Framework**: Jest + SWC -- **DB mocking**: Use dependency injection (DI), not global mocks -- **Supertest**: Pass `app` (NOT `server`) to supertest -- **Global jest**: src/ tests use global `jest` — do NOT import from `@jest/globals` (causes redeclaration errors) -- **Mock reset**: `jest.clearAllMocks()` resets `mockReturnValue` — always re-set mocks in `beforeEach` -- **Test runner**: `npm test` is long-running; run in a background process or sub-agent, not inline - -## ESLint Rules - -- Use `catch {}` not `catch (_err) {}` — underscore prefix not in the allowed pattern -- CJS format for JS files in `src/` - -## Key Patterns - -- Provenance tracking: every data point includes source, timestamp, lineage -- Multi-tenant container isolation -- DI route factories for testability -- Error handling: return undefined over throwing; log and continue over crashing -- Add `.js` extension to relative ESM imports - -## StackMemory Context Rule - -- When an agent fetches conversation context for active work, it must pass the exact current assignment or question as `task_query`. -- Prefer the MCP shape: - - `org_id` - - `conversation_id` - - `worker_mode: true` - - `task_query` - - `recover_on_low_signal: true` -- Do not fetch raw `get_conversation` context for worker execution unless full transcript behavior is explicitly required. -- The current assignment is persisted under `.stackmemory/worker-context/current-assignment.json` so wrappers and hooks can auto-fill or enforce `task_query`. diff --git a/scripts/gepa/generations/gen-001/baseline.md b/scripts/gepa/generations/gen-001/baseline.md deleted file mode 100644 index 0c86cace..00000000 --- a/scripts/gepa/generations/gen-001/baseline.md +++ /dev/null @@ -1,74 +0,0 @@ -# croissant.ai — Agent Guide - -Tool-agnostic reference for AI coding agents working in this repository. - -## Stack - -Node.js / Express / PostgreSQL / Redis -Railway deployment | Stripe / Salesforce / QuickBooks integrations - -## Project Structure - -``` -src/ - api/ # Route handlers - core/ # monitoring-service, cache-service, queue-service, master-agent, api-validation - features/ # Feature modules - shared/ # Shared utilities - integrations/ # Third-party connectors -docs/ # Documentation -scripts/ # Automation scripts -docker/ # Container configs -prompts/ # Externalized LLM prompt templates -``` - -## Commands - -```bash -npm run dev # Start dev server -npm run test # Run test suites (3 parallel Jest workers, maxWorkers=4) -npm run lint # Lint check -npm run migrate # Run DB migrations -docker-compose up -d # Start local DBs -``` - -## Git Conventions - -- Branch prefixes: `feature/`, `fix/`, `chore/` -- Commit format: `type(scope): message` -- Do NOT add `Co-Authored-By` lines to commits -- Pre-commit hook runs: `npm run lint` + `npm run test` + E2E browser screenshots - -## Testing Rules - -- **Framework**: Jest + SWC -- **DB mocking**: Use dependency injection (DI), not global mocks -- **Supertest**: Pass `app` (NOT `server`) to supertest -- **Global jest**: src/ tests use global `jest` — do NOT import from `@jest/globals` (causes redeclaration errors) -- **Mock reset**: `jest.clearAllMocks()` resets `mockReturnValue` — always re-set mocks in `beforeEach` -- **Test runner**: `npm test` is long-running; run in a background process or sub-agent, not inline - -## ESLint Rules - -- Use `catch {}` not `catch (_err) {}` — underscore prefix not in the allowed pattern -- CJS format for JS files in `src/` - -## Key Patterns - -- Provenance tracking: every data point includes source, timestamp, lineage -- Multi-tenant container isolation -- DI route factories for testability -- Error handling: return undefined over throwing; log and continue over crashing -- Add `.js` extension to relative ESM imports - -## StackMemory Context Rule - -- When an agent fetches conversation context for active work, it must pass the exact current assignment or question as `task_query`. -- Prefer the MCP shape: - - `org_id` - - `conversation_id` - - `worker_mode: true` - - `task_query` - - `recover_on_low_signal: true` -- Do not fetch raw `get_conversation` context for worker execution unless full transcript behavior is explicitly required. -- The current assignment is persisted under `.stackmemory/worker-context/current-assignment.json` so wrappers and hooks can auto-fill or enforce `task_query`. diff --git a/scripts/gepa/gold/deliver.jsonl b/scripts/gepa/gold/deliver.jsonl new file mode 100644 index 00000000..b7ab4daa --- /dev/null +++ b/scripts/gepa/gold/deliver.jsonl @@ -0,0 +1,10 @@ +{"issue":"STA-561","attempt":1,"outcome":"success","phase":"committing","toolCalls":69,"filesModified":9,"durationMs":385588,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-559","attempt":1,"outcome":"success","phase":"committing","toolCalls":48,"filesModified":3,"durationMs":412555,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-563","attempt":1,"outcome":"success","phase":"committing","toolCalls":95,"filesModified":10,"durationMs":460221,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-562","attempt":1,"outcome":"success","phase":"committing","toolCalls":42,"filesModified":5,"durationMs":311117,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-560","attempt":1,"outcome":"success","phase":"committing","toolCalls":95,"filesModified":17,"durationMs":619257,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-576","attempt":1,"outcome":"failure","phase":"committing","toolCalls":55,"filesModified":5,"durationMs":267800,"hasCommits":false,"errorTail":"error: could not apply fa39187... feat(sync): add incremental update\nCONFLICT (content): Merge conflict in src/services/linear-sync.ts","expected":{"hasCommits":false,"success":false}} +{"issue":"STA-577","attempt":2,"outcome":"success","phase":"committing","toolCalls":87,"filesModified":6,"durationMs":342100,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-564","attempt":1,"outcome":"success","phase":"committing","toolCalls":39,"filesModified":3,"durationMs":427562,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-572","attempt":1,"outcome":"success","phase":"committing","toolCalls":109,"filesModified":18,"durationMs":800902,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} +{"issue":"STA-566","attempt":1,"outcome":"success","phase":"committing","toolCalls":107,"filesModified":7,"durationMs":809672,"hasCommits":true,"errorTail":null,"expected":{"hasCommits":true,"success":true}} diff --git a/scripts/gepa/gold/implement.jsonl b/scripts/gepa/gold/implement.jsonl new file mode 100644 index 00000000..17cf3aae --- /dev/null +++ b/scripts/gepa/gold/implement.jsonl @@ -0,0 +1 @@ +{"issue":"STA-574","attempt":1,"outcome":"failure","phase":"implementing","toolCalls":98,"filesModified":8,"durationMs":600000,"hasCommits":false,"errorTail":"Agent timed out after 600000ms during implementation. Last activity: editing src/integrations/mcp/tools/search.ts","expected":{"filesModified":8,"scopeKept":false}} diff --git a/scripts/gepa/gold/mine-traces.js b/scripts/gepa/gold/mine-traces.js new file mode 100644 index 00000000..7a828779 --- /dev/null +++ b/scripts/gepa/gold/mine-traces.js @@ -0,0 +1,130 @@ +#!/usr/bin/env node +/** + * Mine conductor outcomes + traces for gold set candidates. + * + * Reads outcomes.jsonl and traces.db, generates per-phase gold set + * candidates in gold/*.jsonl for manual curation. + * + * Usage: node scripts/gepa/gold/mine-traces.js + */ + +import fs from 'fs'; +import path from 'path'; +import { homedir } from 'os'; + +const CONDUCTOR_DIR = path.join(homedir(), '.stackmemory', 'conductor'); +const OUTCOMES_PATH = path.join(CONDUCTOR_DIR, 'outcomes.jsonl'); +const GOLD_DIR = path.dirname(new URL(import.meta.url).pathname); + +// Map agent phases to prompt phases +const PHASE_MAP = { + reading: 'understand', + planning: 'understand', + implementing: 'implement', + testing: 'validate', + linting: 'validate', + building: 'validate', + committing: 'deliver', +}; + +function loadOutcomes() { + if (!fs.existsSync(OUTCOMES_PATH)) { + console.error('No outcomes.jsonl found'); + process.exit(1); + } + return fs + .readFileSync(OUTCOMES_PATH, 'utf-8') + .split('\n') + .filter(Boolean) + .map((l) => JSON.parse(l)); +} + +function generateGoldSets(outcomes) { + const byPhase = { understand: [], implement: [], validate: [], deliver: [] }; + + for (const o of outcomes) { + const phase = PHASE_MAP[o.phase] || 'implement'; + + const entry = { + issue: o.issue, + attempt: o.attempt, + outcome: o.outcome, + phase: o.phase, + toolCalls: o.toolCalls, + filesModified: o.filesModified, + durationMs: o.durationMs, + hasCommits: o.hasCommits, + errorTail: o.errorTail || null, + }; + + // For understand phase: complexity assessment + if (phase === 'understand') { + entry.expected = { + complexity: + o.toolCalls > 80 + ? 'careful' + : o.toolCalls > 40 + ? 'standard' + : 'simple', + success: o.outcome === 'success', + }; + } + + // For implement phase: scope adherence + if (phase === 'implement') { + entry.expected = { + filesModified: o.filesModified, + scopeKept: o.outcome === 'success' && o.filesModified <= 15, + }; + } + + // For validate phase: pass/fail + retry strategy + if (phase === 'validate') { + let retryStrategy = 'none'; + if (o.outcome === 'failure' && o.errorTail) { + if (/lint|eslint/i.test(o.errorTail)) retryStrategy = 'fix_lint'; + else if (/test|vitest|FAIL/i.test(o.errorTail)) + retryStrategy = 'fix_test'; + else if (/build|tsc|type/i.test(o.errorTail)) + retryStrategy = 'fix_build'; + else retryStrategy = 'investigate'; + } + entry.expected = { + passed: o.outcome === 'success', + retryStrategy, + }; + } + + // For deliver phase: commit quality + if (phase === 'deliver') { + entry.expected = { + hasCommits: o.hasCommits, + success: o.outcome === 'success', + }; + } + + byPhase[phase].push(entry); + } + + return byPhase; +} + +// Main +const outcomes = loadOutcomes(); +const goldSets = generateGoldSets(outcomes); + +let totalWritten = 0; +for (const [phase, entries] of Object.entries(goldSets)) { + const outPath = path.join(GOLD_DIR, `${phase}.jsonl`); + const content = entries.map((e) => JSON.stringify(e)).join('\n'); + fs.writeFileSync(outPath, content + '\n'); + console.log(`${phase}: ${entries.length} entries → ${outPath}`); + totalWritten += entries.length; +} + +console.log( + `\nTotal: ${totalWritten} gold set candidates from ${outcomes.length} outcomes` +); +console.log( + 'Review and curate — remove low-quality entries, add expected outputs' +); diff --git a/scripts/gepa/gold/understand.jsonl b/scripts/gepa/gold/understand.jsonl new file mode 100644 index 00000000..ee87c080 --- /dev/null +++ b/scripts/gepa/gold/understand.jsonl @@ -0,0 +1,56 @@ +{"issue":"STA-575","attempt":1,"outcome":"failure","phase":"reading","toolCalls":22,"filesModified":1,"durationMs":112400,"hasCommits":false,"errorTail":"API rate limit exceeded (HTTP 429). Retry-After: 60s. Backing off globally.","expected":{"complexity":"simple","success":false}} +{"issue":"STA-480","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2238,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-480","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":5856,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-479","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2338,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-480","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3648,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-479","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":5214,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-576","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":985,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-577","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1849,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-576","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3355,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-577","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4124,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-578","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1073,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-578","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3351,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-580","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2026,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-579","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1361,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-580","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4308,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-579","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3633,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-581","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1190,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-581","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3482,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-583","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2029,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-582","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1657,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-583","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4316,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-582","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3919,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-585","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1749,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-586","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1154,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-585","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4071,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-586","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3482,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-584","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1028,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-584","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3273,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-588","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1793,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-590","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2537,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-590","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4845,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-588","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4111,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-591","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1213,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-587","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1985,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-591","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3607,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-587","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4398,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-589","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1362,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-589","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3725,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-597","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1971,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-594","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1406,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-597","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4224,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-594","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3658,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-593","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1728,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-596","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1234,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-593","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4008,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-596","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3447,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-595","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1094,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-595","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":3362,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-567","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2324,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-484","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":1835,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-567","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4824,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-484","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4267,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-483","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2918,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-482","attempt":1,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":2326,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-483","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":5344,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} +{"issue":"STA-482","attempt":2,"outcome":"failure","phase":"reading","toolCalls":0,"filesModified":0,"durationMs":4678,"hasCommits":false,"errorTail":"Claude exited with code 1: ","expected":{"complexity":"simple","success":false}} diff --git a/scripts/gepa/gold/validate.jsonl b/scripts/gepa/gold/validate.jsonl new file mode 100644 index 00000000..8862f808 --- /dev/null +++ b/scripts/gepa/gold/validate.jsonl @@ -0,0 +1,4 @@ +{"issue":"STA-570","attempt":1,"outcome":"failure","phase":"testing","toolCalls":45,"filesModified":4,"durationMs":187200,"hasCommits":false,"errorTail":"ESLint: 3 errors (no-unused-vars, @typescript-eslint/no-explicit-any). Fix lint errors before committing.","expected":{"passed":false,"retryStrategy":"fix_lint"}} +{"issue":"STA-571","attempt":1,"outcome":"failure","phase":"testing","toolCalls":62,"filesModified":6,"durationMs":234500,"hasCommits":false,"errorTail":"eslint found 5 problems (2 errors, 3 warnings) in src/services/sync.ts","expected":{"passed":false,"retryStrategy":"fix_lint"}} +{"issue":"STA-572","attempt":1,"outcome":"failure","phase":"testing","toolCalls":78,"filesModified":7,"durationMs":298000,"hasCommits":false,"errorTail":"FAIL src/core/database/__tests__/sqlite-adapter.test.ts > search > should return ranked results\nAssertionError: expected 0 to be greater than 2","expected":{"passed":false,"retryStrategy":"fix_test"}} +{"issue":"STA-573","attempt":1,"outcome":"failure","phase":"testing","toolCalls":34,"filesModified":3,"durationMs":156300,"hasCommits":false,"errorTail":"vitest run failed: 2 tests failed in src/core/context/__tests__/frame-manager.test.ts - TypeError: Cannot read properties of undefined (reading \"id\")","expected":{"passed":false,"retryStrategy":"fix_test"}} diff --git a/scripts/gepa/hooks/gepa-session-hook.js b/scripts/gepa/hooks/gepa-session-hook.js index 737621e7..15adc602 100644 --- a/scripts/gepa/hooks/gepa-session-hook.js +++ b/scripts/gepa/hooks/gepa-session-hook.js @@ -105,7 +105,7 @@ function triggerOptimization(hookState) { const optimizePath = path.join(GEPA_DIR, 'optimize.js'); const reflectPath = path.join(GEPA_DIR, 'hooks', 'reflect.js'); - // Run reflect → optimize as a background pipeline + // Run reflect → phase-targeted optimize as a background pipeline const script = ` // Reflect first (generates insights for mutation context) try { @@ -113,16 +113,18 @@ function triggerOptimization(hookState) { await generateReflection(); } catch {} - // Then optimize (1 generation, quick) + // Then optimize — use --auto-phase to target worst phase from outcomes const { execSync } = await import('child_process'); try { - execSync('node ${optimizePath} mutate', { stdio: 'pipe', timeout: 300000 }); + execSync('node ${optimizePath} mutate --auto-phase', { stdio: 'pipe', timeout: 300000 }); execSync('node ${optimizePath} score', { stdio: 'pipe', timeout: 300000 }); // Read result and notify const fs = await import('fs'); const state = JSON.parse(fs.readFileSync('${STATE_PATH}', 'utf8')); - const msg = \`[GEPA] Auto-optimized: gen \${state.currentGeneration}, best=\${state.bestVariant} (\${(state.bestScore * 100).toFixed(1)}%). Run 'node ${optimizePath} apply' to apply.\`; + const lastAction = state.history?.[state.history.length - 1]; + const phaseInfo = lastAction?.phase ? \` (phase: \${lastAction.phase})\` : ''; + const msg = \`[GEPA] Auto-optimized\${phaseInfo}: gen \${state.currentGeneration}, best=\${state.bestVariant} (\${(state.bestScore * 100).toFixed(1)}%). Run 'node ${optimizePath} apply' to apply.\`; process.stderr.write(msg + '\\n'); } catch (e) { process.stderr.write('[GEPA] Auto-optimize failed: ' + e.message + '\\n'); diff --git a/scripts/gepa/optimize.js b/scripts/gepa/optimize.js index 40f86403..edc5ab76 100755 --- a/scripts/gepa/optimize.js +++ b/scripts/gepa/optimize.js @@ -91,6 +91,195 @@ const GENERATIONS_DIR = path.join(GEPA_DIR, 'generations'); const RESULTS_DIR = path.join(GEPA_DIR, 'results'); const EVALS_DIR = path.join(GEPA_DIR, 'evals'); +// --phase scopes optimization to a single conductor phase file +const phaseIdx = process.argv.indexOf('--phase'); +const phaseName = phaseIdx !== -1 ? process.argv[phaseIdx + 1] : null; +if (phaseIdx !== -1) process.argv.splice(phaseIdx, 2); + +const CONDUCTOR_PROMPTS_DIR = path.join( + process.env.HOME || '', + '.stackmemory', + 'conductor', + 'prompts' +); + +// Eval response cache — deterministic baselines via record/replay +const EVAL_CACHE_DIR = path.join(GEPA_DIR, 'cache'); +if (!fs.existsSync(EVAL_CACHE_DIR)) + fs.mkdirSync(EVAL_CACHE_DIR, { recursive: true }); + +import { createHash } from 'crypto'; + +function evalCacheKey(taskId, variantContent) { + return createHash('sha256') + .update(`${taskId}:${variantContent.slice(0, 500)}`) + .digest('hex') + .slice(0, 16); +} + +function getCachedEvalResult(taskId, variantContent) { + if (process.argv.includes('--no-cache')) return null; + const key = evalCacheKey(taskId, variantContent); + const cachePath = path.join(EVAL_CACHE_DIR, `${key}.json`); + if (fs.existsSync(cachePath)) { + try { + return JSON.parse(fs.readFileSync(cachePath, 'utf8')); + } catch { + return null; + } + } + return null; +} + +function setCachedEvalResult(taskId, variantContent, result) { + const key = evalCacheKey(taskId, variantContent); + const cachePath = path.join(EVAL_CACHE_DIR, `${key}.json`); + fs.writeFileSync(cachePath, JSON.stringify(result)); +} + +/** + * Skill-aware optimization: read usage data from skill-audit.jsonl + * and build context for skill-scoped mutations. + */ +function getSkillAuditContext(skillName) { + const auditPath = path.join( + process.env.HOME || '', + '.stackmemory', + 'skill-audit.jsonl' + ); + if (!fs.existsSync(auditPath)) return ''; + + try { + const lines = fs + .readFileSync(auditPath, 'utf8') + .split('\n') + .filter(Boolean); + const entries = lines.map((l) => JSON.parse(l)); + + // Filter to this skill + const skillEntries = entries.filter((e) => e.skill === skillName); + if (skillEntries.length === 0) return ''; + + const total = skillEntries.length; + const errors = skillEntries.filter((e) => e.error).length; + const errorRate = ((errors / total) * 100).toFixed(1); + + // Common args patterns + const argCounts = {}; + for (const e of skillEntries) { + const arg = e.args || '(none)'; + argCounts[arg] = (argCounts[arg] || 0) + 1; + } + const topArgs = Object.entries(argCounts) + .sort((a, b) => b[1] - a[1]) + .slice(0, 5) + .map(([arg, count]) => ` - "${arg}": ${count}x`) + .join('\n'); + + // Recent errors + const recentErrors = skillEntries + .filter((e) => e.error) + .slice(-5) + .map((e) => ` - ${e.ts}: args="${e.args}"`) + .join('\n'); + + let ctx = `\n## Skill usage data for "${skillName}" (${total} invocations, ${errorRate}% error rate):\n`; + ctx += `\nMost common args:\n${topArgs}\n`; + if (recentErrors) { + ctx += `\nRecent errors:\n${recentErrors}\n`; + } + + return ctx; + } catch { + return ''; + } +} + +/** + * Phase-aware optimization: read failure data from outcomes.jsonl + * and build context for phase-scoped mutations. + */ +function getPhaseFailureContext(phase) { + const outcomesPath = path.join( + process.env.HOME || '', + '.stackmemory', + 'conductor', + 'outcomes.jsonl' + ); + if (!fs.existsSync(outcomesPath)) return ''; + + try { + const lines = fs + .readFileSync(outcomesPath, 'utf8') + .split('\n') + .filter(Boolean); + const recent = lines.slice(-100).map((l) => JSON.parse(l)); + const phaseFailures = recent.filter( + (o) => o.outcome === 'failure' && o.phase === phase + ); + + if (phaseFailures.length === 0) return ''; + + const examples = phaseFailures.slice(-10).map((f) => { + const err = f.errorTail || 'unknown error'; + return `- ${f.issue} (attempt ${f.attempt}): ${err.slice(0, 200)}`; + }); + + return `\n## Recent failures in "${phase}" phase (${phaseFailures.length} of last ${recent.length} runs):\n${examples.join('\n')}\n`; + } catch { + return ''; + } +} + +/** + * Auto-detect worst phase from outcomes for targeted optimization + */ +function detectWorstPhase() { + const outcomesPath = path.join( + process.env.HOME || '', + '.stackmemory', + 'conductor', + 'outcomes.jsonl' + ); + if (!fs.existsSync(outcomesPath)) return null; + + try { + const lines = fs + .readFileSync(outcomesPath, 'utf8') + .split('\n') + .filter(Boolean); + const recent = lines.slice(-50).map((l) => JSON.parse(l)); + const failures = recent.filter((o) => o.outcome === 'failure'); + if (failures.length === 0) return null; + + // Group by phase, find worst + const byPhase = {}; + for (const f of failures) { + const p = mapAgentPhaseToPromptPhase(f.phase); + byPhase[p] = (byPhase[p] || 0) + 1; + } + + const sorted = Object.entries(byPhase).sort((a, b) => b[1] - a[1]); + return sorted[0]?.[0] || null; + } catch { + return null; + } +} + +/** Map conductor AgentPhase names to prompt phase file names */ +function mapAgentPhaseToPromptPhase(agentPhase) { + const map = { + reading: 'understand', + planning: 'understand', + implementing: 'implement', + testing: 'validate', + linting: 'validate', + building: 'validate', + committing: 'deliver', + }; + return map[agentPhase] || 'implement'; +} + // Ensure directories [GENERATIONS_DIR, RESULTS_DIR, EVALS_DIR].forEach((dir) => { if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }); @@ -215,6 +404,145 @@ async function mutate() { console.log( `\nGenerated ${variants.length} variants in gen-${String(nextGen).padStart(3, '0')}/` ); + // Generate crossover children from previous generation's top variants + const crossoverCount = config.evolution.crossoverCount || 0; + if (crossoverCount > 0 && state.history.length > 0) { + const lastSelect = [...state.history] + .reverse() + .find((h) => h.action === 'select' && h.scores?.length >= 2); + if (lastSelect) { + const topTwo = lastSelect.scores.slice(0, 2); + const parentAPath = getGenPath( + state.currentGeneration, + topTwo[0].variant + ); + const parentBPath = getGenPath( + state.currentGeneration, + topTwo[1].variant + ); + if (fs.existsSync(parentAPath) && fs.existsSync(parentBPath)) { + const parentA = fs.readFileSync(parentAPath, 'utf8'); + const parentB = fs.readFileSync(parentBPath, 'utf8'); + for (let c = 0; c < crossoverCount; c++) { + const child = crossover(parentA, parentB); + const childName = `crossover-${String.fromCharCode(97 + c)}`; + fs.writeFileSync(getGenPath(nextGen, childName), child); + variants.push({ + name: childName, + strategy: 'crossover', + path: getGenPath(nextGen, childName), + }); + console.log(` Created ${childName} using strategy: crossover`); + } + } + } + } + + return variants; +} + +/** + * Crossover: combine sections from two parent variants. + * For each markdown section, randomly pick from parent A or B. + */ +function crossover(parentA, parentB) { + const sectionsA = parseSections(parentA.split('\n')); + const sectionsB = parseSections(parentB.split('\n')); + const allKeys = [ + ...new Set([...Object.keys(sectionsA), ...Object.keys(sectionsB)]), + ]; + + const result = []; + for (const key of allKeys) { + const hasA = key in sectionsA && sectionsA[key].trim(); + const hasB = key in sectionsB && sectionsB[key].trim(); + // Randomly pick source, preferring the one that has content + let content; + if (hasA && hasB) { + content = Math.random() < 0.5 ? sectionsA[key] : sectionsB[key]; + } else { + content = hasA ? sectionsA[key] : sectionsB[key]; + } + if (key !== '__preamble__') { + // Reconstruct heading — find depth from original + const depthA = parentA.match( + new RegExp( + `^(#{1,4})\\s+${key.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}`, + 'm' + ) + ); + const prefix = depthA ? depthA[1] : '##'; + result.push(`${prefix} ${key}`); + } + if (content) result.push(content); + result.push(''); + } + return result.join('\n').trim(); +} + +/** + * Phase-scoped mutation: optimize a single conductor phase file + * using failure data from outcomes.jsonl. + */ +async function mutatePhase(phase) { + const phasePath = path.join(CONDUCTOR_PROMPTS_DIR, `${phase}.md`); + if (!fs.existsSync(phasePath)) { + console.error(`[GEPA] Phase file not found: ${phasePath}`); + return; + } + + const current = fs.readFileSync(phasePath, 'utf8'); + const failureContext = getPhaseFailureContext(phase); + const state = getState(); + const nextGen = state.currentGeneration + 1; + + console.log(`[GEPA] Phase-scoped optimization: ${phase}`); + if (failureContext) { + console.log(`[GEPA] Including failure context from outcomes.jsonl`); + } + + const genDir = getGenPath(nextGen); + if (!fs.existsSync(genDir)) fs.mkdirSync(genDir, { recursive: true }); + + // Generate 2 variants (smaller population for phase-level) + const mutations = config.evolution.mutationStrategies; + const variants = []; + + for (let i = 0; i < 2; i++) { + const strategy = + mutations[(state.currentGeneration + i) % mutations.length]; + const variantName = `phase-${phase}-${String.fromCharCode(97 + i)}`; + + console.log(` Creating ${variantName} using strategy: ${strategy}`); + + // Inject phase-specific context into mutation prompt + const phaseAugmented = `${current}\n${failureContext}`; + const mutatedContent = await generateMutation( + phaseAugmented, + strategy, + state + ); + + const variantPath = path.join(genDir, `${variantName}.md`); + fs.writeFileSync(variantPath, mutatedContent); + variants.push({ name: variantName, strategy, path: variantPath, phase }); + } + + // Save baseline + fs.writeFileSync(path.join(genDir, `phase-${phase}-baseline.md`), current); + + state.history.push({ + generation: nextGen, + action: 'mutate-phase', + phase, + variants: variants.map((v) => v.name), + timestamp: new Date().toISOString(), + }); + saveState(state); + + console.log( + `\n[GEPA] Generated ${variants.length} phase variants for ${phase}` + ); return variants; } @@ -375,7 +703,17 @@ async function generateMutation(content, strategy, state) { return generateMutation(content, 'rephrase', state); } - const prompt = `You are an expert prompt engineer optimizing a CLAUDE.md system prompt for an AI coding agent (Claude Opus 4.6). + // Detect if optimizing a skill .md file + const isSkillTarget = targetName && targetName.startsWith('skill:'); + const skillAuditCtx = isSkillTarget + ? getSkillAuditContext(targetName.replace('skill:', '')) + : ''; + + const targetDescription = isSkillTarget + ? 'a Claude Code slash command (skill) .md file that instructs an AI coding agent what to do when the user invokes the command' + : 'a CLAUDE.md system prompt for an AI coding agent (Claude Opus 4.6)'; + + const prompt = `You are an expert prompt engineer optimizing ${targetDescription}. ${content} @@ -402,6 +740,7 @@ ${getRecentFeedback(state)} REFLECTION INSIGHTS (from failure pattern analysis): ${getReflectionInsights()} +${skillAuditCtx} @@ -464,33 +803,72 @@ Output ONLY the final prompt content — no commentary, no review notes, no fenc } /** - * Get recent evaluation feedback for context + * Get recent evaluation feedback for context (session scores + ASI judge feedback) */ function getRecentFeedback(state) { + const parts = []; + + // Session scores const scoresPath = path.join(RESULTS_DIR, 'scores.jsonl'); - if (!fs.existsSync(scoresPath)) return 'No previous evaluations.'; - - const lines = fs - .readFileSync(scoresPath, 'utf8') - .trim() - .split('\n') - .slice(-20); - const scores = lines.map((l) => JSON.parse(l)); - - const summary = scores.reduce((acc, s) => { - if (!acc[s.variant]) acc[s.variant] = { total: 0, count: 0, errors: 0 }; - acc[s.variant].total += s.metrics?.successfulToolCalls || 0; - acc[s.variant].count++; - acc[s.variant].errors += s.metrics?.errorCount || 0; - return acc; - }, {}); - - return Object.entries(summary) - .map( - ([v, s]) => - `${v}: ${s.count} sessions, ${s.errors} errors, avg success: ${(s.total / s.count).toFixed(1)}` - ) - .join('\n'); + if (fs.existsSync(scoresPath)) { + const lines = fs + .readFileSync(scoresPath, 'utf8') + .trim() + .split('\n') + .slice(-20); + const scores = lines.map((l) => JSON.parse(l)); + + const summary = scores.reduce((acc, s) => { + if (!acc[s.variant]) acc[s.variant] = { total: 0, count: 0, errors: 0 }; + acc[s.variant].total += s.metrics?.successfulToolCalls || 0; + acc[s.variant].count++; + acc[s.variant].errors += s.metrics?.errorCount || 0; + return acc; + }, {}); + + parts.push( + Object.entries(summary) + .map( + ([v, s]) => + `${v}: ${s.count} sessions, ${s.errors} errors, avg success: ${(s.total / s.count).toFixed(1)}` + ) + .join('\n') + ); + } + + // ASI feedback from most recent generation's judge + const feedbackFiles = fs.existsSync(RESULTS_DIR) + ? fs + .readdirSync(RESULTS_DIR) + .filter((f) => f.startsWith('feedback-') && f.endsWith('.json')) + .sort() + .reverse() + : []; + + if (feedbackFiles.length > 0) { + try { + const feedback = JSON.parse( + fs.readFileSync(path.join(RESULTS_DIR, feedbackFiles[0]), 'utf8') + ); + const feedbackLines = []; + for (const [criterion, entries] of Object.entries(feedback)) { + // Deduplicate feedback messages + const unique = [...new Set(entries.map((e) => e.feedback))].slice(0, 2); + for (const msg of unique) { + feedbackLines.push(`- ${criterion}: ${msg}`); + } + } + if (feedbackLines.length > 0) { + parts.push( + `\nJUDGE FEEDBACK (areas to improve):\n${feedbackLines.slice(0, 10).join('\n')}` + ); + } + } catch { + // ignore malformed feedback files + } + } + + return parts.length > 0 ? parts.join('\n') : 'No previous evaluations.'; } /** @@ -638,7 +1016,7 @@ async function runEval(variantName) { const evalFiles = config.evals.files ? config.evals.files.filter((f) => fs.existsSync(path.join(EVALS_DIR, f))) : fs.readdirSync(EVALS_DIR).filter((f) => f.endsWith('.jsonl')); - const tasks = evalFiles.flatMap((f) => + let tasks = evalFiles.flatMap((f) => fs .readFileSync(path.join(EVALS_DIR, f), 'utf8') .trim() @@ -646,6 +1024,11 @@ async function runEval(variantName) { .map((l) => JSON.parse(l)) ); + // Respect held-out partition: only use "train" tasks during optimization + if (config.evals.heldOutPartition) { + tasks = tasks.filter((t) => !t.partition || t.partition === 'train'); + } + console.log(` Found ${tasks.length} eval tasks`); // Set environment for tracking @@ -685,6 +1068,15 @@ async function runEval(variantName) { */ async function runSingleEval(task, variantPath) { const startTime = Date.now(); + const variantContent = fs.readFileSync(variantPath, 'utf8'); + + // Check eval response cache (deterministic baseline replay) + const cached = getCachedEvalResult(task.id, variantContent); + if (cached) { + console.log(` [cached]`); + return cached; + } + let tempDir; try { @@ -712,7 +1104,7 @@ async function runSingleEval(task, variantPath) { // Evaluate result against expected outcomes (LLM judge with regex fallback) const evaluation = await evaluateExpectations(result, task.expected, task); - return { + const evalResult = { passed: evaluation.passed, passRate: evaluation.passRate, criteria: evaluation.criteria, @@ -720,6 +1112,11 @@ async function runSingleEval(task, variantPath) { duration: Date.now() - startTime, output: result.slice(0, 2000), }; + + // Cache for deterministic replay on re-runs + setCachedEvalResult(task.id, variantContent, evalResult); + + return evalResult; } catch (error) { return { passed: false, @@ -761,7 +1158,7 @@ async function llmJudge(output, expected, task) { ) .join('\n'); - const judgePrompt = `You are a strict code evaluation judge. Evaluate whether the AI output satisfies each criterion. + const judgePrompt = `You are a strict code evaluation judge. Evaluate each criterion independently using chain-of-thought reasoning. ${task.prompt} @@ -776,7 +1173,11 @@ ${criteriaList} -Before judging each criterion, quote the specific line(s) from the AI output that satisfy or fail it. If you cannot find a relevant quote, the criterion fails. +For EACH criterion independently: +1. Quote the specific line(s) from the AI output relevant to this criterion +2. Reason about whether the criterion is satisfied (think step by step) +3. Make a binary pass/fail decision +4. If it fails, write actionable feedback explaining what the output should have done differently Strictness guide: - "has_function" — a real, working function definition exists (not just mentioned in prose) @@ -785,13 +1186,16 @@ Strictness guide: - "explains_fix" — a clear explanation of what was wrong and why the fix works - "no_overengineering" — solution is minimal; no unnecessary abstractions, extra files, or defensive code for impossible scenarios - "no_hallucination" — all claims about code are grounded in actual output; no references to files/functions that don't exist +- "shows_branch" — output mentions the current git branch name +- "suggests_next_action" — output recommends a concrete next step +- "concise_output" — output is focused and not unnecessarily verbose Respond with ONLY this JSON (no markdown fences): { "criteria": { - "criterion_name": {"passed": true, "quote": "relevant line from output", "reason": "brief explanation"}, - "criterion_name": {"passed": false, "quote": "", "reason": "brief explanation"} + "criterion_name": {"passed": true, "quote": "relevant line from output", "reason": "brief CoT reasoning", "feedback": ""}, + "criterion_name": {"passed": false, "quote": "", "reason": "brief CoT reasoning", "feedback": "Actionable suggestion for improvement"} } }`; @@ -816,12 +1220,17 @@ Respond with ONLY this JSON (no markdown fences): } /** - * Call judge model via Anthropic API (fast, cheap model for evaluation) + * Validate API key at startup — fail fast before burning mutation budget. */ -async function callJudge(prompt, model) { +let _apiKeyValidated = null; // null = untested, true/false = result +async function validateApiKey() { + if (_apiKeyValidated !== null) return _apiKeyValidated; const apiKey = process.env.ANTHROPIC_API_KEY; - - if (apiKey) { + if (!apiKey) { + _apiKeyValidated = false; + return false; + } + try { const response = await fetch('https://api.anthropic.com/v1/messages', { method: 'POST', headers: { @@ -830,22 +1239,78 @@ async function callJudge(prompt, model) { 'anthropic-version': '2023-06-01', }, body: JSON.stringify({ - model, - max_tokens: 2000, - messages: [{ role: 'user', content: prompt }], + model: 'claude-haiku-4-5-20251001', + max_tokens: 10, + messages: [{ role: 'user', content: 'ping' }], }), }); - + _apiKeyValidated = response.ok; if (!response.ok) { - throw new Error(`Judge API error: ${response.status}`); + const body = await response.text().catch(() => ''); + console.warn( + ` API key validation failed (${response.status}): ${body.slice(0, 200)}` + ); } + return _apiKeyValidated; + } catch (e) { + console.warn(` API key validation error: ${e.message}`); + _apiKeyValidated = false; + return false; + } +} - const data = await response.json(); - return data.content[0].text; +/** + * Call judge model via Anthropic API (fast, cheap model for evaluation) + */ +async function callJudge(prompt, model) { + const apiKey = process.env.ANTHROPIC_API_KEY; + + // Try API first (skip if key already known invalid) + if (apiKey && _apiKeyValidated !== false) { + try { + const response = await fetch('https://api.anthropic.com/v1/messages', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-api-key': apiKey, + 'anthropic-version': '2023-06-01', + }, + body: JSON.stringify({ + model, + max_tokens: config.judge?.maxOutputTokens || 2000, + messages: [{ role: 'user', content: prompt }], + }), + }); + + if (response.ok) { + const data = await response.json(); + return data.content[0].text; + } + // Log failure reason for debugging + const errBody = await response.text().catch(() => ''); + console.warn( + ` Judge API ${response.status}: ${errBody.slice(0, 150)}` + ); + _apiKeyValidated = false; // Don't retry bad key + } catch (e) { + console.warn(` Judge API error: ${e.message}`); + } } - // Fallback to CLI - return await spawnClaude(prompt, { timeoutMs: 30000 }); + // Fallback to CLI with config-driven timeout + const timeoutMs = config.judge?.timeoutMs || 120000; + return await spawnClaude(prompt, { timeoutMs }); +} + +/** + * Extract code blocks from output (focus regex matching on actual code, not prose/errors) + */ +function extractCodeBlocks(output) { + const blocks = []; + const re = /```[\w]*\n([\s\S]*?)```/g; + let m; + while ((m = re.exec(output)) !== null) blocks.push(m[1]); + return blocks.length > 0 ? blocks.join('\n') : output; } /** @@ -853,26 +1318,33 @@ async function callJudge(prompt, model) { */ function regexJudge(output, expected) { const criteria = {}; + const code = extractCodeBlocks(output); - for (const [key] of Object.entries(expected)) { + for (const [key, value] of Object.entries(expected)) { let passed = false; + + // Support custom regex from eval task definition + if (typeof value === 'object' && value !== null && value.regex) { + passed = new RegExp(value.regex).test(code); + criteria[key] = { passed, reason: 'custom regex' }; + continue; + } + switch (key) { case 'has_function': passed = /function\s+\w+|const\s+\w+\s*=\s*(\([^)]*\)|async)?\s*(=>|\{)/.test( - output + code ); break; case 'handles_edge_cases': - passed = /if\s*\(|edge|empty|null|undefined|\.length/.test(output); + passed = /if\s*\(|edge|empty|null|undefined|\.length/.test(code); break; case 'uses_async': - passed = /async|await|Promise/.test(output); + passed = /async|await|Promise/.test(code); break; case 'no_nested_callbacks': - passed = !/callback\s*\(\s*function|\.then\s*\([^)]*\.then/.test( - output - ); + passed = !/callback\s*\(\s*function|\.then\s*\([^)]*\.then/.test(code); break; case 'bug_fixed': passed = /fix|correct|change|update/i.test(output); @@ -883,21 +1355,63 @@ function regexJudge(output, expected) { /because|since|the issue|the problem/i.test(output); break; case 'no_overengineering': - // Heuristic: fail if output creates multiple new files or adds abstract factory patterns passed = !( - /class\s+\w+Factory|abstract\s+class|createFactory/i.test(output) || - (output.match(/\/\/ .*\.(?:ts|js|py)\b/g) || []).length > 3 + /class\s+\w+Factory|abstract\s+class|createFactory/i.test(code) || + (code.match(/\/\/ .*\.(?:ts|js|py)\b/g) || []).length > 3 ); break; case 'no_hallucination': - // Heuristic: pass if output doesn't reference non-standard fictional APIs passed = !/(?:import|require)\s*\(?\s*['"](?!\.|\/).*(?:magic|autofix|superhelper)/i.test( - output + code ); break; + // Skill-specific criteria + case 'shows_branch': + passed = /branch|main|master|feature\/|git\s+branch/i.test(output); + break; + case 'shows_recent_commits': + passed = /commit|log|recent|history|git\s+log/i.test(output); + break; + case 'suggests_next_action': + passed = /next|should|recommend|suggest|action|todo/i.test(output); + break; + case 'concise_output': + passed = output.length < 3000; + break; + case 'is_tested': + passed = /test\(|describe\(|it\(|expect\(|vitest|jest/i.test(code); + break; + case 'preserves_behavior': + passed = /backward|compat|existing|maintain|preserve/i.test(output); + break; + case 'has_pagination': + passed = /offset|limit|page|cursor|skip|take/i.test(code); + break; + case 'identifies_security_issue': + passed = /security|vulnerab|inject|xss|csrf|sanitiz/i.test(output); + break; + case 'actionable_feedback': + passed = + output.length > 100 && /should|must|need|fix|change/i.test(output); + break; + case 'captures_handoff': + passed = /handoff|capture|snapshot|state|session/i.test(output); + break; + case 'runs_summary': + passed = /summary|review|session|completed|done/i.test(output); + break; + case 'updates_memory': + passed = /memory|learn|update|save|persist/i.test(output); + break; + case 'checks_uncommitted': + passed = /uncommit|dirty|changes|stash|commit/i.test(output); + break; default: - passed = output.toLowerCase().includes(key.toLowerCase()); + // Substring match on the key name (loose fallback) + passed = output + .toLowerCase() + .includes(key.replace(/_/g, ' ').toLowerCase()); } criteria[key] = { passed, reason: 'regex heuristic' }; } @@ -943,11 +1457,17 @@ async function scoreAndSelect() { return; } - const variants = fs + let variants = fs .readdirSync(genDir) .filter((f) => f.endsWith('.md')) .map((f) => f.replace('.md', '')); + // When targeting a skill, exclude conductor phase variants (and vice versa) + const isSkill = targetName && targetName.startsWith('skill:'); + if (isSkill) { + variants = variants.filter((v) => !v.startsWith('phase-')); + } + console.log(`Scoring ${variants.length} variants in generation ${gen}...`); const scores = []; @@ -957,8 +1477,13 @@ async function scoreAndSelect() { if (result) scores.push(result); } - // Sort by score - scores.sort((a, b) => b.score - a.score); + // Sort by score with elitism tiebreaker (prefer baseline/incumbent on ties) + scores.sort((a, b) => { + if (b.score !== a.score) return b.score - a.score; + if (a.variant === 'baseline') return -1; + if (b.variant === 'baseline') return 1; + return 0; + }); // Show condensed delta for each variant const baselinePath = getGenPath(gen, 'baseline'); @@ -1007,6 +1532,29 @@ async function scoreAndSelect() { } } + // Persist ASI feedback from judge (for injection into next generation's mutations) + if (config.judge?.feedbackEnabled !== false) { + const feedback = {}; + for (const s of scores) { + if (!s.results) continue; + for (const r of s.results) { + if (!r.criteria) continue; + for (const [key, val] of Object.entries(r.criteria)) { + if (val.feedback && !val.passed) { + if (!feedback[key]) feedback[key] = []; + feedback[key].push({ + variant: s.variant, + task: r.taskName, + feedback: val.feedback, + }); + } + } + } + } + const feedbackPath = path.join(RESULTS_DIR, `feedback-${gen}.json`); + fs.writeFileSync(feedbackPath, JSON.stringify(feedback, null, 2)); + } + // Select best const best = scores[0]; @@ -1050,6 +1598,14 @@ async function scoreAndSelect() { async function run(generations = config.evolution.generations) { console.log(`Starting GEPA optimization for ${generations} generations...\n`); + // Validate API key upfront — fail fast + const apiOk = await validateApiKey(); + if (!apiOk) { + console.warn( + 'Warning: API key invalid or missing — judge will use CLI fallback (slower)\n' + ); + } + for (let i = 0; i < generations; i++) { console.log(`\n${'='.repeat(60)}`); console.log(`GENERATION ${i + 1}/${generations}`); @@ -1341,6 +1897,110 @@ async function runAll(generations = 3) { console.log('═'.repeat(60)); } +/** + * Show skill audit statistics from skill-audit.jsonl + */ +function showSkillStats() { + const auditPath = path.join( + process.env.HOME || '', + '.stackmemory', + 'skill-audit.jsonl' + ); + + if (!fs.existsSync(auditPath)) { + console.log('No skill audit data yet. Use skills to generate data.'); + return; + } + + const lines = fs.readFileSync(auditPath, 'utf8').split('\n').filter(Boolean); + const entries = lines.map((l) => JSON.parse(l)); + + // Group by skill + const bySkill = {}; + for (const e of entries) { + if (!bySkill[e.skill]) bySkill[e.skill] = { total: 0, errors: 0, args: {} }; + bySkill[e.skill].total++; + if (e.error) bySkill[e.skill].errors++; + const arg = e.args || '(none)'; + bySkill[e.skill].args[arg] = (bySkill[e.skill].args[arg] || 0) + 1; + } + + console.log(`Skill Audit Stats (${entries.length} total invocations)\n`); + console.log( + `${'Skill'.padEnd(20)} ${'Count'.padStart(6)} ${'Errors'.padStart(7)} ${'Rate'.padStart(6)}` + ); + console.log('-'.repeat(42)); + + const sorted = Object.entries(bySkill).sort( + (a, b) => b[1].total - a[1].total + ); + for (const [skill, stats] of sorted) { + const rate = ((stats.errors / stats.total) * 100).toFixed(0); + console.log( + `${skill.padEnd(20)} ${String(stats.total).padStart(6)} ${String(stats.errors).padStart(7)} ${(rate + '%').padStart(6)}` + ); + } + + // Show skill targets available for optimization + const skillTargets = (config.targets || []).filter((t) => + t.name.startsWith('skill:') + ); + if (skillTargets.length) { + console.log(`\nConfigured skill targets:`); + for (const t of skillTargets) { + const hasData = bySkill[t.name.replace('skill:', '')]; + const marker = hasData ? '✓' : '○'; + console.log(` ${marker} ${t.name.padEnd(20)} ${t.file}`); + } + } +} + +/** + * Run optimization on all skill targets + */ +async function runSkills(generations = 3) { + const skillTargets = (config.targets || []).filter((t) => + t.name.startsWith('skill:') + ); + + if (!skillTargets.length) { + console.log('No skill targets configured in config.json.'); + return; + } + + console.log( + `Running GEPA on ${skillTargets.length} skill targets (${generations} generations each)\n` + ); + + for (const target of skillTargets) { + const resolved = target.file.startsWith('~') + ? path.join(process.env.HOME, target.file.slice(1)) + : path.resolve(target.file); + + if (!fs.existsSync(resolved)) { + console.log(`Skipping ${target.name}: ${resolved} not found\n`); + continue; + } + + console.log(`\n${'═'.repeat(60)}`); + console.log(`SKILL: ${target.name} (${target.file})`); + console.log(`${'═'.repeat(60)}\n`); + + // Override config for this target + config.target.file = target.file; + if (target.evals) config.evals.files = target.evals; + + await init(resolved); + await run(generations); + + console.log(`\nCompleted ${target.name}\n`); + } + + console.log('\n' + '═'.repeat(60)); + console.log('ALL SKILL TARGETS COMPLETE'); + console.log('═'.repeat(60)); +} + // CLI const command = process.argv[2]; const arg1 = process.argv[3]; @@ -1352,7 +2012,19 @@ switch (command) { init(arg1); break; case 'mutate': - mutate(); + if (phaseName || hasFlag('--auto-phase')) { + const phase = phaseName || detectWorstPhase(); + if (phase) { + mutatePhase(phase); + } else { + console.log( + '[GEPA] No phase failures detected — skipping phase mutation' + ); + mutate(); + } + } else { + mutate(); + } break; case 'eval': runEval(arg1 || 'baseline'); @@ -1379,6 +2051,12 @@ switch (command) { case 'run-all': runAll(parseInt(arg1) || 3); break; + case 'skill-stats': + showSkillStats(); + break; + case 'run-skills': + runSkills(parseInt(arg1) || 3); + break; default: console.log(` GEPA - Genetic Eval-driven Prompt Algorithm @@ -1397,6 +2075,11 @@ Usage: node optimize.js targets List available targets node optimize.js run-all [generations] Run optimization on ALL targets +Skill optimization: + node optimize.js skill-stats Show skill audit statistics + node optimize.js run-skills [gens] Run optimization on all skill targets + node optimize.js run --target skill:start Optimize a specific skill + Options: --target Select target from targets[] config Available: ${(config.targets || []).map((t) => t.name).join(', ')} diff --git a/src/cli/commands/orchestrator.ts b/src/cli/commands/orchestrator.ts index cf1c1ab3..030d31d5 100644 --- a/src/cli/commands/orchestrator.ts +++ b/src/cli/commands/orchestrator.ts @@ -26,6 +26,7 @@ import { createReadStream } from 'fs'; import { createInterface } from 'readline'; import { fileURLToPath } from 'url'; import { Transform, type TransformCallback } from 'stream'; +import { createHash } from 'crypto'; import { logger } from '../../core/monitoring/logger.js'; import { isProcessAlive } from '../../utils/process-cleanup.js'; import { @@ -199,9 +200,92 @@ export interface AgentOutcomeEntry { labels?: string[]; // issue labels for difficulty prediction errorTail?: string; // last 5 lines of output.log on failure promptHash?: string; // hash of the prompt template used + promptVersions?: Record; // per-phase content hashes prUrl?: string; // GitHub PR URL if auto-created } +/** Phase prompt file names for decomposed template */ +const PROMPT_PHASES = [ + 'system', + 'understand', + 'implement', + 'validate', + 'deliver', +] as const; +type PromptPhase = (typeof PROMPT_PHASES)[number]; + +/** + * Build agent prompt from decomposed phase files if they exist, + * otherwise fall back to the monolith prompt-template.md. + * + * Returns { prompt, versions } where versions maps each phase + * to a short content hash for outcome attribution. + */ +function buildPromptFromPhases( + variables: Record +): { prompt: string; versions: Record } | null { + const promptsDir = join(homedir(), '.stackmemory', 'conductor', 'prompts'); + + // Check if phase files exist + const systemPath = join(promptsDir, 'system.md'); + if (!existsSync(systemPath)) return null; + + const versions: Record = {}; + const parts: string[] = []; + + for (const phase of PROMPT_PHASES) { + const phasePath = join(promptsDir, `${phase}.md`); + if (!existsSync(phasePath)) continue; + + let content = readFileSync(phasePath, 'utf-8'); + // Apply variable substitution + for (const [key, value] of Object.entries(variables)) { + content = content.replace(new RegExp(`\\{\\{${key}\\}\\}`, 'g'), value); + } + + parts.push(content); + + // Short hash for outcome attribution (first 8 chars of hex digest) + const hash = createHash('sha256') + .update(readFileSync(phasePath, 'utf-8')) + .digest('hex') + .slice(0, 8); + versions[phase] = hash; + } + + if (parts.length === 0) return null; + + // Load DSPy-optimized examples if available + const dspyPath = join( + homedir(), + '.stackmemory', + 'dspy', + 'optimized_state.json' + ); + if (existsSync(dspyPath)) { + try { + const state = JSON.parse(readFileSync(dspyPath, 'utf-8')); + for (const phase of PROMPT_PHASES) { + const sig = state[phase]; + if (sig?.fewShotExamples?.length) { + const examples = sig.fewShotExamples + .slice(0, 3) + .map( + (ex: { input: unknown; output: unknown }) => + `\nInput: ${JSON.stringify(ex.input)}\nOutput: ${JSON.stringify(ex.output)}\n` + ) + .join('\n'); + parts.push(`\n## Optimized Examples (${phase}):\n${examples}`); + } + } + } catch { + // Non-fatal — DSPy state is optional + } + } + + return { prompt: parts.join('\n\n'), versions }; +} + /** Get the conductor failures/outcomes log path */ export function getOutcomesLogPath(): string { return join(homedir(), '.stackmemory', 'conductor', 'outcomes.jsonl'); @@ -416,9 +500,88 @@ export function getRetryStrategy( } } + // Add phase-specific assertion if phase files are active + const promptsDir = join(homedir(), '.stackmemory', 'conductor', 'prompts'); + if (lastFailure?.phase && existsSync(join(promptsDir, 'system.md'))) { + const phaseAssertions = getPhaseAssertions( + lastFailure.phase, + lastFailure.errorTail || '' + ); + adjustments.push(...phaseAssertions); + } + return { shouldRetry: true, adjustments }; } +/** + * Generate phase-specific assertions for retry based on failure phase and error. + * These are injected into the retry prompt so the agent focuses on the exact + * failure point with targeted guidance. + */ +function getPhaseAssertions(phase: AgentPhase, error: string): string[] { + const assertions: string[] = []; + + switch (phase) { + case 'reading': + case 'planning': + assertions.push( + 'ASSERTION: Re-read the issue description completely before planning.', + 'ASSERTION: List ALL files you plan to modify before starting implementation.' + ); + break; + + case 'implementing': + if (/scope|unrelated|refactor/i.test(error)) { + assertions.push( + 'ASSERTION: Only modify files directly required by the issue. Do NOT refactor surrounding code.' + ); + } + if (/import|module|ESM/i.test(error)) { + assertions.push( + 'ASSERTION: Every relative import MUST end with .js extension. Check ALL new imports.' + ); + } + assertions.push( + 'ASSERTION: After implementing, review your diff — if any change is not required by the issue, revert it.' + ); + break; + + case 'testing': + case 'linting': + case 'building': + if (/lint|eslint/i.test(error)) { + assertions.push( + 'ASSERTION: Run `npm run lint` IMMEDIATELY. Fix every error. Do NOT proceed until lint passes.', + 'ASSERTION: Common lint fixes — catch {} not catch (_err) {}, remove unused imports, add .js to relative imports.' + ); + } + if (/test|vitest|jest|FAIL/i.test(error)) { + assertions.push( + 'ASSERTION: Read the FULL test error output. Identify which assertion fails and why.', + 'ASSERTION: If vi.clearAllMocks() is in beforeEach, re-set any mockReturnValue calls after it.' + ); + } + if (/build|tsc|type/i.test(error)) { + assertions.push( + 'ASSERTION: Run `npm run build` and fix ALL TypeScript errors before committing.' + ); + } + assertions.push( + 'ASSERTION: Do NOT use --no-verify to bypass pre-commit hooks. Fix the underlying issue.' + ); + break; + + case 'committing': + assertions.push( + 'ASSERTION: Commit message must follow format: type(scope): description', + 'ASSERTION: If pre-commit hook fails, fix the issue and create a NEW commit — do NOT amend.' + ); + break; + } + + return assertions; +} + // ── Helpers ── /** Find the package root by walking up from the current file. */ @@ -1407,6 +1570,7 @@ export class Conductor { durationMs: Date.now() - run.startedAt, hasCommits: true, labels: issue.labels.map((l) => l.name), + promptVersions: this.lastPromptVersions, prUrl, }); await this.runHook( @@ -1446,6 +1610,7 @@ export class Conductor { durationMs: Date.now() - run.startedAt, hasCommits: false, labels: issue.labels.map((l) => l.name), + promptVersions: this.lastPromptVersions, errorTail: run.error?.slice(-500), }); @@ -2449,10 +2614,13 @@ export class Conductor { } } + /** Last prompt version hashes — set by buildPrompt, read by outcome logging */ + private lastPromptVersions: Record = {}; + /** - * Build the agent prompt. If a custom template exists at - * ~/.stackmemory/conductor/prompt-template.md, use it with variable - * substitution. Otherwise fall back to the default template. + * Build the agent prompt. Tries decomposed phase files first + * (~/.stackmemory/conductor/prompts/*.md), then typed templates, + * then custom prompt-template.md, then default. * * Template variables: {{ISSUE_ID}}, {{TITLE}}, {{DESCRIPTION}}, * {{LABELS}}, {{PRIORITY}}, {{ATTEMPT}}, {{PRIOR_CONTEXT}} @@ -2490,6 +2658,24 @@ export class Conductor { } const priorContext = contextParts.join('\n'); + // Try decomposed phase files first + const variables: Record = { + ISSUE_ID: issue.identifier, + TITLE: issue.title, + DESCRIPTION: issue.description || '', + LABELS: labels, + PRIORITY: priority, + SCOPE: issue.identifier.toLowerCase().replace(/-\d+$/, ''), + ATTEMPT: String(attempt), + PRIOR_CONTEXT: priorContext, + }; + + const phaseResult = buildPromptFromPhases(variables); + if (phaseResult) { + this.lastPromptVersions = phaseResult.versions; + return phaseResult.prompt; + } + // Select template by issue type (labels or title heuristics) const templateDir = join( __dirname, @@ -2852,6 +3038,7 @@ export class Conductor { durationMs, hasCommits, labels: run.issue.labels.map((l) => l.name), + promptVersions: this.lastPromptVersions, errorTail, prUrl, });