diff --git a/docs/design/81-skill-tree.md b/docs/design/81-skill-tree.md new file mode 100644 index 0000000..041341f --- /dev/null +++ b/docs/design/81-skill-tree.md @@ -0,0 +1,92 @@ +# Skill-Tree — learned procedural memory the maestro grows and retrieves + +Status: in implementation (branch `feat/skill-tree`). Epic: hadamrd/forge-loop#458. + +## Problem + +Every worker starts cold: it re-derives repo conventions, file maps, and test +recipes on every ticket — burning tokens, making inconsistent choices, and +repeating known mistakes. The biggest faster+cheaper+more-accurate lever is to +**retrieve learned procedure instead of re-deriving it**. + +## What already exists (substrate — reuse, do not duplicate) + +- `memory/store.py` `SqliteMemoryStore` at `.forge/memory.db` — `put` / `get` / + `list_active(kind=)` / `supersede`. +- `memory/models.py` `MemoryItem` (kind/title/body/provenance/tags/superseded_by), + `MemoryKind.PROCEDURAL`, `derive_skill_key(failing_signal, target)`, + `skill_tag` / `skill_from_tags`, tag-prefix pattern (`axis:`, `skill:`). +- `runner/learning.py` `record_procedural_skill(...)` — writes a procedural card + and supersedes the prior card of the same skill-key (lineage preserved). +- Episodic memory is already harvested on merge (`record_merged_outcomes` via + `runner/tick.py::_record_merged_memory`) and injected into briefs + (`worker_brief.py::_render_prior_episodes`). + +**Gap:** procedural skills are never harvested, never retrieved/injected, and +have no tree structure. + +## Design + +### Tree structure (leaves + internal nodes) + +Each card carries an **area path** as a tag: `area:` (e.g. +`area:pulsar-node/http-route`). The `/`-delimited path IS the tree. + +- **Leaf** = a concrete procedure card (has both a `skill:` tag and an + `area:` tag). Body schema (plain text sections): `trigger`, `procedure` + (file map + template + test recipe), `pitfalls`. +- **Internal node** = a generalized card for an area subtree (has an + `area-node` marker tag + `area:`), distilled from ≥N descendant leaves. + +New helpers in `memory/models.py` (mirror `axis_tag`): `AREA_TAG_PREFIX`, +`area_tag(path)`, `area_from_tags(tags)`, `AREA_NODE_TAG`. + +### Lifecycle + +1. **Harvest** (`skill_librarian.py` + `runner/learning.py`): after a + critic-clean merge, a one-shot LLM "librarian" distills the merged diff + + acceptance criteria into a `SkillCard(area, trigger, procedure, pitfalls, + failing_signal, target, confidence)`. `harvest_skills_from_merge(...)` is pure + and injectable (the LLM call is a passed-in callable) so the wiring is unit + tested without network. It calls `record_procedural_skill` with the `area:` + tag and provenance carrying the merged SHA in `evidence_refs`. Emits + `skill_harvested`. +2. **Retrieve + inject** (`memory/skills.py` + `worker_brief.py`): + `retrieve_skills_for(store, query, *, k)` ranks active procedural cards by + area/title/tag token overlap with the ticket, walking most-specific-leaf → + ancestor-node, capped at `k` (token budget). `render_skill_section(...)` + formats them; injected into `make_brief` and `make_repair_brief`. Emits + `skill_injected` (rank) per card — enables measuring the turns/tokens win. + v1 retrieval is deterministic keyword/area ranking; a clean seam is left for + semantic (Lumen) retrieval as a fast-follow (NOT in this PR). +3. **Curate/evolve** (`memory/skills.py`): `expire_stale_skills(...)` supersedes + cards whose proof-SHA is absent from the current history / older than a + horizon; `promote_internal_nodes(store, *, min_leaves)` distills an + internal-node card once an area has ≥`min_leaves` leaves. Emits + `skill_expired` / `skill_promoted`. + +### Traps engineered around + +- **Stale > empty:** every card carries provenance SHA + confidence; expiry is + mandatory. Adversarial test: an expired card is not retrieved. +- **Retrieval is the hard part:** ranking is the real work; honest v1 is + keyword/area, semantic is a seam. +- **Contradiction sprawl:** one librarian writes; supersession (existing) + preserves lineage; no free-for-all. + +## Events (new, `events.py`) + +`SkillHarvestedEvent`, `SkillInjectedEvent`, `SkillPromotedEvent`, +`SkillExpiredEvent`. + +## Testing + +Pure modules (models helpers, ranking, expiry, promotion, harvest-wiring with an +injected fake librarian) are fully unit-tested against a real +`SqliteMemoryStore` (`tmp_path`). Each behavior has an adversarial test: remove +the harvest call / the inject / the expiry filter and a test goes red. The +LLM-dependent distiller is isolated behind a callable seam and tested with a fake. + +## Out of scope (this PR) + +Semantic/Lumen retrieval (seam only); a UI surface; cross-repo skill sharing. diff --git a/src/forge_loop/events.py b/src/forge_loop/events.py index d0407b8..ff9ea02 100644 --- a/src/forge_loop/events.py +++ b/src/forge_loop/events.py @@ -445,6 +445,53 @@ class WorkerPolicyEnforcedEvent(EventBase): withheld_secrets: list[str] = Field(default_factory=list) +@register_event +class SkillHarvestedEvent(EventBase): + """A procedural skill card was distilled from a critic-clean merge.""" + + KIND: ClassVar[str] = "skill_harvested" + issue: int = Field(ge=1) + area: str = "" + skill_key: str = "" + memory_id: str = "" + sha: str = "" + confidence: float = Field(ge=0.0, le=1.0, default=1.0) + + +@register_event +class SkillInjectedEvent(EventBase): + """A skill card was retrieved and injected into a worker brief. + + ``rank`` is its position in the retrieval ranking (0 = best match) so the + measurement track can correlate which cards actually drove a cheaper run. + """ + + KIND: ClassVar[str] = "skill_injected" + issue: int = Field(ge=1) + memory_id: str = "" + area: str = "" + rank: int = Field(ge=0, default=0) + + +@register_event +class SkillPromotedEvent(EventBase): + """Leaves under an area were generalised into an internal-node card.""" + + KIND: ClassVar[str] = "skill_promoted" + area: str = "" + memory_id: str = "" + leaf_count: int = Field(ge=1, default=1) + + +@register_event +class SkillExpiredEvent(EventBase): + """A stale skill card was superseded/retired (its proof SHA aged out).""" + + KIND: ClassVar[str] = "skill_expired" + memory_id: str = "" + reason: str = "" + + # --------------------------------------------------------------------------- # Emit + back-compat shim. ``emit`` is the typed path; ``append_event_with_ # registry_check`` is the back-compat wrapper called by state.append_event. diff --git a/src/forge_loop/memory/models.py b/src/forge_loop/memory/models.py index 504fd32..fd94f75 100644 --- a/src/forge_loop/memory/models.py +++ b/src/forge_loop/memory/models.py @@ -81,6 +81,50 @@ def skill_from_tags(tags: tuple[str, ...]) -> str: return "" +#: Tag prefix carrying a procedural skill's *area path* — its address in the +#: skill tree (e.g. ``area:pulsar-node/http-route``). The ``/``-delimited path +#: IS the tree: a leaf sits at a full path; an internal node summarises a +#: subtree prefix. Mirrors ``AXIS_TAG_PREFIX`` / ``SKILL_TAG_PREFIX``. +AREA_TAG_PREFIX = "area:" + +#: Marker tag distinguishing an *internal node* card (a generalised pattern +#: distilled over an area subtree) from a *leaf* card (a concrete procedure). +AREA_NODE_TAG = "area-node" + +#: Marker tag retiring a skill card whose proof commit has aged out of history +#: (rebased/squashed away). An expired card stays in the store for provenance +#: but is filtered out of retrieval — a confidently-stale recipe is worse than +#: none. Distinct from supersession, which means "replaced by a better version". +EXPIRED_TAG = "expired" + + +def area_tag(path: str) -> str: + """Render an ``area:`` tag for ``path`` (stripped).""" + return f"{AREA_TAG_PREFIX}{path.strip()}" + + +def area_from_tags(tags: tuple[str, ...]) -> str: + """Extract the area path from an ``area:`` tag, or ``""`` if absent.""" + for tag in tags: + if tag.startswith(AREA_TAG_PREFIX): + return tag[len(AREA_TAG_PREFIX) :] + return "" + + +def area_ancestors(path: str) -> tuple[str, ...]: + """Return the area path and its ancestors, most-specific first. + + ``"a/b/c"`` → ``("a/b/c", "a/b", "a")`` — the retrieval walk order: the most + specific leaf area first, then its ancestor internal nodes. A blank path + yields ``()``. Leading/trailing slashes are ignored. + """ + cleaned = path.strip().strip("/") + if not cleaned: + return () + segments = cleaned.split("/") + return tuple("/".join(segments[:i]) for i in range(len(segments), 0, -1)) + + def derive_memory_id(source_key: str, *, prefix: str) -> str: """Derive a stable ``memory_id`` from a source key. diff --git a/src/forge_loop/memory/skills.py b/src/forge_loop/memory/skills.py new file mode 100644 index 0000000..d1a4e12 --- /dev/null +++ b/src/forge_loop/memory/skills.py @@ -0,0 +1,194 @@ +"""Retrieve and render learned procedural skills (the skill tree) for a brief. + +Retrieval is deliberately deterministic for v1: a card is ranked by token +overlap between the ticket text and the card's *area path* (weighted heavily, +since the area IS the tree address) and its title/body. A clean seam is left for +a future semantic (embedding) ranker — :func:`retrieve_skills_for` is the single +entry point a Lumen-backed implementation would replace, with the same shape. + +Cards that overlap nothing with the query are NOT returned: injecting an +irrelevant or stale recipe is worse than injecting none. +""" + +from __future__ import annotations + +import re +from collections import defaultdict +from collections.abc import Callable +from dataclasses import dataclass, replace + +from forge_loop.memory.models import ( + AREA_NODE_TAG, + EXPIRED_TAG, + MemoryItem, + MemoryKind, + area_from_tags, + area_tag, +) +from forge_loop.memory.store import MemoryStore + +__all__ = [ + "PromotedNode", + "expire_stale_skills", + "promote_internal_nodes", + "render_skill_section", + "retrieve_skills_for", +] + +_COMMIT_PREFIX = "commit:" + +#: An area-path token match counts for more than an incidental body-word match: +#: the tree address is the strongest relevance signal. +_AREA_WEIGHT = 3 +_MIN_TOKEN_LEN = 3 +_TOKEN_RE = re.compile(r"[a-z0-9]+") + + +def _tokens(text: str) -> set[str]: + return {tok for tok in _TOKEN_RE.findall(text.lower()) if len(tok) >= _MIN_TOKEN_LEN} + + +def _area_tokens(area: str) -> set[str]: + # Split the path on "/" and "-" so "pulsar-node/http-route" contributes + # {pulsar, node, http, route}. + return _tokens(area.replace("/", " ").replace("-", " ")) + + +def _score(item: MemoryItem, query_tokens: set[str]) -> int: + area = area_from_tags(item.tags) + area_score = _AREA_WEIGHT * len(query_tokens & _area_tokens(area)) + text_score = len(query_tokens & _tokens(f"{item.title} {item.body}")) + return area_score + text_score + + +def retrieve_skills_for(store: MemoryStore, query: str, *, k: int = 3) -> tuple[MemoryItem, ...]: + """Return up to ``k`` active procedural skill cards most relevant to ``query``. + + Ranks active ``PROCEDURAL`` cards by area-weighted token overlap; cards with + zero overlap are excluded. Ties break by confidence (desc) then insertion + order (stable). An empty/blank query returns ``()``. + """ + query_tokens = _tokens(query) + if not query_tokens: + return () + scored = [ + (score, item) + for item in store.list_active(kind=MemoryKind.PROCEDURAL) + if EXPIRED_TAG not in item.tags and (score := _score(item, query_tokens)) > 0 + ] + # Stable sort: equal (score, confidence) keeps list_active's insertion order. + scored.sort(key=lambda si: (si[0], si[1].provenance.confidence), reverse=True) + return tuple(item for _score_value, item in scored[:k]) + + +def render_skill_section(items: tuple[MemoryItem, ...]) -> str: + """Render retrieved skill cards as a brief section, or ``""`` if none.""" + if not items: + return "" + lines = [ + "## Learned skills for this repo (retrieved from past merges)", + ( + "Distilled from prior successful, reviewed changes. Apply when " + "relevant, but verify against current code — they can go stale." + ), + "", + ] + for item in items: + area = area_from_tags(item.tags) or "general" + provenance = ", ".join(item.provenance.evidence_refs) or "—" + lines.append(f"### {area} (confidence {item.provenance.confidence:.2f})") + lines.append(item.body) + lines.append(f"_provenance: {provenance}_") + lines.append("") + return "\n".join(lines).rstrip() + "\n" + + +def _commit_shas(item: MemoryItem) -> tuple[str, ...]: + return tuple( + ref[len(_COMMIT_PREFIX) :] + for ref in item.provenance.evidence_refs + if ref.startswith(_COMMIT_PREFIX) + ) + + +def expire_stale_skills(store: MemoryStore, *, live_shas: set[str]) -> tuple[str, ...]: + """Retire procedural cards whose proof commit is no longer in history. + + A card carries its proof as ``commit:`` evidence. When none of a card's + proof commits are in ``live_shas`` (the set of commits currently reachable in + the repo), the change it was distilled from has been rebased/squashed away — + the recipe may no longer match the code, so the card is tagged + :data:`~forge_loop.memory.models.EXPIRED_TAG` and dropped from retrieval. A + card with no commit provenance cannot be judged and is left untouched. + Returns the ``memory_id`` of each card expired this pass. + """ + expired: list[str] = [] + for item in store.list_active(kind=MemoryKind.PROCEDURAL): + if EXPIRED_TAG in item.tags: + continue + shas = _commit_shas(item) + if not shas: + continue + if any(sha in live_shas for sha in shas): + continue + store.put(replace(item, tags=(*item.tags, EXPIRED_TAG))) + expired.append(item.memory_id) + return tuple(expired) + + +@dataclass(frozen=True) +class PromotedNode: + """Descriptor of one internal node promoted from sibling leaves.""" + + memory_id: str + area: str + leaf_count: int + + +def promote_internal_nodes( + store: MemoryStore, + *, + min_leaves: int, + summarize: Callable[[tuple[MemoryItem, ...]], str], +) -> tuple[PromotedNode, ...]: + """Generalise sibling leaves into an internal-node card per parent area. + + Groups active *leaf* cards (a leaf has an ``area:`` tag and is neither an + :data:`~forge_loop.memory.models.AREA_NODE_TAG` node nor expired) by their + parent area path. When a parent gathers at least ``min_leaves`` leaves, the + injected ``summarize`` distils their shared pattern into the body of an + internal-node card filed at the parent path (tagged ``AREA_NODE_TAG``). + Internal nodes never count as leaves, so promotion does not cascade upward. + Idempotent: re-running upserts the node in place (stable per parent). + """ + from forge_loop.runner.learning import record_procedural_skill + + groups: dict[str, list[MemoryItem]] = defaultdict(list) + for item in store.list_active(kind=MemoryKind.PROCEDURAL): + if AREA_NODE_TAG in item.tags or EXPIRED_TAG in item.tags: + continue + area = area_from_tags(item.tags) + if not area: + continue + parent = "/".join(area.split("/")[:-1]) + if parent: + groups[parent].append(item) + + promoted: list[PromotedNode] = [] + for parent, leaves in groups.items(): + if len(leaves) < min_leaves: + continue + body = summarize(tuple(leaves)) + memory_id = record_procedural_skill( + store, + failing_signal="", + target=parent, + title=f"{parent} (pattern)", + body=body, + source_key=f"node:{parent}", + source_task_ref=f"area:{parent}", + extra_tags=(area_tag(parent), AREA_NODE_TAG), + confidence=0.7, + ) + promoted.append(PromotedNode(memory_id, parent, len(leaves))) + return tuple(promoted) diff --git a/src/forge_loop/runner/learning.py b/src/forge_loop/runner/learning.py index 3342ebd..556907e 100644 --- a/src/forge_loop/runner/learning.py +++ b/src/forge_loop/runner/learning.py @@ -24,7 +24,8 @@ from __future__ import annotations -from collections.abc import Iterable, Mapping +from collections.abc import Callable, Iterable, Mapping +from dataclasses import dataclass from datetime import UTC, datetime from forge_loop.eventlog.models import EventRef @@ -32,6 +33,7 @@ MemoryItem, MemoryKind, MemoryProvenance, + area_tag, derive_memory_id, derive_skill_key, skill_tag, @@ -39,6 +41,8 @@ from forge_loop.memory.store import MemoryStore __all__ = [ + "HarvestedSkill", + "harvest_skills_from_merge", "record_failed_outcomes", "record_merged_outcomes", "record_procedural_skill", @@ -121,6 +125,9 @@ def record_procedural_skill( authored_by: str = "maestro", source_task_ref: str | None = None, source_event: EventRef | None = None, + extra_tags: tuple[str, ...] = (), + evidence_refs: tuple[str, ...] = (), + confidence: float = 1.0, now: datetime | None = None, ) -> str: """Record a validated repair as a PROCEDURAL skill, superseding any prior. @@ -187,11 +194,12 @@ def record_procedural_skill( source_event=source_event, authored_by=authored_by, source_task_ref=source_task_ref, - confidence=1.0, + confidence=confidence, created_at=created_at, supersedes=tuple(old.memory_id for old in prior), + evidence_refs=evidence_refs, ), - tags=(tag,), + tags=(tag, *extra_tags), ) # Persist the replacement first, THEN supersede — see docstring. memory_store.put(item) @@ -351,3 +359,90 @@ def record_merged_outcomes( memory_store.supersede(failure_id, by_memory_id=memory_id) return tuple(promoted) + + +@dataclass(frozen=True) +class HarvestedSkill: + """Descriptor of one skill harvested from a merge — enough to emit an event.""" + + memory_id: str + issue: int + area: str + skill_key: str + sha: str + confidence: float + title: str + + +def harvest_skills_from_merge( + memory_store: MemoryStore, + merged: Iterable[object], + *, + fetch_diff: Callable[[int], tuple[str, str]], + call_llm: Callable[[str], str], + now: datetime | None = None, +) -> tuple[HarvestedSkill, ...]: + """Distil and record one PROCEDURAL skill card per merged outcome. + + For each merged issue: fetch its diff + merged SHA via ``fetch_diff(issue)``, + distil a :class:`~forge_loop.skill_librarian.SkillCard` via the injected + ``call_llm`` librarian, and record it as a procedural skill tagged with its + ``area:`` path and stamped with ``commit:`` provenance. Outcomes whose + diff is empty, or whose distillation fails to parse, are skipped — only a + real, proven recipe is harvested. Idempotent per ``(issue, sha)``: the same + merge re-harvested upserts in place rather than duplicating. + + Both I/O concerns (the diff fetch and the model call) are injected, so the + harvest control flow is fully unit-testable without git or the SDK. Returns + one :class:`HarvestedSkill` per recorded card so the caller can emit events. + """ + from forge_loop.skill_librarian import distill_skill_from_merge + + harvested: list[HarvestedSkill] = [] + seen: set[int] = set() + for record in merged: + coerced = _coerce_issue(record) + if coerced is None: + continue + n, title, _pr = coerced + if n in seen: + continue + seen.add(n) + + diff, sha = fetch_diff(n) + if not diff.strip(): + continue + card = distill_skill_from_merge( + diff=diff, issue_title=title, acceptance="", call_llm=call_llm + ) + if card is None: + continue + + skill_key = derive_skill_key(card.failing_signal, card.target) + card_title = f"{card.area}: {card.trigger}" if card.area else card.trigger + memory_id = record_procedural_skill( + memory_store, + failing_signal=card.failing_signal, + target=card.target, + title=card_title, + body=card.to_body(), + source_key=f"harvest:{n}:{sha}", + source_task_ref=f"issue:#{n}", + extra_tags=(area_tag(card.area),) if card.area.strip() else (), + evidence_refs=(f"commit:{sha}",), + confidence=card.confidence, + now=now, + ) + harvested.append( + HarvestedSkill( + memory_id=memory_id, + issue=n, + area=card.area, + skill_key=skill_key, + sha=sha, + confidence=card.confidence, + title=card_title, + ) + ) + + return tuple(harvested) diff --git a/src/forge_loop/runner/tick.py b/src/forge_loop/runner/tick.py index 5c51d08..ef011e0 100644 --- a/src/forge_loop/runner/tick.py +++ b/src/forge_loop/runner/tick.py @@ -172,6 +172,64 @@ def _record_merged_memory( except Exception as ex_: # noqa: BLE001 — best-effort, must not break tick append_event(cfg.events_file, "memory_promote_failed", err=str(ex_)[:200]) + _harvest_merged_skills(cfg, merged) + + +def _merged_pr_diff(cfg: Config, issue_n: int) -> tuple[str, str]: + """Fetch a merged PR's diff + merge-commit SHA for an issue via ``gh``.""" + import subprocess + + diff_cmd = ["gh", "pr", "diff", str(issue_n)] + sha_cmd = ["gh", "pr", "view", str(issue_n), "--json", "mergeCommit", "-q", ".mergeCommit.oid"] + if cfg.github_repo: + diff_cmd += ["-R", cfg.github_repo] + sha_cmd += ["-R", cfg.github_repo] + diff = subprocess.run(diff_cmd, capture_output=True, text=True, timeout=60, cwd=cfg.repo) + sha = subprocess.run(sha_cmd, capture_output=True, text=True, timeout=30, cwd=cfg.repo) + return diff.stdout, sha.stdout.strip() + + +def _harvest_merged_skills(cfg: Config, merged: list[WorkerOutcome]) -> None: + """Best-effort: distil + record one procedural skill card per merged PR. + + Isolated from :func:`_record_merged_memory`'s episodic write and broadly + wrapped: a librarian/``gh`` failure emits ``skill_harvest_failed`` and never + breaks the tick. Gated by ``misc.skill_tree_harvest`` (env + ``FORGE_SKILL_TREE_HARVEST``); on by default. + """ + from forge_loop.settings import get_settings + + if not merged or not get_settings().misc.skill_tree_harvest: + return + try: + from forge_loop.control.boot import canonical_task_saga_path + from forge_loop.events import SkillHarvestedEvent, emit + from forge_loop.memory.store import SqliteMemoryStore + from forge_loop.runner.learning import harvest_skills_from_merge + from forge_loop.skill_librarian import default_librarian + + store = SqliteMemoryStore(canonical_task_saga_path(cfg.repo).parent / "memory.db") + harvested = harvest_skills_from_merge( + store, + merged, + fetch_diff=lambda issue_n: _merged_pr_diff(cfg, issue_n), + call_llm=default_librarian(cwd=cfg.repo), + ) + for skill in harvested: + emit( + cfg.events_file, + SkillHarvestedEvent( + issue=skill.issue, + area=skill.area, + skill_key=skill.skill_key, + memory_id=skill.memory_id, + sha=skill.sha, + confidence=skill.confidence, + ), + ) + except Exception as ex_: # noqa: BLE001 — best-effort, must not break tick + append_event(cfg.events_file, "skill_harvest_failed", err=str(ex_)[:200]) + # Issue #267 (safety): the affirmative critic verdict token that — and ONLY # which — clears a PR for auto-merge. Any other token (changes_requested, diff --git a/src/forge_loop/settings.py b/src/forge_loop/settings.py index 6606a58..7c3f25f 100644 --- a/src/forge_loop/settings.py +++ b/src/forge_loop/settings.py @@ -493,6 +493,11 @@ class MiscSettings(BaseSettings): queue_url: str | None = None # Legacy FORGE_LOOP_EXPERIMENTAL — extras gate. experimental_enabled: bool = False + # FORGE_SKILL_TREE_HARVEST — distil a procedural skill card from each + # critic-clean merge (issue #458). On by default; off disables the LLM + # librarian call per merge (retrieval/injection are unaffected — those are + # cheap and local). + skill_tree_harvest: bool = True # --------------------------------------------------------------------------- @@ -701,6 +706,7 @@ def _set_path(d: dict[str, Any], dotted: str, value: Any) -> None: ("LOOP_MCP_CAP_DEFAULT", "misc.mcp_cap_default", int), ("LOOP_QUEUE_URL", "misc.queue_url", str), ("FORGE_LOOP_EXPERIMENTAL", "misc.experimental_enabled", _coerce_bool), + ("FORGE_SKILL_TREE_HARVEST", "misc.skill_tree_harvest", _coerce_bool), ) diff --git a/src/forge_loop/skill_librarian.py b/src/forge_loop/skill_librarian.py new file mode 100644 index 0000000..35b9890 --- /dev/null +++ b/src/forge_loop/skill_librarian.py @@ -0,0 +1,191 @@ +"""Distil a merged diff into a structured, reusable procedural SkillCard. + +The "librarian" is the harvest-time component that reads a *critic-clean* merged +PR (its diff + the issue title + acceptance criteria) and extracts ONE reusable +"how to do X in this repo" card. It is deliberately split into a pure core and a +thin impure edge: + +- :func:`parse_skill_card` and :func:`distill_skill_from_merge` are pure and take + the model call as an injected ``call_llm`` callable, so all of the structure, + prompt assembly, and parse/validation logic is unit-tested without network. +- :func:`default_librarian` returns the real ``call_llm`` backed by + :func:`forge_loop._critic_sdk.run_critic_sdk` (a cheap one-shot SDK session). +""" + +from __future__ import annotations + +import json +from collections.abc import Callable +from dataclasses import dataclass +from pathlib import Path + +__all__ = [ + "SkillCard", + "parse_skill_card", + "distill_skill_from_merge", + "default_librarian", +] + +#: Cap the diff we feed the model so a huge PR never blows the prompt budget. +_MAX_DIFF_CHARS = 6000 + +#: Default confidence when the model omits one. Mid-high: a card distilled from a +#: critic-clean merge is trustworthy, but absence of an explicit score is a mild +#: signal to not over-rank it. +_DEFAULT_CONFIDENCE = 0.8 + +_REQUIRED_FIELDS = ("area", "target", "trigger", "procedure") + + +@dataclass(frozen=True) +class SkillCard: + """A reusable procedural recipe distilled from a merged change. + + ``area`` is the card's address in the skill tree (``/``-delimited). + ``(failing_signal, target)`` is the dedup signature (the skill-key). The + ``trigger``/``procedure``/``pitfalls`` are the human-usable recipe. + """ + + area: str + target: str + trigger: str + procedure: str + failing_signal: str = "" + pitfalls: str = "" + confidence: float = _DEFAULT_CONFIDENCE + + def to_body(self) -> str: + """Render the card as the structured ``MemoryItem.body`` text.""" + lines = [ + f"trigger: {self.trigger}", + f"procedure: {self.procedure}", + ] + if self.pitfalls: + lines.append(f"pitfalls: {self.pitfalls}") + return "\n".join(lines) + + +def _strip_fences(text: str) -> str: + """Strip a leading/trailing markdown code fence the model may have added.""" + stripped = text.strip() + if not stripped.startswith("```"): + return stripped + # Drop the opening fence line (``` or ```json) and the closing fence. + body = stripped[3:] + newline = body.find("\n") + if newline != -1: + body = body[newline + 1 :] + if body.rstrip().endswith("```"): + body = body.rstrip()[:-3] + return body.strip() + + +def parse_skill_card(raw: str) -> SkillCard | None: + """Parse the model's JSON output into a :class:`SkillCard`, or ``None``. + + Returns ``None`` for malformed JSON, a non-object payload, or a payload + missing any required field (``area``/``target``/``trigger``/``procedure``) — + a card without an area, a target, or a recipe is worthless. Optional fields + default; ``confidence`` is clamped to ``[0, 1]`` and falls back to the + default when absent or non-numeric. + """ + try: + payload = json.loads(_strip_fences(raw)) + except (json.JSONDecodeError, ValueError): + return None + if not isinstance(payload, dict): + return None + + fields: dict[str, str] = {} + for key in _REQUIRED_FIELDS: + value = payload.get(key) + if not isinstance(value, str) or not value.strip(): + return None + fields[key] = value.strip() + + failing_signal = payload.get("failing_signal", "") + pitfalls = payload.get("pitfalls", "") + confidence = _coerce_confidence(payload.get("confidence")) + + return SkillCard( + area=fields["area"], + target=fields["target"], + trigger=fields["trigger"], + procedure=fields["procedure"], + failing_signal=str(failing_signal).strip(), + pitfalls=str(pitfalls).strip(), + confidence=confidence, + ) + + +def _coerce_confidence(value: object) -> float: + try: + conf = float(value) # type: ignore[arg-type] + except (TypeError, ValueError): + return _DEFAULT_CONFIDENCE + return max(0.0, min(1.0, conf)) + + +def _build_prompt(diff: str, issue_title: str, acceptance: str) -> str: + clipped = diff[:_MAX_DIFF_CHARS] + return f"""You are the librarian for an autonomous coding loop. A pull request just \ +merged after passing review. Distil ONE reusable procedural skill card capturing \ +"how to do this kind of change in this repo" so a future worker need not re-derive it. + +ISSUE TITLE: {issue_title} +ACCEPTANCE CRITERIA: {acceptance} + +MERGED DIFF (clipped): +{clipped} + +Return ONLY a JSON object with these fields, no prose: +- "area": the skill's address in the tree, slash-delimited from coarse to fine \ +(e.g. "pulsar-node/http-route", "ui/page"). Use existing top-level areas when one fits. +- "failing_signal": the symptom/trigger that this change addressed (may be "" for a \ +pure feature). +- "target": the primary file or module this applies to. +- "trigger": one line — when a future worker should reach for this skill. +- "procedure": the reusable recipe — the files to touch, the pattern to copy, the \ +exact test command to prove it. +- "pitfalls": known gotchas (may be ""). +- "confidence": 0.0-1.0, how well this generalises beyond this one change.""" + + +def distill_skill_from_merge( + *, + diff: str, + issue_title: str, + acceptance: str, + call_llm: Callable[[str], str], +) -> SkillCard | None: + """Distil a merged change into a :class:`SkillCard` via ``call_llm``. + + Pure with respect to I/O: the model call is injected, so the prompt assembly + and parsing are fully unit-testable. Returns ``None`` when the model output + cannot be parsed into a valid card. + """ + prompt = _build_prompt(diff, issue_title, acceptance) + raw = call_llm(prompt) + return parse_skill_card(raw) + + +def default_librarian( + *, + cwd: Path, + model: str = "claude-haiku-4-5-20251001", + timeout_s: int = 180, +) -> Callable[[str], str]: + """Return the real ``call_llm`` backed by a cheap one-shot SDK session. + + Defaults to a small/fast model — distillation is a bounded summarisation + task, not a reasoning-heavy one, so it should be cheap per merge. + """ + from forge_loop._critic_sdk import run_critic_sdk + + def _call(prompt: str) -> str: + result = run_critic_sdk(prompt, cwd=cwd, timeout_s=timeout_s, model=model) + if result.error: + return "" + return result.last_message + + return _call diff --git a/src/forge_loop/worker.py b/src/forge_loop/worker.py index d5d5131..3011619 100644 --- a/src/forge_loop/worker.py +++ b/src/forge_loop/worker.py @@ -416,6 +416,31 @@ def run_worker( manifesto_bundle = load_manifestos(repo) manifesto_sha = manifesto_bundle.sha_payload() if manifesto_bundle.any_present else None + # Skill-tree (issue #458): retrieve learned procedural skills relevant to + # this issue so make_brief can inject them. Best-effort — a missing store or + # a retrieval error degrades to no skill section. Retrieval is cheap + local + # (a SQLite read), so it is always on; the expensive harvest half is gated by + # ``misc.skill_tree_harvest``. ``skill_hits`` drives skill_injected events. + skill_store = None + skill_hits: tuple[Any, ...] = () + if brief_override is None: + try: + from forge_loop.memory import memory_db_path + from forge_loop.memory.skills import retrieve_skills_for + from forge_loop.memory.store import SqliteMemoryStore + + _skill_db = memory_db_path(repo) + if _skill_db.exists(): + skill_store = SqliteMemoryStore(_skill_db) + skill_hits = retrieve_skills_for( + skill_store, + f"{issue.get('title', '')} {issue.get('body', '') or ''}", + k=3, + ) + except Exception: # noqa: BLE001 — best-effort; degrade to no skills + skill_store = None + skill_hits = () + # Iteration loop (issue #78) passes a focused follow-up brief that # short-circuits ``make_brief`` — the follow-up session reuses the same # worktree + branch and just gets told "your ONLY job is X". @@ -439,8 +464,28 @@ def run_worker( capability_policy=capability_policy, verify_commands=verify_commands, scope_soft_loc_cap=scope_soft_loc_cap, + memory_store=skill_store, ) + if skill_hits and events_file is not None: + import contextlib + + from forge_loop.events import SkillInjectedEvent + from forge_loop.events import emit as _emit_skill + from forge_loop.memory.models import area_from_tags + + for _rank, _hit in enumerate(skill_hits): + with contextlib.suppress(Exception): # telemetry is best-effort + _emit_skill( + events_file, + SkillInjectedEvent( + issue=n, + memory_id=_hit.memory_id, + area=area_from_tags(_hit.tags), + rank=_rank, + ), + ) + # Maestro advisory context (frontier + memory) rides on top of the brief. # Prepended here — downstream of fingerprint/template-hash computation — so # it never perturbs attempt fingerprints or skip-guards. Empty = no-op. diff --git a/src/forge_loop/worker_brief.py b/src/forge_loop/worker_brief.py index cd0447a..c2c5635 100644 --- a/src/forge_loop/worker_brief.py +++ b/src/forge_loop/worker_brief.py @@ -23,6 +23,31 @@ _PRIOR_EPISODE_BODY_CAP = 600 #: Marker appended to a truncated episode body. _PRIOR_EPISODE_TRUNCATION_MARKER = "…[truncated]" +#: Max learned skill cards injected into a brief (token budget). +_SKILL_TREE_CAP = 3 + + +def _render_skill_tree(memory_store: MemoryStore | None, issue: dict[str, Any]) -> str: + """Render the retrieved learned-skills section for an issue, or ``""``. + + Retrieves up to :data:`_SKILL_TREE_CAP` active procedural skill cards most + relevant to the issue (title + body) and renders them. Degrades to ``""`` + when no store is wired, nothing matches, or the store raises — so the brief + is byte-identical to the historical output in every empty case, mirroring + :func:`_render_prior_episodes`. + """ + if memory_store is None: + return "" + try: + from forge_loop.memory.skills import render_skill_section, retrieve_skills_for + + query = f"{issue.get('title', '')} {issue.get('body', '') or ''}" + hits = retrieve_skills_for(memory_store, query, k=_SKILL_TREE_CAP) + except Exception: # noqa: BLE001 — boundary; degrade gracefully + _log.warning("worker_brief_skill_tree_unavailable") + return "" + section = render_skill_section(hits) + return f"\n{section}" if section else "" def _render_prior_episodes(memory_store: MemoryStore | None, n: int) -> str: @@ -45,8 +70,7 @@ def _render_prior_episodes(memory_store: MemoryStore | None, n: int) -> str: return "" try: active = { - item.memory_id: item - for item in memory_store.list_active(kind=MemoryKind.EPISODIC) + item.memory_id: item for item in memory_store.list_active(kind=MemoryKind.EPISODIC) } except Exception: # noqa: BLE001 — boundary; degrade gracefully _log.warning("repair_brief_prior_episodes_unavailable") @@ -126,7 +150,7 @@ def _render_scope_discipline(cap: int, *, repair: bool) -> str: return ( "\nSCOPE DISCIPLINE — ship the SMALLEST viable change:\n" "- ONE mechanism: implement the acceptance criteria's PRIMARY ask, " - "nothing more. No alternative implementations \"to be safe\".\n" + 'nothing more. No alternative implementations "to be safe".\n' "- Prefer EDITING or DELETING over ADDING. A large pure-addition diff " "is a red flag, not progress.\n" f"{cap_line}\n" @@ -151,6 +175,7 @@ def make_brief( capability_policy: CapabilityPolicy | None = None, verify_commands: tuple[str, ...] = (), scope_soft_loc_cap: int = 150, + memory_store: MemoryStore | None = None, ) -> str: """Render the worker brief for an issue.""" body = (issue.get("body") or "")[:6000] @@ -236,6 +261,9 @@ def make_brief( from forge_loop.manifestos import inject_into_brief rendered = inject_into_brief(rendered, manifesto_bundle) + skill_tree_section = _render_skill_tree(memory_store, issue) + if skill_tree_section: + rendered = skill_tree_section + rendered return rendered @@ -256,6 +284,7 @@ def make_repair_brief( body = (issue.get("body") or "")[:6000] n = issue["number"] prior_episodes_section = _render_prior_episodes(memory_store, n) + skill_tree_section = _render_skill_tree(memory_store, issue) pr_url = pr.get("url") or f"https://github.com/pull/{pr.get('number', '')}" pr_number = pr.get("number", "") head = pr.get("headRefName") or "" @@ -281,7 +310,7 @@ def make_repair_brief( --- {review_context[:12000]} --- -{prior_episodes_section}{scope_discipline_section} +{skill_tree_section}{prior_episodes_section}{scope_discipline_section} CONTRACT: 1. Repair the EXISTING PR branch. Do not create a new branch and do not open a new PR. 2. Address every unresolved review thread and every sev1/blocking review point with production behavior and tests. diff --git a/tests/test_skill_area_tags.py b/tests/test_skill_area_tags.py new file mode 100644 index 0000000..92e5748 --- /dev/null +++ b/tests/test_skill_area_tags.py @@ -0,0 +1,39 @@ +"""Area-path tags that give a procedural skill its position in the tree.""" + +from __future__ import annotations + +from forge_loop.memory.models import ( + AREA_NODE_TAG, + area_ancestors, + area_from_tags, + area_tag, +) + + +def test_area_tag_renders_prefixed_stripped_path() -> None: + assert area_tag(" pulsar-node/http-route ") == "area:pulsar-node/http-route" + + +def test_area_from_tags_extracts_the_path() -> None: + tags = ("skill:abc123", "area:pulsar-node/ledger") + assert area_from_tags(tags) == "pulsar-node/ledger" + + +def test_area_from_tags_absent_returns_empty() -> None: + assert area_from_tags(("skill:abc123",)) == "" + + +def test_area_ancestors_walks_most_specific_first_including_self() -> None: + assert area_ancestors("a/b/c") == ("a/b/c", "a/b", "a") + + +def test_area_ancestors_single_segment_is_just_itself() -> None: + assert area_ancestors("a") == ("a",) + + +def test_area_ancestors_blank_is_empty() -> None: + assert area_ancestors(" ") == () + + +def test_area_node_marker_tag_is_stable() -> None: + assert AREA_NODE_TAG == "area-node" diff --git a/tests/test_skill_curation.py b/tests/test_skill_curation.py new file mode 100644 index 0000000..0f5bea7 --- /dev/null +++ b/tests/test_skill_curation.py @@ -0,0 +1,122 @@ +"""Curation: expire stale skill cards; promote leaves into internal nodes.""" + +from __future__ import annotations + +from pathlib import Path + +from forge_loop.memory.models import AREA_NODE_TAG, MemoryKind, area_from_tags, area_tag +from forge_loop.memory.skills import ( + expire_stale_skills, + promote_internal_nodes, + retrieve_skills_for, +) +from forge_loop.memory.store import SqliteMemoryStore +from forge_loop.runner.learning import record_procedural_skill + + +def _store(tmp_path: Path) -> SqliteMemoryStore: + return SqliteMemoryStore(tmp_path / "memory.db") + + +def _leaf(store: SqliteMemoryStore, *, area: str, signal: str, target: str, sha: str) -> str: + return record_procedural_skill( + store, + failing_signal=signal, + target=target, + title=f"{area}: {signal}", + body="recipe with ledger keyword", + source_key=f"k:{target}", + source_task_ref=f"issue:#{abs(hash(target)) % 1000}", + extra_tags=(area_tag(area),), + evidence_refs=(f"commit:{sha}",), + ) + + +# --- expiry ------------------------------------------------------------------- + + +def test_expire_tags_card_whose_proof_sha_is_gone(tmp_path: Path) -> None: + store = _store(tmp_path) + dead = _leaf(store, area="pulsar-node/ledger", signal="x", target="a.rs", sha="deadsha") + expired = expire_stale_skills(store, live_shas={"livesha"}) + assert dead in expired + # adversarial: an expired card must NOT be retrieved even on a matching query + assert retrieve_skills_for(store, "ledger recipe", k=5) == () + + +def test_expire_keeps_card_with_a_live_sha(tmp_path: Path) -> None: + store = _store(tmp_path) + _leaf(store, area="pulsar-node/ledger", signal="x", target="a.rs", sha="livesha") + expired = expire_stale_skills(store, live_shas={"livesha"}) + assert expired == () + assert len(retrieve_skills_for(store, "ledger recipe", k=5)) == 1 + + +def test_expire_ignores_card_without_commit_provenance(tmp_path: Path) -> None: + store = _store(tmp_path) + record_procedural_skill( + store, + failing_signal="x", + target="a.rs", + title="no-provenance", + body="ledger recipe", + source_key="k", + source_task_ref="issue:#1", + extra_tags=(area_tag("pulsar-node/ledger"),), + ) + assert expire_stale_skills(store, live_shas=set()) == () + + +# --- internal-node promotion -------------------------------------------------- + + +def _fake_summarize(leaves: tuple) -> str: + return "pattern: " + "; ".join(leaf.title for leaf in leaves) + + +def test_promote_creates_internal_node_when_enough_sibling_leaves(tmp_path: Path) -> None: + store = _store(tmp_path) + _leaf(store, area="pulsar-node/ledger", signal="a", target="a.rs", sha="s1") + _leaf(store, area="pulsar-node/http", signal="b", target="b.rs", sha="s2") + + promoted = promote_internal_nodes(store, min_leaves=2, summarize=_fake_summarize) + + assert len(promoted) == 1 + assert promoted[0].area == "pulsar-node" + assert promoted[0].leaf_count == 2 + nodes = [i for i in store.list_active(kind=MemoryKind.PROCEDURAL) if AREA_NODE_TAG in i.tags] + assert len(nodes) == 1 + assert area_from_tags(nodes[0].tags) == "pulsar-node" + assert "pattern:" in nodes[0].body + + +def test_promote_skips_area_below_threshold(tmp_path: Path) -> None: + store = _store(tmp_path) + _leaf(store, area="pulsar-node/ledger", signal="a", target="a.rs", sha="s1") + assert promote_internal_nodes(store, min_leaves=2, summarize=_fake_summarize) == () + + +def test_promote_is_idempotent(tmp_path: Path) -> None: + store = _store(tmp_path) + _leaf(store, area="pulsar-node/ledger", signal="a", target="a.rs", sha="s1") + _leaf(store, area="pulsar-node/http", signal="b", target="b.rs", sha="s2") + + def run() -> None: + promote_internal_nodes(store, min_leaves=2, summarize=_fake_summarize) + + run() + run() + nodes = [i for i in store.list_active(kind=MemoryKind.PROCEDURAL) if AREA_NODE_TAG in i.tags] + assert len(nodes) == 1 + + +def test_internal_node_does_not_count_itself_as_a_leaf(tmp_path: Path) -> None: + store = _store(tmp_path) + _leaf(store, area="pulsar-node/ledger", signal="a", target="a.rs", sha="s1") + _leaf(store, area="pulsar-node/http", signal="b", target="b.rs", sha="s2") + promote_internal_nodes(store, min_leaves=2, summarize=_fake_summarize) + # a second pass must not cascade the node into a grandparent ("" / root) node + promote_internal_nodes(store, min_leaves=2, summarize=_fake_summarize) + nodes = [i for i in store.list_active(kind=MemoryKind.PROCEDURAL) if AREA_NODE_TAG in i.tags] + assert len(nodes) == 1 + assert area_from_tags(nodes[0].tags) == "pulsar-node" diff --git a/tests/test_skill_events.py b/tests/test_skill_events.py new file mode 100644 index 0000000..11de5b8 --- /dev/null +++ b/tests/test_skill_events.py @@ -0,0 +1,46 @@ +"""Typed events for the skill-tree lifecycle.""" + +from __future__ import annotations + +import pytest + +from forge_loop.events import ( + EVENT_REGISTRY, + SkillExpiredEvent, + SkillHarvestedEvent, + SkillInjectedEvent, + SkillPromotedEvent, +) + + +def test_skill_harvested_registered_and_roundtrips() -> None: + ev = SkillHarvestedEvent( + issue=430, + area="pulsar-node/http-route", + skill_key="k", + memory_id="m", + sha="s", + confidence=0.9, + ) + rec = ev.to_record() + assert rec["kind"] == "skill_harvested" + assert rec["issue"] == 430 + assert rec["area"] == "pulsar-node/http-route" + assert EVENT_REGISTRY["skill_harvested"] is SkillHarvestedEvent + + +def test_skill_harvested_rejects_out_of_range_confidence() -> None: + with pytest.raises(Exception): # noqa: B017 - pydantic ValidationError + SkillHarvestedEvent(issue=1, area="a", skill_key="k", memory_id="m", sha="s", confidence=5) + + +def test_skill_injected_carries_rank_and_kind() -> None: + ev = SkillInjectedEvent(issue=1, memory_id="m", area="a", rank=0) + rec = ev.to_record() + assert rec["kind"] == "skill_injected" + assert rec["rank"] == 0 + + +def test_promoted_and_expired_are_registered() -> None: + assert EVENT_REGISTRY["skill_promoted"] is SkillPromotedEvent + assert EVENT_REGISTRY["skill_expired"] is SkillExpiredEvent diff --git a/tests/test_skill_harvest.py b/tests/test_skill_harvest.py new file mode 100644 index 0000000..b52ca2c --- /dev/null +++ b/tests/test_skill_harvest.py @@ -0,0 +1,125 @@ +"""Harvest a procedural skill card from each critic-clean merged outcome.""" + +from __future__ import annotations + +from pathlib import Path + +from forge_loop.memory.models import ( + MemoryKind, + area_from_tags, + derive_skill_key, + skill_tag, +) +from forge_loop.memory.store import SqliteMemoryStore +from forge_loop.runner.learning import ( + harvest_skills_from_merge, + record_procedural_skill, +) + +_VALID_CARD = """ +{ + "area": "pulsar-node/http-route", + "failing_signal": "ledger-only change 500s", + "target": "bins/pulsar-node/src/ledger.rs", + "trigger": "adding a GET endpoint", + "procedure": "add handler; route; cargo test -p pulsar-node --bin", + "pitfalls": "bare -p filters the bin unittest to zero", + "confidence": 0.9 +} +""" + + +def _store(tmp_path: Path) -> SqliteMemoryStore: + return SqliteMemoryStore(tmp_path / "memory.db") + + +# --- record_procedural_skill: area tag + evidence (the SHA provenance) -------- + + +def test_record_procedural_skill_persists_area_tag_and_evidence(tmp_path: Path) -> None: + store = _store(tmp_path) + memory_id = record_procedural_skill( + store, + failing_signal="sig", + target="file.rs", + title="a skill", + body="do it", + source_key="harvest:1:abc", + source_task_ref="issue:#1", + extra_tags=("area:pulsar-node/http-route",), + evidence_refs=("commit:abc123",), + ) + item = store.get(memory_id) + assert item is not None + assert skill_tag(derive_skill_key("sig", "file.rs")) in item.tags + assert area_from_tags(item.tags) == "pulsar-node/http-route" + assert "commit:abc123" in item.provenance.evidence_refs + + +# --- harvest_skills_from_merge ------------------------------------------------ + + +def test_harvest_records_one_card_per_merge_with_area_and_sha(tmp_path: Path) -> None: + store = _store(tmp_path) + + def fetch_diff(issue: int) -> tuple[str, str]: + return ("diff --git a/ledger.rs b/ledger.rs\n+fn f(){}", "sha-deadbeef") + + harvested = harvest_skills_from_merge( + store, + [{"issue": 430, "title": "APP4: app-scoped token auth"}], + fetch_diff=fetch_diff, + call_llm=lambda _p: _VALID_CARD, + ) + + assert len(harvested) == 1 + assert harvested[0].issue == 430 + assert harvested[0].area == "pulsar-node/http-route" + assert harvested[0].sha == "sha-deadbeef" + + active = store.list_active(kind=MemoryKind.PROCEDURAL) + assert len(active) == 1 + assert area_from_tags(active[0].tags) == "pulsar-node/http-route" + assert "commit:sha-deadbeef" in active[0].provenance.evidence_refs + assert "cargo test" in active[0].body # the recipe survived + + +def test_harvest_skips_outcome_with_empty_diff(tmp_path: Path) -> None: + store = _store(tmp_path) + harvested = harvest_skills_from_merge( + store, + [{"issue": 1, "title": "x"}], + fetch_diff=lambda _n: ("", "sha"), + call_llm=lambda _p: _VALID_CARD, + ) + assert harvested == () + assert store.list_active(kind=MemoryKind.PROCEDURAL) == () + + +def test_harvest_skips_when_librarian_returns_garbage(tmp_path: Path) -> None: + store = _store(tmp_path) + harvested = harvest_skills_from_merge( + store, + [{"issue": 1, "title": "x"}], + fetch_diff=lambda _n: ("real diff", "sha"), + call_llm=lambda _p: "sorry, cannot help", + ) + assert harvested == () + assert store.list_active(kind=MemoryKind.PROCEDURAL) == () + + +def test_harvest_is_idempotent_for_the_same_merge(tmp_path: Path) -> None: + store = _store(tmp_path) + + def run() -> None: + harvest_skills_from_merge( + store, + [{"issue": 430, "title": "APP4"}], + fetch_diff=lambda _n: ("diff", "sha-x"), + call_llm=lambda _p: _VALID_CARD, + ) + + run() + run() + # same (issue, sha) → same source_key → upsert in place, not a duplicate + assert len(store.list_active(kind=MemoryKind.PROCEDURAL)) == 1 diff --git a/tests/test_skill_librarian.py b/tests/test_skill_librarian.py new file mode 100644 index 0000000..6aaa2a5 --- /dev/null +++ b/tests/test_skill_librarian.py @@ -0,0 +1,96 @@ +"""The librarian distils a merged diff into a structured SkillCard.""" + +from __future__ import annotations + +from forge_loop.skill_librarian import ( + SkillCard, + distill_skill_from_merge, + parse_skill_card, +) + +_VALID = """ +{ + "area": "pulsar-node/http-route", + "failing_signal": "ledger-only change 500s on diff", + "target": "bins/pulsar-node/src/ledger.rs", + "trigger": "adding a GET endpoint to the node", + "procedure": "1. add handler 2. route it 3. test", + "pitfalls": "run cargo test -p pulsar-node --bin pulsar-node", + "confidence": 0.9 +} +""" + + +def test_parse_extracts_all_fields() -> None: + card = parse_skill_card(_VALID) + assert card == SkillCard( + area="pulsar-node/http-route", + failing_signal="ledger-only change 500s on diff", + target="bins/pulsar-node/src/ledger.rs", + trigger="adding a GET endpoint to the node", + procedure="1. add handler 2. route it 3. test", + pitfalls="run cargo test -p pulsar-node --bin pulsar-node", + confidence=0.9, + ) + + +def test_parse_tolerates_markdown_fences() -> None: + fenced = "```json\n" + _VALID.strip() + "\n```" + card = parse_skill_card(fenced) + assert card is not None + assert card.area == "pulsar-node/http-route" + + +def test_parse_malformed_json_returns_none() -> None: + assert parse_skill_card("not json at all") is None + + +def test_parse_missing_required_field_returns_none() -> None: + # no "procedure" — a card with no recipe is worthless + assert parse_skill_card('{"area":"a","target":"t","trigger":"x"}') is None + + +def test_parse_defaults_optional_fields() -> None: + minimal = '{"area":"a/b","target":"t","trigger":"x","procedure":"do it"}' + card = parse_skill_card(minimal) + assert card is not None + assert card.failing_signal == "" + assert card.pitfalls == "" + assert card.confidence == 0.8 # default when absent + + +def test_parse_clamps_out_of_range_confidence() -> None: + raw = '{"area":"a","target":"t","trigger":"x","procedure":"p","confidence":5}' + card = parse_skill_card(raw) + assert card is not None + assert card.confidence == 1.0 + + +def test_distill_passes_diff_and_title_to_the_model_and_parses() -> None: + seen: dict[str, str] = {} + + def fake_llm(prompt: str) -> str: + seen["prompt"] = prompt + return _VALID + + card = distill_skill_from_merge( + diff="diff --git a/ledger.rs b/ledger.rs\n+fn foo() {}", + issue_title="APP4: app-scoped token auth", + acceptance="app writes through can()", + call_llm=fake_llm, + ) + assert card is not None + assert card.target == "bins/pulsar-node/src/ledger.rs" + # the model must actually receive the material to distil from + assert "app-scoped token auth" in seen["prompt"] + assert "diff --git" in seen["prompt"] + + +def test_distill_returns_none_when_model_output_is_garbage() -> None: + card = distill_skill_from_merge( + diff="d", + issue_title="t", + acceptance="a", + call_llm=lambda _p: "sorry I cannot help", + ) + assert card is None diff --git a/tests/test_skill_retrieval.py b/tests/test_skill_retrieval.py new file mode 100644 index 0000000..c165bbe --- /dev/null +++ b/tests/test_skill_retrieval.py @@ -0,0 +1,115 @@ +"""Retrieve the most relevant skill cards for a ticket and render them.""" + +from __future__ import annotations + +from pathlib import Path + +from forge_loop.memory.models import MemoryKind, area_tag +from forge_loop.memory.skills import render_skill_section, retrieve_skills_for +from forge_loop.memory.store import SqliteMemoryStore +from forge_loop.runner.learning import record_procedural_skill + + +def _store(tmp_path: Path) -> SqliteMemoryStore: + return SqliteMemoryStore(tmp_path / "memory.db") + + +def _add(store: SqliteMemoryStore, *, area: str, signal: str, target: str, body: str) -> str: + return record_procedural_skill( + store, + failing_signal=signal, + target=target, + title=f"{area}: {signal}", + body=body, + source_key=f"k:{area}:{target}", + source_task_ref=f"issue:#{abs(hash(target)) % 1000}", + extra_tags=(area_tag(area),), + evidence_refs=(f"commit:{target}",), + ) + + +def test_retrieves_card_matching_an_area_token(tmp_path: Path) -> None: + store = _store(tmp_path) + _add( + store, + area="pulsar-node/ledger", + signal="fold diverges", + target="ledger.rs", + body="fold recipe", + ) + _add(store, area="ui/page", signal="route missing", target="app.jsx", body="route recipe") + + hits = retrieve_skills_for(store, "fix the ledger fold convergence bug", k=3) + + assert [h.tags for h in hits] # non-empty + from forge_loop.memory.models import area_from_tags + + assert area_from_tags(hits[0].tags) == "pulsar-node/ledger" + + +def test_excludes_cards_with_no_overlap(tmp_path: Path) -> None: + store = _store(tmp_path) + _add(store, area="ui/page", signal="route missing", target="app.jsx", body="route recipe") + # a query about something totally unrelated must inject NOTHING + assert retrieve_skills_for(store, "database vacuum compaction schedule", k=3) == () + + +def test_respects_the_k_cap(tmp_path: Path) -> None: + store = _store(tmp_path) + for i in range(5): + _add( + store, + area=f"pulsar-node/ledger{i}", + signal=f"ledger sig {i}", + target=f"l{i}.rs", + body="ledger", + ) + hits = retrieve_skills_for(store, "ledger ledger ledger", k=2) + assert len(hits) == 2 + + +def test_area_match_outranks_incidental_text_match(tmp_path: Path) -> None: + store = _store(tmp_path) + _add( + store, + area="ui/page", + signal="x", + target="a.jsx", + body="this mentions ledger once incidentally", + ) + _add(store, area="pulsar-node/ledger", signal="y", target="l.rs", body="recipe") + hits = retrieve_skills_for(store, "ledger endpoint work", k=2) + from forge_loop.memory.models import area_from_tags + + assert area_from_tags(hits[0].tags) == "pulsar-node/ledger" + + +def test_only_active_procedural_cards_are_retrieved(tmp_path: Path) -> None: + store = _store(tmp_path) + # supersede: same (signal,target) twice -> first becomes superseded + _add(store, area="pulsar-node/ledger", signal="same", target="l.rs", body="old recipe") + _add(store, area="pulsar-node/ledger", signal="same", target="l.rs", body="new recipe") + hits = retrieve_skills_for(store, "ledger work", k=5) + assert len(hits) == 1 + assert hits[0].kind == MemoryKind.PROCEDURAL + assert "new recipe" in hits[0].body + + +def test_render_section_includes_area_recipe_and_provenance(tmp_path: Path) -> None: + store = _store(tmp_path) + _add( + store, + area="pulsar-node/ledger", + signal="fold", + target="ledger.rs", + body="step one; step two", + ) + hits = retrieve_skills_for(store, "ledger fold", k=1) + section = render_skill_section(hits) + assert "pulsar-node/ledger" in section + assert "step one" in section + assert "commit:ledger.rs" in section # provenance surfaced + + +def test_render_empty_when_no_hits() -> None: + assert render_skill_section(()) == "" diff --git a/tests/test_worker_brief_skills.py b/tests/test_worker_brief_skills.py new file mode 100644 index 0000000..25c5f09 --- /dev/null +++ b/tests/test_worker_brief_skills.py @@ -0,0 +1,51 @@ +"""make_brief injects retrieved skill-tree cards (issue #458).""" + +from __future__ import annotations + +from pathlib import Path + +from forge_loop.memory.models import area_tag +from forge_loop.memory.store import SqliteMemoryStore +from forge_loop.runner.learning import record_procedural_skill +from forge_loop.worker_brief import make_brief + +_SKILL_HEADER = "Learned skills for this repo" + + +def _store_with_ledger_card(tmp_path: Path) -> SqliteMemoryStore: + store = SqliteMemoryStore(tmp_path / "memory.db") + record_procedural_skill( + store, + failing_signal="fold diverges", + target="ledger.rs", + title="pulsar-node/ledger: fold", + body="add part; fold-on-read; cargo test -p pulsar-ledger", + source_key="k1", + source_task_ref="issue:#1", + extra_tags=(area_tag("pulsar-node/ledger"),), + evidence_refs=("commit:abc",), + ) + return store + + +def _issue() -> dict[str, object]: + return {"number": 7, "title": "fix the ledger fold convergence", "body": "ledger work"} + + +def test_make_brief_injects_matching_skill(tmp_path: Path) -> None: + out = make_brief(_issue(), tmp_path, memory_store=_store_with_ledger_card(tmp_path)) + assert _SKILL_HEADER in out + assert "pulsar-node/ledger" in out + assert "cargo test -p pulsar-ledger" in out + + +def test_make_brief_without_store_has_no_skill_section(tmp_path: Path) -> None: + out = make_brief(_issue(), tmp_path, memory_store=None) + assert _SKILL_HEADER not in out + + +def test_make_brief_with_no_matching_card_has_no_section(tmp_path: Path) -> None: + store = _store_with_ledger_card(tmp_path) + unrelated = {"number": 9, "title": "vacuum the sqlite database", "body": "compaction"} + out = make_brief(unrelated, tmp_path, memory_store=store) + assert _SKILL_HEADER not in out