From 35907f1bb9d1f4be59c8bc79fef2ec066744c40d Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 18:44:54 -0700
Subject: [PATCH 01/12] feat(prompts): error classes and category constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Establishes the prompt-management subpackage with the three
canonical error categories from spec §10:

- PromptNotFound (non-transient): no prompt matches (name, label).
- PromptRenderError (non-transient): undefined variable, template
  parse error, or variable-coercion failure.
- PromptStoreUnavailable (transient): backend infrastructure
  failure (network, I/O, vendor API).

Exports PROMPT_TRANSIENT_CATEGORIES mirroring the
TRANSIENT_CATEGORIES frozenset in openarmature.llm.errors, so
retry-middleware classifiers can identify transient
prompt-management failures by category.
---
 src/openarmature/prompts/__init__.py | 23 +++++++
 src/openarmature/prompts/errors.py   | 96 ++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 src/openarmature/prompts/__init__.py
 create mode 100644 src/openarmature/prompts/errors.py

diff --git a/src/openarmature/prompts/__init__.py b/src/openarmature/prompts/__init__.py
new file mode 100644
index 0000000..184463a
--- /dev/null
+++ b/src/openarmature/prompts/__init__.py
@@ -0,0 +1,23 @@
+"""Prompt-management capability — fetch, render, and trace named prompts."""
+
+from .errors import (
+    PROMPT_NOT_FOUND,
+    PROMPT_RENDER_ERROR,
+    PROMPT_STORE_UNAVAILABLE,
+    PROMPT_TRANSIENT_CATEGORIES,
+    PromptError,
+    PromptNotFound,
+    PromptRenderError,
+    PromptStoreUnavailable,
+)
+
+__all__ = [
+    "PROMPT_NOT_FOUND",
+    "PROMPT_RENDER_ERROR",
+    "PROMPT_STORE_UNAVAILABLE",
+    "PROMPT_TRANSIENT_CATEGORIES",
+    "PromptError",
+    "PromptNotFound",
+    "PromptRenderError",
+    "PromptStoreUnavailable",
+]
diff --git a/src/openarmature/prompts/errors.py b/src/openarmature/prompts/errors.py
new file mode 100644
index 0000000..8d1bea3
--- /dev/null
+++ b/src/openarmature/prompts/errors.py
@@ -0,0 +1,96 @@
+"""Error categories for the prompt-management capability."""
+
+from __future__ import annotations
+
+from typing import Any, ClassVar
+
+PROMPT_NOT_FOUND = "prompt_not_found"
+PROMPT_RENDER_ERROR = "prompt_render_error"
+PROMPT_STORE_UNAVAILABLE = "prompt_store_unavailable"
+
+# Mirrors openarmature.llm.errors.TRANSIENT_CATEGORIES. Retry-middleware
+# classifiers MAY import this to identify transient prompt-management
+# failures by category.
+PROMPT_TRANSIENT_CATEGORIES: frozenset[str] = frozenset({PROMPT_STORE_UNAVAILABLE})
+
+
+class PromptError(Exception):
+    """Base for prompt-management errors. Subclasses set ``category``
+    to one of the canonical identifier strings."""
+
+    category: ClassVar[str]
+
+
+class PromptNotFound(PromptError):
+    """Raised when no prompt matches ``(name, label)``.
+
+    Non-transient: retrying the same name + label will not succeed
+    without changing the backends or the prompt store contents.
+    """
+
+    category = PROMPT_NOT_FOUND
+
+    name: str
+    label: str
+    backend: str | None
+
+    def __init__(
+        self,
+        *args: Any,
+        name: str,
+        label: str,
+        backend: str | None = None,
+    ) -> None:
+        super().__init__(*args)
+        self.name = name
+        self.label = label
+        self.backend = backend
+
+
+class PromptRenderError(PromptError):
+    """Raised when render fails: undefined variable under strict
+    handling, template parse error, or variable-coercion failure.
+
+    Carries the source prompt's identity plus the variable mapping
+    and a description of the render failure.
+    """
+
+    category = PROMPT_RENDER_ERROR
+
+    # v1 policy on ``variables``: pass-through unchanged (no automatic
+    # redaction). Callers wanting redaction wrap their variables
+    # before passing to render. Keys MUST be preserved if a future
+    # redaction policy lands; only values may be redacted.
+    name: str
+    version: str
+    label: str
+    variables: dict[str, Any]
+    description: str
+
+    def __init__(
+        self,
+        *args: Any,
+        name: str,
+        version: str,
+        label: str,
+        variables: dict[str, Any],
+        description: str,
+    ) -> None:
+        super().__init__(*args)
+        self.name = name
+        self.version = version
+        self.label = label
+        self.variables = variables
+        self.description = description
+
+
+class PromptStoreUnavailable(PromptError):
+    """Raised when backend infrastructure fails: network unreachable,
+    filesystem I/O error, vendor API 5xx, vendor API timeout.
+
+    Transient: the same fetch may succeed when the backend recovers.
+    ``PromptManager.fetch`` raises this only after ALL composed
+    backends raise it.
+    """
+
+    category = PROMPT_STORE_UNAVAILABLE

From fe1a779d1fab6c787f1218b114e33a812b489586 Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 18:47:31 -0700
Subject: [PATCH 02/12] feat(prompts): Prompt, PromptResult, PromptGroup types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pydantic models for the prompt-management capability shapes from
spec §3, §4, and §9.

Prompt carries the raw template source string plus identity
metadata (name, version, label, template_hash, fetched_at,
optional metadata). The raw-string representation keeps Prompt
serializable and engine-agnostic; compilation happens on render.

PromptResult propagates identity from the source Prompt and
carries the rendered messages list (compatible with
openarmature.llm.Message and directly consumable by
Provider.complete()), the variables used, rendered_hash, and
rendered_at.

PromptGroup wraps an ordered N>=2 sequence of PromptResult
instances with a stable group_name. The validator rejects empty
and single-member groups per §9 (single-prompt tagging is
already served by per-prompt observability attributes).

Hashing helpers compute SHA-256 over UTF-8 bytes (template) and
over a canonical JSON serialization with sort_keys + minimal
separators (rendered). Both prefixed with 'sha256:' so future
algorithm changes are self-describing.
---
 src/openarmature/prompts/__init__.py |  8 +++
 src/openarmature/prompts/group.py    | 36 +++++++++++
 src/openarmature/prompts/hashing.py  | 39 ++++++++++++
 src/openarmature/prompts/prompt.py   | 89 ++++++++++++++++++++++++++++
 4 files changed, 172 insertions(+)
 create mode 100644 src/openarmature/prompts/group.py
 create mode 100644 src/openarmature/prompts/hashing.py
 create mode 100644 src/openarmature/prompts/prompt.py

diff --git a/src/openarmature/prompts/__init__.py b/src/openarmature/prompts/__init__.py
index 184463a..df78126 100644
--- a/src/openarmature/prompts/__init__.py
+++ b/src/openarmature/prompts/__init__.py
@@ -10,14 +10,22 @@
     PromptRenderError,
     PromptStoreUnavailable,
 )
+from .group import PromptGroup
+from .hashing import compute_rendered_hash, compute_template_hash
+from .prompt import Prompt, PromptResult
 
 __all__ = [
     "PROMPT_NOT_FOUND",
     "PROMPT_RENDER_ERROR",
     "PROMPT_STORE_UNAVAILABLE",
     "PROMPT_TRANSIENT_CATEGORIES",
+    "Prompt",
     "PromptError",
+    "PromptGroup",
     "PromptNotFound",
     "PromptRenderError",
+    "PromptResult",
     "PromptStoreUnavailable",
+    "compute_rendered_hash",
+    "compute_template_hash",
 ]
diff --git a/src/openarmature/prompts/group.py b/src/openarmature/prompts/group.py
new file mode 100644
index 0000000..62bbe3b
--- /dev/null
+++ b/src/openarmature/prompts/group.py
@@ -0,0 +1,36 @@
+"""PromptGroup — composition pattern for tracing related prompts together."""
+
+from __future__ import annotations
+
+from pydantic import BaseModel, ConfigDict, model_validator
+
+from .prompt import PromptResult
+
+
+class PromptGroup(BaseModel):
+    """An ordered N≥2 sequence of PromptResult instances under one
+    logical observability grouping.
+
+    The group is a structural hint to observability, not a control-flow
+    primitive. User code is responsible for executing each member's
+    LLM call. The group's contribution is the ``group_name`` that
+    observability propagates onto every member call's span so trace
+    UIs can render them as one unit.
+
+    Attributes:
+        group_name: Stable identifier for this group pattern.
+        members: Ordered sequence of at least two PromptResult
+            instances. Order matches the application's intended call
+            sequence; the spec does not require sequential execution.
+    """
+
+    model_config = ConfigDict(extra="forbid")
+
+    group_name: str
+    members: list[PromptResult]
+
+    @model_validator(mode="after")
+    def _check_min_two_members(self) -> PromptGroup:
+        if len(self.members) < 2:
+            raise ValueError("prompt group: members MUST contain at least two PromptResult instances")
+        return self
diff --git a/src/openarmature/prompts/hashing.py b/src/openarmature/prompts/hashing.py
new file mode 100644
index 0000000..7e62b2a
--- /dev/null
+++ b/src/openarmature/prompts/hashing.py
@@ -0,0 +1,39 @@
+"""Content-derived hash helpers for prompt-management identity."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+
+from openarmature.llm.messages import Message
+
+# All hashes carry a ``sha256:`` prefix so future algorithm changes are
+# self-describing. Spec §3 / §4 mark the hash function as SHOULD
+# (cryptographic) and the canonical serialization as MUST be
+# deterministic.
+_HASH_PREFIX = "sha256:"
+
+
+def compute_template_hash(template_source: str) -> str:
+    """SHA-256 over the UTF-8 bytes of the raw template source."""
+    digest = hashlib.sha256(template_source.encode("utf-8")).hexdigest()
+    return f"{_HASH_PREFIX}{digest}"
+
+
+def compute_rendered_hash(messages: list[Message]) -> str:
+    """SHA-256 over a canonical JSON serialization of ``messages``.
+
+    Preserves message boundaries, roles, content (including
+    content-block structure per llm-provider §3.1), and tool_calls.
+    ``json.dumps(sort_keys=True, separators=(",", ":"))`` over the
+    per-message ``model_dump(mode="json")`` is deterministic across
+    runs; datetimes serialize as ISO-8601 strings.
+    """
+    canonical = json.dumps(
+        [m.model_dump(mode="json") for m in messages],
+        sort_keys=True,
+        separators=(",", ":"),
+        ensure_ascii=False,
+    )
+    digest = hashlib.sha256(canonical.encode("utf-8")).hexdigest()
+    return f"{_HASH_PREFIX}{digest}"
diff --git a/src/openarmature/prompts/prompt.py b/src/openarmature/prompts/prompt.py
new file mode 100644
index 0000000..fe18f93
--- /dev/null
+++ b/src/openarmature/prompts/prompt.py
@@ -0,0 +1,89 @@
+"""Prompt and PromptResult records."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict
+
+from openarmature.llm.messages import Message
+
+
+class Prompt(BaseModel):
+    """An unrendered template plus identity metadata.
+
+    A prompt carries enough information to be rendered, traced, and
+    content-addressed without a backend round-trip. ``template`` is
+    the raw template source string (Jinja2 syntax in Python);
+    compilation happens on render so ``Prompt`` stays serializable
+    and engine-agnostic.
+
+    Attributes:
+        name: Stable identifier within the backend.
+        version: Backend-defined version string. Two distinct version
+            strings denote distinct prompt contents.
+        label: The label under which this prompt was fetched
+            (e.g., "production", "latest", "variant-a").
+        template: Raw template source.
+        template_hash: SHA-256 of the raw template source. Format
+            ``"sha256:<hex>"``.
+        fetched_at: Time the prompt was fetched from its backend.
+            When a caching backend serves a cached result,
+            ``fetched_at`` MUST reflect the original fetch time, not
+            the cache hit time.
+        metadata: Optional backend-supplied metadata.
+    """
+
+    model_config = ConfigDict(extra="forbid")
+
+    name: str
+    version: str
+    label: str
+    template: str
+    template_hash: str
+    fetched_at: datetime
+    metadata: dict[str, Any] | None = None
+
+
+class PromptResult(BaseModel):
+    """The rendered output of applying variables to a prompt.
+
+    Carries the rendered ``Message`` sequence (ready to pass to
+    ``Provider.complete()``) plus the source prompt's identity
+    metadata and a ``rendered_hash`` that captures the rendered
+    content.
+
+    The ``rendered_hash`` is the cache-key value most useful to
+    downstream consumers: two renders with the same template AND
+    the same variables produce the same hash.
+
+    Attributes:
+        name: Propagated from the source Prompt.
+        version: Propagated from the source Prompt.
+        label: Propagated from the source Prompt.
+        template_hash: Propagated from the source Prompt.
+        rendered_hash: SHA-256 of the canonical serialization of
+            the rendered messages list.
+        messages: Ordered non-empty sequence of ``Message`` records.
+        variables: Variable mapping used to render. v1 policy:
+            pass-through unchanged (no automatic redaction). Keys
+            are always preserved; future redaction policies would
+            redact values, never strip keys.
+        fetched_at: Propagated from the source Prompt.
+        rendered_at: Time this PromptResult was rendered. Distinct
+            from ``fetched_at``: a single fetched prompt may render
+            many times.
+    """
+
+    model_config = ConfigDict(extra="forbid")
+
+    name: str
+    version: str
+    label: str
+    template_hash: str
+    rendered_hash: str
+    messages: list[Message]
+    variables: dict[str, Any]
+    fetched_at: datetime
+    rendered_at: datetime

From f448e2a4fd5565cbab15646913f00b2cc1f931be Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 18:49:56 -0700
Subject: [PATCH 03/12] feat(prompts): PromptBackend protocol, PromptManager,
 jinja2 dep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PromptBackend is a runtime-checkable Protocol with a single async
fetch(name, label) method, matching the openarmature.llm.Provider
pattern. The docstring restates the §5 contract: reentrant, no
mutation, raises PromptNotFound / PromptStoreUnavailable, and
the rule that cached results MUST preserve the original fetched_at.

PromptManager composes one or more PromptBackends and exposes:

- fetch: §8 fallback semantics. First successful fetch wins;
  PromptNotFound STOPS the chain (logical absence MUST NOT
  silently substitute); PromptStoreUnavailable continues to the
  next backend; all-exhausted raises PromptStoreUnavailable with
  the last unavailable chained as __cause__. WARN-level log on
  each fallback per §8.
- render: synchronous string transform via Jinja2 with
  StrictUndefined per §7. Produces a single UserMessage in v1
  (multi-message decomposition deferred). UndefinedError and
  TemplateError both map to PromptRenderError carrying the
  prompt's identity + the variables + a description. Pydantic
  ValidationError on the UserMessage(content=rendered_text)
  construction (empty-string render case) also maps to
  PromptRenderError per §10's 'variable's value not coercible'
  framing.
- get: convenience equivalent to render(await fetch(...), variables).

Adds jinja2>=3.1 to runtime dependencies.
---
 pyproject.toml                       |   1 +
 src/openarmature/prompts/__init__.py |   4 +
 src/openarmature/prompts/backend.py  |  36 ++++++
 src/openarmature/prompts/manager.py  | 157 +++++++++++++++++++++++++++
 uv.lock                              |   2 +
 5 files changed, 200 insertions(+)
 create mode 100644 src/openarmature/prompts/backend.py
 create mode 100644 src/openarmature/prompts/manager.py

diff --git a/pyproject.toml b/pyproject.toml
index 2aa829e..f16133f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "pydantic>=2.7",
     "httpx>=0.27",
     "jsonschema>=4.0",
+    "jinja2>=3.1",
 ]
 
 [project.optional-dependencies]
diff --git a/src/openarmature/prompts/__init__.py b/src/openarmature/prompts/__init__.py
index df78126..d231112 100644
--- a/src/openarmature/prompts/__init__.py
+++ b/src/openarmature/prompts/__init__.py
@@ -1,5 +1,6 @@
 """Prompt-management capability — fetch, render, and trace named prompts."""
 
+from .backend import PromptBackend
 from .errors import (
     PROMPT_NOT_FOUND,
     PROMPT_RENDER_ERROR,
@@ -12,6 +13,7 @@
 )
 from .group import PromptGroup
 from .hashing import compute_rendered_hash, compute_template_hash
+from .manager import PromptManager
 from .prompt import Prompt, PromptResult
 
 __all__ = [
@@ -20,8 +22,10 @@
     "PROMPT_STORE_UNAVAILABLE",
     "PROMPT_TRANSIENT_CATEGORIES",
     "Prompt",
+    "PromptBackend",
     "PromptError",
     "PromptGroup",
+    "PromptManager",
     "PromptNotFound",
     "PromptRenderError",
     "PromptResult",
diff --git a/src/openarmature/prompts/backend.py b/src/openarmature/prompts/backend.py
new file mode 100644
index 0000000..3a79443
--- /dev/null
+++ b/src/openarmature/prompts/backend.py
@@ -0,0 +1,36 @@
+"""PromptBackend protocol."""
+
+from __future__ import annotations
+
+from typing import Protocol, runtime_checkable
+
+from .prompt import Prompt
+
+
+@runtime_checkable
+class PromptBackend(Protocol):
+    """Backend protocol — implementations and sibling packages plug into this.
+
+    A PromptBackend exposes one operation: ``fetch`` a prompt by name
+    and label. Backends do NOT render; rendering is the manager's
+    concern.
+
+    Operation semantics:
+
+    - ``fetch()`` MUST be reentrant: multiple concurrent calls on the
+      same backend are permitted.
+    - ``fetch()`` does NOT render or otherwise mutate the template.
+    - ``fetch()`` MUST raise ``PromptNotFound`` when no prompt matches
+      ``(name, label)``.
+    - ``fetch()`` MUST raise ``PromptStoreUnavailable`` when the
+      backend is unreachable (network failure, filesystem I/O error,
+      vendor API timeout).
+
+    Backends MAY cache their own results internally. When a backend
+    serves a cached result, the returned Prompt's ``template_hash``
+    MUST still be correct for the served template (caching MUST NOT
+    break content-addressing), and ``fetched_at`` MUST reflect the
+    original fetch time, not the cache hit time.
+    """
+
+    async def fetch(self, name: str, label: str = "production") -> Prompt: ...
diff --git a/src/openarmature/prompts/manager.py b/src/openarmature/prompts/manager.py
new file mode 100644
index 0000000..6d1b3fc
--- /dev/null
+++ b/src/openarmature/prompts/manager.py
@@ -0,0 +1,157 @@
+"""PromptManager — user-facing fetch + render + composite-fallback."""
+
+from __future__ import annotations
+
+import logging
+from datetime import UTC, datetime
+from typing import Any
+
+import jinja2
+from pydantic import ValidationError
+
+from openarmature.llm.messages import Message, UserMessage
+
+from .backend import PromptBackend
+from .errors import PromptNotFound, PromptRenderError, PromptStoreUnavailable
+from .hashing import compute_rendered_hash
+from .prompt import Prompt, PromptResult
+
+_log = logging.getLogger(__name__)
+
+
+class PromptManager:
+    """Composes one or more PromptBackends and exposes fetch + render.
+
+    Users interact with the manager; backends are an implementation
+    detail of construction. The manager owns:
+
+    - ``fetch`` — consults backends in order per §8 fallback semantics.
+    - ``render`` — synchronous local string transform; produces a
+      ``PromptResult``.
+    - ``get`` — convenience: ``render(await fetch(...), variables)``.
+    """
+
+    def __init__(self, *backends: PromptBackend) -> None:
+        if not backends:
+            raise ValueError("PromptManager requires at least one backend")
+        self._backends: tuple[PromptBackend, ...] = backends
+
+    async def fetch(self, name: str, label: str = "production") -> Prompt:
+        """Consult composed backends in order, applying §8 fallback.
+
+        - First successful fetch wins; further backends are not consulted.
+        - ``PromptNotFound`` from any backend STOPS the chain — the
+          error propagates. Logical absence MUST NOT silently
+          substitute a stale alternative.
+        - ``PromptStoreUnavailable`` from a backend continues to the
+          next. After ALL backends are exhausted with unavailable
+          failures, the manager raises ``PromptStoreUnavailable``.
+        """
+        last_unavailable: PromptStoreUnavailable | None = None
+        for backend in self._backends:
+            try:
+                return await backend.fetch(name, label)
+            except PromptNotFound:
+                raise
+            except PromptStoreUnavailable as exc:
+                last_unavailable = exc
+                _log.warning(
+                    "prompt backend %r unavailable for (%r, %r); falling back",
+                    backend,
+                    name,
+                    label,
+                )
+                continue
+        assert last_unavailable is not None
+        raise PromptStoreUnavailable(
+            f"all prompt backends unavailable for ({name!r}, {label!r})"
+        ) from last_unavailable
+
+    def render(
+        self,
+        prompt: Prompt,
+        variables: dict[str, Any] | None = None,
+    ) -> PromptResult:
+        """Apply ``variables`` to ``prompt.template`` and return a PromptResult.
+
+        Render is synchronous — no I/O. Variables are strict by
+        default per §7: a template reference to a name not in
+        ``variables`` raises ``PromptRenderError``.
+
+        The render output is always a single ``UserMessage`` carrying
+        the rendered text in v1. Multi-message decomposition (system
+        + user split) is deferred to a follow-on; callers needing
+        that today fetch the raw template and construct the messages
+        list manually.
+        """
+        variables = variables or {}
+        env = jinja2.Environment(
+            undefined=jinja2.StrictUndefined,
+            autoescape=False,
+            keep_trailing_newline=True,
+        )
+
+        rendered_text: str
+        try:
+            template = env.from_string(prompt.template)
+            rendered_text = template.render(**variables)
+        except jinja2.UndefinedError as exc:
+            raise PromptRenderError(
+                f"undefined variable rendering ({prompt.name!r}, {prompt.label!r}): {exc}",
+                name=prompt.name,
+                version=prompt.version,
+                label=prompt.label,
+                variables=variables,
+                description=str(exc),
+            ) from exc
+        except jinja2.TemplateError as exc:
+            raise PromptRenderError(
+                f"template error rendering ({prompt.name!r}, {prompt.label!r}): {exc}",
+                name=prompt.name,
+                version=prompt.version,
+                label=prompt.label,
+                variables=variables,
+                description=str(exc),
+            ) from exc
+
+        # Boundary-wrap the Pydantic-validation step around message
+        # construction. A template that renders to an empty string
+        # (e.g., ``{{ x if x else '' }}`` with ``x=None``) parses
+        # cleanly through Jinja2 but ``UserMessage(content="")``
+        # raises ValidationError per messages.py's non-empty rule.
+        # That counts as a render failure under §10's "variable's
+        # value is not coercible" framing.
+        try:
+            messages: list[Message] = [UserMessage(content=rendered_text)]
+            rendered_hash = compute_rendered_hash(messages)
+        except ValidationError as exc:
+            raise PromptRenderError(
+                f"rendered output invalid for ({prompt.name!r}, {prompt.label!r}): {exc}",
+                name=prompt.name,
+                version=prompt.version,
+                label=prompt.label,
+                variables=variables,
+                description=str(exc),
+            ) from exc
+
+        return PromptResult(
+            name=prompt.name,
+            version=prompt.version,
+            label=prompt.label,
+            template_hash=prompt.template_hash,
+            rendered_hash=rendered_hash,
+            messages=messages,
+            variables=variables,
+            fetched_at=prompt.fetched_at,
+            rendered_at=datetime.now(UTC),
+        )
+
+    async def get(
+        self,
+        name: str,
+        label: str = "production",
+        variables: dict[str, Any] | None = None,
+    ) -> PromptResult:
+        """Convenience equivalent to ``render(await fetch(name, label), variables)``."""
+        prompt = await self.fetch(name, label)
+        return self.render(prompt, variables)
diff --git a/uv.lock b/uv.lock
index c24ddf9..868e158 100644
--- a/uv.lock
+++ b/uv.lock
@@ -889,6 +889,7 @@ version = "0.5.0"
 source = { editable = "." }
 dependencies = [
     { name = "httpx" },
+    { name = "jinja2" },
     { name = "jsonschema" },
     { name = "pydantic" },
 ]
@@ -926,6 +927,7 @@ examples = [
 [package.metadata]
 requires-dist = [
     { name = "httpx", specifier = ">=0.27" },
+    { name = "jinja2", specifier = ">=3.1" },
     { name = "jsonschema", specifier = ">=4.0" },
     { name = "opentelemetry-api", marker = "extra == 'otel'", specifier = ">=1.27,<3" },
     { name = "opentelemetry-instrumentation-logging", marker = "extra == 'otel'", specifier = ">=0.62.0b1" },

From e5d8f1bfce8f679ba84df20e218b3d791c9dc3ff Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 18:54:48 -0700
Subject: [PATCH 04/12] feat(prompts): FilesystemPromptBackend + OTel attribute
 propagation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FilesystemPromptBackend reads prompts from <root>/<label>/<name>.j2.
The subdirectory-per-label layout keeps name-collisions across
labels distinct without prefix-escape concerns. version is
derived from the first 12 hex chars of the template_hash so two
file contents map deterministically to two distinct versions
without needing a sidecar metadata file (spec §3 lets backends
pick any stable identifier). The docstring notes that future
caching backends MUST preserve the original fetched_at on
returned Prompts per spec §3.

Adds the context-variable propagation mechanism for spec §11
LLM-call span attributes:

- openarmature.prompts.context module exposes
  with_active_prompt(result) and with_active_prompt_group(group)
  context managers plus current_prompt_result() /
  current_prompt_group() inspectors.
- OTelObserver._on_llm_event reads the two ContextVars at LLM-
  call span start and surfaces:
    openarmature.prompt.name
    openarmature.prompt.version
    openarmature.prompt.label
    openarmature.prompt.template_hash
    openarmature.prompt.rendered_hash
    openarmature.prompt.group_name
- Nesting is innermost-wins (matches Python's natural ContextVar
  token-stacking behavior; spec §11 doesn't mandate a policy).

The attribute names match spec §11's normative list. The
mechanism (context variables) is one of the two example
mechanisms §11 names; bundling it now keeps the §11 surface
discoverable from the moment prompt-management lands.
---
 .../observability/otel/observer.py            | 18 +++++
 src/openarmature/prompts/__init__.py          | 12 +++
 src/openarmature/prompts/backends/__init__.py |  5 ++
 .../prompts/backends/filesystem.py            | 64 +++++++++++++++
 src/openarmature/prompts/context.py           | 80 +++++++++++++++++++
 5 files changed, 179 insertions(+)
 create mode 100644 src/openarmature/prompts/backends/__init__.py
 create mode 100644 src/openarmature/prompts/backends/filesystem.py
 create mode 100644 src/openarmature/prompts/context.py

diff --git a/src/openarmature/observability/otel/observer.py b/src/openarmature/observability/otel/observer.py
index 959e110..12de9f7 100644
--- a/src/openarmature/observability/otel/observer.py
+++ b/src/openarmature/observability/otel/observer.py
@@ -487,6 +487,24 @@ def _handle_llm_event(self, event: NodeEvent) -> None:
             cid = current_correlation_id()
             if cid is not None:
                 attrs["openarmature.correlation_id"] = cid
+            # Per prompt-management spec §11, surface prompt identity
+            # on the LLM-call span when the call fired inside a
+            # with_active_prompt / with_active_prompt_group context.
+            from openarmature.prompts.context import (
+                current_prompt_group,
+                current_prompt_result,
+            )
+
+            active_prompt = current_prompt_result()
+            if active_prompt is not None:
+                attrs["openarmature.prompt.name"] = active_prompt.name
+                attrs["openarmature.prompt.version"] = active_prompt.version
+                attrs["openarmature.prompt.label"] = active_prompt.label
+                attrs["openarmature.prompt.template_hash"] = active_prompt.template_hash
+                attrs["openarmature.prompt.rendered_hash"] = active_prompt.rendered_hash
+            active_group = current_prompt_group()
+            if active_group is not None:
+                attrs["openarmature.prompt.group_name"] = active_group.group_name
             span = self._tracer.start_span(
                 name="openarmature.llm.complete",
                 context=cast("Any", parent_ctx),
diff --git a/src/openarmature/prompts/__init__.py b/src/openarmature/prompts/__init__.py
index d231112..d8b16e7 100644
--- a/src/openarmature/prompts/__init__.py
+++ b/src/openarmature/prompts/__init__.py
@@ -1,6 +1,13 @@
 """Prompt-management capability — fetch, render, and trace named prompts."""
 
 from .backend import PromptBackend
+from .backends import FilesystemPromptBackend
+from .context import (
+    current_prompt_group,
+    current_prompt_result,
+    with_active_prompt,
+    with_active_prompt_group,
+)
 from .errors import (
     PROMPT_NOT_FOUND,
     PROMPT_RENDER_ERROR,
@@ -21,6 +28,7 @@
     "PROMPT_RENDER_ERROR",
     "PROMPT_STORE_UNAVAILABLE",
     "PROMPT_TRANSIENT_CATEGORIES",
+    "FilesystemPromptBackend",
     "Prompt",
     "PromptBackend",
     "PromptError",
@@ -32,4 +40,8 @@
     "PromptStoreUnavailable",
     "compute_rendered_hash",
     "compute_template_hash",
+    "current_prompt_group",
+    "current_prompt_result",
+    "with_active_prompt",
+    "with_active_prompt_group",
 ]
diff --git a/src/openarmature/prompts/backends/__init__.py b/src/openarmature/prompts/backends/__init__.py
new file mode 100644
index 0000000..55aa0de
--- /dev/null
+++ b/src/openarmature/prompts/backends/__init__.py
@@ -0,0 +1,5 @@
+"""Concrete PromptBackend implementations."""
+
+from .filesystem import FilesystemPromptBackend
+
+__all__ = ["FilesystemPromptBackend"]
diff --git a/src/openarmature/prompts/backends/filesystem.py b/src/openarmature/prompts/backends/filesystem.py
new file mode 100644
index 0000000..1dd0c51
--- /dev/null
+++ b/src/openarmature/prompts/backends/filesystem.py
@@ -0,0 +1,64 @@
+"""Reference filesystem PromptBackend."""
+
+from __future__ import annotations
+
+import asyncio
+from datetime import UTC, datetime
+from pathlib import Path
+
+from ..errors import PromptNotFound, PromptStoreUnavailable
+from ..hashing import compute_template_hash
+from ..prompt import Prompt
+
+
+class FilesystemPromptBackend:
+    """Reads prompts from a directory tree.
+
+    Layout convention: ``<root>/<label>/<name>.j2``. The ``label``
+    subdirectory keeps name-collisions across labels distinct
+    (e.g., ``prompts/production/greeting.j2`` and
+    ``prompts/staging/greeting.j2``). Spec §5 permits filesystem
+    backends to interpret label as "a subdirectory or filename
+    suffix"; this backend picks subdirectory.
+
+    The ``version`` field is derived from the template content hash
+    (first 12 hex chars of the SHA-256) so two file contents map
+    deterministically to two distinct version strings without
+    needing a sidecar metadata file. Per spec §3, this satisfies
+    the "stable identifier" requirement.
+
+    This backend reads from disk on every fetch — no caching. A
+    caching backend (e.g., openarmature-langfuse) that returns
+    cached results MUST preserve the original ``fetched_at`` on the
+    returned Prompt, not the cache-hit time, per spec §3.
+    """
+
+    def __init__(self, root: Path) -> None:
+        self._root = root
+
+    async def fetch(self, name: str, label: str = "production") -> Prompt:
+        path = self._root / label / f"{name}.j2"
+        try:
+            template_source = await asyncio.to_thread(path.read_text, encoding="utf-8")
+        except FileNotFoundError as exc:
+            raise PromptNotFound(
+                f"prompt ({name!r}, {label!r}) not found under {self._root}",
+                name=name,
+                label=label,
+                backend=str(self._root),
+            ) from exc
+        except OSError as exc:
+            raise PromptStoreUnavailable(
+                f"filesystem I/O error reading ({name!r}, {label!r}): {exc}"
+            ) from exc
+
+        template_hash = compute_template_hash(template_source)
+        version = template_hash.removeprefix("sha256:")[:12]
+        return Prompt(
+            name=name,
+            version=version,
+            label=label,
+            template=template_source,
+            template_hash=template_hash,
+            fetched_at=datetime.now(UTC),
+        )
diff --git a/src/openarmature/prompts/context.py b/src/openarmature/prompts/context.py
new file mode 100644
index 0000000..4bf8fc4
--- /dev/null
+++ b/src/openarmature/prompts/context.py
@@ -0,0 +1,80 @@
+"""Context variables for propagating prompt identity to observability.
+
+Spec §11 leaves the propagation mechanism implementation-defined.
+This module provides the Python implementation: two ``ContextVar``s
+plus two context managers (``with_active_prompt`` and
+``with_active_prompt_group``) that observers read to surface the
+normative ``openarmature.prompt.*`` and
+``openarmature.prompt.group_name`` span attributes.
+
+Nesting policy: innermost-wins. When two ``with_active_prompt``
+contexts nest, the inner result is the active one for the
+duration of the inner block; the same applies to
+``with_active_prompt_group``. This matches Python's natural
+``ContextVar`` token-stacking behavior; spec §11 doesn't mandate
+a nesting policy.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from contextlib import contextmanager
+from contextvars import ContextVar
+
+from .group import PromptGroup
+from .prompt import PromptResult
+
+_active_prompt: ContextVar[PromptResult | None] = ContextVar(
+    "openarmature_active_prompt",
+    default=None,
+)
+_active_prompt_group: ContextVar[PromptGroup | None] = ContextVar(
+    "openarmature_active_prompt_group",
+    default=None,
+)
+
+
+@contextmanager
+def with_active_prompt(result: PromptResult) -> Iterator[None]:
+    """Mark ``result`` as the active prompt for downstream LLM calls.
+
+    When the observability extra is installed and an LLM call fires
+    inside this context, the OTel observer surfaces
+    ``openarmature.prompt.name`` / ``version`` / ``label`` /
+    ``template_hash`` / ``rendered_hash`` on the LLM-call span.
+
+    Nesting is innermost-wins.
+    """
+    token = _active_prompt.set(result)
+    try:
+        yield
+    finally:
+        _active_prompt.reset(token)
+
+
+@contextmanager
+def with_active_prompt_group(group: PromptGroup) -> Iterator[None]:
+    """Mark ``group`` as the active prompt group for downstream LLM calls.
+
+    When an LLM call fires inside this context, the OTel observer
+    surfaces ``openarmature.prompt.group_name`` on the LLM-call
+    span, alongside any per-prompt attributes from a concurrently
+    active ``with_active_prompt``.
+
+    Nesting is innermost-wins.
+    """
+    token = _active_prompt_group.set(group)
+    try:
+        yield
+    finally:
+        _active_prompt_group.reset(token)
+
+
+def current_prompt_result() -> PromptResult | None:
+    """Return the innermost active PromptResult, or ``None``."""
+    return _active_prompt.get()
+
+
+def current_prompt_group() -> PromptGroup | None:
+    """Return the innermost active PromptGroup, or ``None``."""
+    return _active_prompt_group.get()

From 9bdbcb7e21732343dcc6418088f903d4fdffb5ca Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 18:59:30 -0700
Subject: [PATCH 05/12] test(conformance): prompt-management harness and 12
 fixtures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds prompt-management as the fifth conformance capability:

- harness/prompt_management.py — typed YAML models for the new
  fixture shape (backends + manager + calls with target /
  operation / capture_as, plus per-call and top-level expected
  blocks for raises / result_equivalence / prompt_group /
  rendered_hash_equal / rendered_hash_different).
- harness/fixtures.py — PromptManagementFixture added to the
  discriminated union; the discriminator recognizes top-level
  'backends:' (without 'mock_provider:') as the prompt-management
  shape.
- harness/loader.py — 'prompt-management' added to CAPABILITIES
  so test_fixture_parsing.py discovers and parses the new
  fixtures.

test_prompt_management.py drives all 12 spec fixtures
(001-fetch-success through 012-prompt-result-rendered-hash-stability)
against the real PromptManager + a MockPromptBackend that
implements the protocol with optional simulate_unavailable +
preloaded prompts + a call_count for fixtures that assert
fallback chain visits. All 12 fixtures pass.
---
 tests/conformance/harness/fixtures.py         |  23 +-
 tests/conformance/harness/loader.py           |   1 +
 .../conformance/harness/prompt_management.py  | 174 ++++++++++
 tests/conformance/test_prompt_management.py   | 307 ++++++++++++++++++
 4 files changed, 500 insertions(+), 5 deletions(-)
 create mode 100644 tests/conformance/harness/prompt_management.py
 create mode 100644 tests/conformance/test_prompt_management.py

diff --git a/tests/conformance/harness/fixtures.py b/tests/conformance/harness/fixtures.py
index e14da05..7cc81ec 100644
--- a/tests/conformance/harness/fixtures.py
+++ b/tests/conformance/harness/fixtures.py
@@ -47,6 +47,7 @@
     StateSchema,
 )
 from .expectations import ExpectedBlock, LlmProviderExpected
+from .prompt_management import PromptManagementFixture
 
 
 class _ForbidExtras(BaseModel):
@@ -234,13 +235,19 @@ class GraphFixture(_ForbidExtras):
 # ---------------------------------------------------------------------------
 
 
-def _discriminate_fixture(value: Any) -> Literal["llm_provider", "cases", "graph"]:
+def _discriminate_fixture(
+    value: Any,
+) -> Literal["llm_provider", "prompt_management", "cases", "graph"]:
     """Pick the fixture shape from a raw YAML dict.
 
-    Order matters: ``mock_provider`` wins over ``cases`` because some
-    llm-provider fixtures (e.g. 003-message-validation) have BOTH —
-    ``mock_provider`` is the load-bearing discriminator, ``cases`` is just
-    the table style for sub-cases.
+    Order matters:
+
+    - ``mock_provider`` wins over ``cases`` because some llm-provider
+      fixtures (e.g. 003-message-validation) have BOTH — ``mock_provider``
+      is the load-bearing discriminator, ``cases`` is the table style.
+    - ``backends`` at the top level (without ``mock_provider``) picks
+      the prompt-management shape. Spec/prompt-management fixtures
+      always carry ``backends:``.
 
     Also handle the serialization path (where the value is a concrete
     variant) so a future ``model_dump`` through the top-level union
@@ -248,6 +255,8 @@ def _discriminate_fixture(value: Any) -> Literal["llm_provider", "cases", "graph
     """
     if isinstance(value, LlmProviderFixture):
         return "llm_provider"
+    if isinstance(value, PromptManagementFixture):
+        return "prompt_management"
     if isinstance(value, CasesFixture):
         return "cases"
     if isinstance(value, GraphFixture):
@@ -255,6 +264,8 @@ def _discriminate_fixture(value: Any) -> Literal["llm_provider", "cases", "graph
     if isinstance(value, dict):
         if "mock_provider" in value:
             return "llm_provider"
+        if "backends" in value:
+            return "prompt_management"
         if "cases" in value:
             return "cases"
     return "graph"
@@ -262,6 +273,7 @@ def _discriminate_fixture(value: Any) -> Literal["llm_provider", "cases", "graph
 
 Fixture = Annotated[
     Annotated[LlmProviderFixture, Tag("llm_provider")]
+    | Annotated[PromptManagementFixture, Tag("prompt_management")]
     | Annotated[CasesFixture, Tag("cases")]
     | Annotated[GraphFixture, Tag("graph")],
     Discriminator(_discriminate_fixture),
@@ -275,5 +287,6 @@ def _discriminate_fixture(value: Any) -> Literal["llm_provider", "cases", "graph
     "GraphFixture",
     "LlmProviderExpected",
     "LlmProviderFixture",
+    "PromptManagementFixture",
     "SubgraphDefinition",
 ]
diff --git a/tests/conformance/harness/loader.py b/tests/conformance/harness/loader.py
index 271e561..fa2def4 100644
--- a/tests/conformance/harness/loader.py
+++ b/tests/conformance/harness/loader.py
@@ -31,6 +31,7 @@
     "llm-provider",
     "pipeline-utilities",
     "observability",
+    "prompt-management",
 )
 
 CONFORMANCE_ROOT = Path(__file__).resolve().parents[3] / "openarmature-spec" / "spec"
diff --git a/tests/conformance/harness/prompt_management.py b/tests/conformance/harness/prompt_management.py
new file mode 100644
index 0000000..3eafe42
--- /dev/null
+++ b/tests/conformance/harness/prompt_management.py
@@ -0,0 +1,174 @@
+"""Typed YAML models for prompt-management conformance fixtures.
+
+Fixture shape (different from the llm-provider / graph shapes):
+
+- ``backends:`` — list of mock backend specs (each with ``name``,
+  optional ``simulate_unavailable``, and a list of ``prompts``).
+- ``manager:`` — optional manager composition (a list of backend
+  names, in fallback order).
+- ``calls:`` — list of operations to drive. Each call has a
+  ``target`` (``{backend: <name>}`` for direct backend operations,
+  or ``manager`` for manager operations), an ``operation``, inputs,
+  optional ``capture_as`` (binds the operation's result to a name
+  usable by later calls / final expectations), and optional
+  per-call ``expected``.
+- ``expected:`` — optional top-level expectation block for
+  PromptGroup shape or cross-call result-equivalence assertions
+  that need access to ``capture_as`` bindings.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict
+
+
+class _StrictModel(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+
+class _PermissiveModel(BaseModel):
+    """For fixture sub-shapes that vary across fixtures and don't
+    warrant a per-shape enumeration."""
+
+    model_config = ConfigDict(extra="allow")
+
+
+# ---------------------------------------------------------------------------
+# Backend / manager configuration
+# ---------------------------------------------------------------------------
+
+
+class FixturePromptSpec(_StrictModel):
+    name: str
+    label: str
+    version: str
+    template: str
+    template_hash: str
+
+
+class FixtureBackendSpec(_StrictModel):
+    name: str
+    prompts: list[FixturePromptSpec] = []
+    simulate_unavailable: bool = False
+
+
+class FixtureManagerSpec(_StrictModel):
+    backends: list[str]
+
+
+# ---------------------------------------------------------------------------
+# Call targets, operations, and expectations
+# ---------------------------------------------------------------------------
+
+
+class BackendTarget(_StrictModel):
+    backend: str
+
+
+CallTarget = BackendTarget | Literal["manager", "construct_prompt_group"]
+
+
+class FixtureExpectedRaises(_PermissiveModel):
+    category: str
+    # Optional extra carries — fixture 005 uses ``description_mentions``,
+    # ``name``, ``version``, ``label``. fixture 008 uses
+    # ``secondary_backend_call_count``. Permissive on this shape so
+    # fixtures evolve.
+    carries: dict[str, Any] | None = None
+
+
+class FixtureExpectedPrompt(_PermissiveModel):
+    """Per-call ``expected.prompt`` shape (fetch ops)."""
+
+
+class FixtureExpectedPromptResult(_PermissiveModel):
+    """Per-call ``expected.prompt_result`` shape (render / get ops)."""
+
+
+class FixtureExpectedPerCall(_StrictModel):
+    prompt: FixtureExpectedPrompt | None = None
+    prompt_result: FixtureExpectedPromptResult | None = None
+    raises: FixtureExpectedRaises | None = None
+    # Fixture 008's extra: assert how many times the secondary
+    # backend's fetch was called. Lives alongside ``raises``.
+    secondary_backend_call_count: int | None = None
+    # Fixture 009's extra: assert per-backend call counts (named
+    # backends → expected call count) after a fetch that exhausts
+    # all of them.
+    backend_call_counts: dict[str, int] | None = None
+
+
+class FixtureCall(_StrictModel):
+    target: CallTarget
+    # ``operation`` is required for fetch / render / get calls. The
+    # ``construct_prompt_group`` shape uses the target as the operation
+    # indicator (no separate operation field on the call).
+    operation: Literal["fetch", "render", "get"] | None = None
+    name: str | None = None
+    label: str | None = None
+    variables: dict[str, Any] | None = None
+    # Render-only inputs — either an inline ``fetched_prompt`` (which
+    # the harness fetches first, then renders) or a ``fetched_prompt_ref``
+    # pointing at an earlier ``capture_as``.
+    fetched_prompt: dict[str, str] | None = None
+    fetched_prompt_ref: str | None = None
+    # construct_prompt_group-only inputs.
+    group_name: str | None = None
+    members_refs: list[str] | None = None
+    capture_as: str | None = None
+    expected: FixtureExpectedPerCall | None = None
+
+
+# ---------------------------------------------------------------------------
+# Top-level expected
+# ---------------------------------------------------------------------------
+
+
+class FixtureExpectedPromptGroup(_PermissiveModel):
+    """Top-level ``expected.prompt_group`` shape (fixture 011)."""
+
+    of: str
+    group_name: str
+    member_count: int
+    member_order_preserved: bool | None = None
+    member_names: list[str] | None = None
+
+
+class FixtureExpectedResultEquivalence(_PermissiveModel):
+    """Top-level ``expected.result_equivalence`` shape (fixtures 006,
+    010, 012). Asserts equality across two or more captured results on
+    a configurable set of fields."""
+
+    of: list[str]
+    fields_must_match: list[str]
+    fields_may_differ: list[str] = []
+    # fixture 012 — assert two different captures have a DIFFERENT
+    # value on a given field.
+    fields_must_differ: list[str] = []
+
+
+class FixtureExpectedTopLevel(_StrictModel):
+    prompt_group: FixtureExpectedPromptGroup | None = None
+    result_equivalence: FixtureExpectedResultEquivalence | None = None
+    # Some fixtures (012) have multiple result-equivalence blocks; keep
+    # a plural list-form too. Empty by default.
+    result_equivalences: list[FixtureExpectedResultEquivalence] = []
+    # Fixture 012's per-pair rendered_hash equality / inequality
+    # assertions. Each entry is a 2-element list of capture names; the
+    # pair MUST share (resp. differ on) ``rendered_hash``.
+    rendered_hash_equal: list[list[str]] = []
+    rendered_hash_different: list[list[str]] = []
+
+
+# ---------------------------------------------------------------------------
+# Fixture root
+# ---------------------------------------------------------------------------
+
+
+class PromptManagementFixture(_StrictModel):
+    backends: list[FixtureBackendSpec]
+    manager: FixtureManagerSpec | None = None
+    calls: list[FixtureCall]
+    expected: FixtureExpectedTopLevel | None = None
diff --git a/tests/conformance/test_prompt_management.py b/tests/conformance/test_prompt_management.py
new file mode 100644
index 0000000..0f1751c
--- /dev/null
+++ b/tests/conformance/test_prompt_management.py
@@ -0,0 +1,307 @@
+"""Run every spec prompt-management conformance fixture against the real subpackage.
+
+The fixtures (``spec/prompt-management/conformance/``) describe
+backend + manager behavior in terms of in-process mock backends and
+``PromptManager`` operations. Unlike the llm-provider fixtures
+(which mock a remote wire), the prompt-management harness instantiates
+real ``PromptManager``s and runs them against ``MockPromptBackend``s —
+no I/O or network involved.
+"""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+import pytest
+import yaml
+
+from openarmature.prompts import (
+    Prompt,
+    PromptError,
+    PromptGroup,
+    PromptManager,
+    PromptNotFound,
+    PromptResult,
+    PromptStoreUnavailable,
+)
+
+from .harness.loader import CONFORMANCE_ROOT
+from .harness.prompt_management import (
+    FixtureBackendSpec,
+    FixtureCall,
+    FixtureExpectedResultEquivalence,
+    PromptManagementFixture,
+)
+
+_CAPABILITY_DIR = CONFORMANCE_ROOT / "prompt-management" / "conformance"
+
+
+def _fixture_paths() -> list[Path]:
+    return sorted(_CAPABILITY_DIR.glob("[0-9][0-9][0-9]-*.yaml"))
+
+
+def _fixture_id(path: Path) -> str:
+    return path.stem
+
+
+# ---------------------------------------------------------------------------
+# MockPromptBackend — backend stand-in for the fixtures
+# ---------------------------------------------------------------------------
+
+
+class MockPromptBackend:
+    """In-process PromptBackend matching the fixture ``backends[i]`` shape.
+
+    Each instance carries a ``name`` (used for fallback-order tracing
+    and for ``backend_call_counts`` assertions), an optional
+    ``simulate_unavailable`` flag that makes every fetch raise
+    ``PromptStoreUnavailable``, and a list of canned prompts keyed
+    by ``(name, label)``.
+
+    ``call_count`` is incremented on every ``fetch`` entry so
+    fixtures 008 and 009 can assert how many times each backend's
+    fetch was actually invoked.
+    """
+
+    def __init__(self, spec: FixtureBackendSpec) -> None:
+        self.name = spec.name
+        self._simulate_unavailable = spec.simulate_unavailable
+        self._prompts: dict[tuple[str, str], Prompt] = {}
+        now = datetime.now(UTC)
+        for ps in spec.prompts:
+            self._prompts[(ps.name, ps.label)] = Prompt(
+                name=ps.name,
+                version=ps.version,
+                label=ps.label,
+                template=ps.template,
+                template_hash=ps.template_hash,
+                fetched_at=now,
+            )
+        self.call_count = 0
+
+    async def fetch(self, name: str, label: str = "production") -> Prompt:
+        self.call_count += 1
+        if self._simulate_unavailable:
+            raise PromptStoreUnavailable(
+                f"mock backend {self.name!r} simulating unavailable for ({name!r}, {label!r})"
+            )
+        key = (name, label)
+        if key not in self._prompts:
+            raise PromptNotFound(
+                f"mock backend {self.name!r} has no prompt for ({name!r}, {label!r})",
+                name=name,
+                label=label,
+                backend=self.name,
+            )
+        return self._prompts[key]
+
+
+# ---------------------------------------------------------------------------
+# Fixture runner
+# ---------------------------------------------------------------------------
+
+
+async def _run_call(
+    call: FixtureCall,
+    backends: dict[str, MockPromptBackend],
+    manager: PromptManager | None,
+    captures: dict[str, Any],
+) -> tuple[Any, BaseException | None]:
+    """Execute one fixture call, returning ``(result, raised)``.
+
+    Exactly one of ``result`` / ``raised`` is populated.
+    """
+    target = call.target
+    operation = call.operation
+
+    try:
+        if target == "construct_prompt_group":
+            # Synthetic op — assemble a PromptGroup from captured
+            # PromptResults.
+            assert call.group_name is not None
+            assert call.members_refs is not None
+            members = [captures[ref] for ref in call.members_refs]
+            return PromptGroup(group_name=call.group_name, members=members), None
+
+        if isinstance(target, str) and target == "manager":
+            assert manager is not None
+            if operation == "fetch":
+                assert call.name is not None and call.label is not None
+                return await manager.fetch(call.name, call.label), None
+            if operation == "render":
+                # Either inline fetched_prompt or a ref to a capture.
+                if call.fetched_prompt_ref is not None:
+                    prompt = captures[call.fetched_prompt_ref]
+                else:
+                    assert call.fetched_prompt is not None
+                    fetched = await manager.fetch(call.fetched_prompt["name"], call.fetched_prompt["label"])
+                    prompt = fetched
+                return manager.render(prompt, call.variables or {}), None
+            if operation == "get":
+                assert call.name is not None and call.label is not None
+                return await manager.get(call.name, call.label, call.variables or {}), None
+            raise AssertionError(f"unsupported manager operation: {operation!r}")
+
+        # ``target: {backend: <name>}`` — direct backend op.
+        assert not isinstance(target, str)
+        backend = backends[target.backend]
+        if operation == "fetch":
+            assert call.name is not None and call.label is not None
+            return await backend.fetch(call.name, call.label), None
+        raise AssertionError(f"unsupported backend operation: {operation!r}")
+    except PromptError as exc:
+        return None, exc
+
+
+# ---------------------------------------------------------------------------
+# Expectation assertions
+# ---------------------------------------------------------------------------
+
+
+def _assert_per_call(
+    call: FixtureCall,
+    result: Any,
+    raised: BaseException | None,
+    backends: dict[str, MockPromptBackend],
+) -> None:
+    if call.expected is None:
+        return
+
+    if call.expected.raises is not None:
+        assert raised is not None, (
+            f"expected raise of category {call.expected.raises.category!r}, got result {result!r}"
+        )
+        actual = getattr(raised, "category", None)
+        assert actual == call.expected.raises.category, (
+            f"expected category {call.expected.raises.category!r}, got {actual!r} ({raised!r})"
+        )
+        carries = call.expected.raises.carries
+        if carries is not None:
+            for key, expected_value in carries.items():
+                if key == "description_mentions":
+                    description = getattr(raised, "description", "") or str(raised)
+                    assert expected_value in description, (
+                        f"expected description to mention {expected_value!r}, got {description!r}"
+                    )
+                    continue
+                actual_attr = getattr(raised, key, None)
+                assert actual_attr == expected_value, (
+                    f"expected {key}={expected_value!r}, got {actual_attr!r}"
+                )
+        if call.expected.secondary_backend_call_count is not None:
+            assert backends["secondary"].call_count == call.expected.secondary_backend_call_count, (
+                f"expected secondary call_count={call.expected.secondary_backend_call_count}, "
+                f"got {backends['secondary'].call_count}"
+            )
+        if call.expected.backend_call_counts is not None:
+            for name, count in call.expected.backend_call_counts.items():
+                actual_count = backends[name].call_count
+                assert actual_count == count, f"expected {name} call_count={count}, got {actual_count}"
+        return
+
+    assert raised is None, f"unexpected raise: {raised!r}"
+
+    if call.expected.prompt is not None:
+        assert isinstance(result, Prompt), f"expected Prompt, got {type(result).__name__}"
+        expected = call.expected.prompt.model_dump(exclude_none=True)
+        for key, value in expected.items():
+            actual_attr = getattr(result, key)
+            assert actual_attr == value, f"prompt.{key}: expected {value!r}, got {actual_attr!r}"
+
+    if call.expected.prompt_result is not None:
+        assert isinstance(result, PromptResult), f"expected PromptResult, got {type(result).__name__}"
+        expected = call.expected.prompt_result.model_dump(exclude_none=True)
+        for key, value in expected.items():
+            if key == "rendered_hash_present":
+                if value:
+                    assert result.rendered_hash, "expected rendered_hash present"
+                continue
+            if key == "rendered_hash_non_empty_string":
+                if value:
+                    assert isinstance(result.rendered_hash, str)
+                    assert len(result.rendered_hash) > 0
+                continue
+            if key == "messages":
+                expected_messages: list[dict[str, Any]] = value
+                actual_messages = [m.model_dump(exclude_none=True) for m in result.messages]
+                # Drop fields the fixture doesn't constrain.
+                normalized: list[dict[str, Any]] = []
+                for m in actual_messages:
+                    normalized.append({k: m[k] for k in m if k in {"role", "content"}})
+                assert normalized == expected_messages, (
+                    f"messages: expected {expected_messages!r}, got {normalized!r}"
+                )
+                continue
+            actual_attr = getattr(result, key)
+            assert actual_attr == value, f"prompt_result.{key}: expected {value!r}, got {actual_attr!r}"
+
+
+def _assert_result_equivalence(
+    eq: FixtureExpectedResultEquivalence,
+    captures: dict[str, Any],
+) -> None:
+    members = [captures[ref] for ref in eq.of]
+    first = members[0]
+    for other in members[1:]:
+        for field in eq.fields_must_match:
+            assert getattr(first, field) == getattr(other, field), (
+                f"result_equivalence: field {field!r} differs across {eq.of!r}"
+            )
+        for field in eq.fields_must_differ:
+            assert getattr(first, field) != getattr(other, field), (
+                f"result_equivalence: field {field!r} matched across {eq.of!r} but MUST differ"
+            )
+
+
+# ---------------------------------------------------------------------------
+# Parametrized test
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("fixture_path", _fixture_paths(), ids=_fixture_id)
+async def test_prompt_management_fixture(fixture_path: Path) -> None:
+    raw: Any = yaml.safe_load(fixture_path.read_text())
+    fixture = PromptManagementFixture.model_validate(raw)
+
+    backends: dict[str, MockPromptBackend] = {spec.name: MockPromptBackend(spec) for spec in fixture.backends}
+    manager: PromptManager | None = None
+    if fixture.manager is not None:
+        ordered = [backends[name] for name in fixture.manager.backends]
+        manager = PromptManager(*ordered)
+
+    captures: dict[str, Any] = {}
+    for call in fixture.calls:
+        result, raised = await _run_call(call, backends, manager, captures)
+        _assert_per_call(call, result, raised, backends)
+        if call.capture_as is not None and raised is None:
+            captures[call.capture_as] = result
+
+    if fixture.expected is None:
+        return
+
+    if fixture.expected.prompt_group is not None:
+        pg_expected = fixture.expected.prompt_group
+        group = captures[pg_expected.of]
+        assert isinstance(group, PromptGroup)
+        assert group.group_name == pg_expected.group_name
+        assert len(group.members) == pg_expected.member_count
+        if pg_expected.member_names is not None:
+            assert [m.name for m in group.members] == pg_expected.member_names
+
+    if fixture.expected.result_equivalence is not None:
+        _assert_result_equivalence(fixture.expected.result_equivalence, captures)
+    for eq in fixture.expected.result_equivalences:
+        _assert_result_equivalence(eq, captures)
+
+    for pair in fixture.expected.rendered_hash_equal:
+        a, b = pair
+        assert captures[a].rendered_hash == captures[b].rendered_hash, (
+            f"rendered_hash differs between {a!r} and {b!r} but fixture expects equal"
+        )
+    for pair in fixture.expected.rendered_hash_different:
+        a, b = pair
+        assert captures[a].rendered_hash != captures[b].rendered_hash, (
+            f"rendered_hash matches between {a!r} and {b!r} but fixture expects different"
+        )

From 853b6d52c450468395b9a259b9c331cf8ecf6703 Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 19:03:32 -0700
Subject: [PATCH 06/12] test(unit): prompts subpackage + OTel attribute
 propagation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds tests/unit/test_prompts.py (25 tests) covering gaps the
conformance fixtures don't exercise directly:

- error categories match spec §10 strings; PROMPT_TRANSIENT_CATEGORIES
  contains only prompt_store_unavailable.
- error attribute carriage (PromptNotFound name/label/backend,
  PromptRenderError name/version/label/variables/description).
- template_hash / rendered_hash determinism, prefix, and length;
  divergence for different inputs.
- Prompt extra-field rejection; PromptGroup 0/1-member rejection
  and 2+ acceptance.
- PromptManager construction (zero-backend rejection).
- Empty-string render output boundary wrap (the spec-agent's
  concern about Jinja2 cleanly rendering '' but UserMessage
  rejecting empty content — verified to surface as
  PromptRenderError).
- Identity-field propagation from Prompt to PromptResult on
  render.
- FilesystemPromptBackend disk I/O: success path, missing file
  raises PromptNotFound, OSError that isn't FileNotFoundError
  raises PromptStoreUnavailable.
- Context-var propagation: with_active_prompt / _prompt_group
  set + reset, innermost-wins nesting, async-task visibility.
- PromptManager fallback gaps: first-match short-circuits later
  backends; render returns a UserMessage carrying the rendered
  text.

Adds two OTel observer tests under tests/unit/test_observability_otel.py:

- Active prompt + active prompt group propagates the six
  openarmature.prompt.* span attributes (name, version, label,
  template_hash, rendered_hash, group_name) on the
  openarmature.llm.complete span.
- Without an active prompt, the LLM-call span carries no
  openarmature.prompt.* attributes.
---
 tests/unit/test_observability_otel.py | 134 +++++++++
 tests/unit/test_prompts.py            | 382 ++++++++++++++++++++++++++
 2 files changed, 516 insertions(+)
 create mode 100644 tests/unit/test_prompts.py

diff --git a/tests/unit/test_observability_otel.py b/tests/unit/test_observability_otel.py
index 0187423..4c46fe9 100644
--- a/tests/unit/test_observability_otel.py
+++ b/tests/unit/test_observability_otel.py
@@ -256,6 +256,140 @@ async def test_checkpoint_save_emits_zero_duration_span() -> None:
 # ---------------------------------------------------------------------------
 
 
+async def test_active_prompt_propagates_to_llm_span_attributes() -> None:
+    """Spec prompt-management §11: when an LLM call fires inside a
+    ``with_active_prompt`` context, the OTel observer MUST surface
+    ``openarmature.prompt.*`` attributes on the LLM-call span.
+    ``with_active_prompt_group`` adds ``openarmature.prompt.group_name``."""
+    from datetime import UTC, datetime
+
+    from openarmature.graph.events import NodeEvent
+    from openarmature.llm.messages import UserMessage
+    from openarmature.llm.providers.openai import _LlmEventState
+    from openarmature.observability.correlation import (
+        _reset_invocation_id,
+        _set_invocation_id,
+    )
+    from openarmature.prompts import (
+        Prompt,
+        PromptGroup,
+        PromptResult,
+        with_active_prompt,
+        with_active_prompt_group,
+    )
+
+    exporter = InMemorySpanExporter()
+    observer = OTelObserver(span_processor=SimpleSpanProcessor(exporter))
+
+    now = datetime.now(UTC)
+    prompt = Prompt(
+        name="greeting",
+        version="v1",
+        label="production",
+        template="Hello, {{ user }}!",
+        template_hash="sha256:tpl",
+        fetched_at=now,
+    )
+    result = PromptResult(
+        name=prompt.name,
+        version=prompt.version,
+        label=prompt.label,
+        template_hash=prompt.template_hash,
+        rendered_hash="sha256:rendered",
+        messages=[UserMessage(content="Hello, Alice!")],
+        variables={"user": "Alice"},
+        fetched_at=now,
+        rendered_at=now,
+    )
+    group = PromptGroup(group_name="classifier_chain", members=[result, result])
+
+    token = _set_invocation_id("inv-1")
+    try:
+        with with_active_prompt(result), with_active_prompt_group(group):
+            started = NodeEvent(
+                node_name="openarmature.llm.complete",
+                namespace=("openarmature.llm.complete",),
+                step=-1,
+                phase="started",
+                pre_state=_LlmEventState(call_id="test-call-prompt", model="test-m"),
+                post_state=None,
+                error=None,
+                parent_states=(),
+            )
+            completed = NodeEvent(
+                node_name="openarmature.llm.complete",
+                namespace=("openarmature.llm.complete",),
+                step=-1,
+                phase="completed",
+                pre_state=_LlmEventState(call_id="test-call-prompt", model="test-m", finish_reason="stop"),
+                post_state=None,
+                error=None,
+                parent_states=(),
+            )
+            await observer(started)
+            await observer(completed)
+    finally:
+        _reset_invocation_id(token)
+
+    observer.shutdown()
+    llm_spans = [s for s in exporter.get_finished_spans() if s.name == "openarmature.llm.complete"]
+    assert len(llm_spans) == 1
+    attrs = llm_spans[0].attributes or {}
+    assert attrs.get("openarmature.prompt.name") == "greeting"
+    assert attrs.get("openarmature.prompt.version") == "v1"
+    assert attrs.get("openarmature.prompt.label") == "production"
+    assert attrs.get("openarmature.prompt.template_hash") == "sha256:tpl"
+    assert attrs.get("openarmature.prompt.rendered_hash") == "sha256:rendered"
+    assert attrs.get("openarmature.prompt.group_name") == "classifier_chain"
+
+
+async def test_llm_span_has_no_prompt_attributes_when_no_active_prompt() -> None:
+    """Without ``with_active_prompt``, the LLM-call span MUST NOT carry
+    ``openarmature.prompt.*`` attributes."""
+    from openarmature.graph.events import NodeEvent
+    from openarmature.llm.providers.openai import _LlmEventState
+    from openarmature.observability.correlation import (
+        _reset_invocation_id,
+        _set_invocation_id,
+    )
+
+    exporter = InMemorySpanExporter()
+    observer = OTelObserver(span_processor=SimpleSpanProcessor(exporter))
+
+    token = _set_invocation_id("inv-2")
+    try:
+        started = NodeEvent(
+            node_name="openarmature.llm.complete",
+            namespace=("openarmature.llm.complete",),
+            step=-1,
+            phase="started",
+            pre_state=_LlmEventState(call_id="test-call-noprompt", model="test-m"),
+            post_state=None,
+            error=None,
+            parent_states=(),
+        )
+        completed = NodeEvent(
+            node_name="openarmature.llm.complete",
+            namespace=("openarmature.llm.complete",),
+            step=-1,
+            phase="completed",
+            pre_state=_LlmEventState(call_id="test-call-noprompt", model="test-m", finish_reason="stop"),
+            post_state=None,
+            error=None,
+            parent_states=(),
+        )
+        await observer(started)
+        await observer(completed)
+    finally:
+        _reset_invocation_id(token)
+    observer.shutdown()
+
+    llm_spans = [s for s in exporter.get_finished_spans() if s.name == "openarmature.llm.complete"]
+    assert len(llm_spans) == 1
+    attrs = llm_spans[0].attributes or {}
+    assert not any(k.startswith("openarmature.prompt.") for k in attrs)
+
+
 async def test_disable_llm_spans_skips_llm_provider_span() -> None:
     """Spec §5.5: ``disable_llm_spans=True`` MUST suppress the
     LLM-provider span emission while leaving all other spans intact."""
diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py
new file mode 100644
index 0000000..b3ffe76
--- /dev/null
+++ b/tests/unit/test_prompts.py
@@ -0,0 +1,382 @@
+"""Focused unit tests for the prompts subpackage.
+
+The conformance suite (``tests/conformance/test_prompt_management.py``)
+covers the spec's behavioral surface end-to-end against fixtures
+001-012. These unit tests fill gaps the conformance fixtures don't
+exercise directly: per-class construction validation,
+FilesystemPromptBackend disk I/O, hashing helpers, context-variable
+propagation, and the empty-string-render boundary wrap.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+from openarmature.llm.messages import Message, UserMessage
+from openarmature.prompts import (
+    PROMPT_NOT_FOUND,
+    PROMPT_RENDER_ERROR,
+    PROMPT_STORE_UNAVAILABLE,
+    PROMPT_TRANSIENT_CATEGORIES,
+    FilesystemPromptBackend,
+    Prompt,
+    PromptError,
+    PromptGroup,
+    PromptManager,
+    PromptNotFound,
+    PromptRenderError,
+    PromptResult,
+    PromptStoreUnavailable,
+    compute_rendered_hash,
+    compute_template_hash,
+    current_prompt_group,
+    current_prompt_result,
+    with_active_prompt,
+    with_active_prompt_group,
+)
+
+# ---------------------------------------------------------------------------
+# Error class hierarchy + categories
+# ---------------------------------------------------------------------------
+
+
+def test_error_categories_match_spec() -> None:
+    assert PromptNotFound.category == "prompt_not_found"
+    assert PromptRenderError.category == "prompt_render_error"
+    assert PromptStoreUnavailable.category == "prompt_store_unavailable"
+    assert PROMPT_NOT_FOUND == "prompt_not_found"
+    assert PROMPT_RENDER_ERROR == "prompt_render_error"
+    assert PROMPT_STORE_UNAVAILABLE == "prompt_store_unavailable"
+
+
+def test_transient_categories_contains_only_store_unavailable() -> None:
+    assert PROMPT_TRANSIENT_CATEGORIES == frozenset({PROMPT_STORE_UNAVAILABLE})
+
+
+def test_prompt_not_found_carries_identity_attributes() -> None:
+    exc = PromptNotFound("nope", name="greeting", label="production", backend="local")
+    assert exc.name == "greeting"
+    assert exc.label == "production"
+    assert exc.backend == "local"
+    assert isinstance(exc, PromptError)
+
+
+def test_prompt_render_error_carries_identity_and_variables() -> None:
+    exc = PromptRenderError(
+        "boom",
+        name="greeting",
+        version="v1",
+        label="production",
+        variables={"user": "Alice"},
+        description="undefined: day",
+    )
+    assert exc.name == "greeting"
+    assert exc.version == "v1"
+    assert exc.label == "production"
+    assert exc.variables == {"user": "Alice"}
+    assert exc.description == "undefined: day"
+
+
+# ---------------------------------------------------------------------------
+# Hashing helpers
+# ---------------------------------------------------------------------------
+
+
+def test_template_hash_is_deterministic_and_prefixed() -> None:
+    a = compute_template_hash("Hello, {{ user }}!")
+    b = compute_template_hash("Hello, {{ user }}!")
+    assert a == b
+    assert a.startswith("sha256:")
+    assert len(a) == len("sha256:") + 64
+
+
+def test_template_hash_differs_for_different_inputs() -> None:
+    a = compute_template_hash("Hello!")
+    b = compute_template_hash("Goodbye!")
+    assert a != b
+
+
+def test_rendered_hash_is_deterministic() -> None:
+    msgs: list[Message] = [UserMessage(content="Hello, Alice!")]
+    a = compute_rendered_hash(msgs)
+    b = compute_rendered_hash(msgs)
+    assert a == b
+    assert a.startswith("sha256:")
+
+
+def test_rendered_hash_differs_for_different_message_content() -> None:
+    msgs_a: list[Message] = [UserMessage(content="Hello, Alice!")]
+    msgs_b: list[Message] = [UserMessage(content="Hello, Bob!")]
+    a = compute_rendered_hash(msgs_a)
+    b = compute_rendered_hash(msgs_b)
+    assert a != b
+
+
+# ---------------------------------------------------------------------------
+# Type construction
+# ---------------------------------------------------------------------------
+
+
+def _make_prompt(template: str = "Hello, {{ user }}!") -> Prompt:
+    return Prompt(
+        name="greeting",
+        version="v1",
+        label="production",
+        template=template,
+        template_hash=compute_template_hash(template),
+        fetched_at=datetime.now(UTC),
+    )
+
+
+def test_prompt_extra_fields_forbidden() -> None:
+    with pytest.raises(ValueError, match="extra"):
+        Prompt.model_validate(
+            {
+                "name": "greeting",
+                "version": "v1",
+                "label": "production",
+                "template": "Hi",
+                "template_hash": "sha256:abc",
+                "fetched_at": datetime.now(UTC),
+                "unknown_field": "not allowed",
+            }
+        )
+
+
+def test_prompt_group_rejects_zero_members() -> None:
+    with pytest.raises(ValueError, match="at least two"):
+        PromptGroup(group_name="g", members=[])
+
+
+def test_prompt_group_rejects_one_member() -> None:
+    prompt = _make_prompt()
+    pr = PromptResult(
+        name=prompt.name,
+        version=prompt.version,
+        label=prompt.label,
+        template_hash=prompt.template_hash,
+        rendered_hash="sha256:abc",
+        messages=[UserMessage(content="x")],
+        variables={},
+        fetched_at=prompt.fetched_at,
+        rendered_at=datetime.now(UTC),
+    )
+    with pytest.raises(ValueError, match="at least two"):
+        PromptGroup(group_name="g", members=[pr])
+
+
+def test_prompt_group_accepts_two_or_more_members() -> None:
+    prompt = _make_prompt()
+    pr = PromptResult(
+        name=prompt.name,
+        version=prompt.version,
+        label=prompt.label,
+        template_hash=prompt.template_hash,
+        rendered_hash="sha256:abc",
+        messages=[UserMessage(content="x")],
+        variables={},
+        fetched_at=prompt.fetched_at,
+        rendered_at=datetime.now(UTC),
+    )
+    PromptGroup(group_name="g", members=[pr, pr])
+    PromptGroup(group_name="g", members=[pr, pr, pr])
+
+
+# ---------------------------------------------------------------------------
+# PromptManager — construction + render edge cases
+# ---------------------------------------------------------------------------
+
+
+def test_manager_requires_at_least_one_backend() -> None:
+    with pytest.raises(ValueError, match="at least one backend"):
+        PromptManager()
+
+
+def test_render_empty_string_output_maps_to_prompt_render_error() -> None:
+    # The boundary-wrap from the spec-agent's concern: a template that
+    # renders cleanly to "" through Jinja2 would construct
+    # UserMessage(content="") which Pydantic rejects.
+    prompt = _make_prompt(template="{{ x if x else '' }}")
+
+    class _NullBackend:
+        async def fetch(self, name: str, label: str = "production") -> Prompt:
+            return prompt
+
+    manager = PromptManager(_NullBackend())
+    with pytest.raises(PromptRenderError) as exc_info:
+        manager.render(prompt, {"x": None})
+    assert exc_info.value.name == "greeting"
+    assert exc_info.value.label == "production"
+
+
+def test_render_propagates_identity_fields() -> None:
+    prompt = _make_prompt()
+
+    class _Backend:
+        async def fetch(self, name: str, label: str = "production") -> Prompt:
+            return prompt
+
+    manager = PromptManager(_Backend())
+    result = manager.render(prompt, {"user": "Alice"})
+    assert result.name == prompt.name
+    assert result.version == prompt.version
+    assert result.label == prompt.label
+    assert result.template_hash == prompt.template_hash
+    assert result.fetched_at == prompt.fetched_at
+    assert result.variables == {"user": "Alice"}
+    assert len(result.messages) == 1
+
+
+# ---------------------------------------------------------------------------
+# FilesystemPromptBackend
+# ---------------------------------------------------------------------------
+
+
+async def test_filesystem_backend_fetch_success(tmp_path: Path) -> None:
+    label_dir = tmp_path / "production"
+    label_dir.mkdir()
+    (label_dir / "greeting.j2").write_text("Hello, {{ user }}!", encoding="utf-8")
+
+    backend = FilesystemPromptBackend(tmp_path)
+    prompt = await backend.fetch("greeting", "production")
+    assert prompt.name == "greeting"
+    assert prompt.label == "production"
+    assert prompt.template == "Hello, {{ user }}!"
+    assert prompt.template_hash == compute_template_hash("Hello, {{ user }}!")
+    # version derived from first 12 hex chars of template_hash
+    assert prompt.version == prompt.template_hash.removeprefix("sha256:")[:12]
+
+
+async def test_filesystem_backend_fetch_missing_file_raises_not_found(tmp_path: Path) -> None:
+    backend = FilesystemPromptBackend(tmp_path)
+    with pytest.raises(PromptNotFound) as exc_info:
+        await backend.fetch("missing", "production")
+    assert exc_info.value.name == "missing"
+    assert exc_info.value.label == "production"
+    assert exc_info.value.backend == str(tmp_path)
+
+
+async def test_filesystem_backend_io_error_raises_store_unavailable(tmp_path: Path) -> None:
+    # Make the label dir a file, not a directory; the read_text path
+    # construction will surface an OSError that is NOT FileNotFoundError.
+    (tmp_path / "production").write_text("not a directory", encoding="utf-8")
+    backend = FilesystemPromptBackend(tmp_path)
+    with pytest.raises(PromptStoreUnavailable):
+        await backend.fetch("foo", "production")
+
+
+# ---------------------------------------------------------------------------
+# Context-variable propagation
+# ---------------------------------------------------------------------------
+
+
+def _make_prompt_result() -> PromptResult:
+    prompt = _make_prompt()
+    return PromptResult(
+        name=prompt.name,
+        version=prompt.version,
+        label=prompt.label,
+        template_hash=prompt.template_hash,
+        rendered_hash="sha256:rendered",
+        messages=[UserMessage(content="hi")],
+        variables={"user": "Alice"},
+        fetched_at=prompt.fetched_at,
+        rendered_at=datetime.now(UTC),
+    )
+
+
+def test_current_prompt_result_default_is_none() -> None:
+    assert current_prompt_result() is None
+
+
+def test_with_active_prompt_sets_and_resets() -> None:
+    pr = _make_prompt_result()
+    assert current_prompt_result() is None
+    with with_active_prompt(pr):
+        assert current_prompt_result() is pr
+    assert current_prompt_result() is None
+
+
+def test_with_active_prompt_innermost_wins() -> None:
+    outer = _make_prompt_result()
+    inner = _make_prompt_result()
+    with with_active_prompt(outer):
+        assert current_prompt_result() is outer
+        with with_active_prompt(inner):
+            assert current_prompt_result() is inner
+        assert current_prompt_result() is outer
+    assert current_prompt_result() is None
+
+
+def test_with_active_prompt_group_default_none_and_sets() -> None:
+    pr1 = _make_prompt_result()
+    pr2 = _make_prompt_result()
+    group = PromptGroup(group_name="g", members=[pr1, pr2])
+    assert current_prompt_group() is None
+    with with_active_prompt_group(group):
+        assert current_prompt_group() is group
+    assert current_prompt_group() is None
+
+
+async def test_active_prompt_visible_from_nested_async_function() -> None:
+    pr = _make_prompt_result()
+
+    async def _read_in_task() -> PromptResult | None:
+        await asyncio.sleep(0)
+        return current_prompt_result()
+
+    with with_active_prompt(pr):
+        result = await _read_in_task()
+    assert result is pr
+
+
+# ---------------------------------------------------------------------------
+# PromptManager fallback semantics (gaps the fixtures don't cover)
+# ---------------------------------------------------------------------------
+
+
+async def test_manager_fetch_first_match_short_circuits() -> None:
+    """Once a backend returns a Prompt, later backends are not consulted."""
+    prompt = _make_prompt()
+
+    class _Hit:
+        def __init__(self) -> None:
+            self.calls = 0
+
+        async def fetch(self, name: str, label: str = "production") -> Prompt:
+            self.calls += 1
+            return prompt
+
+    class _Second:
+        def __init__(self) -> None:
+            self.calls = 0
+
+        async def fetch(self, name: str, label: str = "production") -> Prompt:
+            self.calls += 1
+            return prompt
+
+    first = _Hit()
+    second = _Second()
+    manager = PromptManager(first, second)
+    await manager.fetch("greeting", "production")
+    assert first.calls == 1
+    assert second.calls == 0
+
+
+async def test_manager_render_signature_returns_user_message() -> None:
+    prompt = _make_prompt()
+
+    class _Backend:
+        async def fetch(self, name: str, label: str = "production") -> Prompt:
+            return prompt
+
+    manager = PromptManager(_Backend())
+    result = manager.render(prompt, {"user": "Alice"})
+    assert isinstance(result.messages[0], UserMessage)
+    msg_content: Any = result.messages[0].content
+    assert msg_content == "Hello, Alice!"

From ad2e896382362dbae5a1930b339ae8a17b842e15 Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 19:07:22 -0700
Subject: [PATCH 07/12] docs: prompts concept page, API reference, changelog

docs/concepts/prompts.md walks through the prompt-management
capability: the fetch + render split (and why both, not just
get()), Prompt identity fields, strict-by-default variables,
composite-backend fallback (PromptStoreUnavailable continues,
PromptNotFound stops), the three error categories, PromptGroup
for tracing related prompts, observability propagation via
with_active_prompt and the six normative openarmature.prompt.*
attributes, determinism + content-addressed caching, a minimal
example, and what's out of scope (vendor backends, versioning
workflows, cache invalidation, multi-message decomposition).

docs/reference/prompts.md is an mkdocstrings autodoc page in
the same shape as docs/reference/llm.md.

mkdocs.yml gains the two new pages in the Concepts and
Reference nav sections.

CHANGELOG.md adds two entries under [Unreleased]:

- the new openarmature.prompts subpackage with PromptManager,
  the three error categories, FilesystemPromptBackend, and the
  jinja2>=3.1 runtime dependency.
- the observability propagation surface in
  openarmature.prompts.context plus the OTel observer wiring.
---
 CHANGELOG.md              |   2 +
 docs/concepts/prompts.md  | 287 ++++++++++++++++++++++++++++++++++++++
 docs/reference/prompts.md |   7 +
 mkdocs.yml                |   2 +
 4 files changed, 298 insertions(+)
 create mode 100644 docs/concepts/prompts.md
 create mode 100644 docs/reference/prompts.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2519495..146cb4d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,8 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). The
 
 ### Added
 
+- **Prompt-management capability (proposal 0017, introduced in spec v0.15.0).** New `openarmature.prompts` subpackage. `PromptManager` composes one or more `PromptBackend`s, exposes `fetch` / `render` / `get`, applies the §8 fallback semantics (`prompt_store_unavailable` continues to the next backend; `prompt_not_found` stops the chain), and renders templates with Jinja2's `StrictUndefined` per §7. `Prompt` / `PromptResult` / `PromptGroup` are Pydantic models matching spec §3 / §4 / §9. Three error categories (`PromptNotFound`, `PromptRenderError`, `PromptStoreUnavailable`) with `PROMPT_TRANSIENT_CATEGORIES` exported for retry-middleware classifiers. `FilesystemPromptBackend` is the minimum local-filesystem reference backend (layout: `<root>/<label>/<name>.j2`; `version` derived from the first 12 chars of `template_hash`). New runtime dependency: `jinja2>=3.1`.
+- **`openarmature.prompts.context` — observability propagation per spec §11.** `with_active_prompt(result)` and `with_active_prompt_group(group)` context managers + `current_prompt_result()` / `current_prompt_group()` inspectors. When the OTel observer is active and an LLM call fires inside `with_active_prompt`, the `openarmature.llm.complete` span carries the normative `openarmature.prompt.*` attributes (`name`, `version`, `label`, `template_hash`, `rendered_hash`, `group_name`). Nesting is innermost-wins.
 - **Image content blocks for user messages (proposal 0015, introduced in spec v0.13.0).** `UserMessage.content` now accepts `str | list[ContentBlock]`. The block surface introduces `TextBlock`, `ImageBlock`, `ImageSourceURL`, `ImageSourceInline`, and the `ContentBlock` / `ImageSource` discriminated unions over the block / source `type` field. `ImageBlock` carries a `media_type` (required for inline sources; ignored for URL sources; typed as `str | None` so callers MAY pass any `image/*` type the bound model supports) and an optional `detail` hint (`"auto"` / `"low"` / `"high"`; `None` default omits the field from the wire so providers apply their own default). System, assistant, and tool messages stay text-string-only; image inputs are user-only in v1.
 - **`OpenAIProvider` content-array wire mapping.** When `UserMessage.content` is a content-block sequence, the wire body uses OpenAI's `content` array per §8.1.1. `TextBlock → {type: "text", text}`. `ImageBlock` with a URL source maps to `{type: "image_url", image_url: {url, detail?}}`. `ImageBlock` with an inline source constructs an RFC 2397 `data:<media_type>;base64,<base64_data>` URI and goes through the same `image_url` entry shape. Inline bytes pass through unchanged — no inspection, transcoding, or re-encoding.
 - **New error category `ProviderUnsupportedContentBlock` (non-transient).** Raised when the bound model rejects a content block type / media variant. Distinct from `ProviderInvalidRequest` (which covers spec-shape malformation): this category surfaces a *capability* mismatch, letting callers route differently (e.g., fall back to a multimodal-capable provider) without overloading the malformed-request category. Carries `block_type` ("image" / "audio" / "video") and `reason` (provider's human-readable message) when those are recoverable from the rejection. `OpenAIProvider` detects content rejection via HTTP 400 bodies — heuristic on `error.code` (known set: `image_content_not_supported`, `unsupported_image_media_type`, `audio_content_not_supported`, etc.), `error.type` (`image_parse_error`), and `error.message` ("does not support" + image/audio/video).
diff --git a/docs/concepts/prompts.md b/docs/concepts/prompts.md
new file mode 100644
index 0000000..a61f330
--- /dev/null
+++ b/docs/concepts/prompts.md
@@ -0,0 +1,287 @@
+# Prompts
+
+Named, versioned, content-addressed prompts. OpenArmature's
+prompt-management capability separates *fetching* a template
+from *rendering* it, lets you compose multiple backends with
+explicit fallback, and propagates prompt identity to your
+observability backend so trace UIs can pivot on the prompt
+that produced a call.
+
+Skip ahead to [a minimal example](#a-minimal-example) if you
+want code first.
+
+## The two halves: fetch and render
+
+A `PromptBackend` knows how to find a template by `name` and
+`label`; nothing more. A `PromptManager` composes one or more
+backends and adds rendering on top:
+
+```python
+from openarmature.prompts import PromptManager, FilesystemPromptBackend
+
+manager = PromptManager(FilesystemPromptBackend("./prompts"))
+
+# Fetch returns a Prompt (the raw template + identity metadata).
+prompt = await manager.fetch("greeting", "production")
+
+# Render applies variables and returns a PromptResult (the
+# rendered messages plus a content-addressed identity).
+result = manager.render(prompt, {"user": "Alice"})
+
+# Or do both in one shot:
+result = await manager.get("greeting", "production", {"user": "Alice"})
+```
+
+Why two operations instead of one? Three reasons:
+
+- **Inspect templates without binding variables.** Schema
+  validation, prompt diffing, tooling that walks the prompt
+  catalogue.
+- **Cache templates separately from rendered output.** The
+  fetch step is the I/O step; rendering is pure local
+  computation.
+- **Render the same template with different variables in
+  tight loops.** Map-reduce over chunks, batch evaluation,
+  fan-out fixtures.
+
+The convenience `get()` operation gives you the single-call
+shape when you want it without removing the separability.
+
+## Prompt identity
+
+Every `Prompt` carries five identity fields:
+
+- `name` — your stable identifier (`"greeting"`).
+- `version` — the backend's version string. Implementation-defined:
+  a backend MAY use semver, monotonic integers, content
+  hashes, git short-SHAs, or any stable identifier. The
+  filesystem backend derives it from the template content
+  hash.
+- `label` — the slot the prompt was fetched from
+  (`"production"`, `"latest"`, `"variant-a"`). The label is
+  part of the query.
+- `template_hash` — SHA-256 of the raw template source.
+  Two prompts with different content always have different
+  hashes.
+- `fetched_at` — when the prompt was fetched. Cached
+  backends preserve the original fetch time, not the
+  cache-hit time.
+
+The `name + version + label` triple identifies the prompt;
+the `template_hash` lets you tell two prompts apart by
+*content*, which matters when a vendor backend serves
+different content under the same `latest` label over time.
+
+A `PromptResult` propagates all of those, plus:
+
+- `rendered_hash` — SHA-256 over the rendered messages.
+  Same template + same variables → same hash. This is the
+  cache-key value a memoization layer wants.
+- `messages` — the rendered output as an LLM-ready
+  `list[Message]`. Directly consumable by
+  `Provider.complete()`.
+- `variables` — what was applied. Audit-trail friendly.
+- `rendered_at` — when the render happened. Distinct from
+  `fetched_at`.
+
+## Strict variables by default
+
+A template that references a variable not in the mapping
+raises `PromptRenderError`:
+
+```python
+prompt = await manager.fetch("greeting", "production")  # "Hello, {{ user }}! Today is {{ day }}."
+manager.render(prompt, {"user": "Alice"})  # raises — "day" is undefined
+```
+
+This is intentional. Silently substituting empty strings for
+missing variables masks bugs: a typo'd variable name produces
+a working-but-wrong prompt, often invisibly. If you need
+lenient behavior, wrap your variables in your own defaulting
+layer before passing them to `render()`.
+
+The Python implementation uses Jinja2's `StrictUndefined`.
+
+## Composite backends and fallback
+
+A manager constructed with multiple backends consults them in
+order. The fallback rule distinguishes infrastructure failure
+from logical absence:
+
+```python
+from openarmature.prompts import PromptManager
+from openarmature_langfuse import LangfusePromptBackend  # hypothetical sibling
+
+manager = PromptManager(
+    LangfusePromptBackend(api_key=...),
+    FilesystemPromptBackend("./prompts"),  # local fallback
+)
+```
+
+- **`PromptStoreUnavailable` from a backend → try the next.**
+  Network's down, vendor API is 5xx-ing, filesystem hiccupped —
+  the manager falls back. This is the "Langfuse is degraded,
+  use the local copy" case.
+- **`PromptNotFound` from a backend → STOP the chain.** The
+  error propagates. This is the "operator deliberately
+  deleted the prompt from Langfuse to retire it" case —
+  falling back here would silently resurface a stale local
+  copy under a name the operator wanted gone.
+- **All backends `PromptStoreUnavailable` → manager raises
+  `PromptStoreUnavailable`.** Everything's down.
+
+The two error categories have different operational
+meanings; the manager keeps them separated.
+
+## Errors
+
+Three categories cover every failure mode:
+
+| Error                     | When                                                                | Transient |
+| ------------------------- | ------------------------------------------------------------------- | --------- |
+| `PromptNotFound`          | No prompt matches `(name, label)` in any backend (after §8 rules)   | No        |
+| `PromptRenderError`       | Undefined variable, template parse error, coercion failure          | No        |
+| `PromptStoreUnavailable`  | Backend infrastructure failure (network, I/O, vendor API)           | Yes       |
+
+`PROMPT_TRANSIENT_CATEGORIES` is exported as a frozenset for
+retry-middleware classifiers — the same pattern
+`openarmature.llm` uses with its `TRANSIENT_CATEGORIES`.
+
+## PromptGroup — tracing related prompts together
+
+A `PromptGroup` is a structural grouping of two or more
+`PromptResult` instances under a stable `group_name`. The
+group itself doesn't execute anything; it gives observability
+a shared name to render related calls under.
+
+```python
+from openarmature.prompts import PromptGroup, with_active_prompt_group
+
+classify = await manager.get("classify", variables={"input": user_query})
+answer = await manager.get("answer", variables={"input": user_query, ...})
+
+group = PromptGroup(group_name="classifier_chain", members=[classify, answer])
+with with_active_prompt_group(group):
+    # Every LLM call in this scope carries
+    # openarmature.prompt.group_name="classifier_chain".
+    classification = await provider.complete(classify.messages, ...)
+    final = await provider.complete(answer.messages, ...)
+```
+
+Canonical patterns the primitive covers:
+
+- **Multi-stage classification** — `[coarse, fine, answer]`.
+- **RAG with reranking** — `[query_rewrite, retrieve, rerank, answer]`.
+- **Self-correction loops** — `[generate, critique, revise]`.
+- **Map-reduce over chunks** — `[chunk_classify_1..N, synthesize]`.
+
+The N=2 case ("classifier + follow-up") is the simplest;
+larger groups work under the same primitive. The group rejects
+empty and single-member shapes — single-prompt tagging is
+already served by the per-prompt observability attributes
+below.
+
+## Observability propagation
+
+When an LLM call fires inside `with_active_prompt(result)` (or
+`with_active_prompt_group(group)`), the OTel observer surfaces
+six normative attributes on the `openarmature.llm.complete`
+span:
+
+- `openarmature.prompt.name`
+- `openarmature.prompt.version`
+- `openarmature.prompt.label`
+- `openarmature.prompt.template_hash`
+- `openarmature.prompt.rendered_hash`
+- `openarmature.prompt.group_name`
+
+Pattern:
+
+```python
+result = await manager.get("greeting", "production", {"user": "Alice"})
+with with_active_prompt(result):
+    response = await provider.complete(result.messages, ...)
+```
+
+Trace UIs can then pivot on `prompt.name`, filter on
+`prompt.template_hash` to find every call that used a given
+template version, or surface `prompt.group_name` to group
+related calls into a single workflow view.
+
+Nesting is innermost-wins. If you activate a result inside
+another active result, the inner one wins for the duration
+of the inner block.
+
+## Determinism and content-addressed caching
+
+`render` is deterministic: same `Prompt`, same `variables` →
+bytewise-identical `messages` and `rendered_hash` across
+calls. This is the cache-key contract — `rendered_hash`
+gives a downstream memoization layer the right equivalence
+relation for free.
+
+Templates MAY reference user-supplied variables that capture
+nondeterministic values (`now=datetime.utcnow()`); the
+determinism contract applies to the render operation given
+fixed inputs, not to user-supplied variable content.
+
+## A minimal example
+
+```python
+import asyncio
+from pathlib import Path
+
+from openarmature.prompts import (
+    FilesystemPromptBackend,
+    PromptManager,
+    with_active_prompt,
+)
+
+
+async def main() -> None:
+    manager = PromptManager(FilesystemPromptBackend(Path("./prompts")))
+    result = await manager.get(
+        "greeting",
+        "production",
+        variables={"user": "Alice"},
+    )
+    print(result.messages[0].content)         # rendered text
+    print(result.rendered_hash)               # cache key
+    # Run an LLM call inside the active-prompt context so the
+    # OTel observer can surface prompt.* span attributes.
+    # with with_active_prompt(result):
+    #     response = await provider.complete(result.messages)
+    _ = with_active_prompt  # marker for the snippet above
+
+
+asyncio.run(main())
+```
+
+The filesystem backend layout is
+`<root>/<label>/<name>.j2` — for the example above,
+`./prompts/production/greeting.j2`.
+
+## What's out of scope (for now)
+
+- **Specific vendor backends** — Langfuse, PromptLayer, etc.,
+  ship as sibling packages (`openarmature-langfuse`, …). The
+  core ships the protocol + a filesystem reference.
+- **Prompt versioning workflows** — how versions are assigned,
+  promoted, pinned. Per project. The spec defines the
+  `version` field; the discipline is yours.
+- **Cache invalidation policies** — `template_hash` and
+  `rendered_hash` are the keys; the cache itself is a
+  separate concern.
+- **Prompt linting / evaluation** — quality checks belong to
+  separate tools (or the future eval capability).
+- **Multi-message render decomposition** — v1 emits a single
+  `UserMessage` carrying the rendered text. If you need
+  `system + user` splits, construct the messages list
+  manually outside `render()` for now.
+
+## Where to next
+
+- **[Model Providers](../model-providers/index.md)** —
+  what to pass `result.messages` into.
+- **[API reference: `openarmature.prompts`](../reference/prompts.md)** —
+  the full public surface.
diff --git a/docs/reference/prompts.md b/docs/reference/prompts.md
new file mode 100644
index 0000000..922f20f
--- /dev/null
+++ b/docs/reference/prompts.md
@@ -0,0 +1,7 @@
+# openarmature.prompts
+
+::: openarmature.prompts
+    options:
+      show_root_heading: false
+      show_source: false
+      heading_level: 2
diff --git a/mkdocs.yml b/mkdocs.yml
index 0a81d59..702d8e0 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -95,6 +95,7 @@ nav:
     - Composition: concepts/composition.md
     - Fan-out: concepts/fan-out.md
     - LLMs: concepts/llms.md
+    - Prompts: concepts/prompts.md
     - Observability: concepts/observability.md
     - Checkpointing: concepts/checkpointing.md
   - Model Providers:
@@ -104,6 +105,7 @@ nav:
     - reference/index.md
     - openarmature.graph: reference/graph.md
     - openarmature.llm: reference/llm.md
+    - openarmature.prompts: reference/prompts.md
     - openarmature.checkpoint: reference/checkpoint.md
     - openarmature.observability: reference/observability.md
 

From 0f458edb1e656e5c8ad025252b1103508c2bb760 Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 19:39:14 -0700
Subject: [PATCH 08/12] fix: CoPilot review pass on PR #45
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- manager.py: hoist Jinja2 Environment to module-level singleton
  (stateless config; thread-safe for compile + render; avoids
  re-parsing config on every render call), keep the
  autoescape-disabled-by-design comment.
- errors.py: PromptStoreUnavailable carries optional name / label /
  backends_tried for operator diagnosability; PromptManager's
  aggregate raise populates backends_tried with the ordered list
  of consulted backends. PromptRenderError docstring documents
  spec §10's non-transient mandate.
- backends/filesystem.py: widen the version-prefix length from 12
  to 16 hex chars (~64 bits; birthday-paradox boundary at ~4B
  templates), document the rationale + the wider-prefix /
  alternative-identifier guidance for higher-scale backends. Also
  carries name / label on PromptStoreUnavailable raises.
- observability/otel/observer.py: hoist prompts.context import to
  module top-level (no longer optional; cost off the per-event
  hot path).
- harness/fixtures.py: tighten the prompt-management discriminator
  from `backends:` alone to `backends:` co-occurring with
  `calls:` AND absence of graph-shape keys; avoids silently
  misrouting future fixtures that introduce a backends list for
  some other purpose.
- test_prompt_management.py: lift per-call call-count assertions
  out of the raises branch so they apply on both success and
  error paths; add internal-consistency check that a fixture's
  fields_must_match and fields_may_differ sets don't overlap.
- test_prompts.py: mock Path.read_text for the OSError-routing
  test instead of relying on platform-dependent
  NotADirectoryError behavior; update the version-prefix length
  assertion to match the widened 16-char prefix.
---
 docs/concepts/prompts.md                      | 11 +------
 .../observability/otel/observer.py            |  7 ++---
 .../prompts/backends/filesystem.py            | 19 ++++++++----
 src/openarmature/prompts/errors.py            | 25 +++++++++++++++-
 src/openarmature/prompts/manager.py           | 24 ++++++++++-----
 tests/conformance/harness/fixtures.py         | 12 +++++++-
 tests/conformance/test_prompt_management.py   | 29 +++++++++++++------
 tests/unit/test_prompts.py                    | 19 ++++++++----
 8 files changed, 101 insertions(+), 45 deletions(-)

diff --git a/docs/concepts/prompts.md b/docs/concepts/prompts.md
index a61f330..62fb9b0 100644
--- a/docs/concepts/prompts.md
+++ b/docs/concepts/prompts.md
@@ -231,11 +231,7 @@ fixed inputs, not to user-supplied variable content.
 import asyncio
 from pathlib import Path
 
-from openarmature.prompts import (
-    FilesystemPromptBackend,
-    PromptManager,
-    with_active_prompt,
-)
+from openarmature.prompts import FilesystemPromptBackend, PromptManager
 
 
 async def main() -> None:
@@ -247,11 +243,6 @@ async def main() -> None:
     )
     print(result.messages[0].content)         # rendered text
     print(result.rendered_hash)               # cache key
-    # Run an LLM call inside the active-prompt context so the
-    # OTel observer can surface prompt.* span attributes.
-    # with with_active_prompt(result):
-    #     response = await provider.complete(result.messages)
-    _ = with_active_prompt  # marker for the snippet above
 
 
 asyncio.run(main())
diff --git a/src/openarmature/observability/otel/observer.py b/src/openarmature/observability/otel/observer.py
index 12de9f7..d94e6cf 100644
--- a/src/openarmature/observability/otel/observer.py
+++ b/src/openarmature/observability/otel/observer.py
@@ -93,6 +93,8 @@
 )
 from opentelemetry.trace.propagation import set_span_in_context
 
+from openarmature.prompts.context import current_prompt_group, current_prompt_result
+
 if TYPE_CHECKING:
     from openarmature.graph.events import NodeEvent
 
@@ -490,11 +492,6 @@ def _handle_llm_event(self, event: NodeEvent) -> None:
             # Per prompt-management spec §11, surface prompt identity
             # on the LLM-call span when the call fired inside a
             # with_active_prompt / with_active_prompt_group context.
-            from openarmature.prompts.context import (
-                current_prompt_group,
-                current_prompt_result,
-            )
-
             active_prompt = current_prompt_result()
             if active_prompt is not None:
                 attrs["openarmature.prompt.name"] = active_prompt.name
diff --git a/src/openarmature/prompts/backends/filesystem.py b/src/openarmature/prompts/backends/filesystem.py
index 1dd0c51..37a3bd9 100644
--- a/src/openarmature/prompts/backends/filesystem.py
+++ b/src/openarmature/prompts/backends/filesystem.py
@@ -22,10 +22,15 @@ class FilesystemPromptBackend:
     suffix"; this backend picks subdirectory.
 
     The ``version`` field is derived from the template content hash
-    (first 12 hex chars of the SHA-256) so two file contents map
-    deterministically to two distinct version strings without
-    needing a sidecar metadata file. Per spec §3, this satisfies
-    the "stable identifier" requirement.
+    (first 16 hex chars of the SHA-256, ~64 bits) so two file
+    contents map deterministically to two distinct version strings
+    without needing a sidecar metadata file. Per spec §3, this
+    satisfies the "stable identifier" requirement. The 16-char
+    prefix puts the birthday-paradox collision boundary at ~4B
+    distinct templates — well past any realistic single-backend
+    exposure. Higher-scale backends should widen further or pick a
+    different stable identifier (semver from a sidecar metadata
+    file, git short-SHAs, etc.).
 
     This backend reads from disk on every fetch — no caching. A
     caching backend (e.g., openarmature-langfuse) that returns
@@ -49,11 +54,13 @@ async def fetch(self, name: str, label: str = "production") -> Prompt:
             ) from exc
         except OSError as exc:
             raise PromptStoreUnavailable(
-                f"filesystem I/O error reading ({name!r}, {label!r}): {exc}"
+                f"filesystem I/O error reading ({name!r}, {label!r}): {exc}",
+                name=name,
+                label=label,
             ) from exc
 
         template_hash = compute_template_hash(template_source)
-        version = template_hash.removeprefix("sha256:")[:12]
+        version = template_hash.removeprefix("sha256:")[:16]
         return Prompt(
             name=name,
             version=version,
diff --git a/src/openarmature/prompts/errors.py b/src/openarmature/prompts/errors.py
index 8d1bea3..dd36c77 100644
--- a/src/openarmature/prompts/errors.py
+++ b/src/openarmature/prompts/errors.py
@@ -53,6 +53,11 @@ class PromptRenderError(PromptError):
 
     Carries the source prompt's identity plus the variable mapping
     and a description of the render failure.
+
+    Non-transient per spec §10: retrying the same render with the
+    same prompt + variables will not succeed. Callers whose backend
+    serves a fixed template later should re-fetch + re-render rather
+    than relying on retry-middleware to auto-retry the failed render.
     """
 
     category = PROMPT_RENDER_ERROR
@@ -90,7 +95,25 @@ class PromptStoreUnavailable(PromptError):
 
     Transient: the same fetch may succeed when the backend recovers.
     ``PromptManager.fetch`` raises this only after ALL composed
-    backends raise it.
+    backends raise it; in that aggregate case ``backends_tried``
+    lists the backends consulted (in order) for operator visibility.
+    The ``__cause__`` chain preserves per-backend failure reasons.
     """
 
     category = PROMPT_STORE_UNAVAILABLE
+
+    name: str | None
+    label: str | None
+    backends_tried: list[str] | None
+
+    def __init__(
+        self,
+        *args: Any,
+        name: str | None = None,
+        label: str | None = None,
+        backends_tried: list[str] | None = None,
+    ) -> None:
+        super().__init__(*args)
+        self.name = name
+        self.label = label
+        self.backends_tried = backends_tried
diff --git a/src/openarmature/prompts/manager.py b/src/openarmature/prompts/manager.py
index 6d1b3fc..c7f7255 100644
--- a/src/openarmature/prompts/manager.py
+++ b/src/openarmature/prompts/manager.py
@@ -18,6 +18,18 @@
 
 _log = logging.getLogger(__name__)
 
+# Module-level singleton. Stateless given the configuration (no
+# filters, globals, or per-call mutation), and jinja2.Environment is
+# documented as thread-safe for compile + render — so a single
+# shared instance avoids re-parsing the env config on every render
+# call. autoescape disabled by design: render output goes to an LLM
+# API call (plain text), not an HTML response.
+_RENDER_ENV = jinja2.Environment(
+    undefined=jinja2.StrictUndefined,
+    autoescape=False,
+    keep_trailing_newline=True,
+)
+
 
 class PromptManager:
     """Composes one or more PromptBackends and exposes fetch + render.
@@ -64,7 +76,10 @@ async def fetch(self, name: str, label: str = "production") -> Prompt:
                 continue
         assert last_unavailable is not None
         raise PromptStoreUnavailable(
-            f"all prompt backends unavailable for ({name!r}, {label!r})"
+            f"all prompt backends unavailable for ({name!r}, {label!r})",
+            name=name,
+            label=label,
+            backends_tried=[type(b).__name__ for b in self._backends],
         ) from last_unavailable
 
     def render(
@@ -85,15 +100,10 @@ def render(
         list manually.
         """
         variables = variables or {}
-        env = jinja2.Environment(
-            undefined=jinja2.StrictUndefined,
-            autoescape=False,
-            keep_trailing_newline=True,
-        )
 
         rendered_text: str
         try:
-            template = env.from_string(prompt.template)
+            template = _RENDER_ENV.from_string(prompt.template)
             rendered_text = template.render(**variables)
         except jinja2.UndefinedError as exc:
             raise PromptRenderError(
diff --git a/tests/conformance/harness/fixtures.py b/tests/conformance/harness/fixtures.py
index 7cc81ec..d76bcd5 100644
--- a/tests/conformance/harness/fixtures.py
+++ b/tests/conformance/harness/fixtures.py
@@ -264,7 +264,17 @@ def _discriminate_fixture(
     if isinstance(value, dict):
         if "mock_provider" in value:
             return "llm_provider"
-        if "backends" in value:
+        # PM fixtures uniquely have ``backends:`` AND ``calls:`` and
+        # none of the graph-shape keys. Co-occurrence is the
+        # discriminator until a spec-side ``kind:`` field lands —
+        # checking ``backends:`` alone would silently misroute any
+        # future fixture that introduces a backends list for some
+        # other purpose.
+        if (
+            "backends" in value
+            and "calls" in value
+            and not any(k in value for k in ("nodes", "edges", "state", "entry"))
+        ):
             return "prompt_management"
         if "cases" in value:
             return "cases"
diff --git a/tests/conformance/test_prompt_management.py b/tests/conformance/test_prompt_management.py
index 0f1751c..51f9afc 100644
--- a/tests/conformance/test_prompt_management.py
+++ b/tests/conformance/test_prompt_management.py
@@ -169,6 +169,21 @@ def _assert_per_call(
     if call.expected is None:
         return
 
+    # Call-count assertions apply regardless of raise / success path —
+    # a future fixture asserting short-circuit on a non-raising call
+    # (e.g., first-backend hit means second-backend isn't consulted)
+    # would have its expectation silently ignored if these checks
+    # only ran inside the ``raises`` branch.
+    if call.expected.secondary_backend_call_count is not None:
+        assert backends["secondary"].call_count == call.expected.secondary_backend_call_count, (
+            f"expected secondary call_count={call.expected.secondary_backend_call_count}, "
+            f"got {backends['secondary'].call_count}"
+        )
+    if call.expected.backend_call_counts is not None:
+        for name, count in call.expected.backend_call_counts.items():
+            actual_count = backends[name].call_count
+            assert actual_count == count, f"expected {name} call_count={count}, got {actual_count}"
+
     if call.expected.raises is not None:
         assert raised is not None, (
             f"expected raise of category {call.expected.raises.category!r}, got result {result!r}"
@@ -190,15 +205,6 @@ def _assert_per_call(
                 assert actual_attr == expected_value, (
                     f"expected {key}={expected_value!r}, got {actual_attr!r}"
                 )
-        if call.expected.secondary_backend_call_count is not None:
-            assert backends["secondary"].call_count == call.expected.secondary_backend_call_count, (
-                f"expected secondary call_count={call.expected.secondary_backend_call_count}, "
-                f"got {backends['secondary'].call_count}"
-            )
-        if call.expected.backend_call_counts is not None:
-            for name, count in call.expected.backend_call_counts.items():
-                actual_count = backends[name].call_count
-                assert actual_count == count, f"expected {name} call_count={count}, got {actual_count}"
         return
 
     assert raised is None, f"unexpected raise: {raised!r}"
@@ -242,6 +248,11 @@ def _assert_result_equivalence(
     eq: FixtureExpectedResultEquivalence,
     captures: dict[str, Any],
 ) -> None:
+    overlap = set(eq.fields_must_match) & set(eq.fields_may_differ)
+    assert not overlap, (
+        f"fixture inconsistency: fields {sorted(overlap)} appear in both "
+        f"fields_must_match and fields_may_differ"
+    )
     members = [captures[ref] for ref in eq.of]
     first = members[0]
     for other in members[1:]:
diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py
index b3ffe76..333e60a 100644
--- a/tests/unit/test_prompts.py
+++ b/tests/unit/test_prompts.py
@@ -14,6 +14,7 @@
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any
+from unittest.mock import patch
 
 import pytest
 
@@ -249,7 +250,7 @@ async def test_filesystem_backend_fetch_success(tmp_path: Path) -> None:
     assert prompt.template == "Hello, {{ user }}!"
     assert prompt.template_hash == compute_template_hash("Hello, {{ user }}!")
     # version derived from first 12 hex chars of template_hash
-    assert prompt.version == prompt.template_hash.removeprefix("sha256:")[:12]
+    assert prompt.version == prompt.template_hash.removeprefix("sha256:")[:16]
 
 
 async def test_filesystem_backend_fetch_missing_file_raises_not_found(tmp_path: Path) -> None:
@@ -262,12 +263,18 @@ async def test_filesystem_backend_fetch_missing_file_raises_not_found(tmp_path:
 
 
 async def test_filesystem_backend_io_error_raises_store_unavailable(tmp_path: Path) -> None:
-    # Make the label dir a file, not a directory; the read_text path
-    # construction will surface an OSError that is NOT FileNotFoundError.
-    (tmp_path / "production").write_text("not a directory", encoding="utf-8")
+    # Mock ``Path.read_text`` to raise a generic ``OSError`` so the
+    # test isolates the OSError-but-not-FileNotFoundError branch
+    # without depending on platform-specific filesystem semantics
+    # (Linux surfaces NotADirectoryError for "file where directory
+    # expected"; Windows can surface PermissionError or other
+    # OSError subclasses).
+    (tmp_path / "production").mkdir()
+    (tmp_path / "production" / "foo.j2").write_text("template", encoding="utf-8")
     backend = FilesystemPromptBackend(tmp_path)
-    with pytest.raises(PromptStoreUnavailable):
-        await backend.fetch("foo", "production")
+    with patch("pathlib.Path.read_text", side_effect=OSError("simulated I/O error")):
+        with pytest.raises(PromptStoreUnavailable):
+            await backend.fetch("foo", "production")
 
 
 # ---------------------------------------------------------------------------

From f5eba2abe60977d65dc9ff8d6744ca034cb4da21 Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 19:49:35 -0700
Subject: [PATCH 09/12] docs(prompts): drop em dashes from concept page

Memory rule: no em dashes in user-facing copy. Reworded the new
docs/concepts/prompts.md to use colons, semicolons, parens, or
sentence restructuring in place of em dashes.
---
 docs/concepts/prompts.md | 64 ++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/docs/concepts/prompts.md b/docs/concepts/prompts.md
index 62fb9b0..41aea4f 100644
--- a/docs/concepts/prompts.md
+++ b/docs/concepts/prompts.md
@@ -51,19 +51,19 @@ shape when you want it without removing the separability.
 
 Every `Prompt` carries five identity fields:
 
-- `name` — your stable identifier (`"greeting"`).
-- `version` — the backend's version string. Implementation-defined:
+- `name`: your stable identifier (`"greeting"`).
+- `version`: the backend's version string. Implementation-defined:
   a backend MAY use semver, monotonic integers, content
   hashes, git short-SHAs, or any stable identifier. The
   filesystem backend derives it from the template content
   hash.
-- `label` — the slot the prompt was fetched from
+- `label`: the slot the prompt was fetched from
   (`"production"`, `"latest"`, `"variant-a"`). The label is
   part of the query.
-- `template_hash` — SHA-256 of the raw template source.
+- `template_hash`: SHA-256 of the raw template source.
   Two prompts with different content always have different
   hashes.
-- `fetched_at` — when the prompt was fetched. Cached
+- `fetched_at`: when the prompt was fetched. Cached
   backends preserve the original fetch time, not the
   cache-hit time.
 
@@ -74,14 +74,14 @@ different content under the same `latest` label over time.
 
 A `PromptResult` propagates all of those, plus:
 
-- `rendered_hash` — SHA-256 over the rendered messages.
+- `rendered_hash`: SHA-256 over the rendered messages.
   Same template + same variables → same hash. This is the
   cache-key value a memoization layer wants.
-- `messages` — the rendered output as an LLM-ready
+- `messages`: the rendered output as an LLM-ready
   `list[Message]`. Directly consumable by
   `Provider.complete()`.
-- `variables` — what was applied. Audit-trail friendly.
-- `rendered_at` — when the render happened. Distinct from
+- `variables`: what was applied. Audit-trail friendly.
+- `rendered_at`: when the render happened. Distinct from
   `fetched_at`.
 
 ## Strict variables by default
@@ -91,7 +91,7 @@ raises `PromptRenderError`:
 
 ```python
 prompt = await manager.fetch("greeting", "production")  # "Hello, {{ user }}! Today is {{ day }}."
-manager.render(prompt, {"user": "Alice"})  # raises — "day" is undefined
+manager.render(prompt, {"user": "Alice"})  # raises: "day" is undefined
 ```
 
 This is intentional. Silently substituting empty strings for
@@ -119,14 +119,14 @@ manager = PromptManager(
 ```
 
 - **`PromptStoreUnavailable` from a backend → try the next.**
-  Network's down, vendor API is 5xx-ing, filesystem hiccupped —
-  the manager falls back. This is the "Langfuse is degraded,
+  Network's down, vendor API is 5xx-ing, filesystem hiccupped,
+  so the manager falls back. This is the "Langfuse is degraded,
   use the local copy" case.
 - **`PromptNotFound` from a backend → STOP the chain.** The
-  error propagates. This is the "operator deliberately
-  deleted the prompt from Langfuse to retire it" case —
-  falling back here would silently resurface a stale local
-  copy under a name the operator wanted gone.
+  error propagates. This is the "operator deliberately deleted
+  the prompt from Langfuse to retire it" case; falling back here
+  would silently resurface a stale local copy under a name the
+  operator wanted gone.
 - **All backends `PromptStoreUnavailable` → manager raises
   `PromptStoreUnavailable`.** Everything's down.
 
@@ -144,10 +144,10 @@ Three categories cover every failure mode:
 | `PromptStoreUnavailable`  | Backend infrastructure failure (network, I/O, vendor API)           | Yes       |
 
 `PROMPT_TRANSIENT_CATEGORIES` is exported as a frozenset for
-retry-middleware classifiers — the same pattern
+retry-middleware classifiers, matching the pattern
 `openarmature.llm` uses with its `TRANSIENT_CATEGORIES`.
 
-## PromptGroup — tracing related prompts together
+## PromptGroup: tracing related prompts together
 
 A `PromptGroup` is a structural grouping of two or more
 `PromptResult` instances under a stable `group_name`. The
@@ -170,14 +170,14 @@ with with_active_prompt_group(group):
 
 Canonical patterns the primitive covers:
 
-- **Multi-stage classification** — `[coarse, fine, answer]`.
-- **RAG with reranking** — `[query_rewrite, retrieve, rerank, answer]`.
-- **Self-correction loops** — `[generate, critique, revise]`.
-- **Map-reduce over chunks** — `[chunk_classify_1..N, synthesize]`.
+- **Multi-stage classification**: `[coarse, fine, answer]`.
+- **RAG with reranking**: `[query_rewrite, retrieve, rerank, answer]`.
+- **Self-correction loops**: `[generate, critique, revise]`.
+- **Map-reduce over chunks**: `[chunk_classify_1..N, synthesize]`.
 
 The N=2 case ("classifier + follow-up") is the simplest;
 larger groups work under the same primitive. The group rejects
-empty and single-member shapes — single-prompt tagging is
+empty and single-member shapes; single-prompt tagging is
 already served by the per-prompt observability attributes
 below.
 
@@ -216,7 +216,7 @@ of the inner block.
 
 `render` is deterministic: same `Prompt`, same `variables` →
 bytewise-identical `messages` and `rendered_hash` across
-calls. This is the cache-key contract — `rendered_hash`
+calls. This is the cache-key contract: `rendered_hash`
 gives a downstream memoization layer the right equivalence
 relation for free.
 
@@ -249,30 +249,30 @@ asyncio.run(main())
 ```
 
 The filesystem backend layout is
-`<root>/<label>/<name>.j2` — for the example above,
+`<root>/<label>/<name>.j2`; for the example above,
 `./prompts/production/greeting.j2`.
 
 ## What's out of scope (for now)
 
-- **Specific vendor backends** — Langfuse, PromptLayer, etc.,
+- **Specific vendor backends**: Langfuse, PromptLayer, etc.,
   ship as sibling packages (`openarmature-langfuse`, …). The
   core ships the protocol + a filesystem reference.
-- **Prompt versioning workflows** — how versions are assigned,
+- **Prompt versioning workflows**: how versions are assigned,
   promoted, pinned. Per project. The spec defines the
   `version` field; the discipline is yours.
-- **Cache invalidation policies** — `template_hash` and
+- **Cache invalidation policies**: `template_hash` and
   `rendered_hash` are the keys; the cache itself is a
   separate concern.
-- **Prompt linting / evaluation** — quality checks belong to
+- **Prompt linting / evaluation**: quality checks belong to
   separate tools (or the future eval capability).
-- **Multi-message render decomposition** — v1 emits a single
+- **Multi-message render decomposition**: v1 emits a single
   `UserMessage` carrying the rendered text. If you need
   `system + user` splits, construct the messages list
   manually outside `render()` for now.
 
 ## Where to next
 
-- **[Model Providers](../model-providers/index.md)** —
+- **[Model Providers](../model-providers/index.md)**:
   what to pass `result.messages` into.
-- **[API reference: `openarmature.prompts`](../reference/prompts.md)** —
+- **[API reference: `openarmature.prompts`](../reference/prompts.md)**:
   the full public surface.

From 85b3561582ab6aa58a176243de70144001f33e63 Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 19:53:04 -0700
Subject: [PATCH 10/12] docs: drop em dashes from llms.md,
 model-providers/{index,authoring}.md

Sweep of leftover em dashes from PR-1/PR-2 docs that slipped past
the no-em-dashes-in-user-facing-copy rule. Same substitutions as
the prompts.md cleanup (colons, semicolons, parens, or sentence
restructuring).
---
 docs/concepts/llms.md             | 10 +++++-----
 docs/model-providers/authoring.md |  6 +++---
 docs/model-providers/index.md     |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/concepts/llms.md b/docs/concepts/llms.md
index 9c8c154..1d8f407 100644
--- a/docs/concepts/llms.md
+++ b/docs/concepts/llms.md
@@ -196,7 +196,7 @@ response post-receive against the supplied schema; strict is a
 wire-level optimization, not a correctness requirement.
 
 `strict_mode_supported(schema)` (exported from `openarmature.llm`)
-performs the deep recursive check. The heuristic is conservative —
+performs the deep recursive check. The heuristic is conservative:
 anything not on the list below trips to `strict: false`:
 
 - Top-level schema is `type: "object"`.
@@ -240,7 +240,7 @@ A text block is the array-form equivalent of a text-string message:
 text block is normatively equivalent to one with `content="describe
 this"`.
 
-An image block carries one source — URL or inline base64 — plus an
+An image block carries one source (URL or inline base64) plus an
 optional `detail` hint:
 
 ```python
@@ -302,7 +302,7 @@ fidelity: `"auto"`, `"low"`, or `"high"`. The class default is `None`,
 which **omits the field from the wire** and lets the provider apply
 its own default (conceptually `"auto"`). Setting `detail="auto"`
 explicitly on the spec block forces the wire to carry an explicit
-`"auto"` — usually unnecessary, since the provider's default is the
+`"auto"`, usually unnecessary since the provider's default is the
 same value.
 
 ### When the model can't handle the block
@@ -324,12 +324,12 @@ provider on this category) compose cleanly against it.
 "audio", "video") and `reason` (the provider's human-readable
 message) when those are recoverable from the rejection.
 
-`OpenAIProvider` detects content rejection via the response body —
+`OpenAIProvider` detects content rejection via the response body:
 HTTP 400 with an error code like `image_content_not_supported` or a
 message like "does not support image inputs." Pre-send capability
 checks (failing fast before the wire trip when you know the model
 doesn't support images) live above the provider as userland
-middleware — the provider doesn't ship a static model-capability
+middleware; the provider doesn't ship a static model-capability
 catalog.
 
 ## Routing on parsed fields
diff --git a/docs/model-providers/authoring.md b/docs/model-providers/authoring.md
index 97c4648..28931cb 100644
--- a/docs/model-providers/authoring.md
+++ b/docs/model-providers/authoring.md
@@ -69,7 +69,7 @@ class MyProvider:
         response_schema: dict[str, Any] | type[BaseModel] | None = None,
     ) -> Response:
         # response_schema is part of the Protocol; a skeleton provider
-        # MUST NOT silently ignore it — callers expect either
+        # MUST NOT silently ignore it: callers expect either
         # Response.parsed populated or a StructuredOutputInvalid raise.
         # Until the wire path is implemented, raise
         # ProviderInvalidRequest when response_schema is set. A
@@ -206,8 +206,8 @@ of:
   `ImageSourceInline`) are stable across providers; only the wire
   shape differs. Provider authors targeting non-multimodal models
   MUST surface `ProviderUnsupportedContentBlock` when the request
-  carries blocks the bound model can't serve — pre-send or
-  post-receive per §7.
+  carries blocks the bound model can't serve (pre-send or
+  post-receive per §7).
 - **Structured output.** Threading `response_schema` through the
   request body (native `response_format` if the underlying wire
   supports it; prompt-augmentation fallback otherwise) and validating
diff --git a/docs/model-providers/index.md b/docs/model-providers/index.md
index 1a87b82..09f4579 100644
--- a/docs/model-providers/index.md
+++ b/docs/model-providers/index.md
@@ -89,7 +89,7 @@ in the LLMs concept page for the multimodal contract; see
 
 `OpenAIProvider` detects unsupported-content-block rejections via
 the response body (HTTP 400 with an error code or message indicating
-content rejection) — a post-receive mapping rather than a static
+content rejection): a post-receive mapping rather than a static
 pre-send capability check. Pre-send protection is a userland
 middleware pattern when callers know the bound model's capabilities
 up front.

From bcf63a273d0e063f91b692442241d4f5ca6a8ddc Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 20:01:01 -0700
Subject: [PATCH 11/12] fix: CoPilot review round-2 pass on PR #45
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- CHANGELOG.md: update 12 → 16 hex chars to match the widened
  FilesystemPromptBackend.version derivation.
- prompt.py: PromptResult.messages gains Field(min_length=1) so
  the spec §4 'Ordered non-empty sequence' mandate is enforced at
  the type boundary, not just by the construction path.
- errors.py: PromptStoreUnavailable gains an optional causes
  list[BaseException] attribute carrying per-backend exceptions
  index-aligned to backends_tried.
- manager.py: aggregate raise populates causes with the
  per-backend exceptions in fallback order, while keeping the
  __cause__ chain pointing at the last unavailable for
  stack-trace continuity.
- manager.py: PromptManager carries a per-instance
  dict[str, jinja2.Template] keyed by template_hash. Render
  consults the cache and only re-parses on miss. Unbounded for
  v1 (typical apps have O(10) prompts; an LRU follow-on can
  land if benchmarks show memory pressure). template_hash is
  content-derived, so cache invalidation is automatic when a
  backend returns updated content.
- test_prompts.py: new tests for empty-messages rejection and
  for the compiled-template cache hit behavior.
---
 CHANGELOG.md                        |  2 +-
 src/openarmature/prompts/errors.py  | 10 +++++++--
 src/openarmature/prompts/manager.py | 21 ++++++++++++++-----
 src/openarmature/prompts/prompt.py  |  4 ++--
 tests/unit/test_prompts.py          | 32 +++++++++++++++++++++++++++++
 5 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 146cb4d..ffc04d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). The
 
 ### Added
 
-- **Prompt-management capability (proposal 0017, introduced in spec v0.15.0).** New `openarmature.prompts` subpackage. `PromptManager` composes one or more `PromptBackend`s, exposes `fetch` / `render` / `get`, applies the §8 fallback semantics (`prompt_store_unavailable` continues to the next backend; `prompt_not_found` stops the chain), and renders templates with Jinja2's `StrictUndefined` per §7. `Prompt` / `PromptResult` / `PromptGroup` are Pydantic models matching spec §3 / §4 / §9. Three error categories (`PromptNotFound`, `PromptRenderError`, `PromptStoreUnavailable`) with `PROMPT_TRANSIENT_CATEGORIES` exported for retry-middleware classifiers. `FilesystemPromptBackend` is the minimum local-filesystem reference backend (layout: `<root>/<label>/<name>.j2`; `version` derived from the first 12 chars of `template_hash`). New runtime dependency: `jinja2>=3.1`.
+- **Prompt-management capability (proposal 0017, introduced in spec v0.15.0).** New `openarmature.prompts` subpackage. `PromptManager` composes one or more `PromptBackend`s, exposes `fetch` / `render` / `get`, applies the §8 fallback semantics (`prompt_store_unavailable` continues to the next backend; `prompt_not_found` stops the chain), and renders templates with Jinja2's `StrictUndefined` per §7. `Prompt` / `PromptResult` / `PromptGroup` are Pydantic models matching spec §3 / §4 / §9. Three error categories (`PromptNotFound`, `PromptRenderError`, `PromptStoreUnavailable`) with `PROMPT_TRANSIENT_CATEGORIES` exported for retry-middleware classifiers. `FilesystemPromptBackend` is the minimum local-filesystem reference backend (layout: `<root>/<label>/<name>.j2`; `version` derived from the first 16 hex chars of `template_hash`). New runtime dependency: `jinja2>=3.1`.
 - **`openarmature.prompts.context` — observability propagation per spec §11.** `with_active_prompt(result)` and `with_active_prompt_group(group)` context managers + `current_prompt_result()` / `current_prompt_group()` inspectors. When the OTel observer is active and an LLM call fires inside `with_active_prompt`, the `openarmature.llm.complete` span carries the normative `openarmature.prompt.*` attributes (`name`, `version`, `label`, `template_hash`, `rendered_hash`, `group_name`). Nesting is innermost-wins.
 - **Image content blocks for user messages (proposal 0015, introduced in spec v0.13.0).** `UserMessage.content` now accepts `str | list[ContentBlock]`. The block surface introduces `TextBlock`, `ImageBlock`, `ImageSourceURL`, `ImageSourceInline`, and the `ContentBlock` / `ImageSource` discriminated unions over the block / source `type` field. `ImageBlock` carries a `media_type` (required for inline sources; ignored for URL sources; typed as `str | None` so callers MAY pass any `image/*` type the bound model supports) and an optional `detail` hint (`"auto"` / `"low"` / `"high"`; `None` default omits the field from the wire so providers apply their own default). System, assistant, and tool messages stay text-string-only; image inputs are user-only in v1.
 - **`OpenAIProvider` content-array wire mapping.** When `UserMessage.content` is a content-block sequence, the wire body uses OpenAI's `content` array per §8.1.1. `TextBlock → {type: "text", text}`. `ImageBlock` with a URL source maps to `{type: "image_url", image_url: {url, detail?}}`. `ImageBlock` with an inline source constructs an RFC 2397 `data:<media_type>;base64,<base64_data>` URI and goes through the same `image_url` entry shape. Inline bytes pass through unchanged — no inspection, transcoding, or re-encoding.
diff --git a/src/openarmature/prompts/errors.py b/src/openarmature/prompts/errors.py
index dd36c77..b4d7e14 100644
--- a/src/openarmature/prompts/errors.py
+++ b/src/openarmature/prompts/errors.py
@@ -96,8 +96,11 @@ class PromptStoreUnavailable(PromptError):
     Transient: the same fetch may succeed when the backend recovers.
     ``PromptManager.fetch`` raises this only after ALL composed
     backends raise it; in that aggregate case ``backends_tried``
-    lists the backends consulted (in order) for operator visibility.
-    The ``__cause__`` chain preserves per-backend failure reasons.
+    lists the backends consulted (in order) and ``causes`` carries
+    the per-backend exceptions index-aligned to ``backends_tried``
+    so operators can distinguish "backend A 503 + backend B 503"
+    from "backend A 503 + backend B OSError". The ``__cause__`` chain
+    still points at the last unavailable for stack-trace continuity.
     """
 
     category = PROMPT_STORE_UNAVAILABLE
@@ -105,6 +108,7 @@ class PromptStoreUnavailable(PromptError):
     name: str | None
     label: str | None
     backends_tried: list[str] | None
+    causes: list[BaseException] | None
 
     def __init__(
         self,
@@ -112,8 +116,10 @@ def __init__(
         name: str | None = None,
         label: str | None = None,
         backends_tried: list[str] | None = None,
+        causes: list[BaseException] | None = None,
     ) -> None:
         super().__init__(*args)
         self.name = name
         self.label = label
         self.backends_tried = backends_tried
+        self.causes = causes
diff --git a/src/openarmature/prompts/manager.py b/src/openarmature/prompts/manager.py
index c7f7255..0f7df97 100644
--- a/src/openarmature/prompts/manager.py
+++ b/src/openarmature/prompts/manager.py
@@ -47,6 +47,13 @@ def __init__(self, *backends: PromptBackend) -> None:
         if not backends:
             raise ValueError("PromptManager requires at least one backend")
         self._backends: tuple[PromptBackend, ...] = backends
+        # template_hash → compiled jinja2 Template. Per-manager,
+        # unbounded. Correct by construction: template_hash is
+        # content-derived, so a backend returning updated content
+        # surfaces a fresh hash and a fresh cache entry. An LRU
+        # eviction policy can land if benchmarks ever show memory
+        # pressure; typical apps have O(10) prompts.
+        self._template_cache: dict[str, jinja2.Template] = {}
 
     async def fetch(self, name: str, label: str = "production") -> Prompt:
         """Consult composed backends in order, applying §8 fallback.
@@ -59,14 +66,14 @@ async def fetch(self, name: str, label: str = "production") -> Prompt:
           next. After ALL backends are exhausted with unavailable
           failures, the manager raises ``PromptStoreUnavailable``.
         """
-        last_unavailable: PromptStoreUnavailable | None = None
+        causes: list[BaseException] = []
         for backend in self._backends:
             try:
                 return await backend.fetch(name, label)
             except PromptNotFound:
                 raise
             except PromptStoreUnavailable as exc:
-                last_unavailable = exc
+                causes.append(exc)
                 _log.warning(
                     "prompt backend %r unavailable for (%r, %r); falling back",
                     backend,
@@ -74,13 +81,14 @@ async def fetch(self, name: str, label: str = "production") -> Prompt:
                     label,
                 )
                 continue
-        assert last_unavailable is not None
+        assert causes
         raise PromptStoreUnavailable(
             f"all prompt backends unavailable for ({name!r}, {label!r})",
             name=name,
             label=label,
             backends_tried=[type(b).__name__ for b in self._backends],
-        ) from last_unavailable
+            causes=list(causes),
+        ) from causes[-1]
 
     def render(
         self,
@@ -103,7 +111,10 @@ def render(
 
         rendered_text: str
         try:
-            template = _RENDER_ENV.from_string(prompt.template)
+            template = self._template_cache.get(prompt.template_hash)
+            if template is None:
+                template = _RENDER_ENV.from_string(prompt.template)
+                self._template_cache[prompt.template_hash] = template
             rendered_text = template.render(**variables)
         except jinja2.UndefinedError as exc:
             raise PromptRenderError(
diff --git a/src/openarmature/prompts/prompt.py b/src/openarmature/prompts/prompt.py
index fe18f93..0ceaee0 100644
--- a/src/openarmature/prompts/prompt.py
+++ b/src/openarmature/prompts/prompt.py
@@ -5,7 +5,7 @@
 from datetime import datetime
 from typing import Any
 
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 
 from openarmature.llm.messages import Message
 
@@ -83,7 +83,7 @@ class PromptResult(BaseModel):
     label: str
     template_hash: str
     rendered_hash: str
-    messages: list[Message]
+    messages: list[Message] = Field(min_length=1)
     variables: dict[str, Any]
     fetched_at: datetime
     rendered_at: datetime
diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py
index 333e60a..c4efacd 100644
--- a/tests/unit/test_prompts.py
+++ b/tests/unit/test_prompts.py
@@ -149,6 +149,22 @@ def test_prompt_extra_fields_forbidden() -> None:
         )
 
 
+def test_prompt_result_rejects_empty_messages() -> None:
+    prompt = _make_prompt()
+    with pytest.raises(ValueError):
+        PromptResult(
+            name=prompt.name,
+            version=prompt.version,
+            label=prompt.label,
+            template_hash=prompt.template_hash,
+            rendered_hash="sha256:abc",
+            messages=[],
+            variables={},
+            fetched_at=prompt.fetched_at,
+            rendered_at=datetime.now(UTC),
+        )
+
+
 def test_prompt_group_rejects_zero_members() -> None:
     with pytest.raises(ValueError, match="at least two"):
         PromptGroup(group_name="g", members=[])
@@ -375,6 +391,22 @@ async def fetch(self, name: str, label: str = "production") -> Prompt:
     assert second.calls == 0
 
 
+def test_manager_render_caches_compiled_templates_by_hash() -> None:
+    prompt = _make_prompt()
+
+    class _Backend:
+        async def fetch(self, name: str, label: str = "production") -> Prompt:
+            return prompt
+
+    manager = PromptManager(_Backend())
+    manager.render(prompt, {"user": "Alice"})
+    assert prompt.template_hash in manager._template_cache  # noqa: SLF001
+    cached = manager._template_cache[prompt.template_hash]  # noqa: SLF001
+    # Second render reuses the same compiled Template instance.
+    manager.render(prompt, {"user": "Bob"})
+    assert manager._template_cache[prompt.template_hash] is cached  # noqa: SLF001
+
+
 async def test_manager_render_signature_returns_user_message() -> None:
     prompt = _make_prompt()
 

From 420ce205d8c89f1e13b74976d62a1dde2afa8620 Mon Sep 17 00:00:00 2001
From: chris-colinsky <chris@lunarcommand.xyz>
Date: Fri, 15 May 2026 20:11:22 -0700
Subject: [PATCH 12/12] fix: CoPilot review round-3 pass on PR #45

- harness/prompt_management.py: fix misleading comment on
  FixtureExpectedRaises.carries (secondary_backend_call_count is
  a sibling field on FixtureExpectedPerCall, not inside carries).
- manager.py: replace 'assert causes' with an explicit
  'if not causes: raise RuntimeError(...)' guard so the
  invariant holds under 'python -O' (asserts stripped) and
  surfaces as a clear RuntimeError rather than an opaque
  IndexError if a future change ever silently swallows an
  exception in the fallback loop.
- test_prompts.py: rewrite the active-prompt-in-nested-async-function
  test to spawn via asyncio.create_task so it actually exercises
  context-copy across the task boundary, matching the function
  name's implied claim. The previous form's await ran in the same
  context where ContextVar propagation is trivially expected.
---
 src/openarmature/prompts/manager.py            | 14 +++++++++++++-
 tests/conformance/harness/prompt_management.py | 10 ++++++----
 tests/unit/test_prompts.py                     |  3 ++-
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/openarmature/prompts/manager.py b/src/openarmature/prompts/manager.py
index 0f7df97..4ac3c3f 100644
--- a/src/openarmature/prompts/manager.py
+++ b/src/openarmature/prompts/manager.py
@@ -81,7 +81,19 @@ async def fetch(self, name: str, label: str = "production") -> Prompt:
                     label,
                 )
                 continue
-        assert causes
+        if not causes:
+            # Unreachable under current control flow: the constructor
+            # guarantees ``len(self._backends) >= 1`` and the only
+            # fall-through path from the for-loop appends to
+            # ``causes``. Explicit guard rather than ``assert`` so
+            # the invariant holds under ``python -O`` (asserts get
+            # stripped) — a future change that silently swallowed an
+            # exception in the loop would surface here as a clear
+            # RuntimeError instead of an opaque IndexError on the
+            # next line.
+            raise RuntimeError(
+                "PromptManager.fetch internal invariant violated: no backends consulted but loop exhausted"
+            )
         raise PromptStoreUnavailable(
             f"all prompt backends unavailable for ({name!r}, {label!r})",
             name=name,
diff --git a/tests/conformance/harness/prompt_management.py b/tests/conformance/harness/prompt_management.py
index 3eafe42..e908346 100644
--- a/tests/conformance/harness/prompt_management.py
+++ b/tests/conformance/harness/prompt_management.py
@@ -72,10 +72,12 @@ class BackendTarget(_StrictModel):
 
 class FixtureExpectedRaises(_PermissiveModel):
     category: str
-    # Optional extra carries — fixture 005 uses ``description_mentions``,
-    # ``name``, ``version``, ``label``. fixture 008 uses
-    # ``secondary_backend_call_count``. Permissive on this shape so
-    # fixtures evolve.
+    # Optional extra carries. Fixture 005 surfaces
+    # ``description_mentions`` / ``name`` / ``version`` / ``label``
+    # here. Permissive on this shape so fixtures evolve;
+    # per-backend call-count assertions live on the parent
+    # ``FixtureExpectedPerCall`` (see ``secondary_backend_call_count``
+    # and ``backend_call_counts`` below), not in ``carries``.
     carries: dict[str, Any] | None = None
 
 
diff --git a/tests/unit/test_prompts.py b/tests/unit/test_prompts.py
index c4efacd..f9ccfb7 100644
--- a/tests/unit/test_prompts.py
+++ b/tests/unit/test_prompts.py
@@ -354,7 +354,8 @@ async def _read_in_task() -> PromptResult | None:
         return current_prompt_result()
 
     with with_active_prompt(pr):
-        result = await _read_in_task()
+        task = asyncio.create_task(_read_in_task())
+        result = await task
     assert result is pr