From 2384f75d0af48797491047ea196469e9689413e6 Mon Sep 17 00:00:00 2001 From: Antawari Date: Wed, 27 May 2026 17:09:55 -0600 Subject: [PATCH 1/2] handlers: StewardHandler returns SKIPPED when no merge actually happened MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-fix, StewardHandler.handle returned TaskStatus.COMPLETED for every non-error path — even when verdict was "reject", verdict was missing, or pr_number couldn't be identified. The closer's contract is to seal the work; reporting "completed" for cases where no merge happened makes the pipeline lie about its actual outcomes. Downstream gates and CLI summaries see COMPLETED → assume the PR landed → propagate the false-positive. ## Fix src/bonfire/handlers/steward.py: - Compute final_status from the actual-merge precondition: merge_actually_happened = verdict == "approve" and pr_number is not None final_status = COMPLETED if merge_actually_happened else SKIPPED - Reject verdict + pr_number → SKIPPED - Empty verdict + pr_number → SKIPPED - Approve verdict without pr_number → SKIPPED (nothing to merge) - Approve + pr_number → COMPLETED (the only true-merge path) - All other return-envelope behavior preserved: same metadata (steward_verdict + steward_pr), same result string, same error path. SKIPPED is the right semantics over FAILED — the closer evaluated its preconditions and intentionally declined; it didn't crash. ## Tests tests/unit/test_steward_handler_skipped_on_non_approve.py (new · Knight RED): Five tests across two classes: - TestStatusReflectsActualMerge (4): * reject verdict → SKIPPED * missing verdict → SKIPPED * approve verdict without pr_number → SKIPPED * approve + pr_number → COMPLETED (regression guard for the only true-merge path, plus a sanity check that merge_pr was actually invoked on the github_client mock) - TestStatusPreservesEnvelopeStructure (1): * SKIPPED path still attaches steward_verdict + steward_pr metadata + preserves the result string — only the status changes. ## Verification pytest tests/unit/test_steward_handler_skipped_on_non_approve.py 5 passed (Knight RED → GREEN verified — 4 failed pre-fix, all green post-fix; the 1 pre-fix-passing test is the COMPLETED regression guard) pytest tests/unit/test_steward_handler.py (existing 570-line suite) 30 passed (zero regressions — the existing reject-verdict tests only asserted "merge_pr was not called", not the resulting TaskStatus, so they're unchanged by this fix) ruff check + format on changed files: clean Co-Authored-By: Claude Opus 4.7 (1M context) --- src/bonfire/handlers/steward.py | 13 +- ..._steward_handler_skipped_on_non_approve.py | 197 ++++++++++++++++++ 2 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_steward_handler_skipped_on_non_approve.py diff --git a/src/bonfire/handlers/steward.py b/src/bonfire/handlers/steward.py index 39f1af1..b18ff2a 100644 --- a/src/bonfire/handlers/steward.py +++ b/src/bonfire/handlers/steward.py @@ -156,10 +156,21 @@ async def handle( "steward_verdict": verdict, "steward_pr": str(pr_number) if pr_number else "", } + # Reserve COMPLETED for the only path that actually performed + # a merge — verdict=="approve" AND pr_number was identifiable. + # All other paths (reject verdict, missing verdict, approve- + # without-pr) end here without merging anything; reporting them + # as COMPLETED makes downstream gates + CLI summaries lie about + # the pipeline outcome. SKIPPED conveys "intentionally did not + # run" — the closer evaluated its preconditions and declined. + merge_actually_happened = verdict == "approve" and pr_number is not None + final_status = ( + TaskStatus.COMPLETED if merge_actually_happened else TaskStatus.SKIPPED + ) return envelope.model_copy( update={ "metadata": new_metadata, - "status": TaskStatus.COMPLETED, + "status": final_status, "result": f"closer: verdict={verdict}, pr={pr_number}", }, ) diff --git a/tests/unit/test_steward_handler_skipped_on_non_approve.py b/tests/unit/test_steward_handler_skipped_on_non_approve.py new file mode 100644 index 0000000..5acce1a --- /dev/null +++ b/tests/unit/test_steward_handler_skipped_on_non_approve.py @@ -0,0 +1,197 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2026 BonfireAI + +"""Knight RED tests — StewardHandler returns SKIPPED when no merge happens. + +Pre-fix: ``StewardHandler.handle`` returned ``TaskStatus.COMPLETED`` +unconditionally — regardless of whether the verdict was ``approve``, +``reject``, or empty, and regardless of whether a PR number was present. +Downstream gates and CLI summaries saw the closer stage as "completed" +when in reality the PR was rejected (verdict != "approve") or never +identifiable (pr_number missing). The closer's contract is to seal the +work; no-op success makes the pipeline lie about outcomes. + +The fix returns ``TaskStatus.COMPLETED`` only when an actual merge +happened (``verdict == "approve" and pr_number is not None``), and +``TaskStatus.SKIPPED`` otherwise — preserving the existing return-an- +envelope path (no exception) while reporting the truthful status. + +Existing ``test_steward_handler.py`` already verifies that ``reject`` +verdict does NOT trigger ``merge_pr`` (a behavioral assertion on the +github_client mock). What it does NOT verify is the resulting +``TaskStatus`` — that's the gap this Knight fills. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import pytest + +try: + from bonfire.github.mock import MockGitHubClient # type: ignore[import-not-found] +except ImportError: # pragma: no cover + MockGitHubClient = None # type: ignore[assignment,misc] + +try: + from bonfire.handlers.steward import StewardHandler # type: ignore[import-not-found] +except ImportError: # pragma: no cover + StewardHandler = None # type: ignore[assignment,misc] + +from bonfire.models.envelope import ( + META_PR_NUMBER, + Envelope, + TaskStatus, +) +from bonfire.models.plan import StageSpec + +if TYPE_CHECKING: + pass + + +pytestmark = pytest.mark.skipif( + StewardHandler is None or MockGitHubClient is None, + reason="v0.1 handler not yet ported: StewardHandler / MockGitHubClient missing", +) + + +# --------------------------------------------------------------------------- +# Fixtures (mirror test_steward_handler.py for consistency) +# --------------------------------------------------------------------------- + + +@pytest.fixture() +def github_client(): # noqa: ANN201 + return MockGitHubClient() + + +@pytest.fixture() +def steward_stage() -> StageSpec: + return StageSpec(name="steward", agent_name="pr-merger", role="closer") + + +@pytest.fixture() +def base_envelope() -> Envelope: + return Envelope(task="seal-the-work") + + +@pytest.fixture() +def handler(github_client) -> Any: # noqa: ANN001, ANN201 + return StewardHandler(github_client=github_client) + + +async def _seed_pr(gh: Any) -> int: + pr = await gh.create_pr(title="seed", head="feature", base="master") + return pr.number + + +# --------------------------------------------------------------------------- +# Status contract — three non-merge cases all return SKIPPED. +# --------------------------------------------------------------------------- + + +class TestStatusReflectsActualMerge: + """The returned TaskStatus reflects whether a merge actually happened.""" + + async def test_reject_verdict_returns_skipped( + self, + handler, # noqa: ANN001 + steward_stage: StageSpec, + base_envelope: Envelope, + github_client, # noqa: ANN001 + ) -> None: + """verdict='reject' + pr_number present → SKIPPED (no merge happened).""" + pr_number = await _seed_pr(github_client) + prior = {"wizard": "reject", META_PR_NUMBER: str(pr_number)} + + result = await handler.handle(steward_stage, base_envelope, prior) + + assert result.status == TaskStatus.SKIPPED, ( + f"Expected SKIPPED on reject verdict (no merge happened); " + f"got {result.status}. Pre-fix bug: status was hardcoded to " + f"COMPLETED regardless of verdict, making the pipeline lie about " + f"actual outcomes." + ) + + async def test_empty_verdict_returns_skipped( + self, + handler, # noqa: ANN001 + steward_stage: StageSpec, + base_envelope: Envelope, + github_client, # noqa: ANN001 + ) -> None: + """No verdict key in prior_results → SKIPPED (no merge happened).""" + pr_number = await _seed_pr(github_client) + prior = {META_PR_NUMBER: str(pr_number)} # No verdict key. + + result = await handler.handle(steward_stage, base_envelope, prior) + + assert result.status == TaskStatus.SKIPPED, ( + f"Expected SKIPPED on missing verdict; got {result.status}" + ) + + async def test_approve_without_pr_number_returns_skipped( + self, + handler, # noqa: ANN001 + steward_stage: StageSpec, + base_envelope: Envelope, + ) -> None: + """verdict='approve' but no PR number → SKIPPED (nothing to merge).""" + prior = {"wizard": "approve"} # No pr_number. + + result = await handler.handle(steward_stage, base_envelope, prior) + + assert result.status == TaskStatus.SKIPPED, ( + f"Expected SKIPPED when approve verdict has no PR number " + f"(no merge possible); got {result.status}" + ) + + async def test_approve_with_pr_number_returns_completed( + self, + handler, # noqa: ANN001 + steward_stage: StageSpec, + base_envelope: Envelope, + github_client, # noqa: ANN001 + ) -> None: + """Regression guard: the only TRUE merge path still returns COMPLETED.""" + pr_number = await _seed_pr(github_client) + prior = {"wizard": "approve", META_PR_NUMBER: str(pr_number)} + + result = await handler.handle(steward_stage, base_envelope, prior) + + assert result.status == TaskStatus.COMPLETED, ( + f"approve + pr_number is the actual-merge path — must stay COMPLETED. " + f"Got {result.status}" + ) + # Sanity: the merge actually fired. + assert any(a["type"] == "merge_pr" for a in github_client.actions), ( + "merge_pr should have been invoked on the github client mock" + ) + + +class TestStatusPreservesEnvelopeStructure: + """Returning SKIPPED must not mutate the rest of the envelope contract.""" + + async def test_skipped_path_still_attaches_metadata( + self, + handler, # noqa: ANN001 + steward_stage: StageSpec, + base_envelope: Envelope, + github_client, # noqa: ANN001 + ) -> None: + """The SKIPPED-return path still records steward_verdict + steward_pr metadata. + + Downstream observers (cost rollup, session summary) read these + metadata keys regardless of status. The status change is the only + observable contract delta. + """ + pr_number = await _seed_pr(github_client) + prior = {"wizard": "reject", META_PR_NUMBER: str(pr_number)} + + result = await handler.handle(steward_stage, base_envelope, prior) + + assert result.status == TaskStatus.SKIPPED + assert result.metadata.get("steward_verdict") == "reject" + assert result.metadata.get("steward_pr") == str(pr_number) + # Result string is preserved (it documents what happened, not status). + assert "reject" in result.result From 6756ed65b20b31ff6484a25b370e511e25919e8b Mon Sep 17 00:00:00 2001 From: Antawari Date: Wed, 27 May 2026 17:10:20 -0600 Subject: [PATCH 2/2] ruff format: re-flow steward.py after the SKIPPED-status edit Follow-up to the prior commit. ruff format was reported as needing a re-flow on steward.py after the new merge_actually_happened block; running it now closes the format check that CI would otherwise flag. No behavior change. --- src/bonfire/handlers/steward.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/bonfire/handlers/steward.py b/src/bonfire/handlers/steward.py index b18ff2a..c7102d9 100644 --- a/src/bonfire/handlers/steward.py +++ b/src/bonfire/handlers/steward.py @@ -164,9 +164,7 @@ async def handle( # the pipeline outcome. SKIPPED conveys "intentionally did not # run" — the closer evaluated its preconditions and declined. merge_actually_happened = verdict == "approve" and pr_number is not None - final_status = ( - TaskStatus.COMPLETED if merge_actually_happened else TaskStatus.SKIPPED - ) + final_status = TaskStatus.COMPLETED if merge_actually_happened else TaskStatus.SKIPPED return envelope.model_copy( update={ "metadata": new_metadata,