diff --git a/src/synapt_eval/reviewer/bridge.py b/src/synapt_eval/reviewer/bridge.py index 759e967..6637789 100644 --- a/src/synapt_eval/reviewer/bridge.py +++ b/src/synapt_eval/reviewer/bridge.py @@ -53,10 +53,12 @@ async def review( reasoning=response.reasoning, ) + score = max(0.0, min(1.0, response.score)) if response.score is not None else None + return Verdict( passed=response.passed, reasoning=response.reasoning, severity=self._severity if not response.passed else SEVERITY_INFO, checks=[check], - score=response.score, + score=score, ) diff --git a/src/synapt_eval/suggestion_engine/rules/generation.py b/src/synapt_eval/suggestion_engine/rules/generation.py index ab6bd19..bf24396 100644 --- a/src/synapt_eval/suggestion_engine/rules/generation.py +++ b/src/synapt_eval/suggestion_engine/rules/generation.py @@ -61,7 +61,11 @@ def evaluate( if not verdicts: return [] - flagged = [v for v in verdicts if not v.passed and v.score < self._score_threshold] + flagged = [ + v + for v in verdicts + if not v.passed and v.score is not None and v.score < self._score_threshold + ] if not flagged: return [] diff --git a/tests/unit/test_judge.py b/tests/unit/test_judge.py index 9ed1500..f197380 100644 --- a/tests/unit/test_judge.py +++ b/tests/unit/test_judge.py @@ -157,6 +157,35 @@ async def judge(self, request: JudgeRequest) -> JudgeResponse: assert judge.last_request.rubric == "accuracy" assert judge.last_request.context == {"k": "v"} + @pytest.mark.asyncio + async def test_refusal_reasoning_preserved(self): + judge = MockJudge( + JudgeResponse( + passed=False, + score=0.0, + reasoning="I cannot verify this from the provided context.", + ) + ) + reviewer = JudgingReviewer(judge) + verdict = await reviewer.review("answer", ["expected"], "query") + assert not verdict.passed + assert "cannot verify" in verdict.reasoning + assert verdict.checks[0].reasoning == verdict.reasoning + + @pytest.mark.asyncio + async def test_high_score_clamped_at_bridge(self): + judge = MockJudge(JudgeResponse(passed=True, score=1.5, reasoning="overconfident")) + reviewer = JudgingReviewer(judge) + verdict = await reviewer.review("answer", ["expected"], "query") + assert verdict.score == 1.0 + + @pytest.mark.asyncio + async def test_negative_score_clamped_at_bridge(self): + judge = MockJudge(JudgeResponse(passed=False, score=-0.25, reasoning="undershot")) + reviewer = JudgingReviewer(judge) + verdict = await reviewer.review("answer", ["expected"], "query") + assert verdict.score == 0.0 + class TestJudgeInChain: @pytest.mark.asyncio diff --git a/tests/unit/test_reviewer.py b/tests/unit/test_reviewer.py index c3ee0e4..c7cb2b8 100644 --- a/tests/unit/test_reviewer.py +++ b/tests/unit/test_reviewer.py @@ -6,8 +6,10 @@ CheckResult, FrameworkReviewer, Predicate, + Reviewer, ReviewerChain, Severity, + Verdict, ) from synapt_eval.reviewer.types import ( SEVERITY_CRITICAL, @@ -210,3 +212,153 @@ def test_weight_ordering(self): def test_frozen(self): with pytest.raises(AttributeError): SEVERITY_INFO.weight = 99.0 # type: ignore[misc] + + +class CurrentStatePredicate(Predicate): + def check(self, output: str, expected: list[str], query: str) -> CheckResult: + current_query = "current" in query.lower() + history_leak = any( + token in output.lower() for token in ["used to", "previously", "last week"] + ) + return CheckResult( + name="current_state", + passed=not (current_query and history_leak), + severity=SEVERITY_ERROR, + reasoning="Response should stay grounded in current state", + ) + + +class TemporalAnchorPredicate(Predicate): + def check(self, output: str, expected: list[str], query: str) -> CheckResult: + asked_for_today = "today" in query.lower() + stale_anchor = any(token in output.lower() for token in ["yesterday", "last week"]) + return CheckResult( + name="temporal_anchor", + passed=not (asked_for_today and stale_anchor), + severity=SEVERITY_WARNING, + reasoning="Temporal reference should match the query anchor", + ) + + +class ExpectedFacetPredicate(Predicate): + def check(self, output: str, expected: list[str], query: str) -> CheckResult: + found = any(e.lower() in output.lower() for e in expected) + return CheckResult( + name="expected_facet", + passed=found, + severity=SEVERITY_INFO, + reasoning="Expected facet should be present in the response", + ) + + +class EmptyCheckReviewer(Reviewer): + async def review(self, output: str, expected: list[str], query: str, **kwargs) -> Verdict: + return Verdict( + passed=True, + reasoning="No signal", + severity=SEVERITY_INFO, + checks=[], + score=1.0, + ) + + +class CustomTaggedReviewer(Reviewer): + def __init__(self, name: str, passed: bool): + self._name = name + self._passed = passed + + async def review(self, output: str, expected: list[str], query: str, **kwargs) -> Verdict: + severity = SEVERITY_INFO if self._passed else SEVERITY_ERROR + return Verdict( + passed=self._passed, + reasoning=f"{self._name} {'passed' if self._passed else 'failed'}", + severity=severity, + checks=[ + CheckResult( + name=self._name, + passed=self._passed, + severity=severity, + reasoning="custom reviewer seam", + ) + ], + score=1.0 if self._passed else 0.0, + ) + + +TEMPORAL_CASES = [ + ("Current plan status today?", "The current status is green.", ["green"], set()), + ( + "Current plan status today?", + "It used to be green last week.", + ["green"], + {"current_state", "temporal_anchor"}, + ), + ( + "What is the current deployment state?", + "Previously stable, now degraded.", + ["degraded"], + {"current_state"}, + ), + ("What changed today?", "Today the rollout is paused.", ["paused"], set()), + ( + "What changed today?", + "Yesterday it was paused and last week it was green.", + ["paused"], + {"temporal_anchor"}, + ), + ("What is the current owner?", "The current owner is Atlas.", ["Atlas"], set()), + ( + "What is the current owner?", + "It used to be Apollo.", + ["Apollo"], + {"current_state"}, + ), + ("What changed today?", "Current state is stable with no delta.", ["stable"], set()), +] + + +class TestReviewerAntagonists: + @pytest.mark.asyncio + @pytest.mark.parametrize(("query", "output", "expected", "failed_names"), TEMPORAL_CASES) + async def test_oss_safe_temporal_like_cases(self, query, output, expected, failed_names): + reviewer = FrameworkReviewer( + [ + CurrentStatePredicate(), + TemporalAnchorPredicate(), + ExpectedFacetPredicate(), + ] + ) + verdict = await reviewer.review(output, expected, query) + observed = {check.name for check in verdict.checks if not check.passed} + assert verdict.passed is (len(failed_names) == 0) + assert observed == failed_names + + @pytest.mark.asyncio + async def test_custom_reviewer_subclass_composes_cleanly(self): + chain = ReviewerChain( + [ + FrameworkReviewer([AlwaysPassPredicate("framework_ok")]), + CustomTaggedReviewer("plugin_review", passed=False), + ], + strategy="strictest", + ) + verdict = await chain.review("out", ["exp"], "q") + assert not verdict.passed + names = {check.name for check in verdict.checks} + assert names == {"framework_ok", "plugin_review"} + + @pytest.mark.asyncio + async def test_majority_with_empty_check_reviewer_is_deterministic(self): + chain = ReviewerChain( + [ + FrameworkReviewer([AlwaysPassPredicate("method_a")]), + FrameworkReviewer([AlwaysFailPredicate("method_b")]), + EmptyCheckReviewer(), + ], + strategy="majority", + ) + verdict = await chain.review("out", ["exp"], "q") + assert verdict.passed + assert verdict.score == pytest.approx((1.0 + 0.0 + 1.0) / 3) + names = {check.name for check in verdict.checks} + assert names == {"method_a", "method_b"} diff --git a/tests/unit/test_suggestion_engine.py b/tests/unit/test_suggestion_engine.py index 5f82a32..6d3d567 100644 --- a/tests/unit/test_suggestion_engine.py +++ b/tests/unit/test_suggestion_engine.py @@ -153,6 +153,32 @@ def test_custom_threshold(self): ) assert len(suggestions) == 0 + def test_none_score_ignored(self): + verdicts = [ + Verdict( + passed=False, + reasoning="judge refused to score", + severity=SEVERITY_ERROR, + score=None, # type: ignore[arg-type] + ), + ] + suggestions = HallucinationSignalRule().evaluate(_result(), verdicts=verdicts) + assert suggestions == [] + + def test_zero_score_flagged(self): + verdicts = [ + Verdict(passed=False, reasoning="fabricated", severity=SEVERITY_ERROR, score=0.0), + ] + suggestions = HallucinationSignalRule().evaluate(_result(), verdicts=verdicts) + assert len(suggestions) == 1 + + def test_perfect_score_not_flagged(self): + verdicts = [ + Verdict(passed=False, reasoning="other issue", severity=SEVERITY_WARNING, score=1.0), + ] + suggestions = HallucinationSignalRule().evaluate(_result(), verdicts=verdicts) + assert suggestions == [] + class TestVerdictFailureRule: def test_failed_checks(self): @@ -194,6 +220,19 @@ def test_no_verdicts(self): suggestions = VerdictFailureRule().evaluate(_result()) assert len(suggestions) == 0 + def test_failed_verdict_without_checks(self): + verdicts = [ + Verdict( + passed=False, + reasoning="failed without granular checks", + severity=SEVERITY_ERROR, + checks=[], + score=0.0, + ), + ] + suggestions = VerdictFailureRule().evaluate(_result(), verdicts=verdicts) + assert suggestions == [] + # ── Cross-cutting rules ── @@ -302,6 +341,21 @@ def test_different_categories_filtered(self): ) assert len(suggestions) == 0 + def test_empty_history_list(self): + suggestions = MonotonicDegradationRule().evaluate(_result("r"), context={"history": []}) + assert suggestions == [] + + def test_missing_metric_values_do_not_count_toward_consecutive_runs(self): + history = [ + _result("r", p5=0.80), + _result("r", p5=0.75), + ] + current = _result("r", p5=0.70) + suggestions = MonotonicDegradationRule(metric="tau", consecutive=3).evaluate( + current, context={"history": history} + ) + assert suggestions == [] + class TestStableLowRule: def test_consistently_low(self): @@ -333,6 +387,23 @@ def test_not_enough_runs(self): suggestions = StableLowRule(min_runs=3).evaluate(current, context={"history": history}) assert len(suggestions) == 0 + def test_empty_history_list(self): + current = _result("r", p5=0.55) + suggestions = StableLowRule(min_runs=3).evaluate(current, context={"history": []}) + assert suggestions == [] + + def test_interleaved_other_categories_do_not_satisfy_min_runs(self): + history = [ + _result("other", p5=0.40), + _result("r", p5=0.45), + _result("other", p5=0.42), + ] + current = _result("r", p5=0.44) + suggestions = StableLowRule(threshold=0.7, min_runs=3).evaluate( + current, context={"history": history} + ) + assert suggestions == [] + # ── Engine tests ──