Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/synapt_eval/reviewer/bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ async def review(
reasoning=response.reasoning,
)

score = max(0.0, min(1.0, response.score)) if response.score is not None else None

return Verdict(
passed=response.passed,
reasoning=response.reasoning,
severity=self._severity if not response.passed else SEVERITY_INFO,
checks=[check],
score=response.score,
score=score,
)
6 changes: 5 additions & 1 deletion src/synapt_eval/suggestion_engine/rules/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,11 @@ def evaluate(
if not verdicts:
return []

flagged = [v for v in verdicts if not v.passed and v.score < self._score_threshold]
flagged = [
v
for v in verdicts
if not v.passed and v.score is not None and v.score < self._score_threshold
]

if not flagged:
return []
Expand Down
29 changes: 29 additions & 0 deletions tests/unit/test_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,35 @@ async def judge(self, request: JudgeRequest) -> JudgeResponse:
assert judge.last_request.rubric == "accuracy"
assert judge.last_request.context == {"k": "v"}

@pytest.mark.asyncio
async def test_refusal_reasoning_preserved(self):
judge = MockJudge(
JudgeResponse(
passed=False,
score=0.0,
reasoning="I cannot verify this from the provided context.",
)
)
reviewer = JudgingReviewer(judge)
verdict = await reviewer.review("answer", ["expected"], "query")
assert not verdict.passed
assert "cannot verify" in verdict.reasoning
assert verdict.checks[0].reasoning == verdict.reasoning

@pytest.mark.asyncio
async def test_high_score_clamped_at_bridge(self):
judge = MockJudge(JudgeResponse(passed=True, score=1.5, reasoning="overconfident"))
reviewer = JudgingReviewer(judge)
verdict = await reviewer.review("answer", ["expected"], "query")
assert verdict.score == 1.0

@pytest.mark.asyncio
async def test_negative_score_clamped_at_bridge(self):
judge = MockJudge(JudgeResponse(passed=False, score=-0.25, reasoning="undershot"))
reviewer = JudgingReviewer(judge)
verdict = await reviewer.review("answer", ["expected"], "query")
assert verdict.score == 0.0


class TestJudgeInChain:
@pytest.mark.asyncio
Expand Down
152 changes: 152 additions & 0 deletions tests/unit/test_reviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
CheckResult,
FrameworkReviewer,
Predicate,
Reviewer,
ReviewerChain,
Severity,
Verdict,
)
from synapt_eval.reviewer.types import (
SEVERITY_CRITICAL,
Expand Down Expand Up @@ -210,3 +212,153 @@ def test_weight_ordering(self):
def test_frozen(self):
with pytest.raises(AttributeError):
SEVERITY_INFO.weight = 99.0 # type: ignore[misc]


class CurrentStatePredicate(Predicate):
def check(self, output: str, expected: list[str], query: str) -> CheckResult:
current_query = "current" in query.lower()
history_leak = any(
token in output.lower() for token in ["used to", "previously", "last week"]
)
return CheckResult(
name="current_state",
passed=not (current_query and history_leak),
severity=SEVERITY_ERROR,
reasoning="Response should stay grounded in current state",
)


class TemporalAnchorPredicate(Predicate):
def check(self, output: str, expected: list[str], query: str) -> CheckResult:
asked_for_today = "today" in query.lower()
stale_anchor = any(token in output.lower() for token in ["yesterday", "last week"])
return CheckResult(
name="temporal_anchor",
passed=not (asked_for_today and stale_anchor),
severity=SEVERITY_WARNING,
reasoning="Temporal reference should match the query anchor",
)


class ExpectedFacetPredicate(Predicate):
def check(self, output: str, expected: list[str], query: str) -> CheckResult:
found = any(e.lower() in output.lower() for e in expected)
return CheckResult(
name="expected_facet",
passed=found,
severity=SEVERITY_INFO,
reasoning="Expected facet should be present in the response",
)


class EmptyCheckReviewer(Reviewer):
async def review(self, output: str, expected: list[str], query: str, **kwargs) -> Verdict:
return Verdict(
passed=True,
reasoning="No signal",
severity=SEVERITY_INFO,
checks=[],
score=1.0,
)


class CustomTaggedReviewer(Reviewer):
def __init__(self, name: str, passed: bool):
self._name = name
self._passed = passed

async def review(self, output: str, expected: list[str], query: str, **kwargs) -> Verdict:
severity = SEVERITY_INFO if self._passed else SEVERITY_ERROR
return Verdict(
passed=self._passed,
reasoning=f"{self._name} {'passed' if self._passed else 'failed'}",
severity=severity,
checks=[
CheckResult(
name=self._name,
passed=self._passed,
severity=severity,
reasoning="custom reviewer seam",
)
],
score=1.0 if self._passed else 0.0,
)


TEMPORAL_CASES = [
("Current plan status today?", "The current status is green.", ["green"], set()),
(
"Current plan status today?",
"It used to be green last week.",
["green"],
{"current_state", "temporal_anchor"},
),
(
"What is the current deployment state?",
"Previously stable, now degraded.",
["degraded"],
{"current_state"},
),
("What changed today?", "Today the rollout is paused.", ["paused"], set()),
(
"What changed today?",
"Yesterday it was paused and last week it was green.",
["paused"],
{"temporal_anchor"},
),
("What is the current owner?", "The current owner is Atlas.", ["Atlas"], set()),
(
"What is the current owner?",
"It used to be Apollo.",
["Apollo"],
{"current_state"},
),
("What changed today?", "Current state is stable with no delta.", ["stable"], set()),
]


class TestReviewerAntagonists:
@pytest.mark.asyncio
@pytest.mark.parametrize(("query", "output", "expected", "failed_names"), TEMPORAL_CASES)
async def test_oss_safe_temporal_like_cases(self, query, output, expected, failed_names):
reviewer = FrameworkReviewer(
[
CurrentStatePredicate(),
TemporalAnchorPredicate(),
ExpectedFacetPredicate(),
]
)
verdict = await reviewer.review(output, expected, query)
observed = {check.name for check in verdict.checks if not check.passed}
assert verdict.passed is (len(failed_names) == 0)
assert observed == failed_names

@pytest.mark.asyncio
async def test_custom_reviewer_subclass_composes_cleanly(self):
chain = ReviewerChain(
[
FrameworkReviewer([AlwaysPassPredicate("framework_ok")]),
CustomTaggedReviewer("plugin_review", passed=False),
],
strategy="strictest",
)
verdict = await chain.review("out", ["exp"], "q")
assert not verdict.passed
names = {check.name for check in verdict.checks}
assert names == {"framework_ok", "plugin_review"}

@pytest.mark.asyncio
async def test_majority_with_empty_check_reviewer_is_deterministic(self):
chain = ReviewerChain(
[
FrameworkReviewer([AlwaysPassPredicate("method_a")]),
FrameworkReviewer([AlwaysFailPredicate("method_b")]),
EmptyCheckReviewer(),
],
strategy="majority",
)
verdict = await chain.review("out", ["exp"], "q")
assert verdict.passed
assert verdict.score == pytest.approx((1.0 + 0.0 + 1.0) / 3)
names = {check.name for check in verdict.checks}
assert names == {"method_a", "method_b"}
71 changes: 71 additions & 0 deletions tests/unit/test_suggestion_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,32 @@ def test_custom_threshold(self):
)
assert len(suggestions) == 0

def test_none_score_ignored(self):
verdicts = [
Verdict(
passed=False,
reasoning="judge refused to score",
severity=SEVERITY_ERROR,
score=None, # type: ignore[arg-type]
),
]
suggestions = HallucinationSignalRule().evaluate(_result(), verdicts=verdicts)
assert suggestions == []

def test_zero_score_flagged(self):
verdicts = [
Verdict(passed=False, reasoning="fabricated", severity=SEVERITY_ERROR, score=0.0),
]
suggestions = HallucinationSignalRule().evaluate(_result(), verdicts=verdicts)
assert len(suggestions) == 1

def test_perfect_score_not_flagged(self):
verdicts = [
Verdict(passed=False, reasoning="other issue", severity=SEVERITY_WARNING, score=1.0),
]
suggestions = HallucinationSignalRule().evaluate(_result(), verdicts=verdicts)
assert suggestions == []


class TestVerdictFailureRule:
def test_failed_checks(self):
Expand Down Expand Up @@ -194,6 +220,19 @@ def test_no_verdicts(self):
suggestions = VerdictFailureRule().evaluate(_result())
assert len(suggestions) == 0

def test_failed_verdict_without_checks(self):
verdicts = [
Verdict(
passed=False,
reasoning="failed without granular checks",
severity=SEVERITY_ERROR,
checks=[],
score=0.0,
),
]
suggestions = VerdictFailureRule().evaluate(_result(), verdicts=verdicts)
assert suggestions == []


# ── Cross-cutting rules ──

Expand Down Expand Up @@ -302,6 +341,21 @@ def test_different_categories_filtered(self):
)
assert len(suggestions) == 0

def test_empty_history_list(self):
suggestions = MonotonicDegradationRule().evaluate(_result("r"), context={"history": []})
assert suggestions == []

def test_missing_metric_values_do_not_count_toward_consecutive_runs(self):
history = [
_result("r", p5=0.80),
_result("r", p5=0.75),
]
current = _result("r", p5=0.70)
suggestions = MonotonicDegradationRule(metric="tau", consecutive=3).evaluate(
current, context={"history": history}
)
assert suggestions == []


class TestStableLowRule:
def test_consistently_low(self):
Expand Down Expand Up @@ -333,6 +387,23 @@ def test_not_enough_runs(self):
suggestions = StableLowRule(min_runs=3).evaluate(current, context={"history": history})
assert len(suggestions) == 0

def test_empty_history_list(self):
current = _result("r", p5=0.55)
suggestions = StableLowRule(min_runs=3).evaluate(current, context={"history": []})
assert suggestions == []

def test_interleaved_other_categories_do_not_satisfy_min_runs(self):
history = [
_result("other", p5=0.40),
_result("r", p5=0.45),
_result("other", p5=0.42),
]
current = _result("r", p5=0.44)
suggestions = StableLowRule(threshold=0.7, min_runs=3).evaluate(
current, context={"history": history}
)
assert suggestions == []


# ── Engine tests ──

Expand Down
Loading