Support named evaluator judge backends#920
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces two new judge selectors, --judge-agent-name and --judge-backend-ref, to the aworld-cli evaluator command, making them mutually exclusive with the existing --judge-agent option. This allows running evaluations using a registered local agent/team by name or importing a custom Python factory/callable as a judge backend. The review feedback highlights several improvement opportunities in evaluator_runtime.py, including catching and wrapping import/attribute errors during dynamic module loading, preventing unbound method errors when a class implements execute, and resolving a concurrency race condition when caching the CLI agent executor.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| def _load_ref(ref: str) -> Any: | ||
| module_name, separator, attr_path = ref.partition(":") | ||
| if not separator or not module_name or not attr_path: | ||
| raise ValueError(f"judge backend ref must use module:callable format: {ref}") | ||
| module = importlib.import_module(module_name) | ||
| value: Any = module | ||
| for attr in attr_path.split("."): | ||
| if not attr: | ||
| raise ValueError(f"judge backend ref has an empty attribute segment: {ref}") | ||
| value = getattr(value, attr) | ||
| return value |
There was a problem hiding this comment.
The importlib.import_module and getattr calls can raise ModuleNotFoundError, ImportError, or AttributeError if the user provides an invalid module or callable name via --judge-backend-ref. Since these exceptions are not caught by the except (FileNotFoundError, ValueError, KeyError) blocks in the CLI command handlers, they will cause the CLI to crash with a traceback.
We should catch these exceptions inside _load_ref and wrap them in a ValueError to ensure a clean error message is displayed to the user.
def _load_ref(ref: str) -> Any:
module_name, separator, attr_path = ref.partition(":")
if not separator or not module_name or not attr_path:
raise ValueError(f"judge backend ref must use module:callable format: {ref}")
try:
module = importlib.import_module(module_name)
except ImportError as exc:
raise ValueError(f"Failed to import module '{module_name}': {exc}") from exc
value: Any = module
for attr in attr_path.split("."):
if not attr:
raise ValueError(f"judge backend ref has an empty attribute segment: {ref}")
try:
value = getattr(value, attr)
except AttributeError as exc:
raise ValueError(f"Attribute '{attr}' not found in '{module_name}': {exc}") from exc
return value| def _load_source_judge_backend_ref(ref: str) -> JudgeBackend: | ||
| value = _load_ref(ref) | ||
| if hasattr(value, "execute"): | ||
| return value | ||
| if callable(value) and _can_call_without_arguments(value): | ||
| produced = value() | ||
| if inspect.isawaitable(produced): | ||
| raise ValueError("judge backend ref factory must be synchronous") | ||
| return _coerce_source_judge_backend(produced, backend_id=f"judge-backend-ref:{ref}") | ||
| return _coerce_source_judge_backend(value, backend_id=f"judge-backend-ref:{ref}") |
There was a problem hiding this comment.
If value is a class that implements execute (e.g., a custom subclass of JudgeBackend), hasattr(value, "execute") will evaluate to True on the class itself. Returning the class directly causes a runtime error when the framework attempts to call execute as an unbound method.
We should ensure we do not return the class directly, allowing it to fall through to the callable check where it can be instantiated.
| def _load_source_judge_backend_ref(ref: str) -> JudgeBackend: | |
| value = _load_ref(ref) | |
| if hasattr(value, "execute"): | |
| return value | |
| if callable(value) and _can_call_without_arguments(value): | |
| produced = value() | |
| if inspect.isawaitable(produced): | |
| raise ValueError("judge backend ref factory must be synchronous") | |
| return _coerce_source_judge_backend(produced, backend_id=f"judge-backend-ref:{ref}") | |
| return _coerce_source_judge_backend(value, backend_id=f"judge-backend-ref:{ref}") | |
| def _load_source_judge_backend_ref(ref: str) -> JudgeBackend: | |
| value = _load_ref(ref) | |
| if hasattr(value, "execute") and not inspect.isclass(value): | |
| return value | |
| if callable(value) and _can_call_without_arguments(value): | |
| produced = value() | |
| if inspect.isawaitable(produced): | |
| raise ValueError("judge backend ref factory must be synchronous") | |
| return _coerce_source_judge_backend(produced, backend_id=f"judge-backend-ref:{ref}") | |
| return _coerce_source_judge_backend(value, backend_id=f"judge-backend-ref:{ref}") |
| def _build_cli_agent_judge_backend(*, agent_name: str, backend_id: str, prompt_builder): | ||
| executor_cache: dict[str, Any] = {} | ||
|
|
||
| async def _executor(prompt, system_prompt): | ||
| if isinstance(prompt, tuple): | ||
| raise ValueError("CLI agent judge backend only supports text prompts") | ||
| executor = executor_cache.get("executor") | ||
| if executor is None: | ||
| executor = await _load_cli_agent_executor(agent_name) | ||
| executor_cache["executor"] = executor | ||
| swarm = getattr(executor, "swarm", None) | ||
| if swarm is not None: | ||
| response = await Runners.run(input=str(prompt), swarm=swarm) | ||
| else: | ||
| response = await executor.chat(str(prompt)) | ||
| return str(getattr(response, "answer", response)) | ||
|
|
There was a problem hiding this comment.
Since _executor is an asynchronous function, concurrent evaluation tasks can trigger _load_cli_agent_executor simultaneously before the first call finishes caching the executor. This leads to redundant loading of agents and creation of multiple CLI runtimes/executors.
Using an asyncio.Lock with a double-checked locking pattern avoids this race condition efficiently.
def _build_cli_agent_judge_backend(*, agent_name: str, backend_id: str, prompt_builder):
executor_cache: dict[str, Any] = {}
lock = asyncio.Lock()
async def _executor(prompt, system_prompt):
if isinstance(prompt, tuple):
raise ValueError("CLI agent judge backend only supports text prompts")
executor = executor_cache.get("executor")
if executor is None:
async with lock:
executor = executor_cache.get("executor")
if executor is None:
executor = await _load_cli_agent_executor(agent_name)
executor_cache["executor"] = executor
swarm = getattr(executor, "swarm", None)
if swarm is not None:
response = await Runners.run(input=str(prompt), swarm=swarm)
else:
response = await executor.chat(str(prompt))
return str(getattr(response, "answer", response))
Summary
--judge-agent-nameand--judge-backend-refvalidation.Test Plan