From dceb6b1f821fc22e798f5c932436840c78855de8 Mon Sep 17 00:00:00 2001 From: Houtan Bastani Date: Wed, 13 May 2026 12:45:58 +0000 Subject: [PATCH] feat(llm): add shared model and run registry Move canonical LLM model metadata and benchmarkable model-run declarations into utils so downstream repos can select shared runs by stable model_run_key. Add Models.dev and Artificial Analysis metadata snapshots and loaders. Resolve release dates from Models.dev with manual fallbacks, separate canonical model_key values from provider_model_id routing strings, and validate model declarations during registry construction. Require every ModelRun to declare an explicit, filename-safe model_run_key. Keep build_model_run_key as a naming helper for option coverage, validate duplicate keys, and expose MODEL_RUNS_BY_KEY/select_model_runs for benchmark selection. Add Model.active and ACTIVE_MODEL_RUNS so historical runs remain in MODEL_RUNS while runs depending on inactive provider routes are excluded from current live-callable benchmark sweeps. Mark the Together deepseek-v3.1 route inactive and replace live smoke tests with the active MiniMax M2.7 route. Add Artificial Analysis-backed model-run declarations as benchmark-selectable runs that are automatically included in MODEL_RUNS, with display names resolved from a minimized checked-in AA snapshot containing only stable IDs and display names. Add third-party notices for Models.dev's MIT license and Artificial Analysis attribution, and include those notices in built wheel license metadata. Move shared LLM provider dependencies into pyproject metadata, make requirements.txt delegate to .[dev], configure the package for Python 3.14, and preserve pytest-xdist for parallel integration tests. Document registry conventions, local dev setup, validation commands, and Claude/agent handoff files. Add unit and integration coverage for metadata snapshots, registry validation, provider routing, explicit model-run keys, active model-run filtering, third-party notices, and selectable shared model runs. As a byproduct of using Models.dev, the following model release dates have changed: mistral-large-2411: 2024-11-18 -> 2024-11-01 deepseek-r1: 2025-01-20 -> 2024-12-26 deepseek-v3: 2024-12-25 -> 2025-01-20 glm-4.6: 2025-11-13 -> 2025-09-30 kimi-k2-thinking: 2025-11-05 -> 2025-11-06 kimi-k2.5: 2026-01-30 -> 2026-01-27 glm-5: 2026-02-12 -> 2026-02-11 glm-5.1: 2026-04-07 -> 2026-03-27 kimi-k2.6: 2026-04-20 -> 2026-04-21 claude-3-7-sonnet-20250219: 2025-02-24 -> 2025-02-19 claude-haiku-4-5-20251001: 2025-10-01 -> 2025-10-15 claude-opus-4-5-20251101: 2025-11-24 -> 2025-11-01 grok-4.3: 2026-05-01 -> 2026-04-17 gemini-2.5-flash: 2025-06-17 -> 2025-03-20 gemini-2.5-pro: 2025-06-17 -> 2025-03-20 gemini-3.1-flash-lite: 2026-05-08 -> 2026-05-07 --- AGENTS.md | 138 ++ CLAUDE.md | 1 + Makefile | 10 +- README.md | 13 +- THIRD_PARTY_NOTICES.md | 42 + pyproject.toml | 34 +- requirements.txt | 16 +- scripts/refresh_models_dev_metadata.py | 299 +++ .../llm/providers/test_anthropic.py | 10 +- .../integration/llm/providers/test_google.py | 6 +- .../integration/llm/providers/test_openai.py | 6 +- .../llm/providers/test_together.py | 7 +- tests/integration/llm/providers/test_xai.py | 6 +- tests/integration/llm/test_model_registry.py | 50 +- tests/integration/llm/test_model_runs.py | 30 + .../unit/test_artificial_analysis_metadata.py | 226 ++ tests/unit/test_llm_model_runs.py | 691 ++++++ tests/unit/test_llm_routing.py | 22 +- tests/unit/test_models_dev_metadata.py | 282 +++ tests/unit/test_project_dependencies.py | 66 + utils/helpers/constants.py | 2 + utils/llm/__init__.py | 8 +- utils/llm/artificial_analysis_model_runs.py | 38 + utils/llm/lab_registry.py | 2 + utils/llm/metadata/__init__.py | 1 + utils/llm/metadata/artificial_analysis.py | 63 + .../artificial_analysis_snapshot.json | 2109 +++++++++++++++++ utils/llm/metadata/models_dev.py | 89 + utils/llm/metadata/models_dev_snapshot.json | 355 +++ utils/llm/model_registry.py | 1000 ++++++-- utils/llm/model_runs.py | 637 +++++ 31 files changed, 5956 insertions(+), 303 deletions(-) create mode 100644 AGENTS.md create mode 100644 CLAUDE.md create mode 100644 THIRD_PARTY_NOTICES.md create mode 100644 scripts/refresh_models_dev_metadata.py create mode 100644 tests/integration/llm/test_model_runs.py create mode 100644 tests/unit/test_artificial_analysis_metadata.py create mode 100644 tests/unit/test_llm_model_runs.py create mode 100644 tests/unit/test_models_dev_metadata.py create mode 100644 tests/unit/test_project_dependencies.py create mode 100644 utils/llm/artificial_analysis_model_runs.py create mode 100644 utils/llm/metadata/__init__.py create mode 100644 utils/llm/metadata/artificial_analysis.py create mode 100644 utils/llm/metadata/artificial_analysis_snapshot.json create mode 100644 utils/llm/metadata/models_dev.py create mode 100644 utils/llm/metadata/models_dev_snapshot.json create mode 100644 utils/llm/model_runs.py diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..6879854 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,138 @@ +# Repository Instructions + +## Shared LLM Registry + +This package targets Python 3.14. Black is configured with +`target-version = ["py314"]`; do not broaden `requires-python` without first +checking that formatted code remains valid for the older target. + +## Local Development Setup + +Use Python 3.14 for local development: + +```bash +python3.14 -m venv .venv +source .venv/bin/activate +python -m pip install --upgrade pip +python -m pip install -r requirements.txt +``` + +`requirements.txt` delegates to `.[dev]`; it installs this package and the dev +tools from `pyproject.toml` without editable mode. + +When another repo needs local utils changes during development, use that repo's +virtual environment and install utils explicitly in editable mode, for example: + +```bash +python -m pip install -e ../utils +``` + +Do not add local relative paths to another repo's requirements files. Those +files should use the deployed git pin when ready to deploy. + +The shared LLM registry has two layers: + +- `utils.llm.model_registry.MODELS` contains canonical provider-callable base models. +- `utils.llm.model_runs.MODEL_RUNS` contains exact benchmarkable model-plus-options runs. + +Benchmarks should choose from `MODEL_RUNS` by `model_run_key`; forecast files should store that exact key. + +When adding a base model: + +- Add provider/lab registry entries first only if the provider or lab is missing. +- Look up the model in Models.dev. Prefer a `ModelsDevReference` when Models.dev + has the provider/model entry. +- In Models.dev source paths, `provider_id` is the folder under `providers/`, + and `model_id` is the TOML filename stem under `models/`, for example + `providers/anthropic/models/claude-opus-4-8.toml` maps to `anthropic` / + `claude-opus-4-8`. +- The checked-in Models.dev snapshot is not a catalog; it contains only + registry-referenced models and only `id`, `name`, and `release_date`. +- Use exact Models.dev `provider_id`/`model_id` values. If a reference is wrong, + refreshing the snapshot should fail and suggest nearby Models.dev entries. +- Use `manual_release_date` when the model is missing from Models.dev, when the + Models.dev entry lacks a usable full release date, or for deliberate + historical/manual entries. +- Put the model in the provider-specific list in `utils/llm/model_registry.py` (`OPENAI_MODELS`, `TOGETHER_MODELS`, `ANTHROPIC_MODELS`, `XAI_MODELS`, or `GOOGLE_MODELS`). +- Insert the model where `(release_date, model_key)` stays ascending within its + provider-specific list. +- Use `provider_model_id` for the exact string sent to the provider API. It may differ from `model_key`, especially for routed providers like Together. +- Set `active=False` only when a provider route should remain in registry history + but should be excluded from current live-callable benchmark runs. +- Do not add duplicate `model_key`s. `MODELS = create_models_list(...)` validates uniqueness. + +After changing `ModelsDevReference` values, refresh the Models.dev snapshot from the utils repo: +```bash +python - <<'PY' +from scripts.refresh_models_dev_metadata import write_models_dev_snapshot + +write_models_dev_snapshot() +PY +``` + +When adding a model run: + +- Add it to `utils/llm/model_runs.py` with + `_model_run(model_run_key=..., model_key=..., options=...)`. +- Write `model_run_key` explicitly as the stable benchmark identifier. Do not + rely on implicit generation from model/options. +- Put every runtime call option in the `ModelRun` declaration; do not add hidden defaults elsewhere. +- Use exact provider option names and values as they are passed to `get_response`. +- If an option affects performance and should appear in filenames/forecast keys, add or update a naming rule in `NAME_COMPONENT_RULES`. +- If an option is intentionally name-neutral, add it to `NAME_NEUTRAL_OPTION_PATHS`. +- Unknown option paths should fail loudly rather than silently producing ambiguous model-run keys. +- `build_model_run_key(...)` is a suggested-key helper for consistency checks and + new naming rules; the declared `model_run_key` remains the durable identity. +- Do not add duplicate `model_run_key`s. `MODEL_RUNS = create_model_runs_list(...)` validates uniqueness. +- `MODEL_RUNS` is the historical registry. `ACTIVE_MODEL_RUNS` is derived from + it by dropping runs whose base `Model` has `active=False`. +- Add unit tests for new naming behavior, registry inclusion, and routed provider options when relevant. + +## Artificial Analysis Model Runs + +When adding an Artificial Analysis-backed model run: + +- Use the checked-in Artificial Analysis snapshot as the source for the stable AA model ID and displayed AA name. +- Refresh the snapshot from the AA endpoint; do not hand-edit individual AA models into the JSON file. +- The official AA API key is `API_KEY_ARTIFICIAL_ANALYSIS` in GCP Secret Manager. +- Do not hard-code an AA display name in a `ModelRun`; set `artificial_analysis_id` and let the run read the display name from the snapshot. +- Do not add an `artificial_analysis_model` flag. A non-null `artificial_analysis_id` is the marker that a run is AA-backed. +- Add or update the canonical base `Model` only if the provider-callable model is missing from `utils.llm.model_registry`. +- Add the callable model-plus-options declaration to + `ARTIFICIAL_ANALYSIS_MODEL_RUN_DECLARATIONS` in + `utils/llm/artificial_analysis_model_runs.py`. Every declaration there is + automatically included in `utils.llm.model_runs.MODEL_RUNS`; do not add the + same AA run manually to `MODEL_RUNS`. +- Use the exact provider option names that are passed at runtime. Token suffixes in model-run keys must reflect the actual token cap option used for the call. + +Artificial Analysis token caps should be encoded in the run options this way: + +- Non-reasoning models: use `16_384` output tokens, adjusted downward if the model has a smaller context window or a lower maximum output-token cap. +- Reasoning models: use the maximum output tokens allowed by the model creator for that reasoning configuration. +- If the correct cap is not clear from provider/model documentation or the AA metadata, stop and confirm rather than guessing. + +After adding an AA model run: + +- Add or update unit tests that prove the AA ID resolves from the snapshot and that `display_name` matches the AA leaderboard name. +- Add or update shared registry coverage tests for the new selectable model-run key. +- Run the focused model-run and AA metadata tests, then run the full lint/test suite before committing. + +## Validation + +- Run `make lint` before committing. It runs `isort .`, `black .`, `flake8 .`, + and `pydocstyle .`. +- Run `make test` before committing code changes. Use `PYTEST_ARGS=...` for a + focused test pass while iterating. +- Run `make test-integration` or `make test-integration-parallel` only when the + relevant provider/GCP credentials are available. + +## Live Model-Run Smoke Tests + +Integration tests that hit real LLM APIs require provider API keys. + +- `tests/conftest.py` loads `.env`, then `configure_api_keys(from_gcp=True)` when pytest is run with `--integration`. +- `configure_api_keys(from_gcp=True)` reads provider keys from GCP Secret Manager using the secret names in `utils/helpers/constants.py`. +- The standard LLM secret names are `API_KEY_OPENAI`, `API_KEY_ANTHROPIC`, `API_KEY_GEMINI`, `API_KEY_XAI`, and `API_KEY_TOGETHERAI`. +- To test a specific shared model run, set `LLM_MODEL_RUN_KEYS` to one or more comma-separated `model_run_key`s and run `pytest --integration tests/integration/llm/test_model_runs.py`. +- The model-run integration test calls `model_run.get_response`, so it uses the run's declared provider route, provider model ID, and options. +- For a newly added model run, prefer running its exact smoke test before assuming the provider accepts the declared options. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..43c994c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/Makefile b/Makefile index 47870e8..683afe5 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,5 @@ +PYTEST_ARGS ?= + lint: pyproject.toml setup.cfg isort . black . @@ -8,13 +10,13 @@ clean: find . -type f -name "*~" -exec rm -f {} + test: - pytest + pytest $(PYTEST_ARGS) test-integration: - pytest --integration + pytest --integration $(PYTEST_ARGS) test-integration-parallel: - pytest --integration -n auto + pytest --integration -n auto $(PYTEST_ARGS) coverage: - pytest --cov=utils --cov-report=term-missing --cov-report=html \ No newline at end of file + pytest --cov=utils --cov-report=term-missing --cov-report=html $(PYTEST_ARGS) diff --git a/README.md b/README.md index 13a8594..c5b3475 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ uv add fri-utils ``` -from utils.llm.model_registry import configure_api_keys, MODELS +from utils.llm.model_registry import configure_api_keys, MODELS_BY_KEY # Input the API key for any model provider you like! configure_api_keys( @@ -40,7 +40,7 @@ configure_api_keys( # Call any model we support! # See the full list of supported models in `utils/llm/model_registry.py` -model = next(m for m in MODELS if m.id == "gemini-2.5-flash") +model = MODELS_BY_KEY["gemini-2.5-pro"] model.get_response("Hello") # > "Hello! How can I help you?" ``` @@ -62,6 +62,12 @@ Use option names supported by the respective provider (`utils/llm/providers`). If you don’t see an option you need, feel free to open a GitHub issue! +### Third-party metadata + +The shared LLM registry includes normalized metadata from Models.dev and +Artificial Analysis. See `THIRD_PARTY_NOTICES.md` for Models.dev license terms +and Artificial Analysis attribution. + ### Configuring keys from GCP Secret Manager @@ -71,7 +77,7 @@ If so, you can use the `from_gcp=True` shortcut to set your keys for all model p ``` configure_api_keys(from_gcp=True) # Configure all provider keys from GCP. -model = next(m for m in MODELS if m.id == "gpt-4.1-mini") +model = MODELS_BY_KEY["gpt-5-mini-2025-08-07"] response = model.get_response("Hello") ``` @@ -82,6 +88,7 @@ If you're setting up a Google Cloud Project, the API keys must be stored in Secr - `API_KEY_OPENAI` for OpenAI - `API_KEY_XAI` for xAI - `API_KEY_TOGETHERAI` for Together AI +- `API_KEY_ARTIFICIAL_ANALYSIS` for refreshing the Artificial Analysis metadata snapshot You can also check `utils/helpers/constants.py` for the complete list of secret names. diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md new file mode 100644 index 0000000..23e6c4a --- /dev/null +++ b/THIRD_PARTY_NOTICES.md @@ -0,0 +1,42 @@ +# Third-Party Notices + +This repository includes normalized metadata derived from third-party sources. + +## Models.dev + +The checked-in Models.dev snapshot is derived from https://models.dev/api.json +and the upstream repository https://github.com/anomalyco/models.dev + +Models.dev is licensed under the MIT License: + +```text +MIT License + +Copyright (c) 2025 models.dev + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + +## Artificial Analysis + +The checked-in Artificial Analysis snapshot is derived from the Artificial +Analysis free API and is minimized to the stable model IDs and display names +used by this package. + +Attribution: Artificial Analysis, https://artificialanalysis.ai/. diff --git a/pyproject.toml b/pyproject.toml index 89f69ba..9f4e069 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,17 +4,17 @@ build-backend = "setuptools.build_meta" [project] name = "fri-utils" -version = "0.1.0" +version = "0.2.0" description = "Utilities for the Forecasting Research Institute codebase." readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.14" license = { file = "LICENSE" } authors = [{ name = "Forecasting Research Institute" }] dependencies = [ - "google-genai==1.73.1", - "anthropic==0.97.0", - "together==2.11.0", - "openai==2.33.0", + "google-genai==2.7.0", + "anthropic==0.105.2", + "together==2.16.0", + "openai==2.40.0", "google-cloud-secret-manager>=2.20.0", "google-cloud-storage>=2.14.0", "python-dotenv>=1.0.0", @@ -22,13 +22,14 @@ dependencies = [ [project.optional-dependencies] dev = [ - "black", - "flake8", - "flake8-bugbear", - "isort", - "pydocstyle", - "pytest", - "pytest-cov", + "black==26.5.1", + "flake8==7.3.0", + "flake8-bugbear==25.11.29", + "isort==8.0.1", + "pydocstyle==6.3.0", + "pytest==9.0.3", + "pytest-cov==7.1.0", + "pytest-xdist==3.8.0", ] [tool.setuptools.packages.find] @@ -38,8 +39,15 @@ include = [ ] exclude = ["tests*", "htmlcov*", "venv*"] +[tool.setuptools] +license-files = ["LICENSE", "THIRD_PARTY_NOTICES.md"] + +[tool.setuptools.package-data] +"utils.llm.metadata" = ["*.json"] + [tool.black] line-length = 100 +target-version = ["py314"] [tool.pytest.ini_options] markers = [ diff --git a/requirements.txt b/requirements.txt index 3bb98ee..e7ab444 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1 @@ -google-genai==1.73.1 -anthropic==0.97.0 -together==2.11.0 -openai==2.33.0 -google-cloud-secret-manager>=2.20.0 -google-cloud-storage>=2.14.0 -python-dotenv>=1.0.0 -isort -black -flake8 -flake8-bugbear -pydocstyle -pytest -pytest-cov -pytest-xdist +.[dev] diff --git a/scripts/refresh_models_dev_metadata.py b/scripts/refresh_models_dev_metadata.py new file mode 100644 index 0000000..eed914c --- /dev/null +++ b/scripts/refresh_models_dev_metadata.py @@ -0,0 +1,299 @@ +"""Refresh the checked-in LLM metadata snapshots.""" + +import ast +import difflib +import json +import urllib.request +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from google.api_core import exceptions + +from utils.gcp.secret_manager import get_secret +from utils.helpers.constants import ARTIFICIAL_ANALYSIS_API_KEY_SECRET_NAME + +MODELS_DEV_URL = "https://models.dev/api.json" +ARTIFICIAL_ANALYSIS_URL = "https://artificialanalysis.ai/api/v2/data/llms/models" +DEFAULT_MODELS_DEV_OUTPUT_PATH = ( + Path(__file__).resolve().parents[1] / "utils" / "llm" / "metadata" / "models_dev_snapshot.json" +) +DEFAULT_MODEL_REGISTRY_PATH = ( + Path(__file__).resolve().parents[1] / "utils" / "llm" / "model_registry.py" +) +DEFAULT_ARTIFICIAL_ANALYSIS_OUTPUT_PATH = ( + Path(__file__).resolve().parents[1] + / "utils" + / "llm" + / "metadata" + / "artificial_analysis_snapshot.json" +) + +MODEL_FIELDS = ( + "id", + "name", + "release_date", +) + + +@dataclass(frozen=True, slots=True, order=True) +class ModelsDevReference: + """A provider/model reference into Models.dev.""" + + provider_id: str + model_id: str + + +def _sorted_dict(data: dict[str, Any]) -> dict[str, Any]: + """Return a copy of a dictionary with keys sorted recursively.""" + return {key: _sort_json_value(value) for key, value in sorted(data.items())} + + +def _sort_json_value(value: Any) -> Any: + """Return JSON-like data with dictionaries sorted recursively.""" + if isinstance(value, dict): + return _sorted_dict(value) + if isinstance(value, list): + return [_sort_json_value(item) for item in value] + return value + + +def _literal_string_keyword(call: ast.Call, keyword_name: str) -> str | None: + """Return a string literal keyword argument from an AST call, if present.""" + for keyword in call.keywords: + if keyword.arg == keyword_name and isinstance(keyword.value, ast.Constant): + value = keyword.value.value + if isinstance(value, str): + return value + return None + + +def read_models_dev_references_from_model_registry( + model_registry_path: Path = DEFAULT_MODEL_REGISTRY_PATH, +) -> frozenset[ModelsDevReference]: + """Read Models.dev references from the model registry without importing it.""" + tree = ast.parse(model_registry_path.read_text(), filename=str(model_registry_path)) + references = set() + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + if isinstance(node.func, ast.Name): + function_name = node.func.id + elif isinstance(node.func, ast.Attribute): + function_name = node.func.attr + else: + continue + if function_name != "ModelsDevReference": + continue + + provider_id = _literal_string_keyword(node, "provider_id") + model_id = _literal_string_keyword(node, "model_id") + if provider_id is None or model_id is None: + raise ValueError( + "ModelsDevReference calls in model_registry.py must use literal " + "provider_id and model_id keyword arguments." + ) + references.add(ModelsDevReference(provider_id=provider_id, model_id=model_id)) + return frozenset(references) + + +def _format_model_suggestions( + *, + provider_id: str, + provider_data: dict[str, Any], + missing_model_id: str, +) -> str: + """Format nearby Models.dev model suggestions for an incorrect reference.""" + models = provider_data.get("models", {}) + candidates_by_key = { + model_id: f'{provider_id}/{model_id} name="{model_data.get("name", "")}"' + for model_id, model_data in models.items() + } + search_space = list(candidates_by_key) + search_space.extend( + model_data.get("name", "") for model_data in models.values() if model_data.get("name") + ) + close_values = difflib.get_close_matches( + missing_model_id, + search_space, + n=5, + cutoff=0.35, + ) + suggestions = [] + for value in close_values: + if value in candidates_by_key: + suggestions.append(candidates_by_key[value]) + continue + for model_id, model_data in models.items(): + if model_data.get("name") == value: + suggestions.append(candidates_by_key[model_id]) + break + + # Preserve order while de-duplicating suggestions found by ID and display name. + suggestions = list(dict.fromkeys(suggestions)) + if not suggestions: + return f"No nearby model IDs found for provider {provider_id}." + return "Possible matches:\n " + "\n ".join(suggestions) + + +def _raise_missing_models_dev_reference( + *, + api_response: dict[str, Any], + reference: ModelsDevReference, +) -> None: + """Raise a targeted error for a missing Models.dev reference.""" + provider_data = api_response.get(reference.provider_id) + if provider_data is None: + provider_suggestions = difflib.get_close_matches( + reference.provider_id, + list(api_response), + n=5, + cutoff=0.35, + ) + suffix = ( + "Possible provider IDs:\n " + "\n ".join(provider_suggestions) + if provider_suggestions + else "No nearby provider IDs found." + ) + raise ValueError( + f"Missing Models.dev provider reference: {reference.provider_id}\n{suffix}" + ) + + suggestions = _format_model_suggestions( + provider_id=reference.provider_id, + provider_data=provider_data, + missing_model_id=reference.model_id, + ) + raise ValueError( + f"Missing Models.dev reference: {reference.provider_id}/{reference.model_id}\n" + f"{suggestions}" + ) + + +def normalize_models_dev_api_response( + api_response: dict[str, Any], + *, + models_dev_references: frozenset[ModelsDevReference], +) -> dict[str, Any]: + """Normalize a Models.dev API response into the checked-in snapshot shape.""" + providers = {} + references_by_provider: dict[str, list[ModelsDevReference]] = {} + for reference in sorted(models_dev_references): + references_by_provider.setdefault(reference.provider_id, []).append(reference) + + for provider_id, references in sorted(references_by_provider.items()): + provider_data = api_response.get(provider_id) + if provider_data is None: + _raise_missing_models_dev_reference( + api_response=api_response, + reference=references[0], + ) + + models = {} + provider_models = provider_data.get("models", {}) + for reference in sorted(references): + model_data = provider_models.get(reference.model_id) + if model_data is None: + _raise_missing_models_dev_reference( + api_response=api_response, + reference=reference, + ) + + normalized_model = {} + for field in MODEL_FIELDS: + if field in model_data: + value = model_data[field] + normalized_model[field] = ( + _sorted_dict(value) if isinstance(value, dict) else value + ) + models[reference.model_id] = normalized_model + providers[provider_id] = { + "id": provider_data["id"], + "name": provider_data["name"], + "models": models, + } + return { + "source": MODELS_DEV_URL, + "providers": providers, + } + + +def normalize_artificial_analysis_api_response(api_response: dict[str, Any]) -> dict[str, Any]: + """Normalize Artificial Analysis response fields needed at runtime.""" + return { + "source": ARTIFICIAL_ANALYSIS_URL, + "prompt_options": _sorted_dict(api_response.get("prompt_options") or {}), + "data": [ + {"id": model_data["id"], "name": model_data["name"]} + for model_data in sorted(api_response.get("data", []), key=lambda item: item["id"]) + ], + } + + +def fetch_models_dev_api_response() -> dict[str, Any]: + """Fetch the current Models.dev API response.""" + request = urllib.request.Request(MODELS_DEV_URL, headers={"User-Agent": "fri-utils"}) + with urllib.request.urlopen(request, timeout=30) as response: + return json.load(response) + + +def fetch_artificial_analysis_api_response() -> dict[str, Any]: + """Fetch the current Artificial Analysis LLM models API response.""" + try: + api_key = get_secret(ARTIFICIAL_ANALYSIS_API_KEY_SECRET_NAME) + except RuntimeError, exceptions.NotFound: + api_key = None + if not api_key: + raise RuntimeError( + f"Configure {ARTIFICIAL_ANALYSIS_API_KEY_SECRET_NAME} in GCP Secret Manager " + "to refresh the Artificial Analysis snapshot." + ) + request = urllib.request.Request( + ARTIFICIAL_ANALYSIS_URL, + headers={ + "User-Agent": "fri-utils", + "x-api-key": api_key, + }, + ) + with urllib.request.urlopen(request, timeout=30) as response: + return json.load(response) + + +def write_json_snapshot(snapshot: dict[str, Any], output_path: Path) -> None: + """Write a deterministic JSON snapshot.""" + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(snapshot, indent=2, sort_keys=True) + "\n") + + +def write_models_dev_snapshot(output_path: Path = DEFAULT_MODELS_DEV_OUTPUT_PATH) -> None: + """Fetch, normalize, and write the Models.dev snapshot.""" + snapshot = normalize_models_dev_api_response( + fetch_models_dev_api_response(), + models_dev_references=read_models_dev_references_from_model_registry(), + ) + write_json_snapshot(snapshot, output_path) + + +def write_artificial_analysis_snapshot( + output_path: Path = DEFAULT_ARTIFICIAL_ANALYSIS_OUTPUT_PATH, +) -> None: + """Fetch, normalize, and write the Artificial Analysis snapshot.""" + snapshot = normalize_artificial_analysis_api_response(fetch_artificial_analysis_api_response()) + write_json_snapshot(snapshot, output_path) + + +def write_snapshots() -> None: + """Fetch, normalize, and write all checked-in LLM metadata snapshots.""" + models_dev_snapshot = normalize_models_dev_api_response( + fetch_models_dev_api_response(), + models_dev_references=read_models_dev_references_from_model_registry(), + ) + artificial_analysis_snapshot = normalize_artificial_analysis_api_response( + fetch_artificial_analysis_api_response() + ) + write_json_snapshot(models_dev_snapshot, DEFAULT_MODELS_DEV_OUTPUT_PATH) + write_json_snapshot(artificial_analysis_snapshot, DEFAULT_ARTIFICIAL_ANALYSIS_OUTPUT_PATH) + + +if __name__ == "__main__": + write_snapshots() diff --git a/tests/integration/llm/providers/test_anthropic.py b/tests/integration/llm/providers/test_anthropic.py index 4b62795..7510368 100644 --- a/tests/integration/llm/providers/test_anthropic.py +++ b/tests/integration/llm/providers/test_anthropic.py @@ -1,17 +1,13 @@ """Integration tests for Anthropic model helpers.""" -from __future__ import annotations - import pytest import utils.llm.providers.anthropic as anthropic_module # type: ignore[import] -from tests.integration.helpers import ( - assert_capital_of_france, -) +from tests.integration.helpers import assert_capital_of_france from utils.llm.model_registry import MODELS, Model # type: ignore[import] ANTHROPIC_MODEL: Model | None = next( - (model for model in MODELS if model.id == "claude-sonnet-4-6"), None + (model for model in MODELS if model.model_key == "claude-sonnet-4-6"), None ) assert ANTHROPIC_MODEL is not None @@ -30,7 +26,7 @@ def test_anthropic_provider_get_response_live_call(): provider = anthropic_module.AnthropicProvider(api_key=api_key) assert_capital_of_france( lambda prompt: provider.get_response( - model_id=ANTHROPIC_MODEL.full_name, + model_id=ANTHROPIC_MODEL.provider_model_id, prompt=prompt, options={"temperature": 0, "max_tokens": 16}, ) diff --git a/tests/integration/llm/providers/test_google.py b/tests/integration/llm/providers/test_google.py index fe87c7d..23b2f90 100644 --- a/tests/integration/llm/providers/test_google.py +++ b/tests/integration/llm/providers/test_google.py @@ -1,7 +1,5 @@ """Integration tests for Google Gemini model helpers.""" -from __future__ import annotations - import pytest import utils.llm.providers.google as google_module # type: ignore[import] @@ -9,7 +7,7 @@ from utils.llm.model_registry import MODELS, Model # type: ignore[import] GOOGLE_MODEL: Model | None = next( - (model for model in MODELS if model.id == "gemini-2.5-flash"), None + (model for model in MODELS if model.model_key == "gemini-2.5-pro"), None ) assert GOOGLE_MODEL is not None @@ -28,7 +26,7 @@ def test_google_provider_get_response_live_call(): provider = google_module.GoogleProvider(api_key=api_key) assert_capital_of_france( lambda prompt: provider.get_response( - model_id=GOOGLE_MODEL.full_name, + model_id=GOOGLE_MODEL.provider_model_id, prompt=prompt, options={"temperature": 0}, ) diff --git a/tests/integration/llm/providers/test_openai.py b/tests/integration/llm/providers/test_openai.py index c72059f..8cec41b 100644 --- a/tests/integration/llm/providers/test_openai.py +++ b/tests/integration/llm/providers/test_openai.py @@ -1,7 +1,5 @@ """Integration tests for OpenAI model helpers.""" -from __future__ import annotations - import pytest import utils.llm.providers.openai as openai_module # type: ignore[import] @@ -9,7 +7,7 @@ from utils.llm.model_registry import MODELS, Model # type: ignore[import] OPENAI_MODEL: Model | None = next( - (model for model in MODELS if model.id == "gpt-5-2025-08-07"), None + (model for model in MODELS if model.model_key == "gpt-5-mini-2025-08-07"), None ) assert OPENAI_MODEL is not None @@ -28,7 +26,7 @@ def test_openai_provider_get_response_live_call(): provider = openai_module.OpenAIProvider(api_key=api_key) assert_capital_of_france( lambda prompt: provider.get_response( - model_id=OPENAI_MODEL.full_name, + model_id=OPENAI_MODEL.provider_model_id, prompt=prompt, options={"max_output_tokens": 256}, ) diff --git a/tests/integration/llm/providers/test_together.py b/tests/integration/llm/providers/test_together.py index 092284a..49eb69a 100644 --- a/tests/integration/llm/providers/test_together.py +++ b/tests/integration/llm/providers/test_together.py @@ -1,7 +1,5 @@ """Integration tests for Together AI model helpers.""" -from __future__ import annotations - import pytest import utils.llm.providers.together as together_module # type: ignore[import] @@ -9,9 +7,10 @@ from utils.llm.model_registry import MODELS, Model # type: ignore[import] TOGETHER_MODEL: Model | None = next( - (model for model in MODELS if model.id == "GLM-4.5-Air-FP8"), None + (model for model in MODELS if model.model_key == "minimax-m2.7"), None ) assert TOGETHER_MODEL is not None +assert TOGETHER_MODEL.active is True @pytest.mark.integration @@ -28,7 +27,7 @@ def test_together_provider_get_response_live_call(): provider = together_module.TogetherProvider(api_key=api_key) assert_capital_of_france( lambda prompt: provider.get_response( - model_id=TOGETHER_MODEL.full_name, + model_id=TOGETHER_MODEL.provider_model_id, prompt=prompt, options={"temperature": 0, "max_tokens": 256}, ) diff --git a/tests/integration/llm/providers/test_xai.py b/tests/integration/llm/providers/test_xai.py index cbb1f06..393c45a 100644 --- a/tests/integration/llm/providers/test_xai.py +++ b/tests/integration/llm/providers/test_xai.py @@ -1,14 +1,12 @@ """Integration tests for xAI model helpers.""" -from __future__ import annotations - import pytest import utils.llm.providers.xai as xai_module # type: ignore[import] from tests.integration.helpers import assert_capital_of_france # type: ignore[import] from utils.llm.model_registry import MODELS, Model # type: ignore[import] -XAI_MODEL: Model | None = next((model for model in MODELS if model.id == "grok-4-0709"), None) +XAI_MODEL: Model | None = next((model for model in MODELS if model.model_key == "grok-4.3"), None) assert XAI_MODEL is not None @@ -26,7 +24,7 @@ def test_xai_provider_get_response_live_call(): provider = xai_module.XAIProvider(api_key=api_key) assert_capital_of_france( lambda prompt: provider.get_response( - model_id=XAI_MODEL.full_name, + model_id=XAI_MODEL.provider_model_id, prompt=prompt, options={"temperature": 0}, ) diff --git a/tests/integration/llm/test_model_registry.py b/tests/integration/llm/test_model_registry.py index 53e5f80..491a186 100644 --- a/tests/integration/llm/test_model_registry.py +++ b/tests/integration/llm/test_model_registry.py @@ -1,35 +1,51 @@ -"""Integration tests that validate every registry model can be invoked.""" - -from __future__ import annotations +"""Integration tests that validate representative registry models can be invoked.""" import pytest -from utils.llm.model_registry import MODELS, Model # type: ignore[import] -from utils.llm.providers.anthropic import AnthropicProvider # type: ignore[import] -from utils.llm.providers.google import GoogleProvider # type: ignore[import] -from utils.llm.providers.openai import OpenAIProvider # type: ignore[import] -from utils.llm.providers.together import TogetherProvider # type: ignore[import] -from utils.llm.providers.xai import XAIProvider # type: ignore[import] +from utils.llm.model_registry import MODELS_BY_KEY, Model # type: ignore[import] +from utils.llm.provider_registry import PROVIDERS # type: ignore[import] from ..helpers import assert_capital_of_france +SMOKE_TEST_MODEL_KEYS = [ + "gpt-5-mini-2025-08-07", + "claude-sonnet-4-6", + "minimax-m2.7", + "grok-4.3", + "gemini-2.5-pro", +] + def _minimal_options_for_model(model: Model) -> dict: - if model.provider_cls is AnthropicProvider: - return {"max_tokens": 16} - if model.provider_cls is OpenAIProvider: - return {"max_output_tokens": 16} - if model.provider_cls in {TogetherProvider, XAIProvider}: + if model.provider == PROVIDERS["Anthropic"]: return {"max_tokens": 16} - if model.provider_cls is GoogleProvider: + if model.provider == PROVIDERS["OpenAI"]: + return {"max_output_tokens": 256} + if model.provider == PROVIDERS["Together"]: + return {"temperature": 0, "max_tokens": 256} + if model.provider == PROVIDERS["xAI"]: + return {"temperature": 0} + if model.provider == PROVIDERS["Google"]: return {} return {} +def test_together_smoke_options_leave_room_for_answer_text(): + """Keep routed Together smoke calls aligned with the provider smoke path.""" + model = MODELS_BY_KEY["minimax-m2.7"] + + assert _minimal_options_for_model(model) == {"temperature": 0, "max_tokens": 256} + + @pytest.mark.integration -@pytest.mark.parametrize("model", MODELS, ids=lambda item: item.id) +@pytest.mark.parametrize( + "model", + [MODELS_BY_KEY[model_key] for model_key in SMOKE_TEST_MODEL_KEYS], + ids=lambda item: item.model_key, +) def test_registered_model_live_call(model: Model): - """Each model entry should be callable via its registered provider.""" + """Representative active model entries should be callable via their providers.""" + assert model.active is True assert_capital_of_france( lambda prompt: model.get_response( prompt, diff --git a/tests/integration/llm/test_model_runs.py b/tests/integration/llm/test_model_runs.py new file mode 100644 index 0000000..98f11b8 --- /dev/null +++ b/tests/integration/llm/test_model_runs.py @@ -0,0 +1,30 @@ +"""Integration smoke tests for shared LLM model runs.""" + +import os + +import pytest + +from utils.llm import model_runs + +from .helpers import assert_capital_of_france + +DEFAULT_SMOKE_MODEL_RUN_KEYS = ("gpt-5-mini-2025-08-07-1024",) + + +def _selected_model_run_keys() -> tuple[str, ...]: + """Return model-run keys selected for live smoke testing.""" + raw_keys = os.getenv("LLM_MODEL_RUN_KEYS") + if not raw_keys: + return DEFAULT_SMOKE_MODEL_RUN_KEYS + return tuple(key.strip() for key in raw_keys.split(",") if key.strip()) + + +@pytest.mark.integration +@pytest.mark.parametrize( + "model_run", + model_runs.select_model_runs(_selected_model_run_keys()), + ids=lambda run: run.model_run_key, +) +def test_model_run_live_call(model_run: model_runs.ModelRun): + """A shared model run should be callable with its declared provider options.""" + assert_capital_of_france(model_run.get_response) diff --git a/tests/unit/test_artificial_analysis_metadata.py b/tests/unit/test_artificial_analysis_metadata.py new file mode 100644 index 0000000..6d71195 --- /dev/null +++ b/tests/unit/test_artificial_analysis_metadata.py @@ -0,0 +1,226 @@ +"""Tests for the checked-in Artificial Analysis metadata snapshot.""" + +import tomllib +from pathlib import Path + +import pytest + +from scripts import refresh_models_dev_metadata +from utils.llm.metadata import artificial_analysis + +ROOT_DIR = Path(__file__).resolve().parents[2] + + +def test_load_artificial_analysis_snapshot_exposes_model_metadata(): + """Load the snapshot and expose normalized model fields.""" + snapshot = artificial_analysis.load_artificial_analysis_snapshot() + + model = snapshot.get_model("2dad8957-4c16-4e74-bf2d-8b21514e0ae9") + opus_adaptive = snapshot.get_model("e9a09db3-8fd6-41dd-ba2f-20e0a2bff7f2") + opus_non_reasoning = snapshot.get_model("2fa8e143-77a8-4d05-bfa8-d3b54634c00f") + + assert snapshot.source == refresh_models_dev_metadata.ARTIFICIAL_ANALYSIS_URL + assert snapshot.prompt_options == {"parallel_queries": 1, "prompt_length": 1000} + assert model.id == "2dad8957-4c16-4e74-bf2d-8b21514e0ae9" + assert model.name == "o3-mini" + assert opus_adaptive.name == "Claude Opus 4.7 (Adaptive Reasoning, Max Effort)" + assert opus_non_reasoning.name == "Claude Opus 4.7 (Non-reasoning, High Effort)" + + +def test_artificial_analysis_snapshot_rejects_unknown_model(): + """Raise clear lookup errors for unknown Artificial Analysis model IDs.""" + snapshot = artificial_analysis.load_artificial_analysis_snapshot() + + with pytest.raises(KeyError, match="Unknown Artificial Analysis model_id missing-model"): + snapshot.get_model("missing-model") + + +def test_load_artificial_analysis_snapshot_supports_endpoint_dump_shape(tmp_path): + """Load AA model names from a generated full endpoint snapshot.""" + snapshot_path = tmp_path / "artificial_analysis_snapshot.json" + snapshot_path.write_text(""" +{ + "data": [ + { + "id": "opus-aa-id", + "name": "Claude Opus 4.7 (Adaptive Reasoning, Max Effort)", + "slug": "claude-opus-4-7" + } + ], + "prompt_options": { + "parallel_queries": 1, + "prompt_length": "medium" + }, + "source": "https://artificialanalysis.ai/api/v2/data/llms/models", + "status": 200 +} +""".strip()) + + snapshot = artificial_analysis.load_artificial_analysis_snapshot(snapshot_path) + + assert snapshot.get_model("opus-aa-id").name == ( + "Claude Opus 4.7 (Adaptive Reasoning, Max Effort)" + ) + + +def test_normalize_artificial_analysis_api_response_keeps_only_runtime_fields(): + """Keep only AA fields needed for stable IDs, display names, and attribution.""" + api_response = { + "status": 200, + "prompt_options": {"prompt_length": "medium", "parallel_queries": 1}, + "data": [ + { + "id": "z-model", + "name": "Z Model", + "slug": "z-model", + "model_creator": {"slug": "z-lab", "name": "Z Lab", "id": "z"}, + "evaluations": {"score_b": 2, "score_a": 1}, + "pricing": {"output": 2.5, "input": 1.5}, + "median_output_tokens_per_second": 12.5, + "median_time_to_first_token_seconds": 3.5, + "median_time_to_first_answer_token": 3.5, + "ignored": "drop me", + }, + { + "id": "a-model", + "name": "A Model", + "slug": "a-model", + "model_creator": {"id": "a", "name": "A Lab", "slug": "a-lab"}, + "evaluations": {}, + "pricing": {}, + }, + ], + } + + normalized = refresh_models_dev_metadata.normalize_artificial_analysis_api_response( + api_response + ) + + assert normalized["source"] == refresh_models_dev_metadata.ARTIFICIAL_ANALYSIS_URL + assert normalized["prompt_options"] == {"parallel_queries": 1, "prompt_length": "medium"} + assert [model["id"] for model in normalized["data"]] == ["a-model", "z-model"] + assert normalized["data"][1] == {"id": "z-model", "name": "Z Model"} + + +def test_checked_in_artificial_analysis_snapshot_is_minimal(): + """Do not redistribute the full AA endpoint response in package data.""" + snapshot = refresh_models_dev_metadata.json.loads(artificial_analysis.SNAPSHOT_PATH.read_text()) + + assert set(snapshot) == {"data", "prompt_options", "source"} + assert all(set(model) == {"id", "name"} for model in snapshot["data"]) + + +def test_artificial_analysis_snapshot_refresh_uses_gcp_secret(monkeypatch): + """Use the official GCP Secret Manager key for AA metadata refreshes.""" + monkeypatch.setattr( + refresh_models_dev_metadata, + "get_secret", + lambda secret_name: "gcp-aa-key", + ) + + request_headers = {} + + class FakeResponse: + def __enter__(self): + return self + + def __exit__(self, *args): + return None + + def fake_urlopen(request, timeout): + request_headers.update(request.headers) + assert timeout == 30 + return FakeResponse() + + monkeypatch.setattr(refresh_models_dev_metadata.urllib.request, "urlopen", fake_urlopen) + monkeypatch.setattr(refresh_models_dev_metadata.json, "load", lambda response: {"data": []}) + + assert refresh_models_dev_metadata.fetch_artificial_analysis_api_response() == {"data": []} + assert request_headers["X-api-key"] == "gcp-aa-key" + + +def test_artificial_analysis_snapshot_refresh_requires_api_key(monkeypatch): + """Require the GCP Secret Manager key when it is unavailable.""" + monkeypatch.setattr( + refresh_models_dev_metadata, + "get_secret", + lambda secret_name: (_ for _ in ()).throw(RuntimeError("GCP unavailable")), + ) + + with pytest.raises(RuntimeError, match="API_KEY_ARTIFICIAL_ANALYSIS"): + refresh_models_dev_metadata.fetch_artificial_analysis_api_response() + + +def test_write_snapshots_updates_models_dev_and_artificial_analysis(monkeypatch, tmp_path): + """Write both LLM metadata snapshots from the shared refresh entrypoint.""" + models_dev_output = tmp_path / "models_dev_snapshot.json" + artificial_analysis_output = tmp_path / "artificial_analysis_snapshot.json" + monkeypatch.setattr( + refresh_models_dev_metadata, + "DEFAULT_MODELS_DEV_OUTPUT_PATH", + models_dev_output, + ) + monkeypatch.setattr( + refresh_models_dev_metadata, + "DEFAULT_ARTIFICIAL_ANALYSIS_OUTPUT_PATH", + artificial_analysis_output, + ) + monkeypatch.setattr( + refresh_models_dev_metadata, + "fetch_models_dev_api_response", + lambda: { + "openai": { + "id": "openai", + "name": "OpenAI", + "models": { + "gpt-test": { + "id": "gpt-test", + "name": "GPT Test", + } + }, + } + }, + ) + monkeypatch.setattr( + refresh_models_dev_metadata, + "read_models_dev_references_from_model_registry", + lambda: frozenset( + { + refresh_models_dev_metadata.ModelsDevReference( + provider_id="openai", + model_id="gpt-test", + ) + } + ), + ) + monkeypatch.setattr( + refresh_models_dev_metadata, + "fetch_artificial_analysis_api_response", + lambda: { + "status": 200, + "data": [ + { + "id": "aa-test", + "name": "AA Test", + "slug": "aa-test", + "model_creator": {"id": "creator", "name": "Creator"}, + } + ], + }, + ) + + refresh_models_dev_metadata.write_snapshots() + + assert models_dev_output.exists() + assert artificial_analysis_output.exists() + assert "gpt-test" in models_dev_output.read_text() + assert "aa-test" in artificial_analysis_output.read_text() + + +def test_artificial_analysis_snapshot_is_included_as_package_data(): + """Include the JSON snapshot when utils is installed as a package.""" + pyproject = tomllib.loads((ROOT_DIR / "pyproject.toml").read_text()) + + package_data = pyproject["tool"]["setuptools"]["package-data"] + + assert package_data["utils.llm.metadata"] == ["*.json"] diff --git a/tests/unit/test_llm_model_runs.py b/tests/unit/test_llm_model_runs.py new file mode 100644 index 0000000..a247fb3 --- /dev/null +++ b/tests/unit/test_llm_model_runs.py @@ -0,0 +1,691 @@ +"""Unit tests for shared LLM model-run declarations.""" + +import ast +from datetime import date +from pathlib import Path +from unittest.mock import patch + +import pytest + +from utils.llm.lab_registry import LABS +from utils.llm.provider_registry import PROVIDERS + + +def test_model_keys_are_unique_and_file_safe(): + """Keep base model keys unique and safe for downstream identifiers.""" + from utils.llm import model_registry + + model_keys = [model.model_key for model in model_registry.MODELS] + + assert len(model_keys) == len(set(model_keys)) + assert all(key == key.lower() for key in model_keys) + assert all(" " not in key and "/" not in key and "_" not in key for key in model_keys) + + +def test_model_key_and_provider_model_id_are_distinct_for_together_models(): + """Keep canonical model keys separate from routed provider model IDs.""" + from utils.llm import model_registry + + model = model_registry.MODELS_BY_KEY["deepseek-v3.1"] + + assert model.model_key == "deepseek-v3.1" + assert model.provider_model_id == "deepseek-ai/DeepSeek-V3.1" + assert model.lab == LABS["DeepSeek"] + assert model.provider == PROVIDERS["Together"] + assert model.release_date == date(2025, 8, 21) + assert model.active is False + assert not hasattr(model, "token_limit") + assert not hasattr(model, "provider_cls") + + +def test_forecastbench_origin_main_models_are_in_canonical_registry(): + """Include recent ForecastBench origin/main models in the shared registry.""" + from utils.llm import model_registry + + expected_models = { + "deepseek-v4-pro": ( + "deepseek-ai/DeepSeek-V4-Pro", + LABS["DeepSeek"], + PROVIDERS["Together"], + date(2026, 4, 24), + ), + "gemini-3.5-flash": ( + "gemini-3.5-flash", + LABS["Google DeepMind"], + PROVIDERS["Google"], + date(2026, 5, 19), + ), + } + + for model_key, ( + provider_model_id, + lab, + provider, + release_date, + ) in expected_models.items(): + model = model_registry.MODELS_BY_KEY[model_key] + assert model.provider_model_id == provider_model_id + assert model.lab == lab + assert model.provider == provider + assert model.release_date == release_date + + +def test_model_release_date_resolves_from_models_dev_metadata(): + """Resolve model release dates from configured Models.dev metadata.""" + from utils.llm import model_registry + + model = model_registry.MODELS_BY_KEY["gpt-4o-2024-11-20"] + + assert "release_date" not in model_registry.Model.__dataclass_fields__ + assert "models_dev_provider_id" not in model_registry.Model.__dataclass_fields__ + assert "models_dev_model_id" not in model_registry.Model.__dataclass_fields__ + assert model.models_dev_reference == model_registry.ModelsDevReference( + provider_id="openai", + model_id="gpt-4o-2024-11-20", + ) + assert model.models_dev_metadata is not None + assert model.release_date == model.models_dev_metadata.release_date + assert model.release_date == date(2024, 11, 20) + + +def test_model_api_provider_route_is_independent_from_models_dev_provider(): + """Keep API routing separate from the Models.dev metadata provider.""" + from utils.llm import model_registry + + model = model_registry.MODELS_BY_KEY["glm-5.1"] + + assert model.provider == PROVIDERS["Together"] + assert model.provider_model_id == "zai-org/GLM-5.1" + assert model.models_dev_reference == model_registry.ModelsDevReference( + provider_id="zai", + model_id="glm-5.1", + ) + assert model.release_date == date(2026, 3, 27) + + +def test_models_without_models_dev_metadata_use_manual_release_dates(): + """Use manual release dates only when Models.dev metadata is unavailable.""" + from utils.llm import model_registry + + model = model_registry.MODELS_BY_KEY["gpt-4-0613"] + + assert model.models_dev_metadata is None + assert model.manual_release_date == date(2023, 6, 13) + assert model.release_date == date(2023, 6, 13) + + +def test_provider_specific_model_helpers_default_lab_provider_and_provider_model_id(): + """Use helper constructors to avoid repeated provider and lab boilerplate.""" + from utils.llm import model_registry + + model = model_registry.openai_model( + model_key="gpt-test", + models_dev_reference=model_registry.ModelsDevReference( + provider_id="openai", + model_id="gpt-4o", + ), + ) + + assert model.model_key == "gpt-test" + assert model.provider_model_id == "gpt-test" + assert model.lab == LABS["OpenAI"] + assert model.provider == PROVIDERS["OpenAI"] + assert model.models_dev_reference == model_registry.ModelsDevReference( + provider_id="openai", + model_id="gpt-4o", + ) + assert model.active is True + + +def test_together_model_helper_keeps_lab_and_route_explicit(): + """Keep Together creator lab and provider route explicit while reducing noise.""" + from utils.llm import model_registry + + model = model_registry.together_model( + model_key="glm-5.1", + provider_model_id="zai-org/GLM-5.1", + lab_key="Z.ai", + models_dev_reference=model_registry.ModelsDevReference( + provider_id="zai", + model_id="glm-5.1", + ), + ) + + assert model.lab == LABS["Z.ai"] + assert model.provider == PROVIDERS["Together"] + assert model.provider_model_id == "zai-org/GLM-5.1" + assert model.release_date == date(2026, 3, 27) + + +def test_model_without_models_dev_or_manual_release_date_fails_on_initialization(): + """Fail on construction when a model lacks a release date source.""" + from utils.llm import model_registry + + with pytest.raises(ValueError, match="missing-date-model"): + model_registry.Model( + model_key="missing-date-model", + provider_model_id="missing-date-model", + lab=LABS["OpenAI"], + provider=PROVIDERS["OpenAI"], + ) + + +def test_model_rejects_missing_models_dev_reference_on_initialization(): + """Reject model declarations whose Models.dev reference is not in the snapshot.""" + from utils.llm import model_registry + + with pytest.raises(ValueError, match="bad-reference-model"): + model_registry.openai_model( + model_key="bad-reference-model", + models_dev_reference=model_registry.ModelsDevReference( + provider_id="openai", + model_id="missing-model", + ), + ) + + +def test_openai_dated_model_uses_dated_provider_model_id(): + """Use fixed dated OpenAI provider model IDs instead of moving aliases.""" + from utils.llm import model_registry + + model = model_registry.MODELS_BY_KEY["gpt-4o-mini-2024-07-18"] + + assert model.provider_model_id == "gpt-4o-mini-2024-07-18" + + +def test_model_registry_models_are_grouped_by_provider(): + """Build the shared model registry from provider-specific groups.""" + from utils.llm import model_registry + + assert model_registry.MODELS == [ + *model_registry.OPENAI_MODELS, + *model_registry.TOGETHER_MODELS, + *model_registry.ANTHROPIC_MODELS, + *model_registry.XAI_MODELS, + *model_registry.GOOGLE_MODELS, + ] + assert {model.provider.name for model in model_registry.OPENAI_MODELS} == {"OpenAI"} + assert {model.provider.name for model in model_registry.TOGETHER_MODELS} == {"Together"} + assert {model.provider.name for model in model_registry.ANTHROPIC_MODELS} == {"Anthropic"} + assert {model.provider.name for model in model_registry.XAI_MODELS} == {"xAI"} + assert {model.provider.name for model in model_registry.GOOGLE_MODELS} == {"Google"} + + +def test_model_registry_provider_groups_are_sorted_by_release_date(): + """Keep provider-specific model groups sorted by release date.""" + from utils.llm import model_registry + + provider_groups = [ + model_registry.OPENAI_MODELS, + model_registry.TOGETHER_MODELS, + model_registry.ANTHROPIC_MODELS, + model_registry.XAI_MODELS, + model_registry.GOOGLE_MODELS, + ] + + for models in provider_groups: + release_order = [(model.release_date, model.model_key) for model in models] + assert release_order == sorted(release_order) + + +def test_create_models_list_rejects_duplicate_model_keys(): + """Reject duplicate model keys when creating the full model registry list.""" + from utils.llm import model_registry + + model = model_registry.MODELS_BY_KEY["gpt-4-0613"] + + with pytest.raises(ValueError, match="Duplicate LLM model_key: gpt-4-0613"): + model_registry.create_models_list([model, model]) + + +def test_model_runs_use_canonical_model_registry_objects(): + """Store canonical model registry objects on shared model runs.""" + from utils.llm import model_registry, model_runs + + run = model_runs.MODEL_RUNS_BY_KEY["deepseek-v3.1"] + + assert run.model is model_registry.MODELS_BY_KEY["deepseek-v3.1"] + + +def test_o3_mini_model_run_is_not_artificial_analysis_backed(): + """Keep o3-mini selectable without treating it as an AA-backed run.""" + from utils.llm import model_runs + + run = model_runs.MODEL_RUNS_BY_KEY["o3-mini-2025-01-31"] + + assert run.model_run_key == "o3-mini-2025-01-31" + assert run.artificial_analysis_id is None + assert run.display_name == "o3-mini-2025-01-31" + + +def test_model_run_constructor_requires_explicit_model_run_key(): + """Do not allow ModelRun keys to be generated implicitly.""" + from utils.llm import model_registry, model_runs + + with pytest.raises(TypeError): + model_runs.ModelRun( + model=model_registry.MODELS_BY_KEY["gpt-5.5-2026-04-23"], + ) + + +def test_model_run_declarations_use_literal_model_run_keys(): + """Keep shared model-run keys handwritten at declaration sites.""" + from utils.llm import model_runs + + source = Path(model_runs.__file__).read_text() + tree = ast.parse(source) + + calls = [ + node + for node in ast.walk(tree) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "_model_run" + ] + + assert calls + for call in calls: + keyword = next( + (keyword for keyword in call.keywords if keyword.arg == "model_run_key"), + None, + ) + assert keyword is not None + assert isinstance(keyword.value, ast.Constant) + assert isinstance(keyword.value.value, str) + + +def test_artificial_analysis_model_runs_are_declared_in_dedicated_module(): + """Keep AA-backed model runs separate from the main registry list.""" + from utils.llm import artificial_analysis_model_runs, model_runs + + declaration_keys = [ + declaration["model_run_key"] + for declaration in artificial_analysis_model_runs.ARTIFICIAL_ANALYSIS_MODEL_RUN_DECLARATIONS + ] + registry_keys = [run.model_run_key for run in model_runs.ARTIFICIAL_ANALYSIS_MODEL_RUNS] + aa_keys_in_model_runs = [ + run.model_run_key for run in model_runs.MODEL_RUNS if run.artificial_analysis_id is not None + ] + + assert registry_keys == declaration_keys + assert aa_keys_in_model_runs == declaration_keys + assert "o3-mini-2025-01-31" not in declaration_keys + assert all(isinstance(key, str) for key in declaration_keys) + assert all( + declaration["artificial_analysis_id"] + for declaration in artificial_analysis_model_runs.ARTIFICIAL_ANALYSIS_MODEL_RUN_DECLARATIONS + ) + + +def test_artificial_analysis_opus_runs_use_aa_display_names_and_token_caps(): + """Align Opus AA model runs with AA display names and token cap conventions.""" + from utils.llm import model_runs + + non_reasoning = model_runs.MODEL_RUNS_BY_KEY["claude-opus-4-7-high-16384"] + adaptive = model_runs.MODEL_RUNS_BY_KEY["claude-opus-4-7-adaptive-thinking-max-128000"] + + assert non_reasoning.artificial_analysis_id == "2fa8e143-77a8-4d05-bfa8-d3b54634c00f" + assert non_reasoning.display_name == "Claude Opus 4.7 (Non-reasoning, High Effort)" + assert non_reasoning.options == { + "max_tokens": 16384, + "output_config": {"effort": "high"}, + } + assert adaptive.artificial_analysis_id == "e9a09db3-8fd6-41dd-ba2f-20e0a2bff7f2" + assert adaptive.display_name == "Claude Opus 4.7 (Adaptive Reasoning, Max Effort)" + assert adaptive.options == { + "max_tokens": 128000, + "output_config": {"effort": "max"}, + "thinking": {"type": "adaptive"}, + } + + +def test_artificial_analysis_model_runs_require_snapshot_ids(): + """Reject AA-backed model runs that reference missing snapshot IDs.""" + from utils.llm import model_registry, model_runs + + with pytest.raises(ValueError, match="Artificial Analysis"): + model_runs.ModelRun( + model_run_key="o3-mini-2025-01-31", + model=model_registry.MODELS_BY_KEY["o3-mini-2025-01-31"], + artificial_analysis_id="missing-aa-model", + ) + + +@pytest.mark.parametrize( + ("model_key", "options", "expected_key"), + [ + ("gpt-5.5-2026-04-23", {}, "gpt-5.5-2026-04-23"), + ( + "gpt-5.5-2026-04-23", + {"reasoning": {"effort": "high"}}, + "gpt-5.5-2026-04-23-high", + ), + ( + "gpt-5.5-2026-04-23", + {"reasoning": {"effort": "high"}, "tools": [{"type": "web_search"}]}, + "gpt-5.5-2026-04-23-high-web-search", + ), + ( + "claude-opus-4-7", + { + "max_tokens": 64000, + "output_config": {"effort": "high"}, + "thinking": {"type": "adaptive"}, + "tools": [ + { + "type": "web_search_20260209", + "name": "web_search", + "max_uses": 5, + } + ], + }, + "claude-opus-4-7-adaptive-thinking-high-web-search-64000", + ), + ( + "grok-4.20-0309-reasoning", + { + "tools": [{"type": "web_search"}, {"type": "x_search"}], + "max_tokens": 10000, + }, + "grok-4.20-0309-reasoning-web-search-x-search-10000", + ), + ("deepseek-v3.1", {"max_tokens": 10000}, "deepseek-v3.1-10000"), + ], +) +def test_model_run_key_is_generated_from_name_relevant_options(model_key, options, expected_key): + """Generate stable model-run keys from name-relevant options.""" + from utils.llm import model_runs + + assert model_runs.build_model_run_key(model_key, options) == expected_key + + +def test_name_neutral_options_do_not_appear_in_model_run_key(): + """Exclude name-neutral options from generated model-run keys.""" + from utils.llm import model_runs + + assert ( + model_runs.build_model_run_key( + "gemini-3.1-pro-preview", + { + "candidate_count": 1, + "temperature": 0, + "automatic_function_calling": {"disable": True}, + }, + ) + == "gemini-3.1-pro-preview" + ) + + +def test_model_run_exposes_explicit_model_run_key_as_name(): + """Use the handwritten model-run key as the compatibility name.""" + from utils.llm import model_registry, model_runs + + run = model_runs.ModelRun( + model_run_key="gemini-3.1-pro-preview", + model=model_registry.MODELS_BY_KEY["gemini-3.1-pro-preview"], + options={ + "candidate_count": 1, + "temperature": 0, + "automatic_function_calling": {"disable": True}, + }, + ) + + assert run.model_run_key == "gemini-3.1-pro-preview" + assert run.name == "gemini-3.1-pro-preview" + + +def test_unknown_option_paths_raise_in_model_run_validation(): + """Reject undeclared model-run options instead of silently naming them.""" + from utils.llm import model_registry, model_runs + + with pytest.raises(ValueError, match="name-relevant or name-neutral"): + model_runs.ModelRun( + model_run_key="gpt-5.5-2026-04-23", + model=model_registry.MODELS_BY_KEY["gpt-5.5-2026-04-23"], + options={"new_performance_option": True}, + ) + + +def test_model_run_routes_provider_model_id_to_get_response(): + """Route model-run calls through the provider model ID and merged options.""" + from utils.llm import model_registry, model_runs + + run = model_runs.ModelRun( + model_run_key="deepseek-v3.1", + model=model_registry.MODELS_BY_KEY["deepseek-v3.1"], + options={"temperature": 0}, + ) + + with patch("utils.llm.model_registry.get_response", return_value="forecast") as get_response: + response = run.get_response("prompt", max_tokens=10000) + + assert response == "forecast" + get_response.assert_called_once_with( + provider=PROVIDERS["Together"], + model_id="deepseek-ai/DeepSeek-V3.1", + prompt="prompt", + options={"temperature": 0, "max_tokens": 10000}, + ) + + +def test_shared_model_run_registry_contains_forecastbench_and_timeseriesbench_runs(): + """Expose model runs needed by ForecastBench and TimeSeriesBench.""" + from utils.llm import model_runs + + expected_keys = { + "gpt-4o-mini-2024-07-18", + "gpt-5-nano-2025-08-07", + "gpt-5-mini-2025-08-07", + "gpt-5-mini-2025-08-07-1024", + "gpt-5.2-2025-12-11", + "gpt-5.4-2026-03-05", + "gpt-5.4-2026-03-05-high", + "gpt-5.4-2026-03-05-high-web-search", + "gpt-5.4-mini-2026-03-17", + "gpt-5.4-nano-2026-03-17", + "gpt-5.5-2026-04-23", + "gpt-5.5-2026-04-23-medium", + "gpt-5.5-2026-04-23-high", + "gpt-5.5-2026-04-23-high-web-search", + "deepseek-v3.1", + "deepseek-v4-pro", + "minimax-m2.5", + "minimax-m2.7", + "kimi-k2.5", + "kimi-k2.6", + "glm-5.1", + "gemma-4-31b", + "claude-haiku-4-5-20251001-1024", + "claude-haiku-4-5-20251001-4096", + "claude-sonnet-4-5-20250929-1024", + "claude-sonnet-4-5-20250929-4096", + "claude-sonnet-4-6-1024", + "claude-sonnet-4-6-4096", + "claude-sonnet-4-6-adaptive-thinking-16000", + "claude-opus-4-6-4096", + "claude-opus-4-7-1024", + "claude-opus-4-7-4096", + "claude-opus-4-7-high-16384", + "claude-opus-4-7-adaptive-thinking-high-24000", + "claude-opus-4-7-adaptive-thinking-high-web-search-64000", + "claude-opus-4-7-adaptive-thinking-max-128000", + "claude-opus-4-8-1024", + "claude-opus-4-8-4096", + "claude-opus-4-8-adaptive-thinking-high-24000", + "claude-opus-4-8-adaptive-thinking-high-web-search-64000", + "grok-4-1-fast-reasoning", + "grok-4-1-fast-non-reasoning", + "grok-4.20-0309-reasoning", + "grok-4.20-0309-reasoning-web-search-x-search", + "grok-4.20-0309-non-reasoning", + "grok-4.3", + "gemini-2.5-pro", + "gemini-2.5-pro-web-search", + "gemini-3-flash-preview", + "gemini-3.1-flash-lite-preview", + "gemini-3.1-flash-lite", + "gemini-3.1-pro-preview", + "gemini-3.5-flash", + } + + assert expected_keys <= set(model_runs.MODEL_RUNS_BY_KEY) + + +def test_shared_model_run_keys_are_unique_and_file_safe(): + """Keep shared model-run keys unique and safe for filenames.""" + from utils.llm import model_runs + + keys = [run.model_run_key for run in model_runs.MODEL_RUNS] + + assert len(keys) == len(set(keys)) + assert all(key == key.lower() for key in keys) + assert all(" " not in key and "/" not in key and "_" not in key for key in keys) + + +def test_active_model_runs_exclude_runs_for_inactive_models(): + """Keep inactive provider routes in history while excluding them from live runs.""" + from utils.llm import model_runs + + all_keys = {run.model_run_key for run in model_runs.MODEL_RUNS} + active_keys = {run.model_run_key for run in model_runs.ACTIVE_MODEL_RUNS} + + assert "deepseek-v3.1" in all_keys + assert "deepseek-v3.1" not in active_keys + assert all(run.model.active for run in model_runs.ACTIVE_MODEL_RUNS) + assert model_runs.ACTIVE_MODEL_RUNS_BY_KEY == { + run.model_run_key: run for run in model_runs.ACTIVE_MODEL_RUNS + } + + +def test_create_model_runs_list_rejects_duplicate_model_run_keys(): + """Reject duplicate model-run keys when creating the shared run registry list.""" + from utils.llm import model_runs + + run = model_runs.MODEL_RUNS_BY_KEY["gpt-5.5-2026-04-23"] + + with pytest.raises(ValueError, match="Duplicate LLM model_run_key"): + model_runs.create_model_runs_list([run, run]) + + +def test_declared_model_run_options_do_not_share_mutable_objects(): + """Avoid sharing mutable nested option objects across declared runs.""" + from utils.llm import model_runs + + preview_run = model_runs.MODEL_RUNS_BY_KEY["gemini-3-flash-preview"] + lite_run = model_runs.MODEL_RUNS_BY_KEY["gemini-3.1-flash-lite-preview"] + + assert preview_run.options is not lite_run.options + assert ( + preview_run.options["automatic_function_calling"] + is not lite_run.options["automatic_function_calling"] + ) + + +def test_release_dates_exist_for_all_shared_models(): + """Expose release dates for every shared canonical model.""" + from utils.llm import model_registry + + release_dates = model_registry.model_release_dates_by_key() + + assert release_dates["gpt-5.5-2026-04-23"] == date(2026, 4, 23) + assert release_dates["deepseek-v3.1"] == date(2025, 8, 21) + assert release_dates["gemini-3.1-flash-lite"] == date(2026, 5, 7) + for model in model_registry.MODELS: + assert release_dates[model.model_key] == model.release_date + + +def test_historical_forecastbench_llm_release_dates_are_available(): + """Keep historical ForecastBench LLM release dates in the model registry.""" + from utils.llm import model_registry + + historical_model_keys = { + "claude-2.1", + "claude-3-5-sonnet-20240620", + "claude-3-5-sonnet-20241022", + "claude-3-7-sonnet-20250219", + "claude-3-haiku-20240307", + "claude-3-opus-20240229", + "claude-opus-4-1-20250805", + "claude-opus-4-20250514", + "claude-opus-4-5-20251101", + "claude-sonnet-4-20250514", + "deepseek-r1", + "deepseek-v3", + "gemini-1.5-flash", + "gemini-1.5-pro", + "gemini-2.0-flash-lite-001", + "gemini-2.5-flash", + "gemini-2.5-flash-preview-04-17", + "gemini-2.5-pro-exp-03-25", + "gemini-2.5-pro-preview-03-25", + "gemini-3-pro-preview", + "glm-4.5-air-fp8", + "glm-4.6", + "glm-4.7", + "glm-5", + "gpt-3.5-turbo-0125", + "gpt-4-0613", + "gpt-4-turbo-2024-04-09", + "gpt-4.1-2025-04-14", + "gpt-4.5-preview-2025-02-27", + "gpt-4o", + "gpt-4o-2024-05-13", + "gpt-4o-2024-11-20", + "gpt-5-2025-08-07", + "gpt-5.1-2025-11-13", + "grok-4-0709", + "grok-4-fast-non-reasoning", + "grok-4-fast-reasoning", + "grok-beta", + "kimi-k2-instruct", + "kimi-k2-instruct-0905", + "kimi-k2-thinking", + "llama-2-70b-chat-hf", + "llama-3-70b-chat-hf", + "llama-3-8b-chat-hf", + "llama-3.2-3b-instruct-turbo", + "llama-3.3-70b-instruct-turbo", + "llama-4-maverick-17b-128e-instruct-fp8", + "llama-4-scout-17b-16e-instruct", + "magistral-medium-2506", + "meta-llama-3.1-405b-instruct-turbo", + "mistral-large-2407", + "mistral-large-2411", + "mistral-large-latest", + "mixtral-8x22b-instruct-v0.1", + "mixtral-8x7b-instruct-v0.1", + "o3-2025-04-16", + "o3-mini-2025-01-31", + "o4-mini-2025-04-16", + "qwen1.5-110b-chat", + "qwen2.5-72b-instruct-turbo", + "qwen3-235b-a22b-fp8-tput", + "qwen3-235b-a22b-thinking-2507", + "qwq-32b-preview", + } + release_dates = model_registry.model_release_dates_by_key() + + assert historical_model_keys <= set(model_registry.MODELS_BY_KEY) + assert not hasattr(model_registry, "HISTORICAL_MODEL_RELEASE_DATES_BY_KEY") + assert release_dates["gpt-4-0613"] == date(2023, 6, 13) + assert release_dates["claude-2.1"] == date(2023, 11, 21) + assert release_dates["kimi-k2-instruct"] == date(2025, 7, 12) + assert release_dates["qwen3-235b-a22b-thinking-2507"] == date(2025, 7, 25) + assert all(key in release_dates for key in historical_model_keys) + assert not any(key.startswith("unusedgrok") for key in model_registry.MODELS_BY_KEY) + assert "Always 0" not in release_dates + assert "Naive Forecaster" not in release_dates + + +def test_select_model_runs_preserves_order_and_rejects_unknown_keys(): + """Select shared model runs in requested order and fail on unknown keys.""" + from utils.llm import model_runs + + selected = model_runs.select_model_runs(["gpt-5.4-2026-03-05", "deepseek-v3.1"]) + + assert [run.model_run_key for run in selected] == [ + "gpt-5.4-2026-03-05", + "deepseek-v3.1", + ] + with pytest.raises(KeyError, match="missing-model"): + model_runs.select_model_runs(["missing-model"]) diff --git a/tests/unit/test_llm_routing.py b/tests/unit/test_llm_routing.py index eefb7ac..3d0c11e 100644 --- a/tests/unit/test_llm_routing.py +++ b/tests/unit/test_llm_routing.py @@ -1,7 +1,6 @@ """Unit tests for LLM provider routing.""" -from __future__ import annotations - +from datetime import date from types import SimpleNamespace from typing import Any from unittest.mock import MagicMock, patch @@ -18,6 +17,8 @@ def test_labs_have_leaderboard_names(): assert LABS["OpenAI"].leaderboard_name == "OpenAI" assert LABS["Moonshot"].name == "Moonshot" assert LABS["Moonshot"].leaderboard_name == "Moonshot AI" + assert LABS["MiniMax"].name == "MiniMax" + assert LABS["MiniMax"].leaderboard_name == "MiniMax" assert "Google" not in LABS assert LABS["Google DeepMind"].name == "Google DeepMind" assert LABS["Google DeepMind"].leaderboard_name == "Google DeepMind" @@ -73,20 +74,19 @@ def get_response( } -def test_model_get_response_routes_full_name_and_options(): +def test_model_get_response_routes_provider_model_id_and_options(): """Model routing should call providers through the public final interface.""" from utils.llm import model_registry - from utils.llm.model_registry import Model, OpenAIProvider + from utils.llm.model_registry import Model observed: dict[str, Any] = {} options = {"temperature": 0, "max_tokens": 128} model = Model( - id="reasoning-model", - full_name="reasoning-model", - token_limit=128_000, - provider_cls=OpenAIProvider, + model_key="reasoning-model", + provider_model_id="reasoning-model", + provider=PROVIDERS["OpenAI"], lab=LABS["OpenAI"], - reasoning_model=True, + manual_release_date=date(2026, 1, 1), ) class FakeProvider: @@ -107,7 +107,7 @@ def get_response( assert response == "reasoning text" assert observed == { - "model_id": model.full_name, + "model_id": model.provider_model_id, "prompt": "forecast prompt", "options": options, } @@ -248,6 +248,7 @@ def test_anthropic_provider_forwards_options_without_asserting_max_tokens(): prompt="forecast", options={ "max_tokens": 16000, + "output_config": {"effort": "max"}, "thinking": {"type": "adaptive"}, "tools": [{"type": "web_search_20250305", "name": "web_search"}], }, @@ -258,6 +259,7 @@ def test_anthropic_provider_forwards_options_without_asserting_max_tokens(): model="claude-opus-4-6", messages=[{"role": "user", "content": "forecast"}], max_tokens=16000, + output_config={"effort": "max"}, thinking={"type": "adaptive"}, tools=[{"type": "web_search_20250305", "name": "web_search"}], ) diff --git a/tests/unit/test_models_dev_metadata.py b/tests/unit/test_models_dev_metadata.py new file mode 100644 index 0000000..236622d --- /dev/null +++ b/tests/unit/test_models_dev_metadata.py @@ -0,0 +1,282 @@ +"""Tests for the checked-in Models.dev metadata snapshot.""" + +import json +import tomllib +from datetime import date +from pathlib import Path + +import pytest + +from scripts import refresh_models_dev_metadata +from utils.llm.metadata import models_dev + +ROOT_DIR = Path(__file__).resolve().parents[2] + + +def test_load_models_dev_snapshot_exposes_provider_and_model_metadata(): + """Load the snapshot and expose normalized provider and model fields.""" + snapshot = models_dev.load_models_dev_snapshot() + + openai = snapshot.providers["openai"] + assert openai.name == "OpenAI" + + gpt_4o = openai.models["gpt-4o"] + assert gpt_4o.id == "gpt-4o" + assert gpt_4o.name == "GPT-4o" + assert gpt_4o.release_date == date(2024, 5, 13) + assert set(gpt_4o.raw) == {"id", "name", "release_date"} + + +def test_models_dev_snapshot_can_lookup_model_by_provider_and_model_id(): + """Look up a normalized model by Models.dev provider and model IDs.""" + snapshot = models_dev.load_models_dev_snapshot() + + model = snapshot.get_model(provider_id="anthropic", model_id="claude-3-haiku-20240307") + + assert model.name == "Claude Haiku 3" + assert model.release_date == date(2024, 3, 13) + + +def test_models_dev_snapshot_rejects_unknown_provider_or_model(): + """Raise clear lookup errors for unknown provider or model IDs.""" + snapshot = models_dev.load_models_dev_snapshot() + + try: + snapshot.get_model(provider_id="missing", model_id="gpt-4o") + except KeyError as exc: + assert "Unknown Models.dev provider_id missing" in str(exc) + else: + raise AssertionError("Expected missing provider lookup to fail") + + try: + snapshot.get_model(provider_id="openai", model_id="missing") + except KeyError as exc: + assert "Unknown Models.dev model_id missing for provider_id openai" in str(exc) + else: + raise AssertionError("Expected missing model lookup to fail") + + +def test_models_dev_snapshot_preserves_raw_partial_release_dates(tmp_path): + """Preserve month-only source dates while omitting typed date values.""" + snapshot_path = tmp_path / "models_dev_snapshot.json" + snapshot_path.write_text( + json.dumps( + { + "source": refresh_models_dev_metadata.MODELS_DEV_URL, + "providers": { + "abacus": { + "id": "abacus", + "name": "Abacus", + "models": { + "kimi-k2.5": { + "id": "kimi-k2.5", + "name": "Kimi K2.5", + "release_date": "2026-01", + } + }, + } + }, + } + ) + ) + snapshot = models_dev.load_models_dev_snapshot(snapshot_path) + + model = snapshot.get_model(provider_id="abacus", model_id="kimi-k2.5") + + assert model.release_date is None + assert model.raw["release_date"] == "2026-01" + + +def test_models_dev_snapshot_preserves_raw_invalid_release_dates(tmp_path): + """Preserve invalid source dates while omitting typed date values.""" + snapshot_path = tmp_path / "models_dev_snapshot.json" + snapshot_path.write_text( + json.dumps( + { + "source": refresh_models_dev_metadata.MODELS_DEV_URL, + "providers": { + "scaleway": { + "id": "scaleway", + "name": "Scaleway", + "models": { + "qwen3-embedding-8b": { + "id": "qwen3-embedding-8b", + "name": "Qwen3 Embedding 8B", + "release_date": "2025-25-11", + } + }, + } + }, + } + ) + ) + snapshot = models_dev.load_models_dev_snapshot(snapshot_path) + + model = snapshot.get_model(provider_id="scaleway", model_id="qwen3-embedding-8b") + + assert model.release_date is None + assert model.raw["release_date"] == "2025-25-11" + + +def test_read_models_dev_references_from_model_registry_uses_ast(tmp_path): + """Discover Models.dev references from declarations without importing the registry.""" + registry_path = tmp_path / "model_registry.py" + registry_path.write_text(""" +openai_model( + model_key="gpt-test", + models_dev_reference=ModelsDevReference( + provider_id="openai", + model_id="gpt-test", + ), +) +together_model( + model_key="manual-only", + manual_release_date=date(2026, 1, 1), +) +anthropic_model( + model_key="claude-test", + models_dev_reference=ModelsDevReference(provider_id="anthropic", model_id="claude-test"), +) +""") + + references = refresh_models_dev_metadata.read_models_dev_references_from_model_registry( + registry_path + ) + + assert references == frozenset( + { + refresh_models_dev_metadata.ModelsDevReference("anthropic", "claude-test"), + refresh_models_dev_metadata.ModelsDevReference("openai", "gpt-test"), + } + ) + + +def test_normalize_models_dev_api_response_keeps_expected_fields_sorted(): + """Normalize only referenced model release-date fields and sort providers and models.""" + api_response = { + "openai": { + "id": "openai", + "name": "OpenAI", + "models": { + "z-model": { + "id": "z-model", + "name": "Z Model", + "release_date": "2026-01-02", + "last_updated": "2026-01-03", + "limit": {"output": 2, "context": 1}, + "cost": {"input": 1.5}, + "reasoning": True, + "temperature": False, + "tool_call": True, + "ignored": "drop me", + }, + "a-model": { + "id": "a-model", + "name": "A Model", + "release_date": None, + "last_updated": None, + "limit": {}, + "cost": None, + "reasoning": False, + "temperature": True, + "tool_call": False, + }, + "unused-model": { + "id": "unused-model", + "name": "Unused Model", + "release_date": "2026-01-04", + }, + }, + }, + "unused-provider": { + "id": "unused-provider", + "name": "Unused Provider", + "models": { + "unused": { + "id": "unused", + "name": "Unused", + "release_date": "2026-01-05", + } + }, + }, + } + + normalized = refresh_models_dev_metadata.normalize_models_dev_api_response( + api_response, + models_dev_references=frozenset( + { + refresh_models_dev_metadata.ModelsDevReference("openai", "z-model"), + refresh_models_dev_metadata.ModelsDevReference("openai", "a-model"), + } + ), + ) + + assert list(normalized["providers"]) == ["openai"] + assert list(normalized["providers"]["openai"]["models"]) == ["a-model", "z-model"] + assert normalized["providers"]["openai"]["models"]["z-model"] == { + "id": "z-model", + "name": "Z Model", + "release_date": "2026-01-02", + } + + +def test_normalize_models_dev_api_response_errors_with_reference_suggestions(): + """Reject incorrect exact references and suggest nearby provider-local candidates.""" + api_response = { + "openai": { + "id": "openai", + "name": "OpenAI", + "models": { + "gpt-5.6-2026-06-01": { + "id": "gpt-5.6-2026-06-01", + "name": "GPT-5.6", + "release_date": "2026-06-01", + }, + "gpt-5.6-chat": { + "id": "gpt-5.6-chat", + "name": "GPT-5.6 Chat", + "release_date": "2026-06-01", + }, + }, + } + } + + with pytest.raises(ValueError) as excinfo: + refresh_models_dev_metadata.normalize_models_dev_api_response( + api_response, + models_dev_references=frozenset( + {refresh_models_dev_metadata.ModelsDevReference("openai", "gpt-5.6")} + ), + ) + + message = str(excinfo.value) + assert "Missing Models.dev reference: openai/gpt-5.6" in message + assert 'openai/gpt-5.6-2026-06-01 name="GPT-5.6"' in message + assert 'openai/gpt-5.6-chat name="GPT-5.6 Chat"' in message + + +def test_checked_in_models_dev_snapshot_contains_only_registry_references(): + """Keep the Models.dev snapshot scoped to the registry references that use it.""" + references = refresh_models_dev_metadata.read_models_dev_references_from_model_registry() + snapshot = json.loads(models_dev.SNAPSHOT_PATH.read_text()) + snapshot_references = frozenset( + refresh_models_dev_metadata.ModelsDevReference(provider_id, model_id) + for provider_id, provider_data in snapshot["providers"].items() + for model_id in provider_data["models"] + ) + + assert snapshot_references == references + assert all( + set(model_data) <= {"id", "name", "release_date"} + for provider_data in snapshot["providers"].values() + for model_data in provider_data["models"].values() + ) + + +def test_models_dev_snapshot_is_included_as_package_data(): + """Include the JSON snapshot when utils is installed as a package.""" + pyproject = tomllib.loads((ROOT_DIR / "pyproject.toml").read_text()) + + package_data = pyproject["tool"]["setuptools"]["package-data"] + + assert package_data["utils.llm.metadata"] == ["*.json"] diff --git a/tests/unit/test_project_dependencies.py b/tests/unit/test_project_dependencies.py new file mode 100644 index 0000000..0487de2 --- /dev/null +++ b/tests/unit/test_project_dependencies.py @@ -0,0 +1,66 @@ +"""Tests for package dependency ownership.""" + +import tomllib +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[2] + +SHARED_RUNTIME_DEPENDENCIES = { + "anthropic", + "google-cloud-secret-manager", + "google-cloud-storage", + "google-genai", + "openai", + "python-dotenv", + "together", +} + + +def _requirement_name(requirement: str) -> str: + return ( + requirement.split("==", maxsplit=1)[0] + .split(">=", maxsplit=1)[0] + .split("<", maxsplit=1)[0] + .strip() + ) + + +def test_shared_runtime_dependencies_are_declared_in_pyproject(): + """Keep shared LLM runtime dependencies owned by pyproject metadata.""" + pyproject = tomllib.loads((ROOT / "pyproject.toml").read_text()) + dependency_names = { + _requirement_name(dependency) for dependency in pyproject["project"]["dependencies"] + } + + assert SHARED_RUNTIME_DEPENDENCIES <= dependency_names + + +def test_requirements_txt_delegates_to_pyproject_dev_extra(): + """Keep local requirements install behavior delegated to the dev extra.""" + requirements = (ROOT / "requirements.txt").read_text().splitlines() + + assert requirements == [".[dev]"] + + +def test_dev_extra_preserves_requirements_txt_dev_tooling(): + """Keep previous requirements.txt test tooling in the dev extra.""" + pyproject = tomllib.loads((ROOT / "pyproject.toml").read_text()) + dev_dependencies = pyproject["project"]["optional-dependencies"]["dev"] + dev_dependency_names = {_requirement_name(dependency) for dependency in dev_dependencies} + + assert "pytest-xdist" in dev_dependency_names + assert all("==" in dependency for dependency in dev_dependencies) + + +def test_third_party_notices_cover_metadata_sources(): + """Preserve license and attribution notices for checked-in metadata snapshots.""" + notices = (ROOT / "THIRD_PARTY_NOTICES.md").read_text() + pyproject = tomllib.loads((ROOT / "pyproject.toml").read_text()) + + assert "Models.dev" in notices + assert "https://github.com/anomalyco/models.dev" in notices + assert "MIT License" in notices + assert "Copyright (c) 2025 models.dev" in notices + assert "Artificial Analysis" in notices + assert "https://artificialanalysis.ai/" in notices + assert "THIRD_PARTY_NOTICES.md" in pyproject["tool"]["setuptools"]["license-files"] diff --git a/utils/helpers/constants.py b/utils/helpers/constants.py index 5265339..643837f 100644 --- a/utils/helpers/constants.py +++ b/utils/helpers/constants.py @@ -11,6 +11,7 @@ OPENAI_API_KEY_SECRET_NAME: str = "API_KEY_OPENAI" XAI_API_KEY_SECRET_NAME: str = "API_KEY_XAI" TOGETHER_API_KEY_SECRET_NAME: str = "API_KEY_TOGETHERAI" +ARTIFICIAL_ANALYSIS_API_KEY_SECRET_NAME: str = "API_KEY_ARTIFICIAL_ANALYSIS" __all__ = [ "GOOGLE_CLOUD_PROJECT_ENV_VAR", @@ -20,4 +21,5 @@ "OPENAI_API_KEY_SECRET_NAME", "XAI_API_KEY_SECRET_NAME", "TOGETHER_API_KEY_SECRET_NAME", + "ARTIFICIAL_ANALYSIS_API_KEY_SECRET_NAME", ] diff --git a/utils/llm/__init__.py b/utils/llm/__init__.py index 65474d6..e8af157 100644 --- a/utils/llm/__init__.py +++ b/utils/llm/__init__.py @@ -2,7 +2,13 @@ from importlib import import_module -__all__ = ["lab_registry", "model_registry", "provider_registry", "providers"] +__all__ = [ + "lab_registry", + "model_registry", + "model_runs", + "provider_registry", + "providers", +] def __getattr__(name: str): diff --git a/utils/llm/artificial_analysis_model_runs.py b/utils/llm/artificial_analysis_model_runs.py new file mode 100644 index 0000000..2dfd09a --- /dev/null +++ b/utils/llm/artificial_analysis_model_runs.py @@ -0,0 +1,38 @@ +"""Artificial Analysis-backed model-run declarations.""" + +from collections.abc import Callable +from typing import Any + +# Every declaration here is intended to be part of MODEL_RUNS. Add a run here +# only after its provider-callable options are ready for benchmark selection. +ARTIFICIAL_ANALYSIS_MODEL_RUN_DECLARATIONS: tuple[dict[str, Any], ...] = ( + { + "model_run_key": "claude-opus-4-7-high-16384", + "model_key": "claude-opus-4-7", + "options": { + "max_tokens": 16384, + "output_config": {"effort": "high"}, + }, + "artificial_analysis_id": "2fa8e143-77a8-4d05-bfa8-d3b54634c00f", + }, + { + "model_run_key": "claude-opus-4-7-adaptive-thinking-max-128000", + "model_key": "claude-opus-4-7", + "options": { + "max_tokens": 128000, + "output_config": {"effort": "max"}, + "thinking": {"type": "adaptive"}, + }, + "artificial_analysis_id": "e9a09db3-8fd6-41dd-ba2f-20e0a2bff7f2", + }, +) + + +def create_artificial_analysis_model_runs( + model_run_factory: Callable[..., Any], +) -> list[Any]: + """Build AA-backed model runs using the main registry's factory.""" + return [ + model_run_factory(**declaration) + for declaration in ARTIFICIAL_ANALYSIS_MODEL_RUN_DECLARATIONS + ] diff --git a/utils/llm/lab_registry.py b/utils/llm/lab_registry.py index dc7bd69..e74c5aa 100644 --- a/utils/llm/lab_registry.py +++ b/utils/llm/lab_registry.py @@ -23,8 +23,10 @@ def leaderboard_name(self) -> str: "Anthropic": Lab(name="Anthropic"), "DeepSeek": Lab(name="DeepSeek"), "Moonshot": Lab(name="Moonshot", display_name="Moonshot AI"), + "MiniMax": Lab(name="MiniMax"), "Google DeepMind": Lab(name="Google DeepMind"), "Meta": Lab(name="Meta"), + "Mistral AI": Lab(name="Mistral AI"), "OpenAI": Lab(name="OpenAI"), "Qwen": Lab(name="Qwen"), "xAI": Lab(name="xAI"), diff --git a/utils/llm/metadata/__init__.py b/utils/llm/metadata/__init__.py new file mode 100644 index 0000000..bb5319d --- /dev/null +++ b/utils/llm/metadata/__init__.py @@ -0,0 +1 @@ +"""LLM metadata snapshots and loaders.""" diff --git a/utils/llm/metadata/artificial_analysis.py b/utils/llm/metadata/artificial_analysis.py new file mode 100644 index 0000000..0cdbbcf --- /dev/null +++ b/utils/llm/metadata/artificial_analysis.py @@ -0,0 +1,63 @@ +"""Loader for the checked-in Artificial Analysis metadata snapshot.""" + +import json +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path +from typing import Any + +SNAPSHOT_PATH = Path(__file__).with_name("artificial_analysis_snapshot.json") + + +@dataclass(frozen=True, slots=True) +class ArtificialAnalysisModel: + """Normalized metadata for one Artificial Analysis LLM model entry.""" + + id: str + name: str + + +@dataclass(frozen=True, slots=True) +class ArtificialAnalysisSnapshot: + """Loaded Artificial Analysis metadata indexed by stable model ID.""" + + models: dict[str, ArtificialAnalysisModel] + source: str + prompt_options: dict[str, Any] + + def get_model(self, model_id: str) -> ArtificialAnalysisModel: + """Return a model by Artificial Analysis stable model ID.""" + try: + return self.models[model_id] + except KeyError as exc: + raise KeyError(f"Unknown Artificial Analysis model_id {model_id}") from exc + + +def _model_from_json(data: dict[str, Any]) -> ArtificialAnalysisModel: + """Build normalized model metadata from snapshot JSON.""" + return ArtificialAnalysisModel( + id=data["id"], + name=data["name"], + ) + + +def _models_from_snapshot_json(data: dict[str, Any]) -> dict[str, ArtificialAnalysisModel]: + """Build model metadata from current or legacy snapshot JSON.""" + if "data" in data: + return {model_data["id"]: _model_from_json(model_data) for model_data in data["data"]} + return { + model_id: _model_from_json(model_data) for model_id, model_data in data["models"].items() + } + + +@lru_cache(maxsize=1) +def load_artificial_analysis_snapshot( + path: Path = SNAPSHOT_PATH, +) -> ArtificialAnalysisSnapshot: + """Load the checked-in Artificial Analysis metadata snapshot.""" + data = json.loads(path.read_text()) + return ArtificialAnalysisSnapshot( + models=_models_from_snapshot_json(data), + source=data.get("source", ""), + prompt_options=data.get("prompt_options") or {}, + ) diff --git a/utils/llm/metadata/artificial_analysis_snapshot.json b/utils/llm/metadata/artificial_analysis_snapshot.json new file mode 100644 index 0000000..71b2a1d --- /dev/null +++ b/utils/llm/metadata/artificial_analysis_snapshot.json @@ -0,0 +1,2109 @@ +{ + "data": [ + { + "id": "0081ab31-d10a-44a0-a10d-eee5533fec65", + "name": "GLM-4.5V (Non-reasoning)" + }, + { + "id": "0097ebf5-124f-42f6-9463-33b00e711f03", + "name": "Gemini 3.5 Flash (high)" + }, + { + "id": "00f1248e-78e3-4230-8dc8-5e13ba8645e2", + "name": "MiMo-V2.5-Pro" + }, + { + "id": "0121d27b-5b8a-4901-8684-80589cc6d40d", + "name": "JT-35B-Flash" + }, + { + "id": "016d330a-2141-4afa-b2fc-62b314423dc1", + "name": "Mercury 2" + }, + { + "id": "0179b427-93dc-415c-bb4c-f980ddf8d088", + "name": "Qwen3.5 Omni Plus" + }, + { + "id": "018c60e8-e908-431a-ba57-c840b1df3987", + "name": "Nova 2.0 Omni (medium)" + }, + { + "id": "019e86f6-e66b-42d8-8a50-235a06b53003", + "name": "GPT-5.2 Codex (xhigh)" + }, + { + "id": "021b1b31-d2fc-4653-ab74-c10bd2f41c8e", + "name": "Qwen3.5 122B A10B (Non-reasoning)" + }, + { + "id": "033ade17-d9ec-44e0-b792-b5f1fcd5ab4c", + "name": "Gemini 3.5 Flash (minimal)" + }, + { + "id": "033e4aa9-a556-4224-87b0-341ed1070257", + "name": "Claude 3.5 Haiku" + }, + { + "id": "037dec2f-51e8-4127-a1f1-85155dae7a1d", + "name": "GPT-3.5 Turbo" + }, + { + "id": "0399e614-5d46-484f-9183-e4f32d74e1c6", + "name": "Gemini 2.0 Flash Thinking Experimental (Jan '25)" + }, + { + "id": "04586102-6a28-48f8-a82e-85775d7ed779", + "name": "Qwen2.5 Coder Instruct 32B" + }, + { + "id": "04751bd4-0c5d-416b-a5f2-83727c5bfcda", + "name": "Sonar Reasoning" + }, + { + "id": "04781a0e-40f0-4e2a-a4e5-18e389364a79", + "name": "Gemma 3 270M" + }, + { + "id": "04787c2b-0751-4269-8029-075b727d7aed", + "name": "Grok 4.20 0309 (Reasoning)" + }, + { + "id": "04d023f3-025c-4d78-9571-53edda3eaf2a", + "name": "GPT-5.1 Codex (high)" + }, + { + "id": "05a32e26-e609-4377-951b-8fa23d329926", + "name": "Mistral Medium 3.1" + }, + { + "id": "05e45a36-b5c6-47a1-8adb-9ddc19add5b3", + "name": "GPT-5 nano (minimal)" + }, + { + "id": "073b5329-c4b3-4f1f-8f97-4753aadf4398", + "name": "Gemini 2.5 Pro Preview (May' 25)" + }, + { + "id": "076f2674-bc4b-4925-be59-50832eb8c090", + "name": "o3-mini (high)" + }, + { + "id": "078f4dc8-5350-40a2-a5ea-e8359f795b70", + "name": "o1-preview" + }, + { + "id": "07c35e00-2b12-44c8-91cc-408629cd569e", + "name": "DeepSeek V3.2 Exp (Non-reasoning)" + }, + { + "id": "093883ed-f5fc-443b-8e18-afbfb166699e", + "name": "Qwen3 Coder 480B A35B Instruct" + }, + { + "id": "0985ada8-2ed8-404d-bd8b-7357666ce40f", + "name": "Qwen3.5 2B (Non-reasoning)" + }, + { + "id": "09f43999-b67b-4c1b-b050-44df41ed7e62", + "name": "Devstral 2" + }, + { + "id": "0a603978-03b9-4f47-a273-2f7fd969be85", + "name": "Claude 3.5 Sonnet (Oct '24)" + }, + { + "id": "0a7dda4d-cc9c-4a90-abc1-abb5772c901b", + "name": "DeepSeek V3.1 Terminus (Reasoning)" + }, + { + "id": "0b226b82-1462-4860-bf1a-f8aed7024791", + "name": "Qwen3 Omni 30B A3B Instruct" + }, + { + "id": "0d94dc87-12c8-4d4a-8d99-804ce3f17bc2", + "name": "Nova 2.0 Pro Preview (medium)" + }, + { + "id": "0de67206-4d36-4d10-b8f6-cf37fa747a03", + "name": "Kimi K2.6" + }, + { + "id": "0e34f05c-387e-4968-be15-ccec4a55d8c1", + "name": "DeepSeek R1 (Jan '25)" + }, + { + "id": "0e49fe2d-dd3c-4ae5-b56f-a1c89e14b89e", + "name": "DeepSeek R1 Distill Llama 8B" + }, + { + "id": "0e5f6140-1154-4583-a3e0-8c032a338892", + "name": "Qwen3 0.6B (Non-reasoning)" + }, + { + "id": "0e66bae9-41f1-42fc-9276-ce8cb6f72919", + "name": "Qwen3.5 397B A17B (Reasoning)" + }, + { + "id": "0faadeeb-320c-45cf-9c76-5f8768f342e6", + "name": "LFM2 1.2B" + }, + { + "id": "0fc6308e-fbd2-42d3-a216-06da3c43e34e", + "name": "Kimi K2.6 (Non-reasoning)" + }, + { + "id": "0fec07d5-a9b2-407a-b5f8-5bf10bd86b59", + "name": "Magistral Small 1" + }, + { + "id": "1299b9a8-af50-4742-a58b-24ff7eb48f9f", + "name": "LFM 40B" + }, + { + "id": "12adec16-19fe-4d92-aeff-5ef3eb7e780a", + "name": "MiniMax-M2.5" + }, + { + "id": "12f6a061-0ab3-4c76-b225-49abee253651", + "name": "Mistral Small 4 (Non-reasoning)" + }, + { + "id": "13358187-4584-479c-ab43-5bcdf8f297a4", + "name": "Claude 3.7 Sonnet (Reasoning)" + }, + { + "id": "1479f50b-d37f-4b55-bb8b-4212a15042eb", + "name": "MiMo-V2-Flash (Feb 2026)" + }, + { + "id": "149096f3-57b7-4413-80c2-a2c010a2995a", + "name": "GLM-5-Turbo" + }, + { + "id": "15b56b8e-7b93-4ed9-ac06-75c922e3b86e", + "name": "Ling-1T" + }, + { + "id": "16149b9c-a1e9-4669-a5cb-ff3c00d78f89", + "name": "gpt-oss-20B (low)" + }, + { + "id": "169e47f5-3d4d-4ad4-8f8b-ab46f0c73f67", + "name": "Qwen3.5 27B (Reasoning)" + }, + { + "id": "16c5b637-8bce-4252-81f2-1b87a36a4e4c", + "name": "o3" + }, + { + "id": "16f2578b-1b28-4be3-b371-700c2677bcd6", + "name": "Gemini 1.5 Pro (May '24)" + }, + { + "id": "191a2097-cce3-49cf-881e-0c790892059f", + "name": "Qwen3 4B (Reasoning)" + }, + { + "id": "198b717f-42c8-4ab7-a699-ae9373d669d3", + "name": "DeepSeek V3.1 (Reasoning)" + }, + { + "id": "1a8ba535-df18-459b-ad40-3199191296d7", + "name": "Llama 3.3 Nemotron Super 49B v1 (Reasoning)" + }, + { + "id": "1aa3694e-b656-4dbe-8f84-0c65d8897abb", + "name": "Step 3.5 Flash 2603" + }, + { + "id": "1b05e346-e86a-4a20-8feb-7da8c65a99aa", + "name": "Mistral Large 2 (Jul '24)" + }, + { + "id": "1cb96708-f6c1-4668-9804-5db6ecac01ed", + "name": "DBRX Instruct" + }, + { + "id": "1cf439b8-0cfd-47b2-9de2-9a2157e6762b", + "name": "GLM-4.5 (Reasoning)" + }, + { + "id": "1d0db5a3-3132-4213-a94b-c2e395d08283", + "name": "Qwen2.5 Instruct 32B" + }, + { + "id": "1d81aa1c-64c8-442a-9c41-81b37e407b91", + "name": "Gemini 2.5 Flash-Lite (Non-reasoning)" + }, + { + "id": "1dcea4f7-7e8b-49f8-abe2-5860ff9f349e", + "name": "Hermes 3 - Llama-3.1 70B" + }, + { + "id": "1edc272c-d799-44d6-909a-bf3c1909a3a0", + "name": "Sonar Reasoning Pro" + }, + { + "id": "1f054429-397e-4fdb-9e71-67bc92c1735e", + "name": "GPT-5.5 (xhigh)" + }, + { + "id": "1f05af98-1ec6-4506-a0b8-57a8c9b63878", + "name": "Mistral Medium" + }, + { + "id": "1f6478c9-3e22-4586-adbe-841782859677", + "name": "Nova 2.0 Omni (Non-reasoning)" + }, + { + "id": "1fc32894-1060-493b-af94-62bb1068555e", + "name": "Granite 4.0 Micro" + }, + { + "id": "1fc54cef-d179-48b1-a27d-046874e9b208", + "name": "Claude 3 Haiku" + }, + { + "id": "20da3b31-fc0a-4359-abec-d59367bf1d9f", + "name": "Qwen3.5 9B (Non-reasoning)" + }, + { + "id": "217b34ec-5920-4fc1-8886-6a70a324837d", + "name": "Mistral 7B Instruct" + }, + { + "id": "219ed587-60c5-4a48-9517-8480e08d0ca1", + "name": "Gemini 2.5 Flash (Reasoning)" + }, + { + "id": "222fb320-6e55-4672-846a-b6d5a24a45f4", + "name": "Gemma 3 4B Instruct" + }, + { + "id": "2236df45-0699-40d1-b5cc-69ee345d2257", + "name": "Qwen3.5 122B A10B (Reasoning)" + }, + { + "id": "22d09131-343b-4adf-8760-533e20a2155f", + "name": "MiMo-V2.5" + }, + { + "id": "23149f9b-c904-43e2-9ec4-afa2bf843941", + "name": "Grok 4.1 Fast (Reasoning)" + }, + { + "id": "235060f4-057d-4bd1-8b8e-4a92908c770e", + "name": "Hermes 4 - Llama-3.1 70B (Non-reasoning)" + }, + { + "id": "23b379f7-18df-492a-9fc1-a56c5a5b9cfc", + "name": "NVIDIA Nemotron 3 Nano 30B A3B (Non-reasoning)" + }, + { + "id": "2443ac9e-a3db-423d-accb-8963f6fb0a53", + "name": "Grok 3" + }, + { + "id": "248deb0d-426c-4fa9-86fa-bc60aa9c3719", + "name": "GLM 5V Turbo (Reasoning)" + }, + { + "id": "24ac5b00-5f03-4c47-8e37-522d1195383e", + "name": "Mistral Small 3.1" + }, + { + "id": "2660d74f-ce79-48a8-8b53-6e668e2071a2", + "name": "Claude Opus 4.5 (Reasoning)" + }, + { + "id": "2698f6c6-e436-47ce-a583-dbc25596c571", + "name": "Qwen3 Next 80B A3B Instruct" + }, + { + "id": "26c0b5df-efa7-470f-a65e-2d883329e493", + "name": "Llama Nemotron Super 49B v1.5 (Non-reasoning)" + }, + { + "id": "27202e5f-c82d-4710-92e9-4317877d4883", + "name": "Gemini 2.5 Pro" + }, + { + "id": "272ff333-442f-4169-a804-ac9177bc99d7", + "name": "MiniMax-M2.1" + }, + { + "id": "291a510a-dcc0-40df-8a80-b3aa31900a6c", + "name": "Grok 2 (Dec '24)" + }, + { + "id": "296ace9b-0815-43b2-bafa-fd6cec5cce36", + "name": "MiMo-V2-Omni-0327" + }, + { + "id": "29855680-7469-43eb-8b88-cd3fb1d99da3", + "name": "GPT-5 mini (high)" + }, + { + "id": "2aacdc07-5f4e-4ab9-8ea5-5f7ab93f9eeb", + "name": "Qwen3 4B 2507 (Reasoning)" + }, + { + "id": "2ac96b67-f4f8-4c8c-ac08-c7510faa7bb9", + "name": "Gemini 1.5 Pro (Sep '24)" + }, + { + "id": "2ae624ca-25b4-4cc8-8970-cdfdd3320691", + "name": "Gemini 1.0 Ultra" + }, + { + "id": "2bb84433-f38e-4edc-9b65-4d7b1f473db9", + "name": "Qwen3 1.7B (Non-reasoning)" + }, + { + "id": "2bfdd17a-e027-4068-a54e-b0e90a6df118", + "name": "Gemma 3 27B Instruct" + }, + { + "id": "2c4394a2-b443-470a-908e-5c4a271b780c", + "name": "GLM-4.7-Flash (Reasoning)" + }, + { + "id": "2cd04201-2b6e-47ef-853e-7601f705f2a8", + "name": "Phi-4 Multimodal Instruct" + }, + { + "id": "2cff73da-4855-403c-afc9-5540feadcc15", + "name": "Granite 3.3 8B (Non-reasoning)" + }, + { + "id": "2d19c2d1-062d-436e-b2c2-3d3ecad34acc", + "name": "DeepSeek-V2-Chat" + }, + { + "id": "2d28a13a-096e-475a-beb8-26bbd1c7d51c", + "name": "Qwen3.5 9B (Reasoning)" + }, + { + "id": "2dad8957-4c16-4e74-bf2d-8b21514e0ae9", + "name": "o3-mini" + }, + { + "id": "2dbb6dc7-8c40-4b6d-af9c-cf805f83b79a", + "name": "Grok 4 Fast (Non-reasoning)" + }, + { + "id": "2e40e695-3cec-43da-83f9-615af30b8e91", + "name": "Claude Sonnet 4.6 (Non-reasoning, High Effort)" + }, + { + "id": "2e46d2fd-eb2b-42b9-9fe4-be50630fe870", + "name": "Ling-2.6-1T" + }, + { + "id": "2e6400f5-85ca-4ebc-ba8f-c2811a631138", + "name": "Gemma 3 12B Instruct" + }, + { + "id": "2e8694f9-7782-47a6-a6ba-fdce89d939c8", + "name": "NVIDIA Nemotron Nano 9B V2 (Non-reasoning)" + }, + { + "id": "2e9ff877-fd2c-4ce7-b631-7ca1bdb6d13e", + "name": "Command A+" + }, + { + "id": "2f60d80d-c3d3-4a43-bded-0557898c4618", + "name": "EXAONE 4.0 32B (Non-reasoning)" + }, + { + "id": "2fa8e143-77a8-4d05-bfa8-d3b54634c00f", + "name": "Claude Opus 4.7 (Non-reasoning, High Effort)" + }, + { + "id": "3068def4-7270-4c06-a320-6f6a5623d564", + "name": "GLM-4.5V (Reasoning)" + }, + { + "id": "30c9ba61-d0a1-4794-938e-35865f379d15", + "name": "Qwen3.5 4B (Non-reasoning)" + }, + { + "id": "30ef2a79-e800-4165-9f13-2a338f120db7", + "name": "Qwen3.5 397B A17B (Non-reasoning)" + }, + { + "id": "3373245b-e6dc-4b66-a7b0-3f06f9b7bd46", + "name": "Qwen3 235B A22B 2507 Instruct" + }, + { + "id": "338216fb-62c2-48f1-898a-9166d12fb35e", + "name": "Olmo 3 7B Think" + }, + { + "id": "339a92c1-8a42-417f-8d1f-cdbc605acd9e", + "name": "HyperCLOVA X SEED Think (32B)" + }, + { + "id": "3435db18-9227-45a5-8e79-9546b14b5aaa", + "name": "Sonar Pro" + }, + { + "id": "344c6718-c573-41d4-9556-10287a3fa1fc", + "name": "Nova Micro" + }, + { + "id": "34ef2b5c-3df3-437e-b9c5-81346f8c14a8", + "name": "Sarvam M (Reasoning)" + }, + { + "id": "352f834f-a03c-4117-8a29-c3ccd8a568ce", + "name": "Qwen2.5 Turbo" + }, + { + "id": "3538d399-1b3f-455d-9b13-1d8f9fee26c8", + "name": "EXAONE 4.5 33B (Non-reasoning)" + }, + { + "id": "35d602fc-b8b8-4698-9f4d-f2ce11ca50e4", + "name": "Mistral Small (Feb '24)" + }, + { + "id": "369329e4-629f-425d-975a-e8980aec2965", + "name": "INTELLECT-3" + }, + { + "id": "36f73aaf-d38a-4b56-a2b3-d04d17186910", + "name": "gpt-oss-20B (high)" + }, + { + "id": "385376b1-9815-47dd-83cc-85aac34f247d", + "name": "MiniMax M1 40k" + }, + { + "id": "392063ba-c3b5-47e8-ba67-a7b0b34f6824", + "name": "GPT-5.4 mini (medium)" + }, + { + "id": "39b64e04-7a69-4aa2-9e2e-fe38c24681ec", + "name": "Olmo 3.1 32B Think" + }, + { + "id": "3b156101-b0d7-4438-b350-2d1f1168f40a", + "name": "Qwen3.6 27B (Non-reasoning)" + }, + { + "id": "3b608b70-6434-4baa-99ad-45d499703c67", + "name": "GPT-4.1" + }, + { + "id": "3bc32f13-5afa-4e28-bce1-10e57376686b", + "name": "Command A" + }, + { + "id": "3c5289e5-1c62-434c-bc44-c51c39f640a1", + "name": "LFM2.5-VL-1.6B" + }, + { + "id": "3cf875b8-b6b5-42c0-ad70-617d5be59d00", + "name": "Qwen3 VL 8B Instruct" + }, + { + "id": "3d4e7366-928c-4eff-a8b0-2919c7d334c9", + "name": "JT-MINI" + }, + { + "id": "3d64bf83-232e-427e-8590-26b478bae4a8", + "name": "Ling-mini-2.0" + }, + { + "id": "3de55b83-e02b-412e-8211-315bbebe3e94", + "name": "Gemini 2.0 Flash-Lite (Preview)" + }, + { + "id": "3e6cf518-a1f4-42d3-8fcf-827c9bd8e6d5", + "name": "Qwen3 30B A3B (Reasoning)" + }, + { + "id": "3edcb2ed-6981-4f88-a556-563f7f8f00aa", + "name": "Mixtral 8x7B Instruct" + }, + { + "id": "3ef6db79-1dfa-4780-8c4b-affe2740d9ac", + "name": "Molmo2-8B" + }, + { + "id": "3fd96175-4ef1-434c-8795-f873aec2abc1", + "name": "Mistral Small 4 (Reasoning)" + }, + { + "id": "405e2235-0925-4634-a3c7-fbd5f6394bc0", + "name": "Qwen3.5 0.8B (Non-reasoning)" + }, + { + "id": "40663ad2-b218-471e-bdd4-a1e0c2360e2b", + "name": "GLM-5 (Reasoning)" + }, + { + "id": "4077490a-bbfb-404e-979a-a97a20e3b5de", + "name": "Claude Opus 4.5 (Non-reasoning)" + }, + { + "id": "41f73c27-880c-4f30-8b07-9999ce89a4ae", + "name": "Qwen Chat 72B" + }, + { + "id": "41faf421-118b-465b-b170-d200776580d1", + "name": "Gemini 1.5 Flash (Sep '24)" + }, + { + "id": "43098bd0-77ca-408b-b698-9d60b1d1c3b8", + "name": "GLM-4.6V (Non-reasoning)" + }, + { + "id": "432d6c36-8825-47f3-b4eb-58529cea346b", + "name": "Solar Pro 2 (Preview) (Non-reasoning)" + }, + { + "id": "433e410e-5170-4f03-b92f-7927c220b2fe", + "name": "MiniCPM-V 4.6 1.3B" + }, + { + "id": "4343afb1-c928-44c9-92e2-68fa1195b6f5", + "name": "GPT-4o mini Realtime (Dec '24)" + }, + { + "id": "43573c57-2403-46fb-af4b-a93de9a0c3f5", + "name": "Qwen3 235B A22B (Non-reasoning)" + }, + { + "id": "4386585e-71b4-4a0c-8a63-afb333419cd6", + "name": "Claude Opus 4.6 (Non-reasoning, High Effort)" + }, + { + "id": "43da3718-3d6e-40dd-901a-05664179ff7f", + "name": "Mistral Small 3.2" + }, + { + "id": "43fc5506-c5ed-4dee-9b85-962bf7ae3986", + "name": "DeepSeek V3 (Dec '24)" + }, + { + "id": "441734a9-8901-4850-9bae-b474c370291f", + "name": "Kimi K2" + }, + { + "id": "444cdb1e-bab8-42cd-938c-b2d7a93e2da1", + "name": "DeepSeek R1 Distill Qwen 1.5B" + }, + { + "id": "44b19b51-5367-4ef9-a2ff-2f90b89a0867", + "name": "EXAONE 4.0 32B (Reasoning)" + }, + { + "id": "44db6283-aa82-4799-af4a-679fe0530845", + "name": "Solar Pro 2 (Non-reasoning)" + }, + { + "id": "4559e9f0-8aad-4681-89fb-68cb915e0f16", + "name": "Qwen3 14B (Reasoning)" + }, + { + "id": "45790612-02e3-4c42-b5bd-cd7ed2ea1f2f", + "name": "Sarvam 105B (high)" + }, + { + "id": "45c87531-2d57-48e0-8012-202cd636189e", + "name": "Llama 3.1 Instruct 405B" + }, + { + "id": "466aecdb-3d96-4191-bc52-b3366db38851", + "name": "Llama 3.1 Instruct 70B" + }, + { + "id": "46d8315e-1630-463f-ab62-84185fa0faab", + "name": "Qwen3.5 35B A3B (Reasoning)" + }, + { + "id": "4764d31d-f4af-4297-8bd1-e993f26bcb64", + "name": "MiMo-V2.5-Pro (Non-reasoning)" + }, + { + "id": "47b7df55-5804-40de-ba11-317de786710a", + "name": "Ring-flash-2.0" + }, + { + "id": "48194e0f-8226-4c57-8cb2-2a0fb68a84c9", + "name": "LFM2.5-1.2B-Instruct" + }, + { + "id": "48e50f00-1fd1-4acc-b337-61078aa341e6", + "name": "GPT-5 (high)" + }, + { + "id": "4928e950-7f37-4475-b0dc-c5bad781a321", + "name": "Mistral Large 3" + }, + { + "id": "493f6a1e-7717-4e98-9d6f-548b92c4702d", + "name": "GPT-5.4 (Non-reasoning)" + }, + { + "id": "498862c3-f9ac-49d2-852f-16a02bb0c38f", + "name": "GPT-5.2 (xhigh)" + }, + { + "id": "49e70a38-4ac1-4659-b490-09b2c7ff21d6", + "name": "Apertus 70B Instruct" + }, + { + "id": "49fd01f9-887d-4479-b8ce-771a81ecef4e", + "name": "Grok 4.1 Fast (Non-reasoning)" + }, + { + "id": "4a845d7b-a52d-43bb-80b7-b58c7a0c155e", + "name": "DeepSeek R1 Distill Llama 70B" + }, + { + "id": "4ae6c88d-9e4a-4850-89fe-18a1c04a66cc", + "name": "Qwen3 0.6B (Reasoning)" + }, + { + "id": "4bbceacb-cf47-464b-b60f-e1d1fe016d67", + "name": "MiniMax-M2.7" + }, + { + "id": "4c111fbc-d13a-42b4-858c-1dc17fe3c1d1", + "name": "Grok-1" + }, + { + "id": "4d6dd5ce-08cb-4e87-9288-1dd2f022aa35", + "name": "Doubao Seed Code" + }, + { + "id": "4dc12a38-b18f-4c43-8e1b-678f8434b5b1", + "name": "GPT-5.1 (high)" + }, + { + "id": "5016ea75-7b0e-4737-a7e6-1062c6d90fd4", + "name": "Gemini 3.5 Flash (medium)" + }, + { + "id": "504412c2-2ada-499b-aebf-7e0a35c9d286", + "name": "Claude 4 Opus (Non-reasoning)" + }, + { + "id": "508943e4-e9e1-4d10-9c0e-a7650e9d7315", + "name": "EXAONE 4.5 33B" + }, + { + "id": "509e94e3-f1cb-43fb-98ff-e0e9872cfd1f", + "name": "Qwen3.5 27B (Non-reasoning)" + }, + { + "id": "50f92d5f-f413-4c97-8dab-331101622a28", + "name": "Mistral Large 2 (Nov '24)" + }, + { + "id": "515852e7-ba9c-4571-8cf9-82ad6b45f22f", + "name": "PALM-2" + }, + { + "id": "51d0b717-953d-4b44-af61-406c6b7dff39", + "name": "Qwen3 VL 30B A3B Instruct" + }, + { + "id": "523125f4-a1da-4990-9abd-dd08a069100e", + "name": "Grok 4.20 0309 (Non-reasoning)" + }, + { + "id": "527e943a-adc6-4e69-93af-d1608e1b5fed", + "name": "DeepSeek V3.2 Speciale" + }, + { + "id": "5303601c-8133-4f52-bc4e-5241ee6b3c10", + "name": "Gemma 4 E4B (Non-reasoning)" + }, + { + "id": "538e945c-6c27-4fd3-995d-ded80a36cd10", + "name": "GPT-5.4 (low)" + }, + { + "id": "53c98840-47af-49aa-94e6-469fb17e9a1b", + "name": "Claude Opus 4.6 (Adaptive Reasoning, Max Effort)" + }, + { + "id": "540ebc58-a2d8-4dc9-ba6b-973efa52fab1", + "name": "Jamba 1.5 Mini" + }, + { + "id": "54442579-2a4d-40cc-b264-bc4ff29e311a", + "name": "OLMo 2 32B" + }, + { + "id": "546ec53f-273c-4af7-b13f-b88c41f45905", + "name": "Nova Pro" + }, + { + "id": "54c7f3fc-7078-442a-b472-e8691257a88c", + "name": "Granite 4.0 350M" + }, + { + "id": "55a3ebf6-6117-4cc1-8596-c6de6e552fd4", + "name": "Gemini 2.5 Flash Preview (Non-reasoning)" + }, + { + "id": "573bbd93-114c-4b71-9ede-a73a7d4bdf84", + "name": "Grok 4 Fast (Reasoning)" + }, + { + "id": "575498d6-60ec-466b-9372-fea19911fd07", + "name": "GPT-4o (March 2025, chatgpt-4o-latest)" + }, + { + "id": "5962d643-0a6f-4630-bb08-ab5720d80056", + "name": "Qwen3 1.7B (Reasoning)" + }, + { + "id": "598190f8-dc9c-4fea-a7ea-4b81c402ab18", + "name": "Gemini 3.1 Flash-Lite" + }, + { + "id": "598de97d-029e-47b6-96ec-dbc1e0f9045a", + "name": "Kimi Linear 48B A3B Instruct" + }, + { + "id": "599da8e0-bd9c-4b38-a127-b50e371fbcf8", + "name": "Llama 2 Chat 70B" + }, + { + "id": "59a1bb20-9170-4dc2-ba9c-12d326cf068e", + "name": "Solar Open 100B (Reasoning)" + }, + { + "id": "59b5b14b-5365-4ee7-824a-18a8e6309644", + "name": "GPT-5.3 Codex (xhigh)" + }, + { + "id": "59e22326-1bca-4432-a5fa-147fbe8854e7", + "name": "Mistral Medium 3" + }, + { + "id": "5a088cde-18e2-4dfa-98dd-d283e1c19654", + "name": "LFM2.5-1.2B-Thinking" + }, + { + "id": "5a49ef80-3af5-404b-8ac0-e1b230ae95de", + "name": "K2 Think V2" + }, + { + "id": "5aa1c578-af76-4b91-8699-cdd43582b3af", + "name": "GLM-5.1 (Reasoning)" + }, + { + "id": "5ad2f60f-ee05-49fd-85a0-cef69aa7cb7b", + "name": "o1" + }, + { + "id": "5b2beb12-81a9-47a1-8a2a-d0a727185b50", + "name": "Qwen3 Max (Preview)" + }, + { + "id": "5bb1f426-2d64-4d03-99fb-8041ee85c33b", + "name": "GPT-5.4 Pro (xhigh)" + }, + { + "id": "5c3dd927-48a3-4f3c-8045-9b135f62dfbb", + "name": "Nova Lite" + }, + { + "id": "5c6533f3-75a2-4109-b9a9-3623afc6b86a", + "name": "Seed-OSS-36B-Instruct" + }, + { + "id": "5ce30d25-5353-45bb-bef9-6b87480ba3a2", + "name": "GPT-4.5 (Preview)" + }, + { + "id": "5d11e7a1-4f70-4e5a-9364-e193761d6757", + "name": "GPT-5 Codex (high)" + }, + { + "id": "5d303dc9-c027-401f-9803-4e9aa3331007", + "name": "GLM-4.5-Air" + }, + { + "id": "5d4acc80-7a88-4e84-bfe7-99071b84e6a4", + "name": "Qwen3.5 2B (Reasoning)" + }, + { + "id": "5d8183dc-24f4-46c5-a1d0-d937de149364", + "name": "MiMo-V2-Pro" + }, + { + "id": "5d891609-fe1c-4e8d-b9f0-5b0ff6d9f439", + "name": "Tri-21B-Think" + }, + { + "id": "5da3c0e2-65d2-4bff-a410-cb2132ddafb6", + "name": "DeepSeek V4 Pro (Reasoning, High Effort)" + }, + { + "id": "5dba8d07-9992-483c-81db-dac97cb15ba8", + "name": "Granite 4.0 H Small" + }, + { + "id": "5e0164b3-d902-4bcb-a1b2-83b4f4cd6143", + "name": "Qwen3 30B A3B 2507 (Reasoning)" + }, + { + "id": "5e4e4590-a77e-4b66-95f8-f3960a1a7c68", + "name": "Mistral Large (Feb '24)" + }, + { + "id": "5e8b0d98-a3b4-42b5-93d8-ecb748788754", + "name": "Grok 4.3 (medium)" + }, + { + "id": "5e965af0-ca5c-4f47-9ba9-06000508b84a", + "name": "GPT-5 (medium)" + }, + { + "id": "5ea94a4a-55ac-4ea1-8898-2b3971e94af6", + "name": "Grok 4" + }, + { + "id": "5ec4a0db-da66-4e46-9682-fceeed755ef8", + "name": "Sonar" + }, + { + "id": "5fb47ff6-a30e-4c2c-96f2-55e95a13390f", + "name": "Llama 3.2 Instruct 11B (Vision)" + }, + { + "id": "6000145b-0e3d-4fef-a55f-bcaac84803b2", + "name": "DeepSeek R1 0528 Qwen3 8B" + }, + { + "id": "6000692c-f9a6-47f8-a5c0-e0874ac488bb", + "name": "Qwen3.6 Max Preview" + }, + { + "id": "6056731b-c705-455b-aa0d-43cbf29b1054", + "name": "LongCat Flash Lite" + }, + { + "id": "60cff809-05fb-4439-afe7-8b1476439f49", + "name": "Tri-21B-think Preview" + }, + { + "id": "61bd9367-e520-4ec8-989e-5fbf50e61610", + "name": "Nova 2.0 Lite (high)" + }, + { + "id": "62de31e8-a1a3-429c-b634-a2afccfd9363", + "name": "Gemini 2.5 Pro Preview (Mar' 25)" + }, + { + "id": "63872e9c-3377-4a6b-b477-7bba244c38e9", + "name": "NVIDIA Nemotron 3 Super 120B A12B (Reasoning)" + }, + { + "id": "64312f37-3701-4243-a4c9-7c07a58cd6b9", + "name": "Grok 4.20 0309 v2 (Non-reasoning)" + }, + { + "id": "651ef7ae-9a8f-477e-9c8e-460aa156ba02", + "name": "Qwen3.5 4B (Reasoning)" + }, + { + "id": "660965b2-66d2-49ee-a6b9-79a6ac47d3c0", + "name": "Magistral Medium 1" + }, + { + "id": "66445f84-b2e3-4202-afdc-92ba0f0e5f36", + "name": "Kimi K2 0905" + }, + { + "id": "666eb13f-0d22-4438-8eb0-01876e1a8604", + "name": "Motif-2-12.7B-Reasoning" + }, + { + "id": "66938aab-78fa-49d7-8461-b48b6833e837", + "name": "Qwen3.6 35B A3B (Non-reasoning)" + }, + { + "id": "66f4ce73-9a9b-4b49-9c6e-bedb9bfdc720", + "name": "Ministral 3 3B" + }, + { + "id": "686ab020-ee58-4a70-a9ac-24d675a73506", + "name": "LFM2 8B A1B" + }, + { + "id": "68c89ebf-779c-4445-9241-de964cd17355", + "name": "Gemini 2.5 Flash Preview (Reasoning)" + }, + { + "id": "69534bed-2ffd-4235-832b-e20a810333ab", + "name": "Qwen3.7 Max" + }, + { + "id": "6a5d56e1-bb68-4205-8d9b-26b97888bc84", + "name": "GLM-4.6 (Reasoning)" + }, + { + "id": "6a7c0e25-1dcb-4b15-8495-a8536a9da051", + "name": "GPT-4" + }, + { + "id": "6afbfb62-27e4-435e-9c85-d9fe1b92519e", + "name": "Gemini 2.5 Flash (Non-reasoning)" + }, + { + "id": "6b08a75a-19ee-40b4-be33-133b8ef42f92", + "name": "Llama 2 Chat 13B" + }, + { + "id": "6b79f899-e3c0-45f6-923c-243faccdb2fc", + "name": "GPT-5.5 (medium)" + }, + { + "id": "6ba9e8eb-8124-436d-842f-dbe36df80c27", + "name": "Hermes 4 - Llama-3.1 70B (Reasoning)" + }, + { + "id": "6d9a176d-feb8-4dac-8872-afe32b31897f", + "name": "DeepSeek V3.2 (Non-reasoning)" + }, + { + "id": "6da314d3-a984-4734-8f31-47dd32fb4699", + "name": "Qwen3 VL 32B Instruct" + }, + { + "id": "6dd8ba55-5680-44a9-b309-82928165d5f0", + "name": "GPT-5.2 (Non-reasoning)" + }, + { + "id": "6e1b44ff-c227-496b-aef4-19b70cd18c76", + "name": "Gemma 4 26B A4B (Reasoning)" + }, + { + "id": "6e2a2572-ed8d-4616-8d7d-68d125fc8ee7", + "name": "DeepSeek V4 Pro (Non-reasoning)" + }, + { + "id": "6e6e02fd-9cbd-417f-9bfc-673df89c313d", + "name": "NVIDIA Nemotron Nano 12B v2 VL (Reasoning)" + }, + { + "id": "6f1a7562-6e96-46ac-af4f-6ba5a7a3da96", + "name": "GPT-5.5 (Non-reasoning)" + }, + { + "id": "6f3534b1-1168-472e-b3e3-23ab521504f5", + "name": "Qwen3.5 Omni Flash" + }, + { + "id": "6fc35842-0165-44cf-8570-c484a92b3d8c", + "name": "GLM-4.7 (Reasoning)" + }, + { + "id": "6fd796d3-f346-4f66-97df-5da81714fc73", + "name": "Nova 2.0 Lite (low)" + }, + { + "id": "70152cb0-fb36-4732-a925-89ef40994be1", + "name": "Magistral Small 1.2" + }, + { + "id": "70882ec6-914c-41f0-9754-5e8f75005f77", + "name": "Gemma 4 26B A4B (Non-reasoning)" + }, + { + "id": "713fae11-c75c-4f10-ae2c-8e4074cd58af", + "name": "Ministral 3 14B" + }, + { + "id": "715e05fb-1313-441c-bf1d-8651c752a841", + "name": "K-EXAONE (Reasoning)" + }, + { + "id": "71e8d48c-1920-4f27-8ea9-1f10becc615a", + "name": "Llama 3.2 Instruct 3B" + }, + { + "id": "71f51ea9-94fe-4635-a80d-4cfffbb685f4", + "name": "Gemini 2.5 Flash-Lite Preview (Sep '25) (Non-reasoning)" + }, + { + "id": "72c358fd-7d45-4d68-89aa-699743710924", + "name": "GPT-4.1 nano" + }, + { + "id": "7393c56a-ec31-48e9-b804-c04f2d2cb641", + "name": "Llama 3.1 Nemotron Instruct 70B" + }, + { + "id": "739684ba-0f63-4e2a-b4ee-30741c9e9320", + "name": "Llama 3.1 Instruct 8B" + }, + { + "id": "739e531a-eb0a-478f-bb67-5845b79ce65d", + "name": "LFM2 2.6B" + }, + { + "id": "755d7281-2ed8-48c7-808f-709ec4cbfb71", + "name": "Gemma 4 E2B (Non-reasoning)" + }, + { + "id": "75e1c197-f239-4361-a9d6-66dccfead236", + "name": "DeepSeek V3 0324" + }, + { + "id": "76361085-f5dc-49ec-b069-fe56ca885933", + "name": "Command-R+ (Apr '24)" + }, + { + "id": "764dd095-6ebd-4760-8eb3-cbc40964db2e", + "name": "Sarvam 30B (high)" + }, + { + "id": "7656c62b-5345-435b-bf12-b6ce2ca0d58d", + "name": "Qwen2 Instruct 72B" + }, + { + "id": "76aa6af5-fdc6-4739-a300-983f14e74a67", + "name": "GPT-4 Turbo" + }, + { + "id": "76bce7fb-3a3f-4b66-a78d-35ccf3edf5d2", + "name": "Nova 2.0 Lite (Non-reasoning)" + }, + { + "id": "76dcf6ef-39ea-4be0-b693-b88da25b4caf", + "name": "NVIDIA Nemotron 3 Nano 30B A3B (Reasoning)" + }, + { + "id": "7764d514-694f-444c-8d60-bdc6e24e223f", + "name": "DeepSeek LLM 67B Chat (V1)" + }, + { + "id": "7829427f-f0e3-4f6d-a228-5fbf70dacc02", + "name": "Claude 3 Opus" + }, + { + "id": "783a0ea2-1eef-422a-8c3d-f6d40d943f54", + "name": "Gemini 3 Flash Preview (Non-reasoning)" + }, + { + "id": "78452c64-7303-4192-bca5-2d9ec5c623d4", + "name": "Jamba 1.7 Large" + }, + { + "id": "7a7b52f6-fdef-4dae-9203-58c710ccc81d", + "name": "Mistral Saba" + }, + { + "id": "7ae943a9-9310-4472-a834-c61f0ab68485", + "name": "Qwen3 Max" + }, + { + "id": "7b269763-ecc0-41ef-aa29-47ef632ac065", + "name": "Gemini 2.0 Flash-Lite (Feb '25)" + }, + { + "id": "7c045ca0-b331-488d-af31-df0fd331dfd1", + "name": "Mistral Small (Sep '24)" + }, + { + "id": "7c73c3be-7f51-4d14-bec8-d5789488df25", + "name": "Gemini 3 Flash Preview (Reasoning)" + }, + { + "id": "7d73161f-002f-4c9c-b4f8-6c4d91f2ba8e", + "name": "Nanbeige4.1-3B" + }, + { + "id": "7ec1065a-c90e-41e4-bd17-abb7042eed76", + "name": "Qwen3 30B A3B 2507 Instruct" + }, + { + "id": "7f3c9423-3ee3-4369-a6d9-3f2a40aff00e", + "name": "GPT-5 (low)" + }, + { + "id": "806032ff-6252-4c22-ba99-a126e411b7a4", + "name": "Qwen3 Max Thinking" + }, + { + "id": "80f7860a-7665-4658-9f05-15bccf5f832f", + "name": "Llama 3.2 Instruct 1B" + }, + { + "id": "81444bc8-72f9-4a2d-ad43-27e3f0d2f461", + "name": "Tiny Aya Global" + }, + { + "id": "81b6ddfc-111e-4422-bd44-42ee6165b699", + "name": "GLM-4.7 (Non-reasoning)" + }, + { + "id": "8273650d-40e5-45ee-aeda-df71de784164", + "name": "Exaone 4.0 1.2B (Non-reasoning)" + }, + { + "id": "82879bb8-89fb-4adc-b519-315b8ef30b77", + "name": "Llama 3 Instruct 8B" + }, + { + "id": "82b207dd-d285-4a52-b2fc-2cbd27543899", + "name": "Hermes 4 - Llama-3.1 405B (Reasoning)" + }, + { + "id": "82b36b4d-84dd-4bc0-ad32-e3aee9442789", + "name": "MiMo-V2-Flash (Non-reasoning)" + }, + { + "id": "82ed9bd2-c97b-4c35-9312-94bb72001e36", + "name": "Granite 4.1 8B" + }, + { + "id": "83cb898e-05d9-4e4b-9de3-2d305014d923", + "name": "Claude Instant" + }, + { + "id": "84922739-425f-46e1-87ac-bb4268dcacbb", + "name": "Gemini 2.5 Flash-Lite Preview (Sep '25) (Reasoning)" + }, + { + "id": "84b49308-6b93-47aa-a4f6-776ee1a1e8cd", + "name": "o4-mini (high)" + }, + { + "id": "84e3f11e-d659-4941-8988-1dbfabbaf538", + "name": "GPT-5.2 (medium)" + }, + { + "id": "864da2a5-156c-45fd-873c-8923be91914f", + "name": "Magistral Medium 1.2" + }, + { + "id": "8665ca00-c687-44c7-875c-22618cb31c4f", + "name": "Qwen3.6 35B A3B (Reasoning)" + }, + { + "id": "877fdfc9-2026-477a-af96-e4fd602c0131", + "name": "Gemini 2.5 Flash Preview (Sep '25) (Non-reasoning)" + }, + { + "id": "8823351e-8232-4c9c-8a1d-cd2c1d2c1196", + "name": "QwQ 32B-Preview" + }, + { + "id": "882a5da3-94ca-4602-8693-c45970df17e2", + "name": "Ling-flash-2.0" + }, + { + "id": "8869f28a-a6ff-487f-8d32-93fe335fdda5", + "name": "GPT-5.4 nano (medium)" + }, + { + "id": "891bcdf2-8dd2-4dc3-829b-d963fde25876", + "name": "Apertus 8B Instruct" + }, + { + "id": "89a2c945-1fab-4ee4-9f45-83a9f46cb221", + "name": "DeepSeek V4 Flash (Reasoning, High Effort)" + }, + { + "id": "8a24865b-90d9-4e2b-a2fd-6851c2e9d627", + "name": "DeepSeek-V2.5" + }, + { + "id": "8a4a5ead-7789-4389-8400-30e9d20370b7", + "name": "Claude 4 Opus (Reasoning)" + }, + { + "id": "8b1a70d1-e05f-426b-9122-023d4629ab47", + "name": "GPT-3.5 Turbo (0613)" + }, + { + "id": "8c1be908-67b6-4cf4-ba08-83ddbe44fde3", + "name": "GPT-4o (Aug '24)" + }, + { + "id": "8c29d66d-bf98-4ea3-8572-5409353ecc66", + "name": "Qwen3.6 27B (Reasoning)" + }, + { + "id": "8c748e53-61ae-48b8-af8d-eb8298b1e9db", + "name": "Nemotron 3 Nano Omni 30B A3B Reasoning" + }, + { + "id": "8ca48626-ff5e-48b3-8401-38081376d706", + "name": "Hy3-preview (Reasoning)" + }, + { + "id": "8ddacd41-bf43-411b-aa30-43ebf0567dd8", + "name": "Gemini 1.0 Pro" + }, + { + "id": "8e78cf7a-5b76-4beb-beba-b99c6233b208", + "name": "Solar Pro 2 (Preview) (Reasoning)" + }, + { + "id": "8eb02396-f231-4189-ae15-05f7facebd9b", + "name": "GPT-5 nano (medium)" + }, + { + "id": "8f0a75d6-8d00-4c2e-bcd4-8e88a570a93c", + "name": "DeepSeek-Coder-V2" + }, + { + "id": "8f74a6ed-f82f-4a2f-a96b-4914993e47da", + "name": "Olmo 3 7B Instruct" + }, + { + "id": "90c2a9cf-ad7e-4332-9be2-2fd1309833e2", + "name": "Grok 4.3 (Non-reasoning)" + }, + { + "id": "90e078f2-051b-4c63-8919-76618971cb3f", + "name": "Claude 4.5 Sonnet (Reasoning)" + }, + { + "id": "91cb6144-4937-4e4e-aeda-b4341d355c10", + "name": "Claude 4.5 Sonnet (Non-reasoning)" + }, + { + "id": "91e3b45f-3f52-4511-8c15-8948854bebc5", + "name": "Granite 4.1 3B" + }, + { + "id": "91f3a4c8-b000-4513-942c-bfe283375c35", + "name": "Ling 2.6 Flash" + }, + { + "id": "922c69c7-9037-43c6-8bcf-a1c555e7f3eb", + "name": "Llama 4 Maverick" + }, + { + "id": "92b19c88-fa87-4595-957e-fe9aa5fa5ad4", + "name": "Jamba 1.6 Large" + }, + { + "id": "92f245a7-43b4-4ffd-8bfb-866746bf824d", + "name": "GLM-5.1 (Non-reasoning)" + }, + { + "id": "94229066-9381-4ee1-bf70-a16d63756a6e", + "name": "Gemini 1.5 Flash-8B" + }, + { + "id": "946e7aab-db1c-4c3f-b0b3-7720d0cff187", + "name": "GLM-4.6 (Non-reasoning)" + }, + { + "id": "948892b5-db03-4118-a4a8-ccd51ed871ea", + "name": "Grok 4.3 (high)" + }, + { + "id": "94a6d26e-a903-47f3-8323-ae422d237bb9", + "name": "Olmo 3.1 32B Instruct" + }, + { + "id": "94d09368-9035-47cf-963a-b4310b433a16", + "name": "MiMo-V2-Omni" + }, + { + "id": "9741f3c2-cbb1-4a3f-99ee-7bd7384d9038", + "name": "Ministral 3 8B" + }, + { + "id": "976cc8ad-7904-4056-83c5-960181f47d5f", + "name": "Llama 3.3 Instruct 70B" + }, + { + "id": "9815da7d-70f4-44d6-b539-9ffef0faa152", + "name": "Nemotron Cascade 2 30B A3B" + }, + { + "id": "98e3230e-cee1-4c19-b9f8-b6b6a826ca93", + "name": "R1 1776" + }, + { + "id": "9ca246a7-cf13-42c9-9182-5b5ad6b79026", + "name": "MiniMax M1 80k" + }, + { + "id": "9ca71ac4-41c8-42c0-87dd-5704a9e5b94d", + "name": "Llama 3.2 Instruct 90B (Vision)" + }, + { + "id": "9cc377dc-67ae-4042-bafc-0466b5f05089", + "name": "K-EXAONE (Non-reasoning)" + }, + { + "id": "9dba61f5-78ee-4190-8d1d-8e7063ffd386", + "name": "Qwen3 8B (Reasoning)" + }, + { + "id": "9e141c0d-fc82-4e07-bb2e-fe0003bc030b", + "name": "Claude 2.1" + }, + { + "id": "9eae4ec4-61b8-48bc-9843-3edd506ae933", + "name": "Devstral Small (Jul '25)" + }, + { + "id": "9ee13921-62a4-425a-a22b-df3302198d93", + "name": "NVIDIA Nemotron 3 Nano 4B" + }, + { + "id": "9f7c7566-a704-49a2-a383-cb3181da33a4", + "name": "GPT-4.1 mini" + }, + { + "id": "9f873c2f-2c2d-4ccb-9e1b-71bf61b052be", + "name": "Phi-4 Mini Instruct" + }, + { + "id": "a04f5b78-f397-4fd8-a2b1-00dcab50324c", + "name": "Grok Beta" + }, + { + "id": "a06bd3fc-86db-4a8e-ae6d-7459444d08c9", + "name": "Grok Code Fast 1" + }, + { + "id": "a20ae33a-46e1-41e6-81a0-fe8b00d2e538", + "name": "Nova 2.0 Pro Preview (Non-reasoning)" + }, + { + "id": "a29e66d6-1c3c-456a-8770-59ee3845b35d", + "name": "Ring-1T" + }, + { + "id": "a2c8e7b2-57bf-4d1e-96ea-7944d786d94d", + "name": "Granite 4.0 1B" + }, + { + "id": "a38d719a-709c-4983-b3e7-7090389ae9a6", + "name": "K2-V2 (high)" + }, + { + "id": "a5092ece-d5a7-461f-b036-3faef262423f", + "name": "Gemini 3 Deep Think" + }, + { + "id": "a518a64b-e337-48f3-85a1-ba7dc0e8f961", + "name": "ERNIE 5.0 Thinking Preview" + }, + { + "id": "a550ffca-f89e-4381-ade6-a85dc6a1fb4c", + "name": "Kimi K2.5 (Reasoning)" + }, + { + "id": "a6340098-d7ae-462d-b372-0a0a67fc44b4", + "name": "Claude 4.5 Haiku (Reasoning)" + }, + { + "id": "a68afa0b-7fe2-4e9d-bf3e-741cce3c6aeb", + "name": "Granite 4.0 H 350M" + }, + { + "id": "a6ea7ec0-0aca-4442-98fb-4296c6d18b31", + "name": "Qwen3.5 0.8B (Reasoning)" + }, + { + "id": "a71c1a35-ccc8-43f0-a5a2-070a690b9a00", + "name": "Apriel-v1.6-15B-Thinker" + }, + { + "id": "a7564055-f8ba-4c4b-9e2d-060f61263645", + "name": "Claude 4 Sonnet (Reasoning)" + }, + { + "id": "a797eaf3-6d75-4f29-86a8-e1243ce52d43", + "name": "Gemma 3n E4B Instruct" + }, + { + "id": "a803d3d0-d22e-49a0-ac2c-b9c6f1141065", + "name": "Qwen3 VL 235B A22B (Reasoning)" + }, + { + "id": "a83f84b3-473a-4276-9ae1-8909da723159", + "name": "DeepSeek R1 0528 (May '25)" + }, + { + "id": "a89c4b28-2d8c-456e-88ea-255fb51fd2b6", + "name": "GPT-5.4 (xhigh)" + }, + { + "id": "a8c67863-9d66-44dd-8d27-f58654ecde03", + "name": "Gemma 3n E2B Instruct" + }, + { + "id": "a8efb564-9d17-4d7f-8f43-e9110657ce21", + "name": "DeepHermes 3 - Mistral 24B Preview (Non-reasoning)" + }, + { + "id": "a971b0c0-4c0f-484a-b018-e36b5be3409e", + "name": "Reka Flash (Sep '24)" + }, + { + "id": "aa83359a-d804-4f0b-b5bf-dc637711c26f", + "name": "Llama 3 Instruct 70B" + }, + { + "id": "ab7f016c-a29b-4710-bdf6-6a5cd96aacca", + "name": "NVIDIA Nemotron Nano 12B v2 VL (Non-reasoning)" + }, + { + "id": "aba82268-2bb7-4a0f-80be-9b7722e2145b", + "name": "Devstral Medium" + }, + { + "id": "abe9f0c7-f4f6-430d-ba42-f45afdd4841b", + "name": "Command-R (Mar '24)" + }, + { + "id": "ac1031bc-c53e-4af7-9c6e-2005e0ff44fa", + "name": "Apriel-v1.5-15B-Thinker" + }, + { + "id": "ac48c49d-9e77-4394-ac4e-d1ee51fd5fee", + "name": "Gemini 1.5 Flash (May '24)" + }, + { + "id": "aca9c1ad-fc86-49f3-a312-b1e517ea100c", + "name": "Claude 3.5 Sonnet (June '24)" + }, + { + "id": "acad0665-9457-4531-abd5-b59efd7a89ea", + "name": "Step3 VL 10B" + }, + { + "id": "ad173c8d-f14f-4230-90e9-60979b7720e7", + "name": "DeepSeek V4 Flash (Non-reasoning)" + }, + { + "id": "adf9a85e-abc3-4f28-937b-db6655cc5238", + "name": "Llama 4 Scout" + }, + { + "id": "ae447455-940d-4d30-9139-a664fa896eaf", + "name": "GPT-5.4 mini (Non-Reasoning)" + }, + { + "id": "ae4fe623-80ab-4ea3-8921-70a18ea0fc7e", + "name": "ERNIE 4.5 300B A47B" + }, + { + "id": "af134350-8ba3-4629-b56b-00bd6dcf60c4", + "name": "DeepSeek V3.2 Exp (Reasoning)" + }, + { + "id": "af74f222-05c7-422d-a653-b9c0707c9c72", + "name": "Molmo 7B-D" + }, + { + "id": "b00ecd62-a53f-4aed-b833-3e9d6b0170ba", + "name": "Qwen3 32B (Reasoning)" + }, + { + "id": "b0249961-b8b2-479d-8325-a29ea17c7b89", + "name": "Qwen3 4B 2507 Instruct" + }, + { + "id": "b07aef0a-b192-46a1-b1c9-40b06d1b9061", + "name": "Gemma 4 E2B (Reasoning)" + }, + { + "id": "b13c1257-d746-4027-8fc8-4892dc14701c", + "name": "GPT-5.5 (high)" + }, + { + "id": "b1fa84f8-1ed3-4124-b403-4655dafa4267", + "name": "Llama 3.1 Nemotron Nano 4B v1.1 (Reasoning)" + }, + { + "id": "b26ff709-1773-4595-ae44-78e0a5bac29c", + "name": "DeepSeek R1 Distill Qwen 14B" + }, + { + "id": "b2dd592a-fbc5-458a-b26d-f3964cbab82f", + "name": "Qwen3 8B (Non-reasoning)" + }, + { + "id": "b2e68f0a-8f66-4e4c-9821-2b786cea601b", + "name": "Claude 3 Sonnet" + }, + { + "id": "b2f3191f-77d6-4155-8be6-330f0baa1ae5", + "name": "Gemini 3 Pro Preview (low)" + }, + { + "id": "b36ff8f3-0323-49d1-a063-ab09704fdb0c", + "name": "Nova 2.0 Omni (low)" + }, + { + "id": "b3735511-c6ff-4928-8d72-2181444a4eb3", + "name": "Arctic Instruct" + }, + { + "id": "b4784397-aa28-411b-b011-9c4331bfa9c8", + "name": "GPT-4o (ChatGPT)" + }, + { + "id": "b4ddb4c8-1400-44ab-8c2b-e2472088e7ff", + "name": "Jamba 1.5 Large" + }, + { + "id": "b4f14013-37dd-4c75-bd8a-378365d9ed77", + "name": "Jamba 1.7 Mini" + }, + { + "id": "b4f7d7a4-869a-4ee7-b17a-4046cd1e79fd", + "name": "Qwen2.5 Instruct 72B" + }, + { + "id": "b515503d-4d65-4a3f-8a4a-6c731e2b079f", + "name": "o1-mini" + }, + { + "id": "b58b8272-cd3f-44b9-9b68-612f40779ce2", + "name": "Hy3-preview (Non-reasoning)" + }, + { + "id": "b5c1c91a-7474-4409-9a9c-9c2ac45d9eb6", + "name": "GPT-4o mini" + }, + { + "id": "b6d2e43d-3082-43f5-9318-0f4dbcb54163", + "name": "Trinity Large Thinking" + }, + { + "id": "b7726745-9c77-40c3-8452-974cb53d6fbc", + "name": "DeepHermes 3 - Llama-3.1 8B Preview (Non-reasoning)" + }, + { + "id": "b89c4faf-219e-4171-a1aa-e3bd2fd0a924", + "name": "Solar Pro 3" + }, + { + "id": "b97ef678-2d31-4375-9416-67ea97f87204", + "name": "Qwen3 Omni 30B A3B (Reasoning)" + }, + { + "id": "b9dc72c6-7bea-4936-a55a-4b0c835fc755", + "name": "Qwen2.5 Max" + }, + { + "id": "ba04694d-326a-4a6a-8f1b-46316f872a7f", + "name": "Kimi K2.5 (Non-reasoning)" + }, + { + "id": "ba242e40-83b7-4cd3-a0e0-b56237984914", + "name": "GPT-5.4 mini (xhigh)" + }, + { + "id": "bbd93ebe-80da-4594-bb19-61e69d0331df", + "name": "Gemini 3.1 Pro Preview" + }, + { + "id": "bbe6d782-e630-48d5-b11c-3ce37f373f1e", + "name": "Qwen3 235B A22B (Reasoning)" + }, + { + "id": "bc26bfdb-4923-4442-a6ca-e77392923581", + "name": "GPT-5 mini (minimal)" + }, + { + "id": "bc4579d2-9c46-46c3-ace0-454039bf21bb", + "name": "DeepSeek-V2.5 (Dec '24)" + }, + { + "id": "bcca0e70-7e80-4c07-b1fa-b33bcfb19e51", + "name": "Gemini 2.0 Flash (Feb '25)" + }, + { + "id": "bd2c3517-00d8-4ba5-a989-1f1e52f3ffab", + "name": "Gemma 3n E4B Instruct Preview (May '25)" + }, + { + "id": "bddebfd3-0a8d-47f5-b722-bc4c2ca5a5dc", + "name": "Kimi K2 Thinking" + }, + { + "id": "be185709-ddb4-4268-9597-856464359b25", + "name": "MiMo-V2-Flash (Reasoning)" + }, + { + "id": "bf220674-68bd-43cc-a1b8-ce5ed4d2f18d", + "name": "DeepSeek V4 Pro (Reasoning, Max Effort)" + }, + { + "id": "bf60740e-6aa5-422f-ba49-ef6e9d171205", + "name": "Qwen3 32B (Non-reasoning)" + }, + { + "id": "c1045dc0-4fd3-4adb-9548-18763e0d051f", + "name": "GPT-4o (Nov '24)" + }, + { + "id": "c298d1a8-606c-4971-8613-ccdaaf941043", + "name": "GPT-5.4 nano (Non-Reasoning)" + }, + { + "id": "c2b1e769-7aee-4669-8076-73918bdebf6c", + "name": "Claude 4.5 Haiku (Non-reasoning)" + }, + { + "id": "c3274a19-6d3c-4d01-ab9b-5055a0a40429", + "name": "GPT-5 mini (medium)" + }, + { + "id": "c3738fb0-3408-4430-a699-760ae4b70c93", + "name": "GPT-5 (minimal)" + }, + { + "id": "c43aa1f9-31bd-4a99-be70-84c5e6bd2e75", + "name": "Qwen Chat 14B" + }, + { + "id": "c4c3b42f-e0f0-48ca-b6f9-b296e7697806", + "name": "Llama 3.3 Nemotron Super 49B v1 (Non-reasoning)" + }, + { + "id": "c6a47d8a-7517-46e2-8383-329fe7241725", + "name": "Gemini 2.0 Pro Experimental (Feb '25)" + }, + { + "id": "c72cb85a-18a4-4235-b455-77dff2f16c50", + "name": "Grok 4.20 0309 v2 (Reasoning)" + }, + { + "id": "c7327e6e-b27f-4b1b-859d-159a34e0ba1c", + "name": "DeepSeek Coder V2 Lite Instruct" + }, + { + "id": "c7667559-d9b6-43f1-8cd8-8bdbc78d190b", + "name": "Gemini 2.5 Flash Preview (Sep '25) (Reasoning)" + }, + { + "id": "c76e0ae8-0fd2-45c0-a39d-d398fce9b128", + "name": "Falcon-H1R-7B" + }, + { + "id": "c77cfe51-f4a0-4692-9dee-5061ef667f23", + "name": "GPT-5.5 (low)" + }, + { + "id": "c8158c23-6fff-4c31-911d-954c32d80c28", + "name": "Step 3.5 Flash" + }, + { + "id": "c8673741-5e1a-46a1-9e4f-710a5c920982", + "name": "GLM-4.7-Flash (Non-reasoning)" + }, + { + "id": "c8a3fa87-735e-49a9-afb1-270c5e9f53f7", + "name": "Olmo 3 32B Think" + }, + { + "id": "c8a79180-7d16-4474-8701-9a77c0baa56a", + "name": "Qwen3 Next 80B A3B (Reasoning)" + }, + { + "id": "c99f3bde-7c08-4de8-bd5c-8ee9123ebffa", + "name": "gpt-oss-120b (low)" + }, + { + "id": "ca04852c-eaae-4881-a208-f9b2ca3b7cd6", + "name": "o3-pro" + }, + { + "id": "ca6c1412-f3c1-4391-9231-f83a702aa7af", + "name": "Mixtral 8x22B Instruct" + }, + { + "id": "cbac8c35-e069-4c73-823e-0953e6ed0e85", + "name": "Qwen3 Max Thinking (Preview)" + }, + { + "id": "cc1fa238-1a76-486d-a997-22309275eadd", + "name": "Devstral Small (May '25)" + }, + { + "id": "ccbfa8c3-a762-480b-aade-34fb9697f98c", + "name": "Claude 4.1 Opus (Reasoning)" + }, + { + "id": "cd26a386-4873-46ff-b853-d239050025a2", + "name": "Gemma 4 31B (Reasoning)" + }, + { + "id": "ce3d286e-093d-413d-a81a-0270309f039e", + "name": "Qwen3 VL 30B A3B (Reasoning)" + }, + { + "id": "ce819310-af7c-49d3-9a02-6845111e1788", + "name": "Devstral Small 2" + }, + { + "id": "ceb4d610-d0a4-48c1-bea0-80ed76f1e5ca", + "name": "QwQ 32B" + }, + { + "id": "cf095603-72b6-47f8-8ee1-09a42890f92a", + "name": "Llama 3.1 Nemotron Ultra 253B v1 (Reasoning)" + }, + { + "id": "d034dafe-463d-4c50-956f-84fca657b26f", + "name": "Claude 4 Sonnet (Non-reasoning)" + }, + { + "id": "d0aa27aa-4705-4184-9a1d-483b78c9331c", + "name": "Gemma 4 31B (Non-reasoning)" + }, + { + "id": "d0b3d47e-aec6-425e-9de7-168dcc6d1e28", + "name": "GPT-5.1 (Non-reasoning)" + }, + { + "id": "d1122eff-ee85-4fdc-8a9f-23bee6590667", + "name": "Gemini 3 Pro Preview (high)" + }, + { + "id": "d1720545-d0a8-4c15-a53e-ef5ca99ac7ea", + "name": "Gemma 3 1B Instruct" + }, + { + "id": "d1768b3a-0a21-4e08-b3f6-56a9ab6cfbf3", + "name": "Qwen1.5 Chat 110B" + }, + { + "id": "d2d7dd95-770f-4cb0-9bbc-d275ac19c265", + "name": "GLM-4.6V (Reasoning)" + }, + { + "id": "d306cccd-0085-4b2f-8aa0-ffcdbb434695", + "name": "MiniCPM5-1B (Non-reasoning)" + }, + { + "id": "d370fcbf-c4a1-41a2-abc4-d204fcc3fcbf", + "name": "Qwen3 VL 32B (Reasoning)" + }, + { + "id": "d3968fd3-97d8-4693-8d26-19cefc6f5d5f", + "name": "Hermes 4 - Llama-3.1 405B (Non-reasoning)" + }, + { + "id": "d4be6393-8915-436c-a3a8-4e59bd5c89a9", + "name": "Granite 4.1 30B" + }, + { + "id": "d4fc3f33-f2b0-4da1-88ee-f1f82bd4de31", + "name": "GPT-5.4 nano (xhigh)" + }, + { + "id": "d58cf573-1bd3-4d1f-9182-5482a460f570", + "name": "Qwen3 VL 235B A22B Instruct" + }, + { + "id": "d621247c-d47e-458c-82cb-a166bc3b37e5", + "name": "DeepSeek V3.2 (Reasoning)" + }, + { + "id": "d734e2ce-5cf8-467f-8148-586d02671333", + "name": "Exaone 4.0 1.2B (Reasoning)" + }, + { + "id": "d80eb0f1-f62e-4d31-99d2-7a925eb126b0", + "name": "Gemma 4 E4B (Reasoning)" + }, + { + "id": "d8ddb241-b3e4-4c25-a6a3-72eb1b30c541", + "name": "DeepSeek V4 Flash (Reasoning, Max Effort)" + }, + { + "id": "d925845d-39ad-4de3-8495-f176b79828c0", + "name": "Claude 3.7 Sonnet (Non-reasoning)" + }, + { + "id": "d97713f2-afa6-4f8d-b2f3-ac89a24c4d6c", + "name": "Solar Mini" + }, + { + "id": "da9fe224-8af3-46d7-a8c4-6220779c3f35", + "name": "Qwen3 Coder 30B A3B Instruct" + }, + { + "id": "dae31abc-0587-44d0-ba53-f78e96b6e486", + "name": "K2-V2 (low)" + }, + { + "id": "dafbb6d2-4825-43d1-a927-feedcfd2e998", + "name": "Granite 4.0 H 1B" + }, + { + "id": "dbbcf240-a69e-4078-8f16-7c94c7a8c514", + "name": "Reka Flash 3" + }, + { + "id": "dd059b25-d82a-4ead-82a4-4adceaaec48b", + "name": "Mistral Medium 3.5" + }, + { + "id": "dd738be7-2b69-4775-91a5-8851d3341c2d", + "name": "K2-V2 (medium)" + }, + { + "id": "ddc748d0-6a9b-466b-8d6c-68417980d56d", + "name": "Claude 2.0" + }, + { + "id": "de0beaf0-c951-487a-8eb4-3dd12e74122c", + "name": "Pixtral Large" + }, + { + "id": "dec8073c-57e2-41c0-b1aa-7a62960f103f", + "name": "Qwen3 VL 8B (Reasoning)" + }, + { + "id": "ded8d96e-835f-4359-947a-a4c3bb78e983", + "name": "Phi-3 Mini Instruct 3.8B" + }, + { + "id": "df4c5a29-4b5c-4fef-9f7f-5e24e118ab65", + "name": "Mi:dm K 2.5 Pro" + }, + { + "id": "df8d14e0-3997-4e4d-b4ad-9c047acc9c69", + "name": "Claude Sonnet 4.6 (Adaptive Reasoning, Max Effort)" + }, + { + "id": "df95f83f-5ebb-466a-9d2d-b95efc8c012c", + "name": "DeepSeek R1 Distill Qwen 32B" + }, + { + "id": "dfb9292d-bc7c-4425-a260-4256217e709f", + "name": "DeepSeek V3.1 Terminus (Non-reasoning)" + }, + { + "id": "dfeeb904-e784-4d5c-ad66-9400146b150b", + "name": "Gemini 2.0 Flash Thinking Experimental (Dec '24)" + }, + { + "id": "e0099b99-d368-4562-b0de-4016ea58af54", + "name": "Mi:dm K 2.5 Pro Preview" + }, + { + "id": "e18e5e6a-5a31-4c0b-b80b-ac401392f446", + "name": "GPT-5 nano (high)" + }, + { + "id": "e1cfa926-9e2b-4a0d-8c31-48366a5041c5", + "name": "Llama Nemotron Super 49B v1.5 (Reasoning)" + }, + { + "id": "e2e9ddc3-8c2d-4bf5-a60a-83a1afe61034", + "name": "GPT-4o Realtime (Dec '24)" + }, + { + "id": "e3396f8f-7994-4df5-bdab-43745681ef0a", + "name": "GPT-5.5 Pro (xhigh)" + }, + { + "id": "e410e854-104d-4b35-a171-899ff9d974bb", + "name": "Qwen2.5 Coder Instruct 7B " + }, + { + "id": "e46198a7-cd29-4afd-933d-cdf180f0f305", + "name": "Qwen3 4B (Non-reasoning)" + }, + { + "id": "e58bbffd-fdc2-412a-b6d7-ca0e3f5d611a", + "name": "Nova Premier" + }, + { + "id": "e5dd499f-c330-45ec-9ff0-a99209c82af7", + "name": "Mistral Small 3" + }, + { + "id": "e8d4100e-165b-4c5d-ac11-ac553590a334", + "name": "o1-pro" + }, + { + "id": "e8ffd75b-766f-4551-8c52-6e54706220eb", + "name": "Jamba 1.6 Mini" + }, + { + "id": "e98e911e-9fb2-4a9a-826e-3d681d0cdca8", + "name": "GPT-4o (May '24)" + }, + { + "id": "e9a09db3-8fd6-41dd-ba2f-20e0a2bff7f2", + "name": "Claude Opus 4.7 (Adaptive Reasoning, Max Effort)" + }, + { + "id": "ea5d2c10-1051-437d-95c2-18d5e4d14ff3", + "name": "Cogito v2.1 (Reasoning)" + }, + { + "id": "eab1492c-b853-4852-aa71-06b0ec2481c1", + "name": "GPT-5 (ChatGPT)" + }, + { + "id": "eb4ba465-3fcd-4065-9fe2-e8225e7b2c6c", + "name": "Ring-2.6-1T" + }, + { + "id": "eb689f7a-f210-4a87-b407-f249897f2764", + "name": "Solar Pro 2 (Reasoning)" + }, + { + "id": "ebf3b39f-0be6-43a1-a37a-f9b2978c9916", + "name": "Muse Spark" + }, + { + "id": "ec3b22e6-48ac-416a-b4ae-55565a4f3046", + "name": "Grok 3 Reasoning Beta" + }, + { + "id": "ec60e57e-76d3-42e3-a0e3-80662225a639", + "name": "OpenChat 3.5 (1210)" + }, + { + "id": "ecc6524a-d521-458a-8327-5009e8ce6549", + "name": "Qwen3 14B (Non-reasoning)" + }, + { + "id": "ee708f92-374e-4123-b900-e22d7b2afc19", + "name": "Phi-4" + }, + { + "id": "eebfef01-709e-4ffe-b72f-0db75ef2434b", + "name": "Qwen3.5 35B A3B (Non-reasoning)" + }, + { + "id": "f0083258-8646-45b8-8082-7aaf6c2ea82a", + "name": "gpt-oss-120b (high)" + }, + { + "id": "f164b41f-44c5-4675-bca3-fea1db4bd9ae", + "name": "GLM-5 (Non-reasoning)" + }, + { + "id": "f1d52583-9d20-4099-99ac-b5df9430c3b6", + "name": "NVIDIA Nemotron Nano 9B V2 (Reasoning)" + }, + { + "id": "f2e21112-192e-4aed-ae82-68ca3b38e667", + "name": "Claude Sonnet 4.6 (Non-reasoning, Low Effort)" + }, + { + "id": "f2f60e3a-e5f5-4471-acd2-9f2f29c76007", + "name": "Claude 4.1 Opus (Non-reasoning)" + }, + { + "id": "f3169f25-8c6f-48e4-ae87-0cf872dc0ec1", + "name": "Qwen3 30B A3B (Non-reasoning)" + }, + { + "id": "f371ad68-6947-4767-a78f-1f6c81f96b93", + "name": "Qwen3.6 Plus" + }, + { + "id": "f4274721-ef28-4121-aa88-8e97267a5a82", + "name": "Nova 2.0 Pro Preview (low)" + }, + { + "id": "f4e8194a-d0e6-48eb-92be-4307de5aeeec", + "name": "Gemini 2.5 Flash-Lite (Reasoning)" + }, + { + "id": "f5d83128-047f-496d-ba49-8a428abe8345", + "name": "Qwen3 VL 4B Instruct" + }, + { + "id": "f6ccbe1d-bd7e-484b-9795-18cc9f91552d", + "name": "Qwen3 235B A22B 2507 (Reasoning)" + }, + { + "id": "f73f4711-9c61-40a7-a258-b71c14727f53", + "name": "Llama 3.1 Tulu3 405B" + }, + { + "id": "f74ea286-cd29-4eb4-af14-1389b19c21e5", + "name": "MiniMax-M2" + }, + { + "id": "f78138d6-2e04-4a84-919a-20d177cb6ff1", + "name": "Jamba Reasoning 3B" + }, + { + "id": "f818a7bb-6f23-4b24-8d52-6b9c1a5ca628", + "name": "OLMo 2 7B" + }, + { + "id": "f93d0750-b659-4ceb-a123-7e657904ef2b", + "name": "Qwen3 VL 4B (Reasoning)" + }, + { + "id": "fb112343-c82c-4b43-afea-996bd5101d62", + "name": "KAT-Coder-Pro V1" + }, + { + "id": "fb65266f-5a7d-403c-85d5-ccdf0d1ca838", + "name": "DeepSeek V3.1 (Non-reasoning)" + }, + { + "id": "fbc58677-e324-4b45-a979-7fd8eec555cd", + "name": "LFM2 24B A2B" + }, + { + "id": "fbdf8da1-b341-448c-b3cb-8aff1d8f70b9", + "name": "Nova 2.0 Lite (medium)" + }, + { + "id": "fc4223e8-4586-4ca1-97ca-bb55ff586947", + "name": "KAT Coder Pro V2" + }, + { + "id": "fc92f822-04b7-420d-9c07-a21af5e9aac7", + "name": "Qwen3 Coder Next" + }, + { + "id": "fd4454ff-e703-46c0-a7f5-fa69af09486d", + "name": "GPT-5.1 Codex mini (high)" + }, + { + "id": "fddb72bd-60d3-41af-acc5-3df9a290eb8e", + "name": "Gemini 2.0 Flash (experimental)" + }, + { + "id": "fe11ab6c-a4dd-4c28-9fef-07da76d5ed14", + "name": "Llama 65B" + }, + { + "id": "fe2c2289-d261-4433-8681-46448372c1f6", + "name": "Grok 4.3 (low)" + }, + { + "id": "ff9bc5e5-a02f-4270-983e-4b3f834f3363", + "name": "Grok 3 mini Reasoning (high)" + }, + { + "id": "ffd65ef7-fbdb-4145-98ae-b5d01cda770b", + "name": "Llama 2 Chat 7B" + } + ], + "prompt_options": { + "parallel_queries": 1, + "prompt_length": 1000 + }, + "source": "https://artificialanalysis.ai/api/v2/data/llms/models" +} diff --git a/utils/llm/metadata/models_dev.py b/utils/llm/metadata/models_dev.py new file mode 100644 index 0000000..275c849 --- /dev/null +++ b/utils/llm/metadata/models_dev.py @@ -0,0 +1,89 @@ +"""Loader for the checked-in Models.dev metadata snapshot.""" + +import json +from dataclasses import dataclass +from datetime import date +from functools import lru_cache +from pathlib import Path +from typing import Any + +SNAPSHOT_PATH = Path(__file__).with_name("models_dev_snapshot.json") + + +@dataclass(frozen=True, slots=True) +class ModelsDevModel: + """Normalized metadata for one Models.dev model.""" + + id: str + name: str + release_date: date | None + raw: dict[str, Any] + + +@dataclass(frozen=True, slots=True) +class ModelsDevProvider: + """Normalized metadata for one Models.dev provider.""" + + id: str + name: str + models: dict[str, ModelsDevModel] + + +@dataclass(frozen=True, slots=True) +class ModelsDevSnapshot: + """Loaded Models.dev metadata indexed by provider and model ID.""" + + providers: dict[str, ModelsDevProvider] + + def get_model(self, *, provider_id: str, model_id: str) -> ModelsDevModel: + """Return a model by Models.dev provider and model IDs.""" + try: + provider = self.providers[provider_id] + except KeyError as exc: + raise KeyError(f"Unknown Models.dev provider_id {provider_id}") from exc + try: + return provider.models[model_id] + except KeyError as exc: + raise KeyError( + f"Unknown Models.dev model_id {model_id} for provider_id {provider_id}" + ) from exc + + +def _parse_date(value: str | None) -> date | None: + """Parse an ISO date value from the snapshot.""" + if value is None: + return None + if len(value) != len("YYYY-MM-DD"): + return None + try: + return date.fromisoformat(value) + except ValueError: + return None + + +def _model_from_json(data: dict[str, Any]) -> ModelsDevModel: + """Build normalized model metadata from snapshot JSON.""" + return ModelsDevModel( + id=data["id"], + name=data["name"], + release_date=_parse_date(data.get("release_date")), + raw=data, + ) + + +@lru_cache(maxsize=1) +def load_models_dev_snapshot(path: Path = SNAPSHOT_PATH) -> ModelsDevSnapshot: + """Load the checked-in Models.dev metadata snapshot.""" + data = json.loads(path.read_text()) + providers = {} + for provider_id, provider_data in data["providers"].items(): + models = { + model_id: _model_from_json(model_data) + for model_id, model_data in provider_data["models"].items() + } + providers[provider_id] = ModelsDevProvider( + id=provider_data["id"], + name=provider_data["name"], + models=models, + ) + return ModelsDevSnapshot(providers=providers) diff --git a/utils/llm/metadata/models_dev_snapshot.json b/utils/llm/metadata/models_dev_snapshot.json new file mode 100644 index 0000000..f490257 --- /dev/null +++ b/utils/llm/metadata/models_dev_snapshot.json @@ -0,0 +1,355 @@ +{ + "providers": { + "anthropic": { + "id": "anthropic", + "models": { + "claude-3-5-sonnet-20240620": { + "id": "claude-3-5-sonnet-20240620", + "name": "Claude Sonnet 3.5", + "release_date": "2024-06-20" + }, + "claude-3-5-sonnet-20241022": { + "id": "claude-3-5-sonnet-20241022", + "name": "Claude Sonnet 3.5 v2", + "release_date": "2024-10-22" + }, + "claude-3-7-sonnet-20250219": { + "id": "claude-3-7-sonnet-20250219", + "name": "Claude Sonnet 3.7", + "release_date": "2025-02-19" + }, + "claude-3-haiku-20240307": { + "id": "claude-3-haiku-20240307", + "name": "Claude Haiku 3", + "release_date": "2024-03-13" + }, + "claude-3-opus-20240229": { + "id": "claude-3-opus-20240229", + "name": "Claude Opus 3", + "release_date": "2024-02-29" + }, + "claude-haiku-4-5-20251001": { + "id": "claude-haiku-4-5-20251001", + "name": "Claude Haiku 4.5", + "release_date": "2025-10-15" + }, + "claude-opus-4-1-20250805": { + "id": "claude-opus-4-1-20250805", + "name": "Claude Opus 4.1", + "release_date": "2025-08-05" + }, + "claude-opus-4-20250514": { + "id": "claude-opus-4-20250514", + "name": "Claude Opus 4", + "release_date": "2025-05-22" + }, + "claude-opus-4-5-20251101": { + "id": "claude-opus-4-5-20251101", + "name": "Claude Opus 4.5", + "release_date": "2025-11-01" + }, + "claude-opus-4-6": { + "id": "claude-opus-4-6", + "name": "Claude Opus 4.6", + "release_date": "2026-02-05" + }, + "claude-opus-4-7": { + "id": "claude-opus-4-7", + "name": "Claude Opus 4.7", + "release_date": "2026-04-16" + }, + "claude-opus-4-8": { + "id": "claude-opus-4-8", + "name": "Claude Opus 4.8", + "release_date": "2026-05-28" + }, + "claude-sonnet-4-20250514": { + "id": "claude-sonnet-4-20250514", + "name": "Claude Sonnet 4", + "release_date": "2025-05-22" + }, + "claude-sonnet-4-5-20250929": { + "id": "claude-sonnet-4-5-20250929", + "name": "Claude Sonnet 4.5", + "release_date": "2025-09-29" + }, + "claude-sonnet-4-6": { + "id": "claude-sonnet-4-6", + "name": "Claude Sonnet 4.6", + "release_date": "2026-02-17" + } + }, + "name": "Anthropic" + }, + "deepseek": { + "id": "deepseek", + "models": { + "deepseek-v4-pro": { + "id": "deepseek-v4-pro", + "name": "DeepSeek V4 Pro", + "release_date": "2026-04-24" + } + }, + "name": "DeepSeek" + }, + "google": { + "id": "google", + "models": { + "gemini-2.5-flash": { + "id": "gemini-2.5-flash", + "name": "Gemini 2.5 Flash", + "release_date": "2025-03-20" + }, + "gemini-2.5-pro": { + "id": "gemini-2.5-pro", + "name": "Gemini 2.5 Pro", + "release_date": "2025-03-20" + }, + "gemini-3-flash-preview": { + "id": "gemini-3-flash-preview", + "name": "Gemini 3 Flash Preview", + "release_date": "2025-12-17" + }, + "gemini-3-pro-preview": { + "id": "gemini-3-pro-preview", + "name": "Gemini 3 Pro Preview", + "release_date": "2025-11-18" + }, + "gemini-3.1-flash-lite": { + "id": "gemini-3.1-flash-lite", + "name": "Gemini 3.1 Flash Lite", + "release_date": "2026-05-07" + }, + "gemini-3.1-flash-lite-preview": { + "id": "gemini-3.1-flash-lite-preview", + "name": "Gemini 3.1 Flash Lite Preview", + "release_date": "2026-03-03" + }, + "gemini-3.1-pro-preview": { + "id": "gemini-3.1-pro-preview", + "name": "Gemini 3.1 Pro Preview", + "release_date": "2026-02-19" + }, + "gemini-3.5-flash": { + "id": "gemini-3.5-flash", + "name": "Gemini 3.5 Flash", + "release_date": "2026-05-19" + }, + "gemma-4-31b-it": { + "id": "gemma-4-31b-it", + "name": "Gemma 4 31B IT", + "release_date": "2026-04-02" + } + }, + "name": "Google" + }, + "minimax": { + "id": "minimax", + "models": { + "MiniMax-M2.5": { + "id": "MiniMax-M2.5", + "name": "MiniMax-M2.5", + "release_date": "2026-02-12" + }, + "MiniMax-M2.7": { + "id": "MiniMax-M2.7", + "name": "MiniMax-M2.7", + "release_date": "2026-03-18" + } + }, + "name": "MiniMax (minimax.io)" + }, + "mistral": { + "id": "mistral", + "models": { + "mistral-large-2411": { + "id": "mistral-large-2411", + "name": "Mistral Large 2.1", + "release_date": "2024-11-01" + } + }, + "name": "Mistral" + }, + "moonshotai": { + "id": "moonshotai", + "models": { + "kimi-k2-thinking": { + "id": "kimi-k2-thinking", + "name": "Kimi K2 Thinking", + "release_date": "2025-11-06" + }, + "kimi-k2.6": { + "id": "kimi-k2.6", + "name": "Kimi K2.6", + "release_date": "2026-04-21" + } + }, + "name": "Moonshot AI" + }, + "openai": { + "id": "openai", + "models": { + "gpt-4.1": { + "id": "gpt-4.1", + "name": "GPT-4.1", + "release_date": "2025-04-14" + }, + "gpt-4o": { + "id": "gpt-4o", + "name": "GPT-4o", + "release_date": "2024-05-13" + }, + "gpt-4o-2024-05-13": { + "id": "gpt-4o-2024-05-13", + "name": "GPT-4o (2024-05-13)", + "release_date": "2024-05-13" + }, + "gpt-4o-2024-11-20": { + "id": "gpt-4o-2024-11-20", + "name": "GPT-4o (2024-11-20)", + "release_date": "2024-11-20" + }, + "gpt-4o-mini": { + "id": "gpt-4o-mini", + "name": "GPT-4o mini", + "release_date": "2024-07-18" + }, + "gpt-5": { + "id": "gpt-5", + "name": "GPT-5", + "release_date": "2025-08-07" + }, + "gpt-5-mini": { + "id": "gpt-5-mini", + "name": "GPT-5 Mini", + "release_date": "2025-08-07" + }, + "gpt-5-nano": { + "id": "gpt-5-nano", + "name": "GPT-5 Nano", + "release_date": "2025-08-07" + }, + "gpt-5.1": { + "id": "gpt-5.1", + "name": "GPT-5.1", + "release_date": "2025-11-13" + }, + "gpt-5.2": { + "id": "gpt-5.2", + "name": "GPT-5.2", + "release_date": "2025-12-11" + }, + "gpt-5.4": { + "id": "gpt-5.4", + "name": "GPT-5.4", + "release_date": "2026-03-05" + }, + "gpt-5.4-mini": { + "id": "gpt-5.4-mini", + "name": "GPT-5.4 mini", + "release_date": "2026-03-17" + }, + "gpt-5.4-nano": { + "id": "gpt-5.4-nano", + "name": "GPT-5.4 nano", + "release_date": "2026-03-17" + }, + "gpt-5.5": { + "id": "gpt-5.5", + "name": "GPT-5.5", + "release_date": "2026-04-23" + }, + "o3": { + "id": "o3", + "name": "o3", + "release_date": "2025-04-16" + }, + "o4-mini": { + "id": "o4-mini", + "name": "o4-mini", + "release_date": "2025-04-16" + } + }, + "name": "OpenAI" + }, + "togetherai": { + "id": "togetherai", + "models": { + "deepseek-ai/DeepSeek-R1": { + "id": "deepseek-ai/DeepSeek-R1", + "name": "DeepSeek R1", + "release_date": "2024-12-26" + }, + "deepseek-ai/DeepSeek-V3": { + "id": "deepseek-ai/DeepSeek-V3", + "name": "DeepSeek V3", + "release_date": "2025-01-20" + }, + "deepseek-ai/DeepSeek-V3-1": { + "id": "deepseek-ai/DeepSeek-V3-1", + "name": "DeepSeek V3.1", + "release_date": "2025-08-21" + }, + "meta-llama/Llama-3.3-70B-Instruct-Turbo": { + "id": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "name": "Llama 3.3 70B", + "release_date": "2024-12-06" + }, + "moonshotai/Kimi-K2.5": { + "id": "moonshotai/Kimi-K2.5", + "name": "Kimi K2.5", + "release_date": "2026-01-27" + } + }, + "name": "Together AI" + }, + "xai": { + "id": "xai", + "models": { + "grok-4.20-0309-non-reasoning": { + "id": "grok-4.20-0309-non-reasoning", + "name": "Grok 4.20 (Non-Reasoning)", + "release_date": "2026-03-09" + }, + "grok-4.20-0309-reasoning": { + "id": "grok-4.20-0309-reasoning", + "name": "Grok 4.20 (Reasoning)", + "release_date": "2026-03-09" + }, + "grok-4.3": { + "id": "grok-4.3", + "name": "Grok 4.3", + "release_date": "2026-04-17" + } + }, + "name": "xAI" + }, + "zai": { + "id": "zai", + "models": { + "glm-4.6": { + "id": "glm-4.6", + "name": "GLM-4.6", + "release_date": "2025-09-30" + }, + "glm-4.7": { + "id": "glm-4.7", + "name": "GLM-4.7", + "release_date": "2025-12-22" + }, + "glm-5": { + "id": "glm-5", + "name": "GLM-5", + "release_date": "2026-02-11" + }, + "glm-5.1": { + "id": "glm-5.1", + "name": "GLM-5.1", + "release_date": "2026-03-27" + } + }, + "name": "Z.AI" + } + }, + "source": "https://models.dev/api.json" +} diff --git a/utils/llm/model_registry.py b/utils/llm/model_registry.py index e28725b..5734e47 100644 --- a/utils/llm/model_registry.py +++ b/utils/llm/model_registry.py @@ -1,8 +1,53 @@ -"""Central model registry for LLM providers.""" - -from __future__ import annotations - +"""Central model registry for LLM providers. + +Adding a base model: + +1. Look up the model in Models.dev. If present, copy its exact `provider_id` & `model_id` into + `ModelsDevReference`; the checked-in snapshot is only a generated subset, not the catalog. + In Models.dev source paths, `provider_id` is the folder under `providers/`, and + `model_id` is the TOML filename stem under `models/`, e.g. + `providers/anthropic/models/claude-opus-4-8.toml` -> `anthropic` / `claude-opus-4-8`. + +2. Add the model to the provider-specific list below with the provider helper. `model_key` is our + stable key; set `provider_model_id` only when the provider API ID differs. For routed + providers like Together, set `lab_key`. + + Example: + ``` + openai_model( + model_key="gpt-5.5-2026-04-23", + models_dev_reference=ModelsDevReference( + provider_id="openai", + model_id="gpt-5.5", + ), + ) + ``` + +3. If Models.dev is missing the model or a full release date, set `manual_release_date`. + It can also be a fallback alongside `ModelsDevReference` when Models.dev has the + entry but not the date. + +4. Insert the entry where `(release_date, model_key)` stays ascending within that provider + list. Use `active=False` only for historical routes that should stay registered but + leave `ACTIVE_MODEL_RUNS`. + +5. Add benchmark call configs in `model_runs.py` with explicit `model_run_key` values. + +After changing `ModelsDevReference` values, refresh the Models.dev snapshot from the repo's root +directory: +``` +python - <<'PY' +from scripts.refresh_models_dev_metadata import write_models_dev_snapshot + +write_models_dev_snapshot() +PY +``` +Incorrect exact references fail with nearby Models.dev suggestions. +""" + +from collections.abc import Sequence from dataclasses import dataclass +from datetime import date from functools import lru_cache from typing import Any, Final, Type @@ -17,6 +62,7 @@ XAI_API_KEY_SECRET_NAME, ) from .lab_registry import LABS, Lab +from .metadata.models_dev import ModelsDevModel, load_models_dev_snapshot from .provider_registry import PROVIDERS, Provider from .providers.anthropic import AnthropicProvider from .providers.base import BaseLLMProvider @@ -37,10 +83,6 @@ PROVIDERS["Together"]: TogetherProvider, } -_PROVIDER_CLASS_TO_PROVIDER: dict[Type[BaseLLMProvider], Provider] = { - provider_cls: provider for provider, provider_cls in _PROVIDER_TO_CLASS.items() -} - # Mapping from provider classes to GCP secret names _PROVIDER_CLASS_TO_SECRET_NAME: dict[Type[BaseLLMProvider], str] = { OpenAIProvider: OPENAI_API_KEY_SECRET_NAME, @@ -51,16 +93,80 @@ } +@dataclass(frozen=True, slots=True) +class ModelsDevReference: + """Reference to an underlying model entry in Models.dev.""" + + provider_id: str + model_id: str + + @dataclass(frozen=True, slots=True) class Model: - """Registered LLM model metadata.""" + """Canonical LLM model metadata.""" - id: str - full_name: str - token_limit: int - provider_cls: Type[BaseLLMProvider] + model_key: str + provider_model_id: str lab: Lab - reasoning_model: bool = False + provider: Provider + models_dev_reference: ModelsDevReference | None = None + manual_release_date: date | None = None + active: bool = True + + def __post_init__(self) -> None: + """Validate the model declaration against configured metadata.""" + if self.models_dev_reference is None: + if self.manual_release_date is None: + raise ValueError(f"Model {self.model_key} is missing release date source") + return + + try: + metadata = self.models_dev_metadata + except KeyError as exc: + reference = self.models_dev_reference + raise ValueError( + f"Model {self.model_key} has invalid Models.dev reference " + f"{reference.provider_id}/{reference.model_id}: {exc}" + ) from exc + + if metadata.release_date is not None: + return + if self.manual_release_date is None: + raise ValueError(f"Model {self.model_key} is missing release date metadata") + + @property + def models_dev_provider_id(self) -> str | None: + """Return the Models.dev provider ID for compatibility and debugging.""" + if self.models_dev_reference is None: + return None + return self.models_dev_reference.provider_id + + @property + def models_dev_model_id(self) -> str | None: + """Return the Models.dev model ID for compatibility and debugging.""" + if self.models_dev_reference is None: + return None + return self.models_dev_reference.model_id + + @property + def models_dev_metadata(self) -> ModelsDevModel | None: + """Return Models.dev metadata for this model when a lookup is configured.""" + if self.models_dev_reference is None: + return None + return load_models_dev_snapshot().get_model( + provider_id=self.models_dev_reference.provider_id, + model_id=self.models_dev_reference.model_id, + ) + + @property + def release_date(self) -> date: + """Return this model's release date from Models.dev or a manual fallback.""" + metadata = self.models_dev_metadata + if metadata is not None and metadata.release_date is not None: + return metadata.release_date + if self.manual_release_date is not None: + return self.manual_release_date + raise ValueError(f"Model {self.model_key} is missing release date metadata") def get_response( self, @@ -68,15 +174,137 @@ def get_response( options: dict[str, Any] | None = None, ) -> str: """Request a response from the model's provider.""" - provider = _PROVIDER_CLASS_TO_PROVIDER[self.provider_cls] return get_response( - provider, - self.full_name, + self.provider, + self.provider_model_id, prompt=prompt, options=options, ) +def provider_model( + *, + model_key: str, + lab_key: str, + provider_key: str, + provider_model_id: str | None = None, + models_dev_reference: ModelsDevReference | None = None, + manual_release_date: date | None = None, + active: bool = True, +) -> Model: + """Create a model declaration for a provider route.""" + return Model( + model_key=model_key, + provider_model_id=provider_model_id or model_key, + lab=LABS[lab_key], + provider=PROVIDERS[provider_key], + models_dev_reference=models_dev_reference, + manual_release_date=manual_release_date, + active=active, + ) + + +def openai_model( + *, + model_key: str, + provider_model_id: str | None = None, + models_dev_reference: ModelsDevReference | None = None, + manual_release_date: date | None = None, + active: bool = True, +) -> Model: + """Create an OpenAI model declaration.""" + return provider_model( + model_key=model_key, + provider_model_id=provider_model_id, + lab_key="OpenAI", + provider_key="OpenAI", + models_dev_reference=models_dev_reference, + manual_release_date=manual_release_date, + active=active, + ) + + +def anthropic_model( + *, + model_key: str, + provider_model_id: str | None = None, + models_dev_reference: ModelsDevReference | None = None, + manual_release_date: date | None = None, + active: bool = True, +) -> Model: + """Create an Anthropic model declaration.""" + return provider_model( + model_key=model_key, + provider_model_id=provider_model_id, + lab_key="Anthropic", + provider_key="Anthropic", + models_dev_reference=models_dev_reference, + manual_release_date=manual_release_date, + active=active, + ) + + +def xai_model( + *, + model_key: str, + provider_model_id: str | None = None, + models_dev_reference: ModelsDevReference | None = None, + manual_release_date: date | None = None, + active: bool = True, +) -> Model: + """Create an xAI model declaration.""" + return provider_model( + model_key=model_key, + provider_model_id=provider_model_id, + lab_key="xAI", + provider_key="xAI", + models_dev_reference=models_dev_reference, + manual_release_date=manual_release_date, + active=active, + ) + + +def google_model( + *, + model_key: str, + provider_model_id: str | None = None, + models_dev_reference: ModelsDevReference | None = None, + manual_release_date: date | None = None, + active: bool = True, +) -> Model: + """Create a Google model declaration.""" + return provider_model( + model_key=model_key, + provider_model_id=provider_model_id, + lab_key="Google DeepMind", + provider_key="Google", + models_dev_reference=models_dev_reference, + manual_release_date=manual_release_date, + active=active, + ) + + +def together_model( + *, + model_key: str, + lab_key: str, + provider_model_id: str | None = None, + models_dev_reference: ModelsDevReference | None = None, + manual_release_date: date | None = None, + active: bool = True, +) -> Model: + """Create a Together-routed model declaration.""" + return provider_model( + model_key=model_key, + provider_model_id=provider_model_id, + lab_key=lab_key, + provider_key="Together", + models_dev_reference=models_dev_reference, + manual_release_date=manual_release_date, + active=active, + ) + + def _get_api_key_for_provider(provider_cls: Type[BaseLLMProvider]) -> str | None: """Look up API key for a provider from the registry configuration. @@ -135,7 +363,7 @@ def configure_api_keys( try: api_key = get_secret(secret_name) _PROVIDER_API_KEYS[provider_cls] = api_key - except (RuntimeError, exceptions.NotFound): + except RuntimeError, exceptions.NotFound: # GCP not configured or secret doesn't exist, skip this provider pass @@ -204,204 +432,542 @@ def validate_provider_keys(providers: list[Provider]) -> None: ) -MODELS: Final[list[Model]] = [ - Model( - id="gpt-4.1-mini", - full_name="gpt-4.1-mini", - token_limit=128_000, - provider_cls=OpenAIProvider, - lab=LABS["OpenAI"], - ), - Model( - id="gpt-4o-mini", - full_name="gpt-4o-mini", - token_limit=128_000, - provider_cls=OpenAIProvider, - lab=LABS["OpenAI"], - ), - Model( - id="gpt-5-2025-08-07", - full_name="gpt-5-2025-08-07", - token_limit=128_000, - provider_cls=OpenAIProvider, - lab=LABS["OpenAI"], - reasoning_model=True, - ), - Model( - id="gpt-5-mini-2025-08-07", - full_name="gpt-5-mini-2025-08-07", - token_limit=128_000, - provider_cls=OpenAIProvider, - lab=LABS["OpenAI"], - reasoning_model=True, - ), - Model( - id="gpt-5-nano-2025-08-07", - full_name="gpt-5-nano-2025-08-07", - token_limit=128_000, - provider_cls=OpenAIProvider, - lab=LABS["OpenAI"], - reasoning_model=True, - ), - Model( - id="gpt-5.1-2025-11-13", - full_name="gpt-5.1-2025-11-13", - token_limit=128_000, - provider_cls=OpenAIProvider, - lab=LABS["OpenAI"], - reasoning_model=True, - ), - Model( - id="gpt-5.2-2025-12-11", - full_name="gpt-5.2-2025-12-11", - token_limit=128_000, - provider_cls=OpenAIProvider, - lab=LABS["OpenAI"], - reasoning_model=True, - ), - Model( - id="o3-2025-04-16", - full_name="o3-2025-04-16", - token_limit=200_000, - provider_cls=OpenAIProvider, - lab=LABS["OpenAI"], - reasoning_model=True, - ), - Model( - id="gpt-4.1-2025-04-14", - full_name="gpt-4.1-2025-04-14", - token_limit=128_000, - provider_cls=OpenAIProvider, - lab=LABS["OpenAI"], - ), - Model( - id="DeepSeek-V3.1", - full_name="deepseek-ai/DeepSeek-V3.1", - token_limit=128_000, - provider_cls=TogetherProvider, - lab=LABS["DeepSeek"], - ), - Model( - id="Qwen3-235B-A22B-Thinking-2507", - full_name="Qwen/Qwen3-235B-A22B-Thinking-2507", - token_limit=262_144, - provider_cls=TogetherProvider, - lab=LABS["Qwen"], - ), - Model( - id="GLM-4.5-Air-FP8", - full_name="zai-org/GLM-4.5-Air-FP8", - token_limit=131_072, - provider_cls=TogetherProvider, - lab=LABS["Z.ai"], - ), - Model( - id="GLM-4.6", - full_name="zai-org/GLM-4.6", - token_limit=202_752, - provider_cls=TogetherProvider, - lab=LABS["Z.ai"], - reasoning_model=False, - ), - Model( - id="claude-sonnet-4-5-20250929", - full_name="claude-sonnet-4-5-20250929", - token_limit=200_000, - provider_cls=AnthropicProvider, - lab=LABS["Anthropic"], - ), - Model( - id="claude-haiku-4-5-20251001", - full_name="claude-haiku-4-5-20251001", - token_limit=200_000, - provider_cls=AnthropicProvider, - lab=LABS["Anthropic"], - ), - Model( - id="claude-opus-4-1-20250805", - full_name="claude-opus-4-1-20250805", - token_limit=200_000, - provider_cls=AnthropicProvider, - lab=LABS["Anthropic"], - ), - Model( - id="claude-opus-4-5-20251101", - full_name="claude-opus-4-5-20251101", - token_limit=200_000, - provider_cls=AnthropicProvider, - lab=LABS["Anthropic"], - ), - Model( - id="claude-sonnet-4-6", - full_name="claude-sonnet-4-6", - token_limit=200_000, - provider_cls=AnthropicProvider, - lab=LABS["Anthropic"], - ), - Model( - id="claude-sonnet-4-20250514", - full_name="claude-sonnet-4-20250514", - token_limit=200_000, - provider_cls=AnthropicProvider, - lab=LABS["Anthropic"], - ), - Model( - id="grok-4-fast-reasoning", - full_name="grok-4-fast-reasoning", - token_limit=2_000_000, - provider_cls=XAIProvider, - lab=LABS["xAI"], - ), - Model( - id="grok-4-fast-non-reasoning", - full_name="grok-4-fast-non-reasoning", - token_limit=2_000_000, - provider_cls=XAIProvider, - lab=LABS["xAI"], - ), - Model( - id="grok-4-0709", - full_name="grok-4-0709", - token_limit=256_000, - provider_cls=XAIProvider, - lab=LABS["xAI"], - ), - Model( - id="grok-4-1-fast-reasoning", - full_name="grok-4-1-fast-reasoning", - token_limit=2_000_000, - provider_cls=XAIProvider, - lab=LABS["xAI"], - reasoning_model=True, - ), - Model( - id="grok-4-1-fast-non-reasoning", - full_name="grok-4-1-fast-non-reasoning", - token_limit=2_000_000, - provider_cls=XAIProvider, - lab=LABS["xAI"], - reasoning_model=False, - ), - Model( - id="gemini-2.5-pro", - full_name="gemini-2.5-pro", - token_limit=1_048_576, - provider_cls=GoogleProvider, - lab=LABS["Google DeepMind"], - ), - Model( - id="gemini-2.5-flash", - full_name="models/gemini-2.5-flash", - token_limit=1_048_576, - provider_cls=GoogleProvider, - lab=LABS["Google DeepMind"], - ), - Model( - id="gemini-3-pro-preview", - full_name="gemini-3-pro-preview", - token_limit=1_048_576, - provider_cls=GoogleProvider, - lab=LABS["Google DeepMind"], - reasoning_model=False, +# OpenAI models: https://developers.openai.com/api/docs/models +OPENAI_MODELS: Final[list[Model]] = [ + openai_model( + model_key="gpt-4-0613", + manual_release_date=date(2023, 6, 13), + ), + openai_model( + model_key="gpt-3.5-turbo-0125", + manual_release_date=date(2024, 1, 25), + ), + openai_model( + model_key="gpt-4-turbo-2024-04-09", + manual_release_date=date(2024, 4, 9), + ), + openai_model( + model_key="gpt-4o", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-4o"), + ), + openai_model( + model_key="gpt-4o-2024-05-13", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-4o-2024-05-13"), + ), + openai_model( + model_key="gpt-4o-mini-2024-07-18", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-4o-mini"), + ), + openai_model( + model_key="gpt-4o-2024-11-20", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-4o-2024-11-20"), + ), + openai_model( + model_key="o3-mini-2025-01-31", + manual_release_date=date(2025, 1, 31), + ), + openai_model( + model_key="gpt-4.5-preview-2025-02-27", + manual_release_date=date(2025, 2, 27), + ), + openai_model( + model_key="gpt-4.1-2025-04-14", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-4.1"), + ), + openai_model( + model_key="o3-2025-04-16", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="o3"), + ), + openai_model( + model_key="o4-mini-2025-04-16", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="o4-mini"), + ), + openai_model( + model_key="gpt-5-2025-08-07", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-5"), + ), + openai_model( + model_key="gpt-5-mini-2025-08-07", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-5-mini"), + ), + openai_model( + model_key="gpt-5-nano-2025-08-07", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-5-nano"), + ), + openai_model( + model_key="gpt-5.1-2025-11-13", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-5.1"), + ), + openai_model( + model_key="gpt-5.2-2025-12-11", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-5.2"), + ), + openai_model( + model_key="gpt-5.4-2026-03-05", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-5.4"), + ), + openai_model( + model_key="gpt-5.4-mini-2026-03-17", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-5.4-mini"), + ), + openai_model( + model_key="gpt-5.4-nano-2026-03-17", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-5.4-nano"), + ), + openai_model( + model_key="gpt-5.5-2026-04-23", + models_dev_reference=ModelsDevReference(provider_id="openai", model_id="gpt-5.5"), + ), +] + +# Together models: https://docs.together.ai/docs/serverless-models +TOGETHER_MODELS: Final[list[Model]] = [ + together_model( + model_key="llama-2-70b-chat-hf", + lab_key="Meta", + manual_release_date=date(2023, 7, 18), + ), + together_model( + model_key="mixtral-8x7b-instruct-v0.1", + lab_key="Mistral AI", + manual_release_date=date(2023, 12, 11), + ), + together_model( + model_key="mistral-large-latest", + lab_key="Mistral AI", + manual_release_date=date(2024, 2, 26), + ), + together_model( + model_key="mixtral-8x22b-instruct-v0.1", + lab_key="Mistral AI", + manual_release_date=date(2024, 4, 17), + ), + together_model( + model_key="llama-3-70b-chat-hf", + lab_key="Meta", + manual_release_date=date(2024, 4, 18), + ), + together_model( + model_key="llama-3-8b-chat-hf", + lab_key="Meta", + manual_release_date=date(2024, 4, 18), + ), + together_model( + model_key="qwen1.5-110b-chat", + lab_key="Qwen", + manual_release_date=date(2024, 4, 25), + ), + together_model( + model_key="meta-llama-3.1-405b-instruct-turbo", + lab_key="Meta", + manual_release_date=date(2024, 7, 23), + ), + together_model( + model_key="mistral-large-2407", + lab_key="Mistral AI", + manual_release_date=date(2024, 7, 24), + ), + together_model( + model_key="qwen2.5-72b-instruct-turbo", + lab_key="Qwen", + manual_release_date=date(2024, 9, 19), + ), + together_model( + model_key="llama-3.2-3b-instruct-turbo", + lab_key="Meta", + manual_release_date=date(2024, 9, 25), + ), + together_model( + model_key="mistral-large-2411", + lab_key="Mistral AI", + models_dev_reference=ModelsDevReference( + provider_id="mistral", model_id="mistral-large-2411" + ), + ), + together_model( + model_key="qwq-32b-preview", + lab_key="Qwen", + manual_release_date=date(2024, 11, 28), + ), + together_model( + model_key="llama-3.3-70b-instruct-turbo", + lab_key="Meta", + models_dev_reference=ModelsDevReference( + provider_id="togetherai", model_id="meta-llama/Llama-3.3-70B-Instruct-Turbo" + ), + ), + together_model( + model_key="deepseek-r1", + lab_key="DeepSeek", + models_dev_reference=ModelsDevReference( + provider_id="togetherai", model_id="deepseek-ai/DeepSeek-R1" + ), + ), + together_model( + model_key="deepseek-v3", + lab_key="DeepSeek", + models_dev_reference=ModelsDevReference( + provider_id="togetherai", model_id="deepseek-ai/DeepSeek-V3" + ), + ), + together_model( + model_key="llama-4-maverick-17b-128e-instruct-fp8", + lab_key="Meta", + manual_release_date=date(2025, 4, 5), + ), + together_model( + model_key="llama-4-scout-17b-16e-instruct", + lab_key="Meta", + manual_release_date=date(2025, 4, 5), + ), + together_model( + model_key="qwen3-235b-a22b-fp8-tput", + lab_key="Qwen", + manual_release_date=date(2025, 4, 29), + ), + together_model( + model_key="magistral-medium-2506", + lab_key="Mistral AI", + manual_release_date=date(2025, 5, 28), + ), + together_model( + model_key="kimi-k2-instruct", + lab_key="Moonshot", + manual_release_date=date(2025, 7, 12), + ), + together_model( + model_key="qwen3-235b-a22b-thinking-2507", + lab_key="Qwen", + manual_release_date=date(2025, 7, 25), + ), + together_model( + model_key="glm-4.5-air-fp8", + lab_key="Z.ai", + manual_release_date=date(2025, 7, 28), + ), + together_model( + model_key="deepseek-v3.1", + provider_model_id="deepseek-ai/DeepSeek-V3.1", + lab_key="DeepSeek", + active=False, + models_dev_reference=ModelsDevReference( + provider_id="togetherai", model_id="deepseek-ai/DeepSeek-V3-1" + ), + ), + together_model( + model_key="kimi-k2-instruct-0905", + lab_key="Moonshot", + manual_release_date=date(2025, 9, 5), + ), + together_model( + model_key="glm-4.6", + lab_key="Z.ai", + models_dev_reference=ModelsDevReference(provider_id="zai", model_id="glm-4.6"), + ), + together_model( + model_key="kimi-k2-thinking", + lab_key="Moonshot", + models_dev_reference=ModelsDevReference( + provider_id="moonshotai", model_id="kimi-k2-thinking" + ), + ), + together_model( + model_key="glm-4.7", + lab_key="Z.ai", + models_dev_reference=ModelsDevReference(provider_id="zai", model_id="glm-4.7"), + ), + together_model( + model_key="kimi-k2.5", + provider_model_id="moonshotai/Kimi-K2.5", + lab_key="Moonshot", + models_dev_reference=ModelsDevReference( + provider_id="togetherai", model_id="moonshotai/Kimi-K2.5" + ), + ), + together_model( + model_key="glm-5", + lab_key="Z.ai", + models_dev_reference=ModelsDevReference(provider_id="zai", model_id="glm-5"), + ), + together_model( + model_key="minimax-m2.5", + provider_model_id="MiniMaxAI/MiniMax-M2.5", + lab_key="MiniMax", + models_dev_reference=ModelsDevReference(provider_id="minimax", model_id="MiniMax-M2.5"), + ), + together_model( + model_key="minimax-m2.7", + provider_model_id="MiniMaxAI/MiniMax-M2.7", + lab_key="MiniMax", + models_dev_reference=ModelsDevReference(provider_id="minimax", model_id="MiniMax-M2.7"), + ), + together_model( + model_key="glm-5.1", + provider_model_id="zai-org/GLM-5.1", + lab_key="Z.ai", + models_dev_reference=ModelsDevReference(provider_id="zai", model_id="glm-5.1"), + ), + together_model( + model_key="gemma-4-31b", + provider_model_id="google/gemma-4-31B-it", + lab_key="Google DeepMind", + models_dev_reference=ModelsDevReference(provider_id="google", model_id="gemma-4-31b-it"), + ), + together_model( + model_key="kimi-k2.6", + provider_model_id="moonshotai/Kimi-K2.6", + lab_key="Moonshot", + models_dev_reference=ModelsDevReference(provider_id="moonshotai", model_id="kimi-k2.6"), + ), + together_model( + model_key="deepseek-v4-pro", + provider_model_id="deepseek-ai/DeepSeek-V4-Pro", + lab_key="DeepSeek", + models_dev_reference=ModelsDevReference(provider_id="deepseek", model_id="deepseek-v4-pro"), + ), +] + +# Anthropic models: https://platform.claude.com/docs/en/about-claude/models/overview +ANTHROPIC_MODELS: Final[list[Model]] = [ + anthropic_model( + model_key="claude-2.1", + manual_release_date=date(2023, 11, 21), + ), + anthropic_model( + model_key="claude-3-opus-20240229", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-3-opus-20240229" + ), + ), + anthropic_model( + model_key="claude-3-haiku-20240307", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-3-haiku-20240307" + ), + ), + anthropic_model( + model_key="claude-3-5-sonnet-20240620", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-3-5-sonnet-20240620" + ), + ), + anthropic_model( + model_key="claude-3-5-sonnet-20241022", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-3-5-sonnet-20241022" + ), + ), + anthropic_model( + model_key="claude-3-7-sonnet-20250219", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-3-7-sonnet-20250219" + ), + ), + anthropic_model( + model_key="claude-opus-4-20250514", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-opus-4-20250514" + ), + ), + anthropic_model( + model_key="claude-sonnet-4-20250514", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-sonnet-4-20250514" + ), + ), + anthropic_model( + model_key="claude-opus-4-1-20250805", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-opus-4-1-20250805" + ), + ), + anthropic_model( + model_key="claude-sonnet-4-5-20250929", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-sonnet-4-5-20250929" + ), + ), + anthropic_model( + model_key="claude-haiku-4-5-20251001", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-haiku-4-5-20251001" + ), + ), + anthropic_model( + model_key="claude-opus-4-5-20251101", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-opus-4-5-20251101" + ), + ), + anthropic_model( + model_key="claude-opus-4-6", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-opus-4-6" + ), + ), + anthropic_model( + model_key="claude-sonnet-4-6", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-sonnet-4-6" + ), + ), + anthropic_model( + model_key="claude-opus-4-7", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", model_id="claude-opus-4-7" + ), + ), + anthropic_model( + model_key="claude-opus-4-8", + models_dev_reference=ModelsDevReference( + provider_id="anthropic", + model_id="claude-opus-4-8", + ), + ), +] + +# xAI models: https://console.x.ai/ -> API Models +XAI_MODELS: Final[list[Model]] = [ + xai_model( + model_key="grok-beta", + manual_release_date=date(2024, 11, 4), + ), + xai_model( + model_key="grok-4-0709", + manual_release_date=date(2025, 7, 9), + ), + xai_model( + model_key="grok-4-fast-non-reasoning", + manual_release_date=date(2025, 9, 19), + ), + xai_model( + model_key="grok-4-fast-reasoning", + manual_release_date=date(2025, 9, 19), + ), + xai_model( + model_key="grok-4-1-fast-non-reasoning", + manual_release_date=date(2025, 11, 17), + ), + xai_model( + model_key="grok-4-1-fast-reasoning", + manual_release_date=date(2025, 11, 17), + ), + xai_model( + model_key="grok-4.20-0309-non-reasoning", + models_dev_reference=ModelsDevReference( + provider_id="xai", model_id="grok-4.20-0309-non-reasoning" + ), + ), + xai_model( + model_key="grok-4.20-0309-reasoning", + models_dev_reference=ModelsDevReference( + provider_id="xai", model_id="grok-4.20-0309-reasoning" + ), + ), + xai_model( + model_key="grok-4.3", + models_dev_reference=ModelsDevReference(provider_id="xai", model_id="grok-4.3"), + ), +] + +# Google models: https://ai.google.dev/gemini-api/docs/models +GOOGLE_MODELS: Final[list[Model]] = [ + google_model( + model_key="gemini-1.5-flash", + manual_release_date=date(2024, 5, 1), + ), + google_model( + model_key="gemini-1.5-pro", + manual_release_date=date(2024, 5, 1), + ), + google_model( + model_key="gemini-2.0-flash-lite-001", + manual_release_date=date(2025, 2, 5), + ), + google_model( + model_key="gemini-2.5-flash", + models_dev_reference=ModelsDevReference(provider_id="google", model_id="gemini-2.5-flash"), + ), + google_model( + model_key="gemini-2.5-pro", + models_dev_reference=ModelsDevReference(provider_id="google", model_id="gemini-2.5-pro"), + ), + google_model( + model_key="gemini-2.5-pro-exp-03-25", + manual_release_date=date(2025, 3, 25), + ), + google_model( + model_key="gemini-2.5-pro-preview-03-25", + manual_release_date=date(2025, 4, 4), + ), + google_model( + model_key="gemini-2.5-flash-preview-04-17", + manual_release_date=date(2025, 4, 17), + ), + google_model( + model_key="gemini-3-pro-preview", + models_dev_reference=ModelsDevReference( + provider_id="google", model_id="gemini-3-pro-preview" + ), + ), + google_model( + model_key="gemini-3-flash-preview", + models_dev_reference=ModelsDevReference( + provider_id="google", model_id="gemini-3-flash-preview" + ), + ), + google_model( + model_key="gemini-3.1-pro-preview", + models_dev_reference=ModelsDevReference( + provider_id="google", model_id="gemini-3.1-pro-preview" + ), + ), + google_model( + model_key="gemini-3.1-flash-lite-preview", + models_dev_reference=ModelsDevReference( + provider_id="google", model_id="gemini-3.1-flash-lite-preview" + ), + ), + google_model( + model_key="gemini-3.1-flash-lite", + models_dev_reference=ModelsDevReference( + provider_id="google", model_id="gemini-3.1-flash-lite" + ), + ), + google_model( + model_key="gemini-3.5-flash", + models_dev_reference=ModelsDevReference(provider_id="google", model_id="gemini-3.5-flash"), ), ] + + +def _validate_unique_model_keys(models: Sequence[Model]) -> None: + """Reject duplicate model keys in a model registry list.""" + seen_model_keys = set() + for model in models: + if model.model_key in seen_model_keys: + raise ValueError(f"Duplicate LLM model_key: {model.model_key}") + seen_model_keys.add(model.model_key) + + +def create_models_list(models: Sequence[Model]) -> list[Model]: + """Create a validated model registry list.""" + _validate_unique_model_keys(models) + return list(models) + + +MODELS: Final[list[Model]] = create_models_list( + [ + *OPENAI_MODELS, + *TOGETHER_MODELS, + *ANTHROPIC_MODELS, + *XAI_MODELS, + *GOOGLE_MODELS, + ] +) +MODELS_BY_KEY: Final[dict[str, Model]] = {model.model_key: model for model in MODELS} + + +def model_release_dates_by_key() -> dict[str, date]: + """Return release dates keyed by canonical model_key.""" + return {model.model_key: model.release_date for model in MODELS} diff --git a/utils/llm/model_runs.py b/utils/llm/model_runs.py new file mode 100644 index 0000000..ce9af43 --- /dev/null +++ b/utils/llm/model_runs.py @@ -0,0 +1,637 @@ +"""Shared LLM model-run registry. + +``model_run_key`` is handwritten at each declaration site and is the stable +identifier used by benchmark files. ``build_model_run_key`` remains a helper for +checking naming conventions and adding option-name rules, but it must not be +used to silently derive a run key. +""" + +import logging +from collections.abc import Iterable, Sequence +from copy import deepcopy +from dataclasses import dataclass, field +from datetime import date +from typing import Any + +from .artificial_analysis_model_runs import create_artificial_analysis_model_runs +from .lab_registry import Lab +from .metadata.artificial_analysis import load_artificial_analysis_snapshot +from .model_registry import MODELS_BY_KEY, Model +from .provider_registry import Provider + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class ModelRun: + """Concrete LLM run with provider options.""" + + model_run_key: str + model: Model + options: dict[str, Any] = field(default_factory=dict) + artificial_analysis_id: str | None = None + + def __post_init__(self) -> None: + """Validate model-run metadata.""" + _validate_model_run_key(self.model_run_key) + build_model_run_key(self.model.model_key, self.options) + + if self.artificial_analysis_id is None: + return + + try: + load_artificial_analysis_snapshot().get_model(self.artificial_analysis_id) + except KeyError as exc: + raise ValueError( + "Artificial Analysis model runs must reference a valid " + f"artificial_analysis_id: {self.artificial_analysis_id}" + ) from exc + + @property + def name(self) -> str: + """Return the model-run key for compatibility with benchmark code.""" + return self.model_run_key + + @property + def display_name(self) -> str: + """Return the display name for leaderboards and reports.""" + if self.artificial_analysis_id is None: + return self.model_run_key + + return load_artificial_analysis_snapshot().get_model(self.artificial_analysis_id).name + + @property + def id(self) -> str: + """Return the model-run identifier.""" + return self.model_run_key + + @property + def model_key(self) -> str: + """Return the canonical base model key.""" + return self.model.model_key + + @property + def provider_model_id(self) -> str: + """Return the provider API model identifier.""" + return self.model.provider_model_id + + @property + def lab(self) -> Lab: + """Return the model-making lab.""" + return self.model.lab + + @property + def provider(self) -> Provider: + """Return the API provider route.""" + return self.model.provider + + @property + def model_organization(self) -> str: + """Return the model lab display name.""" + return self.model.lab.leaderboard_name + + @property + def release_date(self) -> date: + """Return the underlying model release date.""" + return self.model.release_date + + def __repr__(self) -> str: + """Return a concise model-run representation.""" + if self.options: + return f"" + return f"" + + def get_response(self, prompt: str, **kwargs: Any) -> str: + """Request a response from the configured provider and model.""" + from utils.llm.model_registry import get_response + + merged_options = {**self.options, **kwargs} + logger.info( + "Requesting LLM response provider=%s provider_model_id=%s options=%s", + self.provider.name, + self.provider_model_id, + merged_options, + ) + return get_response( + provider=self.provider, + model_id=self.provider_model_id, + prompt=prompt, + options=merged_options, + ) + + +NAME_NEUTRAL_OPTION_PATHS = { + ("temperature",), + ("candidate_count",), + ("automatic_function_calling",), +} + + +def _iter_leaf_paths(value: Any, prefix: tuple[str, ...] = ()) -> Iterable[tuple[str, ...]]: + """Yield leaf paths for nested option data.""" + if isinstance(value, dict): + for key, nested in value.items(): + yield from _iter_leaf_paths(nested, (*prefix, str(key))) + elif isinstance(value, list): + yield prefix + else: + yield prefix + + +def _is_path_covered(path: tuple[str, ...], covered_prefixes: set[tuple[str, ...]]) -> bool: + """Return whether path is covered by a consumed or neutral prefix.""" + return any(path[: len(prefix)] == prefix for prefix in covered_prefixes) + + +def _thinking_suffixes( + options: dict[str, Any], +) -> tuple[list[str], set[tuple[str, ...]]]: + """Return suffixes and consumed paths for model thinking options.""" + thinking = options.get("thinking") + if not isinstance(thinking, dict): + return [], set() + if thinking.get("type") == "adaptive": + return ["adaptive-thinking"], {("thinking",)} + raise ValueError(f"Unsupported thinking option for model-run naming: {thinking}") + + +def _effort_suffixes(options: dict[str, Any]) -> tuple[list[str], set[tuple[str, ...]]]: + """Return suffixes and consumed paths for effort options.""" + suffixes = [] + consumed: set[tuple[str, ...]] = set() + + reasoning = options.get("reasoning") + if isinstance(reasoning, dict) and "effort" in reasoning: + suffixes.append(str(reasoning["effort"]).replace("_", "-").lower()) + consumed.add(("reasoning",)) + + output_config = options.get("output_config") + if isinstance(output_config, dict) and "effort" in output_config: + suffixes.append(str(output_config["effort"]).replace("_", "-").lower()) + consumed.add(("output_config",)) + + return suffixes, consumed + + +def _tool_suffixes(options: dict[str, Any]) -> tuple[list[str], set[tuple[str, ...]]]: + """Return suffixes and consumed paths for tool options.""" + tools = options.get("tools") + if not tools: + return [], set() + if not isinstance(tools, list): + raise ValueError("tools option must be a list for model-run naming") + + suffixes = [] + for tool in tools: + if not isinstance(tool, dict): + raise ValueError(f"Unsupported tool option for model-run naming: {tool}") + tool_type = tool.get("type") + if tool_type in {"web_search", "web_search_20260209"} or "googleSearch" in tool: + suffix = "web-search" + elif tool_type == "x_search": + suffix = "x-search" + else: + raise ValueError(f"Unsupported tool option for model-run naming: {tool}") + if suffix not in suffixes: + suffixes.append(suffix) + + tool_order = {"web-search": 0, "x-search": 1} + return sorted(suffixes, key=tool_order.__getitem__), {("tools",)} + + +def _token_suffixes(options: dict[str, Any]) -> tuple[list[str], set[tuple[str, ...]]]: + """Return suffixes and consumed paths for token cap options.""" + suffixes = [] + consumed: set[tuple[str, ...]] = set() + + for key in ("max_tokens", "max_output_tokens"): + if key in options: + suffixes.append(str(options[key])) + consumed.add((key,)) + + return suffixes, consumed + + +NAME_COMPONENT_RULES = ( + _thinking_suffixes, + _effort_suffixes, + _tool_suffixes, + _token_suffixes, +) + + +def build_model_run_key(model_key: str, options: dict[str, Any]) -> str: + """Build a suggested model-run key from a base model key and options.""" + suffixes = [] + consumed_prefixes = set(NAME_NEUTRAL_OPTION_PATHS) + + for rule in NAME_COMPONENT_RULES: + rule_suffixes, rule_consumed = rule(options) + suffixes.extend(rule_suffixes) + consumed_prefixes.update(rule_consumed) + + unknown_paths = sorted( + path for path in _iter_leaf_paths(options) if not _is_path_covered(path, consumed_prefixes) + ) + if unknown_paths: + raise ValueError( + "ModelRun options must be name-relevant or name-neutral. " + f"Unknown option paths: {unknown_paths}" + ) + + if suffixes: + return "-".join([model_key, *suffixes]) + return model_key + + +def _validate_model_run_key(model_run_key: str) -> None: + """Reject model-run keys that are unsafe for downstream filenames.""" + if not isinstance(model_run_key, str): + raise TypeError("ModelRun model_run_key must be a string") + if not model_run_key: + raise ValueError("ModelRun model_run_key must be non-empty") + if model_run_key != model_run_key.lower(): + raise ValueError(f"ModelRun model_run_key must be lowercase: {model_run_key}") + if any(char in model_run_key for char in (" ", "/", "_")): + raise ValueError(f"ModelRun model_run_key is not filename-safe: {model_run_key}") + + +def _model_run( + *, + model_run_key: str, + model_key: str, + options: dict[str, Any] | None = None, + artificial_analysis_id: str | None = None, +) -> ModelRun: + """Create a model run from a canonical model key.""" + return ModelRun( + model_run_key=model_run_key, + model=MODELS_BY_KEY[model_key], + options=deepcopy(options) if options is not None else {}, + artificial_analysis_id=artificial_analysis_id, + ) + + +def _validate_unique_model_run_keys(runs: Sequence[ModelRun]) -> None: + """Reject duplicate model-run keys in a model-run registry list.""" + seen_model_run_keys = set() + for run in runs: + if run.model_run_key in seen_model_run_keys: + raise ValueError(f"Duplicate LLM model_run_key: {run.model_run_key}") + seen_model_run_keys.add(run.model_run_key) + + +def create_model_runs_list(runs: Sequence[ModelRun]) -> list[ModelRun]: + """Create a validated model-run registry list.""" + _validate_unique_model_run_keys(runs) + return list(runs) + + +ARTIFICIAL_ANALYSIS_MODEL_RUNS = create_artificial_analysis_model_runs(_model_run) + + +MODEL_RUNS: list[ModelRun] = create_model_runs_list( + [ + # AA declarations are benchmark-selectable runs, not metadata-only + # records, so declaring them in the AA module adds them here. + *ARTIFICIAL_ANALYSIS_MODEL_RUNS, + _model_run( + model_run_key="gpt-4o-mini-2024-07-18", + model_key="gpt-4o-mini-2024-07-18", + options={"temperature": 0}, + ), + _model_run( + model_run_key="o3-mini-2025-01-31", + model_key="o3-mini-2025-01-31", + ), + _model_run( + model_run_key="gpt-5-nano-2025-08-07", + model_key="gpt-5-nano-2025-08-07", + ), + _model_run( + model_run_key="gpt-5-mini-2025-08-07", + model_key="gpt-5-mini-2025-08-07", + ), + _model_run( + model_run_key="gpt-5-mini-2025-08-07-1024", + model_key="gpt-5-mini-2025-08-07", + options={"max_output_tokens": 1024}, + ), + _model_run( + model_run_key="gpt-5.2-2025-12-11", + model_key="gpt-5.2-2025-12-11", + ), + _model_run( + model_run_key="gpt-5.4-2026-03-05", + model_key="gpt-5.4-2026-03-05", + ), + _model_run( + model_run_key="gpt-5.4-2026-03-05-high", + model_key="gpt-5.4-2026-03-05", + options={"reasoning": {"effort": "high"}}, + ), + _model_run( + model_run_key="gpt-5.4-2026-03-05-high-web-search", + model_key="gpt-5.4-2026-03-05", + options={ + "reasoning": {"effort": "high"}, + "tools": [{"type": "web_search"}], + }, + ), + _model_run( + model_run_key="gpt-5.4-mini-2026-03-17", + model_key="gpt-5.4-mini-2026-03-17", + ), + _model_run( + model_run_key="gpt-5.4-nano-2026-03-17", + model_key="gpt-5.4-nano-2026-03-17", + ), + _model_run( + model_run_key="gpt-5.5-2026-04-23", + model_key="gpt-5.5-2026-04-23", + ), + _model_run( + model_run_key="gpt-5.5-2026-04-23-medium", + model_key="gpt-5.5-2026-04-23", + options={"reasoning": {"effort": "medium"}}, + ), + _model_run( + model_run_key="gpt-5.5-2026-04-23-high", + model_key="gpt-5.5-2026-04-23", + options={"reasoning": {"effort": "high"}}, + ), + _model_run( + model_run_key="gpt-5.5-2026-04-23-high-web-search", + model_key="gpt-5.5-2026-04-23", + options={ + "reasoning": {"effort": "high"}, + "tools": [{"type": "web_search"}], + }, + ), + _model_run( + model_run_key="deepseek-v3.1", + model_key="deepseek-v3.1", + options={"temperature": 0}, + ), + _model_run( + model_run_key="deepseek-v4-pro", + model_key="deepseek-v4-pro", + options={"temperature": 0}, + ), + _model_run( + model_run_key="minimax-m2.5", + model_key="minimax-m2.5", + options={"temperature": 0}, + ), + _model_run( + model_run_key="minimax-m2.7", + model_key="minimax-m2.7", + options={"temperature": 0}, + ), + _model_run( + model_run_key="kimi-k2.5", + model_key="kimi-k2.5", + options={"temperature": 0}, + ), + _model_run( + model_run_key="kimi-k2.6", + model_key="kimi-k2.6", + options={"temperature": 0}, + ), + _model_run( + model_run_key="glm-5.1", + model_key="glm-5.1", + options={"temperature": 0}, + ), + _model_run( + model_run_key="gemma-4-31b", + model_key="gemma-4-31b", + options={"temperature": 0}, + ), + _model_run( + model_run_key="claude-haiku-4-5-20251001-1024", + model_key="claude-haiku-4-5-20251001", + options={"max_tokens": 1024, "temperature": 0}, + ), + _model_run( + model_run_key="claude-haiku-4-5-20251001-4096", + model_key="claude-haiku-4-5-20251001", + options={"max_tokens": 4096}, + ), + _model_run( + model_run_key="claude-sonnet-4-5-20250929-1024", + model_key="claude-sonnet-4-5-20250929", + options={"max_tokens": 1024, "temperature": 0}, + ), + _model_run( + model_run_key="claude-sonnet-4-5-20250929-4096", + model_key="claude-sonnet-4-5-20250929", + options={"max_tokens": 4096}, + ), + _model_run( + model_run_key="claude-sonnet-4-6-1024", + model_key="claude-sonnet-4-6", + options={"max_tokens": 1024, "temperature": 0}, + ), + _model_run( + model_run_key="claude-sonnet-4-6-4096", + model_key="claude-sonnet-4-6", + options={"max_tokens": 4096}, + ), + _model_run( + model_run_key="claude-sonnet-4-6-adaptive-thinking-16000", + model_key="claude-sonnet-4-6", + options={ + "max_tokens": 16000, + "thinking": {"type": "adaptive"}, + }, + ), + _model_run( + model_run_key="claude-opus-4-6-4096", + model_key="claude-opus-4-6", + options={"max_tokens": 4096}, + ), + _model_run( + model_run_key="claude-opus-4-7-1024", + model_key="claude-opus-4-7", + options={"max_tokens": 1024}, + ), + _model_run( + model_run_key="claude-opus-4-7-4096", + model_key="claude-opus-4-7", + options={"max_tokens": 4096}, + ), + _model_run( + model_run_key="claude-opus-4-7-adaptive-thinking-high-24000", + model_key="claude-opus-4-7", + options={ + "max_tokens": 24000, + "output_config": {"effort": "high"}, + "thinking": {"type": "adaptive"}, + }, + ), + _model_run( + model_run_key="claude-opus-4-7-adaptive-thinking-high-web-search-64000", + model_key="claude-opus-4-7", + options={ + "max_tokens": 64000, + "output_config": {"effort": "high"}, + "thinking": {"type": "adaptive"}, + "tools": [ + { + "type": "web_search_20260209", + "name": "web_search", + "max_uses": 5, + } + ], + }, + ), + _model_run( + model_run_key="claude-opus-4-8-1024", + model_key="claude-opus-4-8", + options={"max_tokens": 1024}, + ), + _model_run( + model_run_key="claude-opus-4-8-4096", + model_key="claude-opus-4-8", + options={"max_tokens": 4096}, + ), + _model_run( + model_run_key="claude-opus-4-8-adaptive-thinking-high-24000", + model_key="claude-opus-4-8", + options={ + "max_tokens": 24000, + "output_config": {"effort": "high"}, + "thinking": {"type": "adaptive"}, + }, + ), + _model_run( + model_run_key="claude-opus-4-8-adaptive-thinking-high-web-search-64000", + model_key="claude-opus-4-8", + options={ + "max_tokens": 64000, + "output_config": {"effort": "high"}, + "thinking": {"type": "adaptive"}, + "tools": [ + { + "type": "web_search_20260209", + "name": "web_search", + "max_uses": 5, + } + ], + }, + ), + _model_run( + model_run_key="grok-4-1-fast-reasoning", + model_key="grok-4-1-fast-reasoning", + ), + _model_run( + model_run_key="grok-4-1-fast-non-reasoning", + model_key="grok-4-1-fast-non-reasoning", + ), + _model_run( + model_run_key="grok-4.20-0309-reasoning", + model_key="grok-4.20-0309-reasoning", + options={"temperature": 0}, + ), + _model_run( + model_run_key="grok-4.20-0309-reasoning-web-search-x-search", + model_key="grok-4.20-0309-reasoning", + options={ + "tools": [{"type": "web_search"}, {"type": "x_search"}], + }, + ), + _model_run( + model_run_key="grok-4.20-0309-non-reasoning", + model_key="grok-4.20-0309-non-reasoning", + options={"temperature": 0}, + ), + _model_run( + model_run_key="grok-4.3", + model_key="grok-4.3", + options={"temperature": 0}, + ), + _model_run( + model_run_key="gemini-2.5-pro", + model_key="gemini-2.5-pro", + options={"temperature": 0}, + ), + _model_run( + model_run_key="gemini-2.5-pro-web-search", + model_key="gemini-2.5-pro", + options={ + "temperature": 0, + "tools": [{"googleSearch": {}}], + }, + ), + _model_run( + model_run_key="gemini-3-flash-preview", + model_key="gemini-3-flash-preview", + options={ + "candidate_count": 1, + "temperature": 0, + "automatic_function_calling": {"disable": True}, + }, + ), + _model_run( + model_run_key="gemini-3.1-flash-lite-preview", + model_key="gemini-3.1-flash-lite-preview", + options={ + "candidate_count": 1, + "temperature": 0, + "automatic_function_calling": {"disable": True}, + }, + ), + _model_run( + model_run_key="gemini-3.1-flash-lite", + model_key="gemini-3.1-flash-lite", + options={ + "candidate_count": 1, + "temperature": 0, + "automatic_function_calling": {"disable": True}, + }, + ), + _model_run( + model_run_key="gemini-3.1-pro-preview", + model_key="gemini-3.1-pro-preview", + options={ + "candidate_count": 1, + "temperature": 0, + "automatic_function_calling": {"disable": True}, + }, + ), + _model_run( + model_run_key="gemini-3.5-flash", + model_key="gemini-3.5-flash", + options={ + "candidate_count": 1, + "temperature": 0, + "automatic_function_calling": {"disable": True}, + }, + ), + ] +) +MODEL_RUNS_BY_KEY: dict[str, ModelRun] = {run.model_run_key: run for run in MODEL_RUNS} + +# MODEL_RUNS is historical. ACTIVE_MODEL_RUNS is the current live-callable +# subset for benchmarks and integration sweeps. +ACTIVE_MODEL_RUNS: list[ModelRun] = [run for run in MODEL_RUNS if run.model.active] +ACTIVE_MODEL_RUNS_BY_KEY: dict[str, ModelRun] = { + run.model_run_key: run for run in ACTIVE_MODEL_RUNS +} + + +def get_model_run(model_run_key: str) -> ModelRun: + """Return a shared model run by key.""" + try: + return MODEL_RUNS_BY_KEY[model_run_key] + except KeyError as exc: + available = ", ".join(sorted(MODEL_RUNS_BY_KEY)) + raise KeyError( + f"Unknown LLM model_run_key {model_run_key}. Available: {available}" + ) from exc + + +def select_model_runs(model_run_keys: Sequence[str]) -> list[ModelRun]: + """Return model runs in the requested order.""" + return [get_model_run(model_run_key) for model_run_key in model_run_keys]