diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e2fc866..c61009c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -427,9 +427,33 @@ jobs: - name: Install uv uses: astral-sh/setup-uv@v4 + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry & target + uses: Swatinem/rust-cache@v2 + + - name: Install build deps for samael + run: | + sudo apt-get update + sudo apt-get install -y libxml2-dev libxslt1-dev libxmlsec1-dev pkg-config libssl-dev + + - name: Regenerate Hadrian OpenAPI spec + run: cargo run --release -- openapi --output openapi/hadrian.openapi.json + + - name: Verify checked-in spec matches generated + run: | + if ! git diff --exit-code -- openapi/hadrian.openapi.json; then + echo "::error::openapi/hadrian.openapi.json is out of date. Run ./scripts/generate-openapi.sh and commit the result." >&2 + exit 1 + fi + - name: Fetch reference specs run: ./scripts/fetch-openapi-specs.sh openai + - name: Test conformance script + run: ./scripts/test_openapi_conformance.py + - name: Run conformance check run: ./scripts/openapi-conformance.py diff --git a/.github/workflows/helm.yml b/.github/workflows/helm.yml index cb97ad6..d1cbf64 100644 --- a/.github/workflows/helm.yml +++ b/.github/workflows/helm.yml @@ -311,6 +311,17 @@ jobs: cluster_name: helm-test wait: 120s + # `helm/kind-action`'s `wait` only waits for the control plane to be + # Ready. CoreDNS / kube-proxy can still be coming up at that point, and + # kicking off `helm install` against a half-warm API server has shown up + # as `client rate limiter Wait returned an error: context deadline + # exceeded` — helm's discovery calls saturate client-go's QPS budget + # while the apiserver is sluggish, then time out. + - name: Wait for cluster system pods + run: | + kubectl wait --for=condition=Ready pods --all -n kube-system --timeout=180s + kubectl get pods -A + - name: Add Bitnami repo run: helm repo add bitnami https://charts.bitnami.com/bitnami @@ -368,10 +379,15 @@ jobs: - name: Install chart run: | + # `--burst-limit` raises client-go's burst from the default 100; the + # kind apiserver in CI is slow enough that helm's discovery + readiness + # polling can otherwise hit the limiter and fail with + # `client rate limiter Wait returned an error: context deadline exceeded`. helm install hadrian helm/hadrian \ -f /tmp/kind-test-values.yaml \ --wait \ - --timeout 5m + --timeout 5m \ + --burst-limit 300 - name: Check deployment status run: | diff --git a/Cargo.lock b/Cargo.lock index 2f325a4..6e7b31f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -129,7 +129,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -140,7 +140,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -265,9 +265,9 @@ dependencies = [ [[package]] name = "astral-tokio-tar" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c23f3af104b40a3430ccb90ed5f7bd877a8dc5c26fc92fde51a22b40890dcf9" +checksum = "4ce73b17c62717c4b6a9af10b43e87c578b0cac27e00666d48304d3b7d2c0693" dependencies = [ "filetime", "futures-core", @@ -2344,7 +2344,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -2527,7 +2527,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.52.0", ] [[package]] @@ -3399,6 +3399,7 @@ dependencies = [ "google-cloud-token", "hex", "hickory-resolver", + "hmac", "hostname", "http 1.4.0", "http-body-util", @@ -4225,7 +4226,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.60.2", + "windows-sys 0.52.0", ] [[package]] @@ -5019,7 +5020,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -5855,7 +5856,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.60.2", + "windows-sys 0.52.0", ] [[package]] @@ -6593,7 +6594,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.60.2", + "windows-sys 0.52.0", ] [[package]] @@ -6652,7 +6653,7 @@ dependencies = [ "security-framework", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.60.2", + "windows-sys 0.52.0", ] [[package]] @@ -7220,7 +7221,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -7678,7 +7679,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.60.2", + "windows-sys 0.52.0", ] [[package]] @@ -8045,6 +8046,7 @@ dependencies = [ "axum", "base64 0.22.1", "bytes", + "flate2", "h2", "http 1.4.0", "http-body 1.0.1", @@ -9024,15 +9026,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "windows-sys" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" -dependencies = [ - "windows-targets 0.53.5", -] - [[package]] name = "windows-sys" version = "0.61.2" @@ -9081,30 +9074,13 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm 0.52.6", + "windows_i686_gnullvm", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] -[[package]] -name = "windows-targets" -version = "0.53.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" -dependencies = [ - "windows-link 0.2.1", - "windows_aarch64_gnullvm 0.53.1", - "windows_aarch64_msvc 0.53.1", - "windows_i686_gnu 0.53.1", - "windows_i686_gnullvm 0.53.1", - "windows_i686_msvc 0.53.1", - "windows_x86_64_gnu 0.53.1", - "windows_x86_64_gnullvm 0.53.1", - "windows_x86_64_msvc 0.53.1", -] - [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -9123,12 +9099,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" - [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -9147,12 +9117,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" -[[package]] -name = "windows_aarch64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" - [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -9171,24 +9135,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" -[[package]] -name = "windows_i686_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" - [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" -[[package]] -name = "windows_i686_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" - [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -9207,12 +9159,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" -[[package]] -name = "windows_i686_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" - [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -9231,12 +9177,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" -[[package]] -name = "windows_x86_64_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" - [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -9255,12 +9195,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" - [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -9279,12 +9213,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "windows_x86_64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" - [[package]] name = "winnow" version = "0.7.15" diff --git a/Cargo.toml b/Cargo.toml index ac3ba58..c2fb446 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -282,6 +282,7 @@ rust_decimal = { version = "1.40.0", features = ["macros"] } serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.145" sha2 = "0.10" +hmac = "0.12" subtle = "2.6.1" thiserror = "2.0.17" tokio = { version = "1.48.0", features = [ @@ -356,7 +357,7 @@ metrics-exporter-prometheus = { version = "0.16", optional = true } open = { version = "5.3.3", optional = true } openssl = { version = "0.10", optional = true } opentelemetry = { version = "0.31", optional = true } -opentelemetry-otlp = { version = "0.31", features = ["trace", "logs", "grpc-tonic", "http-proto"], optional = true } +opentelemetry-otlp = { version = "0.31", features = ["trace", "logs", "grpc-tonic", "gzip-tonic", "http-proto"], optional = true } opentelemetry-semantic-conventions = { version = "0.31", optional = true } opentelemetry_sdk = { version = "0.31", features = ["rt-tokio", "logs"], optional = true } redis = { version = "0.32.7", features = ["aio", "tokio-comp", "cluster-async"], optional = true } diff --git a/Dockerfile b/Dockerfile index 46361a6..971360e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,8 @@ WORKDIR /app/docs RUN pnpm build # Stage 2: Build Rust application -FROM rustlang/rust:nightly-slim AS builder +# Pinned to the latest stable Rust toolchain. +FROM rust:1.95.0-slim AS builder # Install build dependencies # Includes SAML libraries (libxml2, libxslt, xmlsec1) for samael crate @@ -90,19 +91,19 @@ COPY --from=frontend-builder /app/docs/out ./docs/out/ # Fetch model catalog (embedded at compile time via include_str!) RUN mkdir -p data && curl -sSL https://models.dev/api.json -o data/models-dev-catalog.json -# Force fresh build of the main crate by removing cached artifacts. -# The --mount=type=cache for target/ persists across builds, but fingerprints -# may not detect all source changes. Removing the crate's artifacts ensures -# a full recompile of application code (dependencies remain cached). -RUN touch src/main.rs && \ +# Build the actual application. +# The --mount=type=cache for target/ persists across builds, but the dummy-source +# fingerprints from the dependency-build layer can survive even after the real +# sources are copied in, causing the bin to link against stale rmeta that lacks +# modules from the real lib.rs. Wipe the hadrian crate's artifacts inside the +# same RUN as the build (so the cache mount is actually active) to force a full +# recompile of application code while keeping dependency caches intact. +RUN --mount=type=cache,target=/usr/local/cargo/registry \ + --mount=type=cache,target=/usr/src/hadrian/target \ rm -rf target/release/.fingerprint/hadrian-* \ target/release/deps/hadrian-* \ target/release/deps/libhadrian-* \ - target/release/hadrian - -# Build the actual application -RUN --mount=type=cache,target=/usr/local/cargo/registry \ - --mount=type=cache,target=/usr/src/hadrian/target \ + target/release/hadrian && \ cargo build --release && \ cp target/release/hadrian /usr/src/hadrian/hadrian-bin @@ -110,11 +111,12 @@ RUN --mount=type=cache,target=/usr/local/cargo/registry \ FROM debian:trixie-slim # Install runtime dependencies -# Includes SAML libraries for XML signature verification +# Includes SAML libraries for XML signature verification. +# `curl` was previously required for the HEALTHCHECK; the binary now ships +# with a `hadrian healthcheck` subcommand so curl is no longer needed. RUN apt-get update && apt-get install -y \ ca-certificates \ libssl3 \ - curl \ libxml2 \ libxslt1.1 \ libxmlsec1 \ @@ -156,8 +158,9 @@ EOF # Expose port EXPOSE 8080 -# Health check +# Health check (uses the built-in `hadrian healthcheck` subcommand so the +# runtime image doesn't need to ship `curl`). HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ - CMD curl -f http://localhost:8080/health || exit 1 + CMD ["/app/hadrian", "--config", "/app/config/hadrian.toml", "healthcheck"] CMD ["/app/hadrian", "--config", "/app/config/hadrian.toml"] diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..874fd1c --- /dev/null +++ b/TODO.md @@ -0,0 +1,187 @@ +# Code Review TODO + +Findings from a whole-repo review. Grouped by severity. Each item is independently actionable. + +## Critical + +- [x] **Cross-tenant cache leak** — `src/cache/keys.rs:142-373` builds response/embeddings/completions cache keys from prompt content alone. New `CacheTenantScope { org_id, project_id, api_key_id, user_id }` is folded into every response/responses/completions/embeddings cache hash, threaded through `ResponseCache`/`SemanticCache` and the gateway routes via `tenant_scope_from_auth`. Vector search trait gains a `VectorTenantFilter`; pgvector enforces it in SQL (with `IS NULL` for unscoped requests), qdrant adds equality must-clauses, and `SemanticCache::lookup` re-applies the filter post-hoc so backends without server-side support still cannot return cross-tenant matches. +- [x] **Admin endpoints call `authz.require` with all-None scope** — `routes/admin/{skills,templates,conversations,audit_logs,model_pricing,api_keys,usage}.rs`. Each get/update/delete now pre-fetches the row and routes the owner-derived `(resource_id, org_id, team_id, project_id)` tuple through helper scope-mappers (`skill_authz_scope`, `template_authz_scope`, `conversation_authz_scope`, `pricing_authz_scope`); list-by-user / list-by-provider variants pass the path id via `resource_id`; `audit_logs::list/get` re-evaluates authz against the resolved org-id (after the existing membership constraint) instead of all-None; `api_keys::create` for User owners now surfaces `user_id` via `resource_id`; `model_pricing::create/upsert/bulk_upsert` authorise per-input owner so a single tenant can't smuggle Global/cross-tenant rows. Truly global endpoints (`/admin/v1/usage/...`, `/admin/v1/usage/logs[/export]`, `model_pricing::list_global`) remain all-None — those are intentionally platform-scoped and gated by system policy. +- [x] **IAP/proxy-auth headers trusted when `trusted_proxies` unset** — `src/middleware/layers/admin.rs:1079-1107`. Config validation now refuses startup unconditionally when IAP is enabled without `server.trusted_proxies`; the proxy-auth middleware fail-closes (drops headers) instead of trusting all sources. +- [x] **Reserved `_emergency_admin`/`_system_bootstrap` roles passthrough** — bearer + proxy paths now strip; OIDC and SAML session paths now strip reserved roles from `claims.roles`/`claims.groups`/`assertion.groups` when building the session. +- [x] **OIDC discovery missing SSRF + issuer pinning** — `src/auth/oidc.rs:170-254`. Validate+IP-pin the discovery URL; assert `discovery.issuer == config.issuer`. +- [x] **Image fetch missing SSRF check** — `src/providers/image.rs:164-261`. Run `validate_base_url` and pin the resolved IP; enforce body-size cutoff while streaming. +- [x] **SQLite pool never sets `PRAGMA foreign_keys = ON`** — `src/db/mod.rs`. Enable on pool construction; add startup self-test. +- [x] **SQLite repos mix `datetime('now')` with bound RFC-3339** — `db/sqlite/api_keys.rs:790-820,907,943` and `db/sqlite/domain_verifications.rs:259,285,305`. Replaced inline `datetime('now')` with bound `truncate_to_millis(Utc::now())` for the api_keys revoke/last_used/rotate paths and the domain_verifications expiry comparisons; `scripts/ci-backend.sh` now greps `src/db/sqlite` and fails on `datetime('now')` outside `DEFAULT (...)` clauses so future regressions are caught in CI. +- [x] **`axum::serve` missing `into_make_service_with_connect_info`** — `src/cli/server.rs:404`. Without it, `ConnectInfo` is never inserted; per-IP rate limits, IP allowlists, and audit IPs all degrade. +- [x] **Anthropic stream rewriters byte-slice IDs unchecked** — `providers/anthropic/stream.rs:825-1218` and `convert.rs:957,999`. Use `strip_prefix("msg_")`/`strip_prefix("toolu_")` with a fallback. +- [-] **`TlsConfig` parsed but never honored** — `src/cli/server.rs:368-407`. Now logs a startup error when `[server.tls]` is set, explaining that the gateway listens on plain HTTP and that TLS must be terminated upstream. Native TLS termination still pending. +- [x] **Helm liveness probe targets `/health`** — `helm/hadrian/values.yaml:271-289`. Switch liveness to `/health/live` and readiness to `/health/ready`; same in `Dockerfile:161`. +- [x] **Conversation switch mid-stream commits to wrong conversation** — `ui/src/hooks/useConversationSync.ts:103-131`. Fixed in `useChat.ts`: a new effect aborts in-flight controllers + `stopStreaming()`/`clearStreams()` whenever `conversationId` changes, so a switch immediately tears down whatever was streaming. As a belt-and-braces guard against any racing completion that slipped through (e.g. a final SSE chunk that landed before the abort propagated), `sendMessage`, `regenerateResponse`, and `editAndRerun` now snapshot `conversationIdRef.current` at the top and skip the `addAssistantMessages` / `replaceAssistantMessage` commit if the snapshot no longer matches at completion time. +- [x] **`useChat` subscribes to entire streaming/debug stores** — `ui/src/pages/chat/useChat.ts:286-289`. Replaced the bare `useStreamingStore()` / `useDebugStore()` calls (which subscribed to the entire stores) with `useStreamingStore.getState()` / `useDebugStore.getState()` so the hook captures stable action handles without subscribing. The reactive selectors (`useAllStreams`, `useIsStreaming`, plus the surgical `useStream*` selectors that components already use) remain the only resubscribers; every per-token streaming/debug update no longer re-renders the chat root. +- [x] **HTML artifact "Open in new tab" escapes iframe sandbox** — `ui/src/components/Artifact/HtmlArtifact.tsx:90-96`. Replaced the blob-URL `window.open` (which inherited our origin) with an `about:blank` host whose body holds a sandboxed `