diff --git a/Cargo.lock b/Cargo.lock index 923896bc..31e89723 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -417,6 +417,25 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum-tracing-opentelemetry" +version = "0.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bedd2c385488b22a3a35b664fbc7f8e755d3ec6720848bc106b80cb5ae18fd7" +dependencies = [ + "axum", + "futures-core", + "futures-util", + "http", + "opentelemetry 0.31.0", + "opentelemetry-semantic-conventions 0.31.0", + "pin-project-lite", + "tower", + "tracing", + "tracing-opentelemetry 0.32.1", + "tracing-opentelemetry-instrumentation-sdk", +] + [[package]] name = "base16ct" version = "0.2.0" @@ -797,6 +816,7 @@ dependencies = [ "argon2", "async-stream", "axum", + "axum-tracing-opentelemetry", "base64 0.22.1", "chrono", "chrono-tz", @@ -825,6 +845,10 @@ dependencies = [ "mime_guess", "nucleo-matcher", "openidconnect", + "opentelemetry 0.32.0", + "opentelemetry-otlp", + "opentelemetry-semantic-conventions 0.32.0", + "opentelemetry_sdk", "parking_lot", "pdfium-render", "quick-xml", @@ -840,6 +864,7 @@ dependencies = [ "serde_yaml", "serial_test", "sha2", + "sysinfo", "tabled", "tempfile", "thiserror 2.0.18", @@ -847,10 +872,12 @@ dependencies = [ "tokio-cron-scheduler", "tokio-stream", "tokio-util", + "tonic", "tower", "tower-http", "tracing", "tracing-appender", + "tracing-opentelemetry 0.33.0", "tracing-subscriber", "tracing-test", "unicode-normalization", @@ -914,6 +941,18 @@ dependencies = [ "web-sys", ] +[[package]] +name = "const-hex" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20d9a563d167a9cce0f94153382b33cb6eded6dfabff03c69ad65a28ea1514e0" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "proptest", + "serde_core", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -2151,6 +2190,19 @@ dependencies = [ "webpki-roots 1.0.6", ] +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.20" @@ -3138,6 +3190,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -3276,6 +3337,25 @@ dependencies = [ "url", ] +[[package]] +name = "objc2-core-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" +dependencies = [ + "bitflags", +] + +[[package]] +name = "objc2-io-kit" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15" +dependencies = [ + "libc", + "objc2-core-foundation", +] + [[package]] name = "object" version = "0.37.3" @@ -3340,6 +3420,112 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "opentelemetry" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.18", +] + +[[package]] +name = "opentelemetry" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0142c63252a9e054e68a4c61a5778f7b14f576274d593f8ce883d191a099682" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.18", + "tracing", +] + +[[package]] +name = "opentelemetry-http" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5683015d09e2df236ef005b17f6f196f0d5f6313c4fa43a7b6a53b52776e4331" +dependencies = [ + "async-trait", + "bytes", + "http", + "opentelemetry 0.32.0", + "reqwest 0.13.2", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9966929966d17620d7c316c643ba62631826e10021409357772d5eea84f62c35" +dependencies = [ + "http", + "opentelemetry 0.32.0", + "opentelemetry-http", + "opentelemetry-proto", + "opentelemetry_sdk", + "prost", + "reqwest 0.13.2", + "serde_json", + "thiserror 2.0.18", + "tokio", + "tonic", + "tonic-types", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56d658ba1faf63f7b9c492cfbe6e0ec365440a16132d3270c1065f7b33f1b638" +dependencies = [ + "base64 0.22.1", + "const-hex", + "opentelemetry 0.32.0", + "opentelemetry_sdk", + "prost", + "serde", + "tonic", + "tonic-prost", +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e62e29dfe041afb8ed2a6c9737ab57db4907285d999ef8ad3a59092a36bdc846" + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ca2f98a0437b427b4b08f19f1caa3c44db885a202bc12cfea13d6c702243d68" + +[[package]] +name = "opentelemetry_sdk" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368afaed344110f40b179bb8fbe54bc52d98f9bd2b281799ef32487c2650c956" +dependencies = [ + "futures-channel", + "futures-executor", + "futures-util", + "opentelemetry 0.32.0", + "percent-encoding", + "portable-atomic", + "rand 0.9.2", + "thiserror 2.0.18", + "tokio", + "tokio-stream", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -3612,6 +3798,26 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315" +[[package]] +name = "pin-project" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2466b2336ed02bcdca6b294417127b90ec92038d1d5c4fbeac971a922e0e0924" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "pin-project-lite" version = "0.2.16" @@ -3818,6 +4024,53 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bitflags", + "num-traits", + "rand 0.9.2", + "rand_chacha 0.9.0", + "rand_xorshift", + "regex-syntax", + "unarray", +] + +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "prost-types" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +dependencies = [ + "prost", +] + [[package]] name = "psm" version = "0.1.30" @@ -4041,6 +4294,15 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core 0.9.5", +] + [[package]] name = "rangemap" version = "1.7.1" @@ -4250,7 +4512,9 @@ checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" dependencies = [ "base64 0.22.1", "bytes", + "futures-channel", "futures-core", + "futures-util", "http", "http-body", "http-body-util", @@ -5526,6 +5790,20 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "sysinfo" +version = "0.39.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14311e7e9a03114cd4b65eedd54e8fed2945e17f08586ae97ef53bc0669f9581" +dependencies = [ + "libc", + "memchr", + "ntapi", + "objc2-core-foundation", + "objc2-io-kit", + "windows", +] + [[package]] name = "tabled" version = "0.20.0" @@ -5832,6 +6110,54 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + +[[package]] +name = "tonic-types" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ab1b02061f83d519bba3caa167f88f261ef05720ab8ebc954ade70de3348e8" +dependencies = [ + "prost", + "prost-types", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -5840,9 +6166,12 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap 2.13.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -5936,6 +6265,51 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac28f2d093c6c477eaa76b23525478f38de514fa9aeb1285738d4b97a9552fc" +dependencies = [ + "js-sys", + "opentelemetry 0.31.0", + "smallvec", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "web-time", +] + +[[package]] +name = "tracing-opentelemetry" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adbc64cba7137545b8044cb1fe9814f7aacf3c6b5f9b45be8bb5db538befdb26" +dependencies = [ + "js-sys", + "opentelemetry 0.32.0", + "smallvec", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "web-time", +] + +[[package]] +name = "tracing-opentelemetry-instrumentation-sdk" +version = "0.32.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8f2540011f6d5ac30e1fc9ff169573b4559406498ac27e0242549d9bf527ed1" +dependencies = [ + "http", + "opentelemetry 0.31.0", + "opentelemetry-semantic-conventions 0.31.0", + "tracing", + "tracing-opentelemetry 0.32.1", +] + [[package]] name = "tracing-subscriber" version = "0.3.22" @@ -6008,6 +6382,12 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicase" version = "2.9.0" @@ -6538,6 +6918,27 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580" +dependencies = [ + "windows-collections", + "windows-core", + "windows-future", + "windows-numerics", +] + +[[package]] +name = "windows-collections" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610" +dependencies = [ + "windows-core", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -6551,6 +6952,17 @@ dependencies = [ "windows-strings", ] +[[package]] +name = "windows-future" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb" +dependencies = [ + "windows-core", + "windows-link", + "windows-threading", +] + [[package]] name = "windows-implement" version = "0.60.2" @@ -6579,6 +6991,16 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-numerics" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26" +dependencies = [ + "windows-core", + "windows-link", +] + [[package]] name = "windows-result" version = "0.4.1" @@ -6714,6 +7136,15 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows-threading" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37" +dependencies = [ + "windows-link", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" diff --git a/Cargo.toml b/Cargo.toml index 91b3135e..817e68bc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,9 +13,19 @@ name = "codex" path = "src/main.rs" [features] -default = ["rar"] +default = ["rar", "observability"] rar = ["dep:unrar"] embed-frontend = [] +observability = [ + "dep:opentelemetry", + "dep:opentelemetry_sdk", + "dep:opentelemetry-otlp", + "dep:opentelemetry-semantic-conventions", + "dep:tracing-opentelemetry", + "dep:axum-tracing-opentelemetry", + "dep:tonic", + "dep:sysinfo", +] [workspace] members = [".", "migration"] @@ -108,6 +118,33 @@ tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } tracing-appender = "0.2" log = "0.4" # For sqlx logging level configuration + +# OpenTelemetry (optional, gated by `observability` feature) +opentelemetry = { version = "0.32", optional = true } +opentelemetry_sdk = { version = "0.32", features = ["rt-tokio", "trace", "metrics"], optional = true } +opentelemetry-otlp = { version = "0.32", default-features = false, features = [ + "grpc-tonic", + "http-proto", + "http-json", + # Blocking HTTP client is intentional: the OTel SDK 0.32 batch processor + # runs export on a dedicated std::thread that has no async runtime + # attached. An async reqwest client would panic on first export. The + # blocking client only blocks the batch thread, not the server runtime. + "reqwest-blocking-client", + "trace", + "metrics", +], optional = true } +opentelemetry-semantic-conventions = { version = "0.32", optional = true } +tracing-opentelemetry = { version = "0.33", optional = true } +axum-tracing-opentelemetry = { version = "0.33", optional = true } +# Re-used via opentelemetry-otlp's grpc-tonic feature; declared here so +# metadata helpers can use MetadataKey/MetadataValue types directly. +tonic = { version = "0.14", default-features = false, optional = true } +# Process-level metrics (CPU, memory). `opentelemetry-system-metrics` would +# do this for us but is pinned to opentelemetry 0.31, one minor behind our +# 0.32. Rolling the few callbacks we need against sysinfo directly is ~30 lines +# and keeps the toolchain consistent. +sysinfo = { version = "0.39", default-features = false, features = ["system"], optional = true } async-stream = "0.3" futures = "0.3" tokio-stream = "0.1" @@ -171,6 +208,9 @@ http-body-util = "0.1" hyper = { version = "1.0", features = ["full"] } serial_test = "3.2" tracing-test = "0.2" +# Enable the SDK's `testing` feature for the in-memory metric exporter used +# in observability::metrics tests. Dev-only; no production impact. +opentelemetry_sdk = { version = "0.32", features = ["rt-tokio", "trace", "metrics", "testing"] } # ============================================================================= # Development Profile - Optimized for fast incremental builds diff --git a/Makefile b/Makefile index 4fcd488c..60b79541 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,9 @@ dev-logs-worker: ## View worker logs only dev-logs-frontend: ## View frontend logs only docker compose logs -f frontend-dev +dev-logs-jaeger: ## View Jaeger logs only + docker compose logs -f jaeger + dev-restart: ## Restart all development containers docker compose restart codex-dev codex-dev-worker frontend-dev diff --git a/config/config.docker.yaml b/config/config.docker.yaml index 38b909f4..afefa51a 100644 --- a/config/config.docker.yaml +++ b/config/config.docker.yaml @@ -163,3 +163,31 @@ komga_api: koreader_api: enabled: true + +# OpenTelemetry observability (disabled by default). +# +# Uncomment to ship traces/metrics to the bundled Jaeger sidecar (started +# automatically by `make dev-up`). Jaeger accepts OTLP on port 4317 inside the +# compose network and serves a UI at http://localhost:16686 on the host. +# +# See docs/docs/observability.md for the full schema, backend matrix, and +# sampling guidance. +# observability: +# enabled: true +# service_name: codex +# otlp: +# endpoint: http://jaeger:4317 +# protocol: grpc +# # headers: # auth/tenant headers for hosted backends +# # x-tenant: dev +# timeout_ms: 5000 +# traces: +# enabled: true +# sample_ratio: 1.0 +# metrics: +# enabled: true +# export_interval_ms: 30000 +# browser: +# enabled: true # opt-in browser RUM (proxied through codex) +# proxy_path: /api/v1/observability/otlp +# sample_ratio: 0.1 diff --git a/config/config.kubernetes.yaml b/config/config.kubernetes.yaml index 8dc0bd67..37937646 100644 --- a/config/config.kubernetes.yaml +++ b/config/config.kubernetes.yaml @@ -155,3 +155,36 @@ files: # - /api/v1/books/*/thumbnail # Exempt book thumbnails # cleanup_interval_secs: 60 # bucket_ttl_secs: 300 + +# OpenTelemetry observability (disabled by default). +# +# In Kubernetes you'll typically point this at the cluster's OTel collector +# DaemonSet/Deployment (e.g. opentelemetry-collector.observability.svc:4317) +# or at the OTLP receiver of an agent like the DataDog Agent. See +# docs/docs/observability.md for the schema, backend matrix, and sampling +# guidance. +# +# Most fields can also be set via env (CODEX_OBSERVABILITY_*) so secrets +# (auth tokens) can come from Kubernetes Secrets: +# CODEX_OBSERVABILITY_ENABLED=true +# CODEX_OBSERVABILITY_OTLP_ENDPOINT=http://otel-collector.observability:4317 +# CODEX_OBSERVABILITY_OTLP_HEADERS=signoz-access-token=$(cat /secrets/signoz-token) +# observability: +# enabled: true +# service_name: codex +# otlp: +# endpoint: http://otel-collector.observability:4317 +# protocol: grpc +# # headers: # auth/tenant headers +# # x-honeycomb-team: ... +# timeout_ms: 5000 +# traces: +# enabled: true +# sample_ratio: 0.25 # tune for cluster traffic volume +# metrics: +# enabled: true +# export_interval_ms: 30000 +# browser: +# enabled: false +# proxy_path: /api/v1/observability/otlp +# sample_ratio: 0.1 diff --git a/config/config.sqlite.yaml b/config/config.sqlite.yaml index 0391898b..46f1af8c 100644 --- a/config/config.sqlite.yaml +++ b/config/config.sqlite.yaml @@ -162,3 +162,28 @@ files: # - /api/v1/books/*/thumbnail # Exempt book thumbnails # cleanup_interval_secs: 60 # How often to clean up stale buckets # bucket_ttl_secs: 300 # Time before a bucket is considered stale + +# OpenTelemetry observability (disabled by default). +# +# Uncomment and point `otlp.endpoint` at your collector to enable trace and +# metric export. See docs/docs/observability.md for the schema, backend matrix +# (SigNoz, Tempo, Honeycomb, Uptrace, ...), and sampling guidance. +# observability: +# enabled: true +# service_name: codex +# otlp: +# endpoint: http://localhost:4317 # e.g. a local Jaeger or your operator's collector +# protocol: grpc # grpc | http/protobuf | http/json +# # headers: # auth/tenant headers (e.g. signoz-access-token) +# # x-honeycomb-team: ... +# timeout_ms: 5000 +# traces: +# enabled: true +# sample_ratio: 1.0 # tune down on busy deployments +# metrics: +# enabled: true +# export_interval_ms: 30000 +# browser: +# enabled: false # opt-in browser RUM, proxied through codex +# proxy_path: /api/v1/observability/otlp +# sample_ratio: 0.1 diff --git a/docker-compose.yml b/docker-compose.yml index e3fceed9..1295318a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -120,12 +120,26 @@ services: # CODEX_DATABASE_POSTGRES_DATABASE_NAME: codex CODEX_SCHEDULER_TIMEZONE: America/Los_Angeles CODEX_LOGGING_LEVEL: debug + # OpenTelemetry observability: ship traces/metrics to the bundled Jaeger + # sidecar so `make dev-up` "just works". The Codex config files keep + # observability disabled by default (trust posture for production + # deployments); the dev compose overrides that here. + CODEX_OBSERVABILITY_ENABLED: "true" + CODEX_OBSERVABILITY_SERVICE_NAME: codex + CODEX_OBSERVABILITY_OTLP_ENDPOINT: http://jaeger:4317 + CODEX_OBSERVABILITY_OTLP_PROTOCOL: grpc + CODEX_OBSERVABILITY_BROWSER_ENABLED: "true" healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"] interval: 10s timeout: 5s retries: 30 - start_period: 30s + # Generous grace period: first-time `cargo build` inside the container + # can take 10+ minutes on a cold cache. During start_period, failing + # healthchecks do not count toward `retries`, so the container is not + # prematurely marked unhealthy (which would also fail codex-dev-worker + # since it depends on `codex-dev: service_healthy`). + start_period: 900s networks: - codex-network profiles: @@ -202,6 +216,12 @@ services: # CODEX_DATABASE_POSTGRES_PASSWORD: codex # CODEX_DATABASE_POSTGRES_DATABASE_NAME: codex CODEX_LOGGING_LEVEL: debug + # OpenTelemetry observability: same overrides as codex-dev so the worker + # emits spans/metrics into the same Jaeger sidecar. + CODEX_OBSERVABILITY_ENABLED: "true" + CODEX_OBSERVABILITY_SERVICE_NAME: codex + CODEX_OBSERVABILITY_OTLP_ENDPOINT: http://jaeger:4317 + CODEX_OBSERVABILITY_OTLP_PROTOCOL: grpc networks: - codex-network profiles: @@ -317,6 +337,26 @@ services: - dev - prod + # Jaeger all-in-one for OTLP trace evaluation (see docs/docs/observability.md). + # Accepts OTLP natively on 4317 (gRPC) / 4318 (HTTP), serves the UI on 16686, + # and stores spans in memory. Available in the dev profile; the codex-dev and + # codex-dev-worker services above are pre-wired to send OTLP here via + # CODEX_OBSERVABILITY_OTLP_ENDPOINT=http://jaeger:4317. + jaeger: + image: jaegertracing/all-in-one:1.62.0 + container_name: codex-jaeger + environment: + - COLLECTOR_OTLP_ENABLED=true + ports: + - "16686:16686" + - "4317:4317" + - "4318:4318" + networks: + - codex-network + restart: unless-stopped + profiles: + - dev + # Documentation server docs: build: diff --git a/docs/api/openapi.json b/docs/api/openapi.json index d07ef05c..c878be7c 100644 --- a/docs/api/openapi.json +++ b/docs/api/openapi.json @@ -7423,6 +7423,119 @@ ] } }, + "/api/v1/observability/config": { + "get": { + "tags": [ + "Observability" + ], + "summary": "Return the configuration the browser SDK needs to bootstrap itself.", + "description": "Authenticated to keep the response (which leaks the sample ratio /\nproxy path / service name) inside the existing trust boundary;\neverything sensitive (endpoint, headers) stays server-side.", + "operationId": "get_browser_config", + "responses": { + "200": { + "description": "Browser SDK bootstrap config", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BrowserObservabilityConfigDto" + } + } + } + }, + "401": { + "description": "Unauthorized" + } + }, + "security": [ + { + "jwt_bearer": [] + }, + { + "api_key": [] + } + ] + } + }, + "/api/v1/observability/otlp/v1/metrics": { + "post": { + "tags": [ + "Observability" + ], + "summary": "Forward a batched OTLP/HTTP metrics payload to the configured upstream.", + "operationId": "proxy_metrics", + "requestBody": { + "description": "OTLP/HTTP metrics payload (protobuf or JSON)", + "content": { + "application/x-protobuf": {} + } + }, + "responses": { + "200": { + "description": "Forwarded successfully" + }, + "400": { + "description": "Payload too large" + }, + "401": { + "description": "Unauthorized" + }, + "502": { + "description": "Upstream collector error" + }, + "503": { + "description": "Browser observability disabled" + } + }, + "security": [ + { + "jwt_bearer": [] + }, + { + "api_key": [] + } + ] + } + }, + "/api/v1/observability/otlp/v1/traces": { + "post": { + "tags": [ + "Observability" + ], + "summary": "Forward a batched OTLP/HTTP traces payload to the configured upstream.", + "operationId": "proxy_traces", + "requestBody": { + "description": "OTLP/HTTP traces payload (protobuf or JSON)", + "content": { + "application/x-protobuf": {} + } + }, + "responses": { + "200": { + "description": "Forwarded successfully" + }, + "400": { + "description": "Payload too large" + }, + "401": { + "description": "Unauthorized" + }, + "502": { + "description": "Upstream collector error" + }, + "503": { + "description": "Browser observability disabled" + } + }, + "security": [ + { + "jwt_bearer": [] + }, + { + "api_key": [] + } + ] + } + }, "/api/v1/plugins/actions": { "get": { "tags": [ @@ -21675,6 +21788,38 @@ "parentPath": "/home/user" } }, + "BrowserObservabilityConfigDto": { + "type": "object", + "description": "Browser RUM bootstrap configuration returned by\n`GET /api/v1/observability/config`.", + "required": [ + "enabled", + "serviceName", + "proxyPath", + "sampleRatio" + ], + "properties": { + "enabled": { + "type": "boolean", + "description": "Whether the browser SDK should initialize. False means the SDK\nbootstrap is a no-op even if the script is loaded." + }, + "proxyPath": { + "type": "string", + "description": "Same-origin path prefix on the Codex server where the browser SDK\nshould POST OTLP batches. The SDK appends `/v1/traces` and\n`/v1/metrics` to this base.", + "example": "/api/v1/observability/otlp" + }, + "sampleRatio": { + "type": "number", + "format": "double", + "description": "Parent-based sampling ratio applied client-side. Browsers are noisy;\ndefault low.", + "example": 0.1 + }, + "serviceName": { + "type": "string", + "description": "`service.name` resource attribute the browser SDK should set on\nevery span (matches the backend service name unless the operator\noverrode it specifically for the browser).", + "example": "codex-web" + } + } + }, "BulkAnalyzeBooksRequest": { "type": "object", "description": "Request to perform bulk analyze operations on multiple books", @@ -41463,6 +41608,10 @@ "name": "Metrics", "description": "Application metrics and statistics" }, + { + "name": "Observability", + "description": "Browser RUM bootstrap configuration and OTLP forwarding proxy" + }, { "name": "Filesystem", "description": "Filesystem browsing for library paths" @@ -41553,6 +41702,7 @@ "Plugins", "Plugin Actions", "Metrics", + "Observability", "Filesystem", "Duplicates", "Sharing Tags" diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index d5d13772..ea9cb4f2 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -685,6 +685,82 @@ If you lose the encryption key, all stored OAuth tokens become undecryptable. Us Automatic key rotation with key versioning (storing the key version alongside encrypted data for seamless re-encryption) is planned for a future release. +## Observability Configuration + +Codex emits OpenTelemetry traces and metrics over OTLP, plus optional browser RUM proxied through the server. Everything is **disabled by default**; nothing is exported until an operator opts in. + +For the full guide (architecture, sampling guidance, backend matrix, troubleshooting), see the [Observability page](./observability). + +```yaml +observability: + enabled: false # master switch; must be true for any export to happen + service_name: codex # `service.name` resource attribute + otlp: + endpoint: "" # e.g. http://localhost:4317 (gRPC) or http://localhost:4318 (HTTP) + protocol: grpc # grpc | http/protobuf | http/json + headers: {} # auth/tenant headers (e.g. signoz-access-token: ...) + timeout_ms: 5000 + traces: + enabled: true # honored only when observability.enabled is also true + sample_ratio: 1.0 # parent-based sampler ratio in [0.0, 1.0] + metrics: + enabled: true + export_interval_ms: 30000 # periodic reader interval + browser: + enabled: false # opt-in separately; enables the OTLP proxy + ships SDK config + proxy_path: /api/v1/observability/otlp + sample_ratio: 0.1 # browsers are noisy; sample lower than backend by default +``` + +### Top-level settings + +| Setting | Default | Env Override | Description | +|---------|---------|--------------|-------------| +| `enabled` | `false` | `CODEX_OBSERVABILITY_ENABLED` | Master switch. No providers are initialized when `false`. | +| `service_name` | `codex` | `CODEX_OBSERVABILITY_SERVICE_NAME` | Resource attribute that identifies this process in the backend UI. | + +### OTLP exporter (`observability.otlp`) + +| Setting | Default | Env Override | Description | +|---------|---------|--------------|-------------| +| `endpoint` | `""` | `CODEX_OBSERVABILITY_OTLP_ENDPOINT` | Collector URL. Required when `enabled: true`. | +| `protocol` | `grpc` | `CODEX_OBSERVABILITY_OTLP_PROTOCOL` | One of `grpc`, `http/protobuf`, `http/json`. | +| `headers` | `{}` | `CODEX_OBSERVABILITY_OTLP_HEADERS` | Map of arbitrary headers. Env format: `k1=v1,k2=v2`. | +| `timeout_ms` | `5000` | `CODEX_OBSERVABILITY_OTLP_TIMEOUT_MS` | Per-export request timeout. | + +:::tip Endpoint format +For gRPC endpoints, include the scheme: `http://host:4317` (cleartext) or `https://host:4317` (TLS). +For HTTP endpoints, point at the base URL only: `http://collector:4318`. The SDK appends `/v1/traces` and `/v1/metrics` per signal. +::: + +### Traces (`observability.traces`) + +| Setting | Default | Env Override | Description | +|---------|---------|--------------|-------------| +| `enabled` | `true` | `CODEX_OBSERVABILITY_TRACES_ENABLED` | Per-signal switch. Honored only when the parent `enabled` is also true. | +| `sample_ratio` | `1.0` | `CODEX_OBSERVABILITY_TRACES_SAMPLE_RATIO` | Parent-based sampler ratio in `[0.0, 1.0]`. Out-of-range values are clamped. | + +See the [sampling guidance table](./observability#sampling-guidance) for production-sized recommendations. + +### Metrics (`observability.metrics`) + +| Setting | Default | Env Override | Description | +|---------|---------|--------------|-------------| +| `enabled` | `true` | `CODEX_OBSERVABILITY_METRICS_ENABLED` | Per-signal switch. Honored only when the parent `enabled` is also true. | +| `export_interval_ms` | `30000` | `CODEX_OBSERVABILITY_METRICS_EXPORT_INTERVAL_MS` | Periodic reader export interval. Lower values increase load on the collector. | + +### Browser RUM (`observability.browser`) + +| Setting | Default | Env Override | Description | +|---------|---------|--------------|-------------| +| `enabled` | `false` | `CODEX_OBSERVABILITY_BROWSER_ENABLED` | Opt-in switch for the OTLP proxy and the SPA's SDK bootstrap. | +| `proxy_path` | `/api/v1/observability/otlp` | `CODEX_OBSERVABILITY_BROWSER_PROXY_PATH` | Path on the Codex server where the browser SDK POSTs OTLP batches. | +| `sample_ratio` | `0.1` | `CODEX_OBSERVABILITY_BROWSER_SAMPLE_RATIO` | Client-side sample ratio. | + +:::note Two independent switches +`observability.browser.enabled` is intentionally independent from the backend `observability.enabled` flag. Some operators want server-side observability without shipping spans from every browser tab. The SDK additionally refuses to start if `observability.otlp.endpoint` is empty, so a misconfigured server cannot leak data via the browser. +::: + ## Environment Variables All configuration options can be overridden with environment variables using the `CODEX_` prefix. @@ -772,6 +848,21 @@ CODEX_RATE_LIMIT_AUTHENTICATED_BURST=200 CODEX_RATE_LIMIT_EXEMPT_PATHS=/health,/api/v1/events CODEX_RATE_LIMIT_CLEANUP_INTERVAL_SECS=60 CODEX_RATE_LIMIT_BUCKET_TTL_SECS=300 + +# Observability (OpenTelemetry / OTLP) +CODEX_OBSERVABILITY_ENABLED=true +CODEX_OBSERVABILITY_SERVICE_NAME=codex +CODEX_OBSERVABILITY_OTLP_ENDPOINT=http://localhost:4317 +CODEX_OBSERVABILITY_OTLP_PROTOCOL=grpc +CODEX_OBSERVABILITY_OTLP_HEADERS=signoz-access-token=abc123,x-tenant=production +CODEX_OBSERVABILITY_OTLP_TIMEOUT_MS=5000 +CODEX_OBSERVABILITY_TRACES_ENABLED=true +CODEX_OBSERVABILITY_TRACES_SAMPLE_RATIO=0.1 +CODEX_OBSERVABILITY_METRICS_ENABLED=true +CODEX_OBSERVABILITY_METRICS_EXPORT_INTERVAL_MS=30000 +CODEX_OBSERVABILITY_BROWSER_ENABLED=false +CODEX_OBSERVABILITY_BROWSER_PROXY_PATH=/api/v1/observability/otlp +CODEX_OBSERVABILITY_BROWSER_SAMPLE_RATIO=0.1 ``` ## Runtime vs Startup Settings diff --git a/docs/docs/observability.md b/docs/docs/observability.md new file mode 100644 index 00000000..925bd511 --- /dev/null +++ b/docs/docs/observability.md @@ -0,0 +1,244 @@ +--- +sidebar_position: 16 +--- + +# Observability (OpenTelemetry) + +Codex ships an opt-in OpenTelemetry pipeline that emits **traces** and **metrics** over OTLP, plus an optional **browser RUM** layer that posts spans from the SPA through a same-origin proxy. Logs continue to flow through the existing `tracing-subscriber` stdout/file appender, with trace IDs injected on every line for correlation. + +The exporter is vendor-neutral. Anything that speaks OTLP works without code changes: [SigNoz](https://signoz.io/), [Grafana Tempo](https://grafana.com/oss/tempo/) + [Mimir](https://grafana.com/oss/mimir/), [Honeycomb](https://www.honeycomb.io/), [Uptrace](https://uptrace.dev/), the [DataDog Agent](https://docs.datadoghq.com/opentelemetry/) OTLP receiver, and more. + +:::tip Default state +Observability is **disabled by default**. Nothing is exported until an operator opts in. This is intentional for a self-hosted product: no telemetry leaves the box without explicit configuration. +::: + +## Quickstart (Docker dev environment) + +The bundled dev compose ships a Jaeger all-in-one sidecar on the `dev` profile and overrides the Codex config to point at it via env vars. `make dev-up` brings the whole stack up with observability already on — no YAML edit, no restart. + +```bash +make dev-up +``` + +Jaeger exposes its UI at [http://localhost:16686](http://localhost:16686). Hit a few endpoints in the Codex app, then pick **codex** from the service dropdown in Jaeger. Traces should appear within a few seconds. + +The env overrides live in `docker-compose.yml` under the `codex-dev` and `codex-dev-worker` services: + +```yaml +CODEX_OBSERVABILITY_ENABLED: "true" +CODEX_OBSERVABILITY_SERVICE_NAME: codex +CODEX_OBSERVABILITY_OTLP_ENDPOINT: http://jaeger:4317 +CODEX_OBSERVABILITY_OTLP_PROTOCOL: grpc +CODEX_OBSERVABILITY_BROWSER_ENABLED: "true" # codex-dev only; enables RUM proxy +``` + +`config/config.docker.yaml` itself ships with the `observability:` block commented out so a production deployment using the same config doesn't quietly start exporting telemetry — the dev override is intentionally local to the compose file. + +:::warning Evaluation use only +Jaeger all-in-one stores spans in memory (lost on restart) and the UI has no auth. It is appropriate for local dev and evaluation. For long-term storage, metrics, or a full APM UI in production, point Codex at a real OTLP backend (SigNoz, Grafana Tempo + Mimir, Honeycomb, Uptrace, etc.) per the backend matrix below. +::: + +## Quickstart (outside the dev compose) + +If you're running Codex outside of `docker-compose.yml`, any OTLP backend works. The smallest standalone setup is the same Jaeger all-in-one image: + +```bash +docker run -d --name codex-jaeger \ + -e COLLECTOR_OTLP_ENABLED=true \ + -p 16686:16686 -p 4317:4317 -p 4318:4318 \ + jaegertracing/all-in-one:1.62.0 +``` + +Then enable `observability` in your config file with `otlp.endpoint: http://localhost:4317`. + +## Configuration + +The full schema is documented in the [Configuration reference](./configuration#observability-configuration). At a minimum, an enabled deployment needs: + +- `observability.enabled: true` +- `observability.otlp.endpoint` set to an OTLP collector URL +- `observability.otlp.headers` populated if your backend requires auth (e.g. `signoz-access-token`, `x-honeycomb-team`) + +```yaml +observability: + enabled: true + otlp: + endpoint: https://ingest.eu.signoz.cloud:443 + protocol: grpc + headers: + signoz-access-token: "your-token-here" + timeout_ms: 5000 +``` + +### Choosing a backend + +| Backend | Endpoint shape | Protocol | Notes | +| ---------------------- | --------------------------------------- | --------- | --------------------------------------------------------------------- | +| Self-hosted SigNoz | `http://signoz-otel-collector:4317` | `grpc` | Easiest local setup. Use the bundled compose file below. | +| SigNoz Cloud | `https://ingest..signoz.cloud` | `grpc` | Requires `signoz-access-token` header. | +| Grafana Tempo (local) | `http://tempo:4317` | `grpc` | Pair with Mimir for metrics. Grafana renders both. | +| Honeycomb | `https://api.honeycomb.io` | `grpc` | Requires `x-honeycomb-team` (and optionally `x-honeycomb-dataset`). | +| Uptrace | `https://otlp.uptrace.dev:4317` | `grpc` | Requires `uptrace-dsn` header. | +| DataDog (OTLP receive) | `http://datadog-agent:4317` | `grpc` | Agent must have `otlp_config.receiver.protocols.grpc` enabled. | +| HTTP-only environments | `http://collector:4318` | `http/protobuf` | Use when load balancers don't terminate gRPC. | + +### Choosing a protocol + +`grpc` is the default and the right choice in most environments: smaller payloads, persistent connections, lower overhead. Switch to `http/protobuf` only when something between Codex and the collector (a managed load balancer, a strict egress proxy) blocks gRPC. `http/json` exists for parity but produces noticeably larger payloads; prefer `http/protobuf` over it whenever both are an option. + +### Sampling guidance + +Codex uses a **parent-based** sampler. Practically: if an incoming request already carries a `traceparent`, that decision is honored; otherwise the configured `sample_ratio` decides whether to sample at the root. + +| Workload | Recommended `traces.sample_ratio` | Reasoning | +| ----------------------------------------- | --------------------------------- | ---------------------------------------------------------------------- | +| Local development | `1.0` | You want every trace while iterating. | +| Small home server (1–5 active users) | `1.0` | Volume is low; full traces are cheap. | +| Medium deployment (10–50 active users) | `0.25`–`0.5` | Keep tail latency debuggable without flooding the collector. | +| Large/multi-tenant (100+ active users) | `0.05`–`0.1` | Pair with backend-side tail sampling if your collector supports it. | +| Diagnosing a specific incident | `1.0` temporarily | Crank up while reproducing, then back off. | + +Browser RUM defaults to `browser.sample_ratio: 0.1` because a busy SPA can produce many spans per user session. Raise it cautiously: a noisy front end can dwarf backend traffic at the collector. + +:::note Sample ratio decisions are local +The Rust SDK samples at the root span. If a downstream service (e.g. a plugin subprocess in a future iteration) makes its own decision, it does so independently. There is no global coordination. +::: + +## What Codex sends + +### Trace spans + +- **HTTP server spans** — every request, named by matched route template (e.g. `GET /api/v1/series/:id`, not the resolved URL). Standard `http.*` semantic-convention attributes. +- **Repository spans** — `db..` for hot-path operations on books, series, libraries, users, and plugin records. Carry `db.system`, `db.operation`, and the entity ID as an attribute (never in the span name). +- **Plugin RPC spans** — `plugin.` around every JSON-RPC call to a plugin subprocess. Internal `plugin.rpc.write` / `plugin.rpc.wait` child spans break down the round-trip into stdio write vs. response wait. +- **Scanner spans** — `scanner.scan_library` / `scanner.analyze_book` as root spans for background work. +- **Task worker spans** — `task.execute` per claimed task, carrying `task.id` and `task.type`. + +### Metrics + +Two flavors land in the OTLP pipeline: + +- **Counters and histograms** — dual-written from the in-process plugin and task metrics services. Histograms (not just averages) let p95/p99 be queried server-side. +- **Observable gauges** — inventory snapshot (libraries, series, books, users, pages), refreshed every 30s; process CPU/memory; task in-flight count. + +Concrete metric names: + +| Metric | Type | Attributes | +| --------------------------------- | ------------------- | ------------------------------------------------------- | +| `codex.plugin.requests.total` | Counter | `plugin_id`, `method`, `outcome` | +| `codex.plugin.duration_ms` | Histogram (ms) | `plugin_id`, `method`, `outcome` | +| `codex.task.completed.total` | Counter | `task_type`, `outcome` | +| `codex.task.duration_ms` | Histogram (ms) | `task_type`, `outcome` | +| `codex.task.queue_wait_ms` | Histogram (ms) | `task_type` | +| `codex.task.in_flight` | Observable gauge | (none) | +| `codex.inventory.libraries` | Observable gauge | (none) | +| `codex.inventory.series` | Observable gauge | (none) | +| `codex.inventory.books` | Observable gauge | (none) | +| `codex.inventory.users` | Observable gauge | (none) | +| `codex.inventory.pages` | Observable gauge | (none) | +| `http.server.request.duration` | Histogram (seconds) | `http.request.method`, `http.route`, `http.response.status_code` | +| `process.cpu.time` | Observable gauge | (none) | +| `process.memory.usage` | Observable gauge | (none) | +| `process.memory.virtual` | Observable gauge | (none) | + +The existing [`/api/v1/metrics/plugins`](./api) dashboard endpoint is unchanged. The in-app store is still authoritative for that view; OTLP is a parallel consumer. + +### What Codex does **not** send + +- **Logs.** Stdout / file logging is unchanged. Trace IDs are injected on every line so you can ship logs separately (Vector, Filebeat, Loki, etc.) and correlate by trace ID. +- **Resource bodies.** Span attributes carry IDs and operation names, not titles, file contents, or query strings. +- **User-identifying browser data.** The browser SDK emits document-load, fetch, click, and submit spans. There is no session replay, no DOM capture, no PII enrichment. +- **Cross-process plugin spans.** Plugin RPC spans wrap the manager-side call; `traceparent` is not propagated into plugin subprocesses in this release. Plugins remain black boxes from a tracing perspective. + +## Browser RUM + +When `observability.browser.enabled: true`: + +1. The SPA fetches `GET /api/v1/observability/config` on startup. If the server flag is on **and** an OTLP endpoint is configured, the heavyweight OTel browser SDK is dynamically imported. Otherwise the chunk is never downloaded. +2. The SDK registers `document-load`, `fetch`, `user-interaction` (click + submit only), and `xml-http-request` instrumentations. +3. Spans are batched in memory (flush every 5s or 512 spans, max queue 2048) and POSTed to `/api/v1/observability/otlp/v1/traces`. +4. Codex forwards the OTLP body verbatim to the configured collector, swapping in the operator-configured `otlp.headers`. Browser-supplied headers are dropped except for `Content-Type`. +5. On `pagehide`, the SDK uses `navigator.sendBeacon()` to flush the final batch so spans survive navigation. + +`FetchInstrumentation.propagateTraceHeaderCorsUrls` is anchored to `window.location.origin`, so `traceparent` is injected only on Codex API calls and never leaked to third-party CDNs or external metadata sources. + +### Why the proxy? + +The proxy exists for three reasons: + +1. **No CORS configuration on the collector.** The SPA always POSTs to its own origin. +2. **No collector credentials in the browser.** Auth tokens stay on the server. +3. **Reuses existing session auth.** The proxy is `FlexibleAuthContext`-gated, so the cookie or bearer the SPA already carries authenticates the export. The OTel JS exporter does not need custom auth wiring. + +The proxy is a thin pass-through. It does not buffer, batch, transform, or sample. Body size is capped at 4 MiB and per-session rate limits apply. + +## Trace ID correlation in logs + +When observability is enabled, log lines pick up trace context: + +``` +2026-05-22T18:02:11.034Z INFO trace_id=4bf92f3577b34da6a3ce929d0e0e4736 span_id=00f067aa0ba902b7 codex::services::plugin::manager: plugin.search_series finished plugin_id=anilist duration_ms=412 +``` + +Ship the log file to any backend that can index by `trace_id` and you can pivot from a slow log line to the SigNoz trace and back. + +## Performance impact + +Codex's success criteria for this feature are: + +- **< 2% added request latency when observability is disabled** (the default). +- **< 5% added request latency when enabled with default sampling.** + +The disabled-path overhead is effectively zero: the OTel layer is not installed in the `tracing-subscriber` registry, repository `#[instrument]` attributes compile to inert spans without a subscriber, and metric instruments resolve to no-op implementations from `metrics_stub.rs` under `--no-default-features`. With observability enabled at `sample_ratio: 1.0` on a representative endpoint, measured overhead falls inside the 5% budget (see the benchmark in the implementation notes for the methodology). + +If you need to validate on your own deployment: + +```bash +# Baseline (observability disabled) +ab -n 1000 -c 10 -H "Authorization: Bearer $TOKEN" \ + http://localhost:8080/api/v1/series?page=1 + +# Then enable observability, restart, and re-run with the same args. +# Compare p50/p95/p99 in the ab output. +``` + +## Disabling observability + +Three ways, in order of granularity: + +1. **Full off** — set `observability.enabled: false` (the default) and restart. No providers initialize, no telemetry leaves the process. +2. **Per-signal off** — keep `observability.enabled: true` but set `observability.traces.enabled: false` or `observability.metrics.enabled: false`. Useful when one pipeline needs maintenance. +3. **Sampling to zero** — `observability.traces.sample_ratio: 0.0` keeps the layer installed (so incoming `traceparent` is still extracted for logging) but no new traces start at the root. Cheaper than restarting if you need to drop trace volume without redeploying. + +Browser RUM has its own switch: `observability.browser.enabled: false` disables the proxy endpoint and the SPA's config payload reports `enabled: false`, so the SDK chunk is never downloaded. + +## Troubleshooting + +**Traces don't appear in the backend.** + +- Check the Codex logs for `otel_status_code=ERROR` lines or `failed to export` warnings. +- Confirm `observability.enabled` is `true` **and** `observability.otlp.endpoint` is non-empty. An enabled config with an empty endpoint is treated as a misconfiguration and the OTel layer is not installed. +- For gRPC endpoints, the URL scheme matters: `http://host:4317` for cleartext, `https://host:4317` for TLS. +- For HTTP/protobuf endpoints, the SDK appends `/v1/traces` and `/v1/metrics` to the base URL. Configure `http://collector:4318`, not `http://collector:4318/v1/traces`. + +**Metrics arrive but with the wrong tenant / project / dataset.** + +- Headers configured under `observability.otlp.headers` apply to **both** traces and metrics exports. Most multi-tenant backends use a single header (e.g. `x-honeycomb-team`); for backends that route by dataset, set the dataset header at the OTLP level too. + +**Browser traces don't show up.** + +- Confirm `GET /api/v1/observability/config` returns `enabled: true` in the response body. If it returns `enabled: false` while you have `browser.enabled: true` in YAML, the OTLP endpoint is probably empty. +- Open the network panel. Successful proxy POSTs to `/api/v1/observability/otlp/v1/traces` return `204 No Content`. A `503` means the proxy is disabled. +- The `tracer-*.js` chunk is loaded asynchronously. If it never appears in the network panel, the bootstrap probe failed or the chunk was blocked by an extension. + +**`cargo build --no-default-features` after enabling observability.** + +- The `observability` feature is in `default = ["rar", "observability"]`. `--no-default-features` compiles against the stub module: all instrumentation calls become no-ops and the OTel crates are not linked. There is no runtime config change required. + +## Reference + +- [Configuration reference](./configuration#observability-configuration) — full schema and environment variable list +- [`docker-compose.yml`](https://github.com/AshDevFr/codex/blob/main/docker-compose.yml) — bundled Jaeger sidecar lives on the `dev` profile +- [OpenTelemetry Rust](https://github.com/open-telemetry/opentelemetry-rust) — SDK source +- [OpenTelemetry JS browser SDK](https://opentelemetry.io/docs/languages/js/) — browser SDK source +- [W3C Trace Context](https://www.w3.org/TR/trace-context/) — the propagation format used end-to-end diff --git a/src/api/docs.rs b/src/api/docs.rs index cc37f9d0..9ee8c162 100644 --- a/src/api/docs.rs +++ b/src/api/docs.rs @@ -390,6 +390,11 @@ The following paths are exempt from rate limiting: v1::handlers::api_keys::update_api_key, v1::handlers::api_keys::delete_api_key, + // Observability endpoints + v1::handlers::observability::get_browser_config, + v1::handlers::observability::proxy_traces, + v1::handlers::observability::proxy_metrics, + // Metrics endpoints v1::handlers::get_inventory_metrics, v1::handlers::get_plugin_metrics, @@ -594,6 +599,9 @@ The following paths are exempt from rate limiting: // App info v1::dto::AppInfoDto, + // Observability DTOs + v1::dto::BrowserObservabilityConfigDto, + // DTOs v1::dto::LoginRequest, v1::dto::LoginResponse, @@ -1119,6 +1127,7 @@ The following paths are exempt from rate limiting: (name = "User Plugins", description = "User-facing plugin management, OAuth, and configuration"), (name = "Recommendations", description = "Personalized recommendation endpoints"), (name = "Metrics", description = "Application metrics and statistics"), + (name = "Observability", description = "Browser RUM bootstrap configuration and OTLP forwarding proxy"), (name = "Filesystem", description = "Filesystem browsing for library paths"), (name = "Duplicates", description = "Duplicate book detection and management"), (name = "Sharing Tags", description = "Content access control tags (admin only)"), @@ -1250,7 +1259,7 @@ impl utoipa::Modify for TagGroupsModifier { }, { "name": "Administration", - "tags": ["Admin", "Settings", "Plugins", "Plugin Actions", "Metrics", "Filesystem", "Duplicates", "Sharing Tags"] + "tags": ["Admin", "Settings", "Plugins", "Plugin Actions", "Metrics", "Observability", "Filesystem", "Duplicates", "Sharing Tags"] }, { "name": "Real-time Events", diff --git a/src/api/extractors/auth.rs b/src/api/extractors/auth.rs index 0026cfbf..86fb4e67 100644 --- a/src/api/extractors/auth.rs +++ b/src/api/extractors/auth.rs @@ -180,6 +180,10 @@ pub struct AppState { pub database_config: Arc, /// PDF configuration - used for rendering settings and cache config pub pdf_config: Arc, + /// Observability configuration - used by the browser RUM SDK bootstrap + /// endpoint and the OTLP forwarding proxy. Always present; handlers gate + /// behavior on `browser.enabled` / `otlp.endpoint`. + pub observability_config: Arc, pub email_service: Arc, pub event_broadcaster: Arc, /// Settings service - used for runtime configuration diff --git a/src/api/middleware/http_metrics.rs b/src/api/middleware/http_metrics.rs new file mode 100644 index 00000000..8a0ba2b7 --- /dev/null +++ b/src/api/middleware/http_metrics.rs @@ -0,0 +1,38 @@ +//! HTTP request metrics middleware. +//! +//! Emits an OTel histogram measurement (`http.server.request.duration` in +//! seconds) with `method`, `route`, and `status_code` attributes for every +//! HTTP request. The route comes from Axum's `MatchedPath` extractor so the +//! attribute carries the template (`/api/v1/series/:id`) rather than the +//! resolved URL — otherwise cardinality would explode per series ID. +//! +//! Layered alongside the existing `axum-tracing-opentelemetry` span layers; +//! that crate focuses on spans, this layer focuses on metrics. + +use axum::extract::{MatchedPath, Request}; +use axum::middleware::Next; +use axum::response::Response; +use std::time::Instant; + +/// Record request duration after the inner service responds. +pub async fn http_metrics_middleware(request: Request, next: Next) -> Response { + let method = request.method().clone(); + let route = request + .extensions() + .get::() + .map(|p| p.as_str().to_string()) + .unwrap_or_else(|| "unmatched".to_string()); + + let start = Instant::now(); + let response = next.run(request).await; + let elapsed = start.elapsed().as_secs_f64(); + + crate::observability::metrics::record_http_request( + method.as_str(), + &route, + response.status().as_u16(), + elapsed, + ); + + response +} diff --git a/src/api/middleware/mod.rs b/src/api/middleware/mod.rs index b2de2e0f..b2bdc556 100644 --- a/src/api/middleware/mod.rs +++ b/src/api/middleware/mod.rs @@ -1,7 +1,9 @@ pub mod auth; +pub mod http_metrics; pub mod permissions; pub mod rate_limit; pub mod tracing; +pub use http_metrics::http_metrics_middleware; pub use rate_limit::RateLimitLayer; pub use tracing::create_trace_layer; diff --git a/src/api/routes/mod.rs b/src/api/routes/mod.rs index a4a9fc92..a5d6484c 100644 --- a/src/api/routes/mod.rs +++ b/src/api/routes/mod.rs @@ -168,10 +168,22 @@ pub fn create_router(state: Arc, config: &Config) -> Router { }, )); - // Add request tracing middleware (outermost layer) + // Add request tracing middleware // This logs all HTTP requests/responses with method, path, status, and latency // Logs at debug level for normal requests, error level for 5xx responses router = router.layer(create_trace_layer()); + // OpenTelemetry HTTP request-duration histogram (no-op when observability + // is disabled). Layered after the trace layer so request timing here is + // bounded by the same span the OTel server span covers. + router = router.layer(axum::middleware::from_fn( + crate::api::middleware::http_metrics_middleware, + )); + + // OpenTelemetry HTTP span / response context middleware (outermost layer). + // No-op when the `observability` feature is disabled or + // `observability.enabled` is false in config. + router = crate::observability::install_http_layers(router, &config.observability); + router } diff --git a/src/api/routes/v1/dto/mod.rs b/src/api/routes/v1/dto/mod.rs index 1c8520ab..6259f39b 100644 --- a/src/api/routes/v1/dto/mod.rs +++ b/src/api/routes/v1/dto/mod.rs @@ -15,6 +15,7 @@ pub mod info; pub mod library; pub mod library_jobs; pub mod metrics; +pub mod observability; pub mod oidc; pub mod page; pub mod patch; @@ -51,6 +52,7 @@ pub use info::*; pub use library::*; pub use library_jobs::*; pub use metrics::*; +pub use observability::*; pub use oidc::*; pub use page::*; pub use pdf_cache::*; diff --git a/src/api/routes/v1/dto/observability.rs b/src/api/routes/v1/dto/observability.rs new file mode 100644 index 00000000..de99f2e1 --- /dev/null +++ b/src/api/routes/v1/dto/observability.rs @@ -0,0 +1,36 @@ +//! Observability DTOs +//! +//! Describes the configuration the browser-side OpenTelemetry SDK needs to +//! bootstrap itself. Secrets (collector auth headers, endpoint hostnames) +//! stay server-side — this payload only carries enough info for the SDK to +//! decide whether to start and where on the Codex origin to POST batches. + +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; + +/// Browser RUM bootstrap configuration returned by +/// `GET /api/v1/observability/config`. +#[derive(Debug, Serialize, Deserialize, ToSchema, Clone)] +#[serde(rename_all = "camelCase")] +pub struct BrowserObservabilityConfigDto { + /// Whether the browser SDK should initialize. False means the SDK + /// bootstrap is a no-op even if the script is loaded. + pub enabled: bool, + + /// `service.name` resource attribute the browser SDK should set on + /// every span (matches the backend service name unless the operator + /// overrode it specifically for the browser). + #[schema(example = "codex-web")] + pub service_name: String, + + /// Same-origin path prefix on the Codex server where the browser SDK + /// should POST OTLP batches. The SDK appends `/v1/traces` and + /// `/v1/metrics` to this base. + #[schema(example = "/api/v1/observability/otlp")] + pub proxy_path: String, + + /// Parent-based sampling ratio applied client-side. Browsers are noisy; + /// default low. + #[schema(example = 0.1)] + pub sample_ratio: f64, +} diff --git a/src/api/routes/v1/handlers/mod.rs b/src/api/routes/v1/handlers/mod.rs index 3c0524a6..9b0ca72d 100644 --- a/src/api/routes/v1/handlers/mod.rs +++ b/src/api/routes/v1/handlers/mod.rs @@ -54,6 +54,7 @@ pub mod info; pub mod libraries; pub mod library_jobs; pub mod metrics; +pub mod observability; pub mod oidc; pub mod pages; pub mod pdf_cache; diff --git a/src/api/routes/v1/handlers/observability.rs b/src/api/routes/v1/handlers/observability.rs new file mode 100644 index 00000000..6a8fe08f --- /dev/null +++ b/src/api/routes/v1/handlers/observability.rs @@ -0,0 +1,227 @@ +//! Browser RUM bootstrap + OTLP forwarding proxy handlers. +//! +//! The browser SDK runs server-side configuration on startup +//! ([`get_browser_config`]) and then POSTs OTLP/HTTP batches to +//! [`proxy_traces`] / [`proxy_metrics`]. The proxy forwards the body +//! verbatim to the operator-configured upstream collector with the +//! operator-configured headers attached, avoiding CORS hops and keeping +//! collector auth tokens out of the browser. + +use std::sync::Arc; +use std::time::Duration; + +use axum::{ + Json, + body::Bytes, + extract::State, + http::{HeaderMap, StatusCode, header}, + response::{IntoResponse, Response}, +}; +use tokio::sync::OnceCell; + +use crate::api::{ + error::ApiError, + extractors::{AppState, FlexibleAuthContext}, +}; +use crate::config::ObservabilityConfig; + +use super::super::dto::BrowserObservabilityConfigDto; + +/// Maximum accepted body size for a single OTLP POST. 4 MiB matches the +/// default Collector grpc/HTTP receiver limit and is well above any +/// reasonable browser batch (default batch flushes at 512 spans, ~50 KB). +const MAX_OTLP_BODY_BYTES: usize = 4 * 1024 * 1024; + +/// Reusable HTTP client for the upstream OTLP forward. +/// +/// Built lazily on first use so the timeout matches whatever +/// `observability.otlp.timeout_ms` was configured at startup. A single +/// client serves every forward — its connection pool is the reason we +/// don't construct one per request. +static UPSTREAM_CLIENT: OnceCell = OnceCell::const_new(); + +async fn upstream_client( + config: &ObservabilityConfig, +) -> Result<&'static reqwest::Client, ApiError> { + UPSTREAM_CLIENT + .get_or_try_init(|| async { + reqwest::Client::builder() + .timeout(Duration::from_millis(config.otlp.timeout_ms)) + .build() + .map_err(|e| { + ApiError::Internal(format!( + "Failed to build observability proxy HTTP client: {e}" + )) + }) + }) + .await +} + +/// Return the configuration the browser SDK needs to bootstrap itself. +/// +/// Authenticated to keep the response (which leaks the sample ratio / +/// proxy path / service name) inside the existing trust boundary; +/// everything sensitive (endpoint, headers) stays server-side. +#[utoipa::path( + get, + path = "/api/v1/observability/config", + responses( + (status = 200, description = "Browser SDK bootstrap config", body = BrowserObservabilityConfigDto), + (status = 401, description = "Unauthorized"), + ), + security( + ("jwt_bearer" = []), + ("api_key" = []) + ), + tag = "Observability" +)] +pub async fn get_browser_config( + State(state): State>, + _auth: FlexibleAuthContext, +) -> Json { + let cfg = &state.observability_config; + Json(BrowserObservabilityConfigDto { + enabled: cfg.browser.enabled && !cfg.otlp.endpoint.trim().is_empty(), + service_name: cfg.service_name.clone(), + proxy_path: cfg.browser.proxy_path.clone(), + sample_ratio: cfg.browser.sample_ratio, + }) +} + +/// Forward a batched OTLP/HTTP traces payload to the configured upstream. +#[utoipa::path( + post, + path = "/api/v1/observability/otlp/v1/traces", + request_body(content_type = "application/x-protobuf", description = "OTLP/HTTP traces payload (protobuf or JSON)"), + responses( + (status = 200, description = "Forwarded successfully"), + (status = 400, description = "Payload too large"), + (status = 401, description = "Unauthorized"), + (status = 502, description = "Upstream collector error"), + (status = 503, description = "Browser observability disabled"), + ), + security( + ("jwt_bearer" = []), + ("api_key" = []) + ), + tag = "Observability" +)] +pub async fn proxy_traces( + state: State>, + auth: FlexibleAuthContext, + headers: HeaderMap, + body: Bytes, +) -> Result { + forward_otlp(state, auth, headers, body, "v1/traces").await +} + +/// Forward a batched OTLP/HTTP metrics payload to the configured upstream. +#[utoipa::path( + post, + path = "/api/v1/observability/otlp/v1/metrics", + request_body(content_type = "application/x-protobuf", description = "OTLP/HTTP metrics payload (protobuf or JSON)"), + responses( + (status = 200, description = "Forwarded successfully"), + (status = 400, description = "Payload too large"), + (status = 401, description = "Unauthorized"), + (status = 502, description = "Upstream collector error"), + (status = 503, description = "Browser observability disabled"), + ), + security( + ("jwt_bearer" = []), + ("api_key" = []) + ), + tag = "Observability" +)] +pub async fn proxy_metrics( + state: State>, + auth: FlexibleAuthContext, + headers: HeaderMap, + body: Bytes, +) -> Result { + forward_otlp(state, auth, headers, body, "v1/metrics").await +} + +async fn forward_otlp( + State(state): State>, + _auth: FlexibleAuthContext, + headers: HeaderMap, + body: Bytes, + signal_suffix: &'static str, +) -> Result { + let cfg = state.observability_config.clone(); + + if !cfg.browser.enabled { + return Err(ApiError::ServiceUnavailable( + "Browser observability is disabled".to_string(), + )); + } + + let upstream_base = cfg.otlp.endpoint.trim(); + if upstream_base.is_empty() { + return Err(ApiError::ServiceUnavailable( + "OTLP endpoint not configured".to_string(), + )); + } + + if body.len() > MAX_OTLP_BODY_BYTES { + return Err(ApiError::BadRequest(format!( + "OTLP payload exceeds {}-byte limit", + MAX_OTLP_BODY_BYTES + ))); + } + + // Preserve the inbound content-type so the upstream can parse + // protobuf vs. JSON correctly. Default to protobuf since that's what + // the OTel JS exporter uses by default. + let content_type = headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("application/x-protobuf") + .to_string(); + + let client = upstream_client(&cfg).await?; + let upstream_url = format!("{}/{}", upstream_base.trim_end_matches('/'), signal_suffix); + + let mut req = client + .post(&upstream_url) + .header(header::CONTENT_TYPE, content_type) + .body(body); + + // Layer the operator-configured headers last so they win over any + // header that might have come from the browser. Browser-supplied + // headers (other than content-type, which we set explicitly above) + // are intentionally dropped. + for (k, v) in cfg.otlp.headers.iter() { + req = req.header(k, v); + } + + let upstream_response = req.send().await.map_err(|e| { + tracing::warn!(error = %e, url = %upstream_url, "OTLP forward failed"); + ApiError::Internal(format!("Failed to reach OTLP upstream: {e}")) + })?; + + let status = upstream_response.status(); + let upstream_body = upstream_response.bytes().await.unwrap_or_default(); + + if !status.is_success() { + tracing::warn!( + status = %status, + url = %upstream_url, + "OTLP upstream returned non-success" + ); + return Ok(( + StatusCode::BAD_GATEWAY, + [(header::CONTENT_TYPE, "application/octet-stream")], + upstream_body, + ) + .into_response()); + } + + Ok(( + StatusCode::OK, + [(header::CONTENT_TYPE, "application/octet-stream")], + upstream_body, + ) + .into_response()) +} diff --git a/src/api/routes/v1/routes/mod.rs b/src/api/routes/v1/routes/mod.rs index e47cc76a..41ef2929 100644 --- a/src/api/routes/v1/routes/mod.rs +++ b/src/api/routes/v1/routes/mod.rs @@ -8,6 +8,7 @@ mod auth; mod books; mod libraries; mod misc; +mod observability; mod oidc; mod plugins; mod recommendations; @@ -43,6 +44,7 @@ pub fn create_router(state: Arc) -> Router { .merge(user_plugins::routes(state.clone())) .merge(recommendations::routes(state.clone())) .merge(releases::routes(state.clone())) + .merge(observability::routes(state.clone())) // Apply state to all routes .with_state(state) } diff --git a/src/api/routes/v1/routes/observability.rs b/src/api/routes/v1/routes/observability.rs new file mode 100644 index 00000000..53a1e02b --- /dev/null +++ b/src/api/routes/v1/routes/observability.rs @@ -0,0 +1,43 @@ +//! Observability routes +//! +//! Routes for the browser RUM bootstrap configuration endpoint and the +//! OTLP/HTTP forwarding proxy. The OTLP routes accept raw bodies (JSON or +//! protobuf) and forward them to the operator-configured upstream +//! collector. + +use super::super::handlers; +use crate::api::extractors::AppState; +use axum::{ + Router, + extract::DefaultBodyLimit, + routing::{get, post}, +}; +use std::sync::Arc; + +/// 4 MiB upper bound on inbound OTLP POST bodies. Mirrors the default +/// collector receiver limit; the OTLP-JS exporter flushes well below this +/// (default batch hits ~50 KB). Anything above this is almost certainly +/// abuse, so we reject at the body extractor instead of forwarding. +const MAX_PROXY_BODY_BYTES: usize = 4 * 1024 * 1024; + +/// Routes: +/// - GET /observability/config - Browser SDK bootstrap config +/// - POST /observability/otlp/v1/traces - Forward traces to upstream OTLP +/// - POST /observability/otlp/v1/metrics - Forward metrics to upstream OTLP +pub fn routes(_state: Arc) -> Router> { + Router::new() + .route( + "/observability/config", + get(handlers::observability::get_browser_config), + ) + .route( + "/observability/otlp/v1/traces", + post(handlers::observability::proxy_traces) + .layer(DefaultBodyLimit::max(MAX_PROXY_BODY_BYTES)), + ) + .route( + "/observability/otlp/v1/metrics", + post(handlers::observability::proxy_metrics) + .layer(DefaultBodyLimit::max(MAX_PROXY_BODY_BYTES)), + ) +} diff --git a/src/commands/common.rs b/src/commands/common.rs index ef27e4c7..6c4d38de 100644 --- a/src/commands/common.rs +++ b/src/commands/common.rs @@ -1,6 +1,7 @@ use crate::config::{Config, DatabaseConfig, DatabaseType, EnvOverride}; use crate::db::Database; use crate::events::EventBroadcaster; +use crate::observability::ObservabilityHandle; use crate::services::{SettingsService, TaskMetricsService}; use crate::tasks::TaskWorker; use sea_orm::DatabaseConnection; @@ -110,16 +111,30 @@ pub fn load_config(config_path: PathBuf) -> anyhow::Result<(Config, bool)> { Ok((config, config_created)) } -/// Initialize tracing with config -/// Returns an optional guard that must be kept alive and the log level string -pub fn init_tracing( - config: &Config, -) -> anyhow::Result<(Option, String)> { - use std::fs; - use std::io; - use tracing_subscriber::fmt::writer::MakeWriterExt; +/// Bundle of long-lived guards returned by [`init_tracing`]. +/// +/// `file_guard` keeps the non-blocking file appender's worker thread alive, +/// `observability` owns the OTel providers so [`ObservabilityHandle::shutdown`] +/// can flush them on graceful exit, and `log_level` is the effective filter +/// string for diagnostic logging. +pub struct TracingHandles { + pub file_guard: Option, + pub observability: ObservabilityHandle, + pub log_level: String, +} - // Get log level from config or environment +/// Initialize tracing with config. +/// +/// Composes the existing fmt + file appender with an optional OpenTelemetry +/// layer when `observability.enabled` is true. Returns a [`TracingHandles`] +/// bundle that the caller is expected to keep alive for the process lifetime +/// and to drive shutdown through. +pub fn init_tracing(config: &Config) -> anyhow::Result { + use tracing_subscriber::{fmt, layer::SubscriberExt, util::SubscriberInitExt}; + + // Resolve the effective log filter: explicit RUST_LOG wins, then config. + // At info/warn/error we silence sqlx down to warn (it is otherwise noisy + // at info), preserving the user's level for the rest of the workspace. let log_level = if let Ok(env_log) = std::env::var("RUST_LOG") { if env_log.contains("sqlx=") { env_log @@ -143,79 +158,109 @@ pub fn init_tracing( }; let env_filter = EnvFilter::new(&log_level); - let console_enabled = config.logging.console; - - let guard = match (console_enabled, &config.logging.file) { - (true, Some(log_path)) => { - let log_path = std::path::Path::new(log_path); - if let Some(parent) = log_path.parent() { - fs::create_dir_all(parent)?; - } - let directory = log_path - .parent() - .unwrap_or_else(|| std::path::Path::new(".")); - let filename = log_path - .file_name() - .and_then(|n| n.to_str()) - .unwrap_or("codex.log"); - - let file_appender = tracing_appender::rolling::daily(directory, filename); - let (non_blocking, guard) = tracing_appender::non_blocking(file_appender); - - let writer = io::stdout.and(non_blocking); + // Build the writer + keep the appender's worker guard alive. Branches on + // the (console, file) matrix and erases the writer type via `BoxMakeWriter` + // so the registry composition stays uniform. + let (writer, file_guard, ansi_enabled) = + build_log_writer(config.logging.console, config.logging.file.as_deref())?; + + // Initialize OTel providers (no-op when disabled or feature off). Done + // before constructing the bridge layer so the global tracer is in place + // for any code that grabs it via `global::tracer(...)` later. + let observability = crate::observability::init(&config.observability)?; + + let fmt_layer = fmt::layer() + .with_writer(writer) + .with_ansi(ansi_enabled) + .event_format(crate::observability::TraceContextFormat::default()); + + // Compose subscribers inline: a generic helper here trips up the + // Layer/Subscriber bounds because each `.with(...)` changes S, so the + // inline form is the cleanest path. Keep the two branches in sync. + // + // `try_init().ok()` (instead of `init()`) so a second call in the same + // process — e.g. tests that drive migrate + wait_for_migrations back to + // back — no-ops on the global subscriber instead of panicking. + #[cfg(feature = "observability")] + { + let otel_layer = observability + .tracer() + .cloned() + .map(|t| tracing_opentelemetry::layer().with_tracer(t)); + tracing_subscriber::registry() + .with(env_filter) + .with(fmt_layer) + .with(otel_layer) + .try_init() + .ok(); + } + #[cfg(not(feature = "observability"))] + { + tracing_subscriber::registry() + .with(env_filter) + .with(fmt_layer) + .try_init() + .ok(); + } - tracing_subscriber::fmt() - .with_env_filter(env_filter) - .with_writer(writer) - .try_init() - .ok(); + Ok(TracingHandles { + file_guard, + observability, + log_level, + }) +} - Some(guard) - } - (true, None) => { - tracing_subscriber::fmt() - .with_env_filter(env_filter) - .try_init() - .ok(); - None - } - (false, Some(log_path)) => { - let log_path = std::path::Path::new(log_path); - if let Some(parent) = log_path.parent() { - fs::create_dir_all(parent)?; - } +/// Build a `MakeWriter` covering the (console, file) matrix. +/// +/// Returns a type-erased writer plus the file appender's worker guard (when +/// applicable) and whether ANSI escapes should be emitted (off for file-only +/// output to keep log files plain text). +fn build_log_writer( + console_enabled: bool, + log_file: Option<&str>, +) -> anyhow::Result<( + tracing_subscriber::fmt::writer::BoxMakeWriter, + Option, + bool, +)> { + use std::io; + use tracing_subscriber::fmt::writer::{BoxMakeWriter, MakeWriterExt}; - let directory = log_path - .parent() - .unwrap_or_else(|| std::path::Path::new(".")); - let filename = log_path - .file_name() - .and_then(|n| n.to_str()) - .unwrap_or("codex.log"); - - let file_appender = tracing_appender::rolling::daily(directory, filename); - let (non_blocking, guard) = tracing_appender::non_blocking(file_appender); - - tracing_subscriber::fmt() - .with_env_filter(env_filter) - .with_writer(non_blocking) - .with_ansi(false) - .try_init() - .ok(); - - Some(guard) + match (console_enabled, log_file) { + (true, Some(path)) => { + let (non_blocking, guard) = build_file_appender(path)?; + let combined = io::stdout.and(non_blocking); + Ok((BoxMakeWriter::new(combined), Some(guard), true)) } - (false, None) => { - tracing_subscriber::fmt() - .with_env_filter(env_filter) - .try_init() - .ok(); - None + (true, None) => Ok((BoxMakeWriter::new(io::stdout), None, true)), + (false, Some(path)) => { + let (non_blocking, guard) = build_file_appender(path)?; + Ok((BoxMakeWriter::new(non_blocking), Some(guard), false)) } - }; + (false, None) => Ok((BoxMakeWriter::new(io::sink), None, false)), + } +} - Ok((guard, log_level)) +fn build_file_appender( + log_path: &str, +) -> anyhow::Result<( + tracing_appender::non_blocking::NonBlocking, + tracing_appender::non_blocking::WorkerGuard, +)> { + let log_path = std::path::Path::new(log_path); + if let Some(parent) = log_path.parent() { + fs::create_dir_all(parent)?; + } + let directory = log_path + .parent() + .unwrap_or_else(|| std::path::Path::new(".")); + let filename = log_path + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("codex.log"); + let file_appender = tracing_appender::rolling::daily(directory, filename); + Ok(tracing_appender::non_blocking(file_appender)) } /// Display database configuration diff --git a/src/commands/migrate.rs b/src/commands/migrate.rs index 7a12d856..e29d3104 100644 --- a/src/commands/migrate.rs +++ b/src/commands/migrate.rs @@ -9,9 +9,9 @@ pub async fn migrate_command(config_path: PathBuf) -> Result<()> { // Load configuration let (config, _config_created) = load_config(config_path.clone())?; - // Initialize tracing with config - let (_log_guard, log_level) = init_tracing(&config)?; - info!("Logging level: {}", log_level); + // Initialize tracing with config (composes fmt + optional OTel layer) + let _tracing_handles = init_tracing(&config)?; + info!("Logging level: {}", _tracing_handles.log_level); info!("Loading configuration from {:?}", config_path); info!("Configuration loaded successfully"); diff --git a/src/commands/serve.rs b/src/commands/serve.rs index ad819672..85139794 100644 --- a/src/commands/serve.rs +++ b/src/commands/serve.rs @@ -1,6 +1,7 @@ use crate::commands::common::{ - display_database_config, ensure_data_directories, get_worker_count, init_database, - init_settings_service, init_tracing, load_config, shutdown_workers, spawn_workers, + TracingHandles, display_database_config, ensure_data_directories, get_worker_count, + init_database, init_settings_service, init_tracing, load_config, shutdown_workers, + spawn_workers, }; use crate::config::DatabaseType; use std::path::PathBuf; @@ -14,9 +15,14 @@ pub async fn serve_command(config_path: PathBuf) -> anyhow::Result<()> { // Load configuration let (config, config_created) = load_config(config_path.clone())?; - // Initialize tracing with config - let (log_guard, log_level) = init_tracing(&config)?; - info!("Logging level: {}", log_level); + // Initialize tracing with config (composes fmt + optional OTel layer) + let tracing_handles = init_tracing(&config)?; + info!("Logging level: {}", tracing_handles.log_level); + info!( + "Observability: traces={}, metrics={}", + tracing_handles.observability.traces_enabled(), + tracing_handles.observability.metrics_enabled(), + ); if config_created { info!("Created default configuration file"); @@ -131,6 +137,15 @@ pub async fn serve_command(config_path: PathBuf) -> anyhow::Result<()> { .start_background_jobs(background_task_cancel.clone()); info!("Task metrics background jobs started"); + // Refresh the inventory metric snapshot every 30s so the OTel observable + // gauges have current values. Cheap: five `COUNT(*)` queries. The poller + // exits as soon as the cancellation token fires. + let inventory_poller_handle = crate::observability::inventory::spawn_poller( + Arc::new(db.sea_orm_connection().clone()), + std::time::Duration::from_secs(30), + background_task_cancel.clone(), + ); + // Initialize read progress batching service let read_progress_service = Arc::new(crate::services::ReadProgressService::new( db.sea_orm_connection().clone(), @@ -449,6 +464,7 @@ pub async fn serve_command(config_path: PathBuf) -> anyhow::Result<()> { auth_config: Arc::new(config.auth.clone()), database_config: Arc::new(config.database.clone()), pdf_config: Arc::new(config.pdf.clone()), + observability_config: Arc::new(config.observability.clone()), email_service, event_broadcaster: event_broadcaster.clone(), settings_service, @@ -504,8 +520,14 @@ pub async fn serve_command(config_path: PathBuf) -> anyhow::Result<()> { info!(" GET {} - API docs (Scalar)", config.api.api_docs_path); } - // Keep log guard alive - let _log_guard = log_guard; + // Destructure the tracing handles: keep file guard alive for the + // remainder of `serve_command`, and hold onto the OTel guard so we can + // flush providers explicitly during graceful shutdown. + let TracingHandles { + file_guard: _log_guard, + observability: observability_handle, + log_level: _, + } = tracing_handles; // Start server info!("========================================"); @@ -550,6 +572,12 @@ pub async fn serve_command(config_path: PathBuf) -> anyhow::Result<()> { } } + // Await inventory metrics poller completion + info!("Waiting for inventory metrics poller to complete..."); + if let Err(e) = inventory_poller_handle.await { + tracing::warn!("Inventory metrics poller panicked: {}", e); + } + // Await read progress background flush task completion info!("Waiting for read progress flush task to complete..."); if let Err(e) = read_progress_handle.await { @@ -601,6 +629,12 @@ pub async fn serve_command(config_path: PathBuf) -> anyhow::Result<()> { shutdown_workers(worker_handles, worker_shutdown_channels, worker_count).await; } + // Flush + shut down OTel providers (no-op when observability is disabled). + // Done last so any spans emitted during shutdown still get exported. + info!("Flushing OpenTelemetry providers..."); + observability_handle.shutdown(); + info!("OpenTelemetry providers flushed"); + info!("Shutdown complete"); server_result?; Ok(()) diff --git a/src/commands/wait_for_migrations.rs b/src/commands/wait_for_migrations.rs index 77b35fd3..98c6a80a 100644 --- a/src/commands/wait_for_migrations.rs +++ b/src/commands/wait_for_migrations.rs @@ -14,9 +14,9 @@ pub async fn wait_for_migrations_command( // Load configuration let (config, _config_created) = load_config(config_path.clone())?; - // Initialize tracing with config - let (_log_guard, log_level) = init_tracing(&config)?; - info!("Logging level: {}", log_level); + // Initialize tracing with config (composes fmt + optional OTel layer) + let _tracing_handles = init_tracing(&config)?; + info!("Logging level: {}", _tracing_handles.log_level); info!("Loading configuration from {:?}", config_path); info!("Configuration loaded successfully"); diff --git a/src/commands/worker.rs b/src/commands/worker.rs index 4fb15563..8ea3126b 100644 --- a/src/commands/worker.rs +++ b/src/commands/worker.rs @@ -1,6 +1,7 @@ use crate::commands::common::{ - display_database_config, ensure_data_directories, get_worker_count, init_database, - init_settings_service, init_tracing, load_config, shutdown_workers, spawn_workers, + TracingHandles, display_database_config, ensure_data_directories, get_worker_count, + init_database, init_settings_service, init_tracing, load_config, shutdown_workers, + spawn_workers, }; use std::path::PathBuf; use std::sync::Arc; @@ -13,9 +14,9 @@ pub async fn worker_command(config_path: PathBuf) -> anyhow::Result<()> { // Load configuration let (config, _config_created) = load_config(config_path.clone())?; - // Initialize tracing with config - let (log_guard, log_level) = init_tracing(&config)?; - info!("Logging level: {}", log_level); + // Initialize tracing with config (composes fmt + optional OTel layer) + let tracing_handles = init_tracing(&config)?; + info!("Logging level: {}", tracing_handles.log_level); info!("Loading configuration from {:?}", config_path); info!("Configuration loaded successfully"); @@ -180,8 +181,12 @@ pub async fn worker_command(config_path: PathBuf) -> anyhow::Result<()> { info!(" Press Ctrl+C to stop"); info!("========================================"); - // Keep log guard alive - let _log_guard = log_guard; + // Keep log guard alive; hold the observability handle until graceful exit. + let TracingHandles { + file_guard: _log_guard, + observability: observability_handle, + log_level: _, + } = tracing_handles; // Wait for shutdown signal shutdown_signal().await; @@ -208,6 +213,10 @@ pub async fn worker_command(config_path: PathBuf) -> anyhow::Result<()> { // Shutdown workers shutdown_workers(worker_handles, worker_shutdown_channels, worker_count).await; + // Flush + shut down OTel providers (no-op when observability is disabled). + info!("Flushing OpenTelemetry providers..."); + observability_handle.shutdown(); + info!("Shutdown complete"); Ok(()) diff --git a/src/config/env_override.rs b/src/config/env_override.rs index 6adabc8b..90f8d631 100644 --- a/src/config/env_override.rs +++ b/src/config/env_override.rs @@ -1,8 +1,10 @@ #[allow(unused_imports)] use super::types::{ ApiConfig, ApplicationConfig, AuthConfig, Config, DatabaseConfig, DatabaseType, FilesConfig, - KomgaApiConfig, KoreaderApiConfig, LogLevel, LoggingConfig, OidcConfig, OidcDefaultRole, - OidcProviderConfig, PostgresConfig, RateLimitConfig, SQLiteConfig, ScannerConfig, TaskConfig, + KomgaApiConfig, KoreaderApiConfig, LogLevel, LoggingConfig, ObservabilityBrowserConfig, + ObservabilityConfig, ObservabilityMetricsConfig, ObservabilityTracesConfig, OidcConfig, + OidcDefaultRole, OidcProviderConfig, OtlpConfig, OtlpProtocol, PostgresConfig, RateLimitConfig, + SQLiteConfig, ScannerConfig, TaskConfig, }; use std::collections::HashMap; use std::env; @@ -236,6 +238,112 @@ impl EnvOverride for Config { .apply_env_overrides(&format!("{}_KOMGA_API", prefix)); self.rate_limit .apply_env_overrides(&format!("{}_RATE_LIMIT", prefix)); + self.observability + .apply_env_overrides(&format!("{}_OBSERVABILITY", prefix)); + } +} + +impl EnvOverride for ObservabilityConfig { + fn apply_env_overrides(&mut self, prefix: &str) { + if let Ok(enabled) = env::var(format!("{}_ENABLED", prefix)) { + self.enabled = enabled.eq_ignore_ascii_case("true") || enabled == "1"; + } + if let Ok(service_name) = env::var(format!("{}_SERVICE_NAME", prefix)) + && !service_name.is_empty() + { + self.service_name = service_name; + } + self.otlp.apply_env_overrides(&format!("{}_OTLP", prefix)); + self.traces + .apply_env_overrides(&format!("{}_TRACES", prefix)); + self.metrics + .apply_env_overrides(&format!("{}_METRICS", prefix)); + self.browser + .apply_env_overrides(&format!("{}_BROWSER", prefix)); + } +} + +impl EnvOverride for OtlpConfig { + fn apply_env_overrides(&mut self, prefix: &str) { + if let Ok(endpoint) = env::var(format!("{}_ENDPOINT", prefix)) { + self.endpoint = endpoint; + } + if let Ok(protocol) = env::var(format!("{}_PROTOCOL", prefix)) { + self.protocol = match protocol.to_lowercase().as_str() { + "grpc" => OtlpProtocol::Grpc, + "http/protobuf" | "http-protobuf" | "http_protobuf" | "httpproto" => { + OtlpProtocol::HttpProtobuf + } + "http/json" | "http-json" | "http_json" => OtlpProtocol::HttpJson, + _ => self.protocol, + }; + } + if let Ok(headers) = env::var(format!("{}_HEADERS", prefix)) { + // Format: "k1=v1,k2=v2". Empty pairs are skipped. + self.headers.clear(); + for entry in headers.split(',') { + let entry = entry.trim(); + if entry.is_empty() { + continue; + } + if let Some((k, v)) = entry.split_once('=') { + let k = k.trim(); + let v = v.trim(); + if !k.is_empty() { + self.headers.insert(k.to_string(), v.to_string()); + } + } + } + } + if let Ok(timeout_ms) = env::var(format!("{}_TIMEOUT_MS", prefix)) + && let Ok(ms) = timeout_ms.parse::() + { + self.timeout_ms = ms; + } + } +} + +impl EnvOverride for ObservabilityTracesConfig { + fn apply_env_overrides(&mut self, prefix: &str) { + if let Ok(enabled) = env::var(format!("{}_ENABLED", prefix)) { + self.enabled = enabled.eq_ignore_ascii_case("true") || enabled == "1"; + } + if let Ok(sample_ratio) = env::var(format!("{}_SAMPLE_RATIO", prefix)) + && let Ok(ratio) = sample_ratio.parse::() + { + self.sample_ratio = ratio; + } + } +} + +impl EnvOverride for ObservabilityMetricsConfig { + fn apply_env_overrides(&mut self, prefix: &str) { + if let Ok(enabled) = env::var(format!("{}_ENABLED", prefix)) { + self.enabled = enabled.eq_ignore_ascii_case("true") || enabled == "1"; + } + if let Ok(interval_ms) = env::var(format!("{}_EXPORT_INTERVAL_MS", prefix)) + && let Ok(ms) = interval_ms.parse::() + { + self.export_interval_ms = ms; + } + } +} + +impl EnvOverride for ObservabilityBrowserConfig { + fn apply_env_overrides(&mut self, prefix: &str) { + if let Ok(enabled) = env::var(format!("{}_ENABLED", prefix)) { + self.enabled = enabled.eq_ignore_ascii_case("true") || enabled == "1"; + } + if let Ok(proxy_path) = env::var(format!("{}_PROXY_PATH", prefix)) + && !proxy_path.is_empty() + { + self.proxy_path = proxy_path; + } + if let Ok(sample_ratio) = env::var(format!("{}_SAMPLE_RATIO", prefix)) + && let Ok(ratio) = sample_ratio.parse::() + { + self.sample_ratio = ratio; + } } } @@ -636,8 +744,8 @@ mod tests { // We'll use a helper to create a minimal config use crate::config::{ ApiConfig, ApplicationConfig, AuthConfig, DatabaseConfig, DatabaseType, EmailConfig, - FilesConfig, KomgaApiConfig, LoggingConfig, PdfConfig, PdfHandleCacheConfig, - RateLimitConfig, SQLiteConfig, SchedulerConfig, + FilesConfig, KomgaApiConfig, LoggingConfig, ObservabilityConfig, PdfConfig, + PdfHandleCacheConfig, RateLimitConfig, SQLiteConfig, SchedulerConfig, }; let mut config = Config { data_dir: "data".to_string(), @@ -670,6 +778,7 @@ mod tests { komga_api: KomgaApiConfig::default(), koreader_api: KoreaderApiConfig::default(), rate_limit: RateLimitConfig::default(), + observability: ObservabilityConfig::default(), }; // Set env vars BEFORE applying overrides @@ -824,8 +933,8 @@ mod tests { use crate::config::{ ApiConfig, ApplicationConfig, AuthConfig, DatabaseConfig, DatabaseType, EmailConfig, - FilesConfig, KomgaApiConfig, LoggingConfig, PdfConfig, PdfHandleCacheConfig, - RateLimitConfig, SQLiteConfig, SchedulerConfig, + FilesConfig, KomgaApiConfig, LoggingConfig, ObservabilityConfig, PdfConfig, + PdfHandleCacheConfig, RateLimitConfig, SQLiteConfig, SchedulerConfig, }; let mut config = Config { data_dir: "data".to_string(), @@ -861,6 +970,7 @@ mod tests { }, koreader_api: KoreaderApiConfig::default(), rate_limit: RateLimitConfig::default(), + observability: ObservabilityConfig::default(), }; set_var("CODEX_KOMGA_API_ENABLED", "true"); @@ -1498,4 +1608,63 @@ mod tests { remove_var("CODEX_DATA_DIR"); } + + #[test] + #[serial] + fn test_observability_env_override_all_fields() { + // Cover every leaf field at least once so a regression in the + // env_override impl is caught here rather than at runtime. + let vars = [ + ("CODEX_OBSERVABILITY_ENABLED", "true"), + ("CODEX_OBSERVABILITY_SERVICE_NAME", "codex-staging"), + ( + "CODEX_OBSERVABILITY_OTLP_ENDPOINT", + "https://otel.example.com:4317", + ), + ("CODEX_OBSERVABILITY_OTLP_PROTOCOL", "http/protobuf"), + ("CODEX_OBSERVABILITY_OTLP_TIMEOUT_MS", "9000"), + ( + "CODEX_OBSERVABILITY_OTLP_HEADERS", + "x-tenant=acme,x-key=secret", + ), + ("CODEX_OBSERVABILITY_TRACES_ENABLED", "false"), + ("CODEX_OBSERVABILITY_TRACES_SAMPLE_RATIO", "0.3"), + ("CODEX_OBSERVABILITY_METRICS_ENABLED", "false"), + ("CODEX_OBSERVABILITY_METRICS_EXPORT_INTERVAL_MS", "60000"), + ("CODEX_OBSERVABILITY_BROWSER_ENABLED", "true"), + ("CODEX_OBSERVABILITY_BROWSER_PROXY_PATH", "/proxy"), + ("CODEX_OBSERVABILITY_BROWSER_SAMPLE_RATIO", "0.7"), + ]; + for (k, _) in vars.iter() { + remove_var(k); + } + for (k, v) in vars.iter() { + set_var(k, v); + } + + let mut config = crate::config::ObservabilityConfig::default(); + config.apply_env_overrides("CODEX_OBSERVABILITY"); + + assert!(config.enabled); + assert_eq!(config.service_name, "codex-staging"); + assert_eq!(config.otlp.endpoint, "https://otel.example.com:4317"); + assert!(matches!( + config.otlp.protocol, + crate::config::OtlpProtocol::HttpProtobuf + )); + assert_eq!(config.otlp.timeout_ms, 9000); + assert_eq!(config.otlp.headers.get("x-tenant"), Some(&"acme".into())); + assert_eq!(config.otlp.headers.get("x-key"), Some(&"secret".into())); + assert!(!config.traces.enabled); + assert!((config.traces.sample_ratio - 0.3).abs() < f64::EPSILON); + assert!(!config.metrics.enabled); + assert_eq!(config.metrics.export_interval_ms, 60000); + assert!(config.browser.enabled); + assert_eq!(config.browser.proxy_path, "/proxy"); + assert!((config.browser.sample_ratio - 0.7).abs() < f64::EPSILON); + + for (k, _) in vars.iter() { + remove_var(k); + } + } } diff --git a/src/config/loader.rs b/src/config/loader.rs index 3bd647bd..b19816cb 100644 --- a/src/config/loader.rs +++ b/src/config/loader.rs @@ -22,9 +22,9 @@ mod tests { use super::*; use crate::config::{ ApiConfig, ApplicationConfig, AuthConfig, DatabaseConfig, DatabaseType, EmailConfig, - FilesConfig, KomgaApiConfig, KoreaderApiConfig, LoggingConfig, PdfConfig, - PdfHandleCacheConfig, RateLimitConfig, SQLiteConfig, ScannerConfig, SchedulerConfig, - TaskConfig, + FilesConfig, KomgaApiConfig, KoreaderApiConfig, LoggingConfig, ObservabilityConfig, + PdfConfig, PdfHandleCacheConfig, RateLimitConfig, SQLiteConfig, ScannerConfig, + SchedulerConfig, TaskConfig, }; use tempfile::NamedTempFile; @@ -82,6 +82,7 @@ application: komga_api: KomgaApiConfig::default(), koreader_api: KoreaderApiConfig::default(), rate_limit: RateLimitConfig::default(), + observability: ObservabilityConfig::default(), }; let temp_file = NamedTempFile::new().unwrap(); @@ -169,6 +170,7 @@ scanner: komga_api: KomgaApiConfig::default(), koreader_api: KoreaderApiConfig::default(), rate_limit: RateLimitConfig::default(), + observability: ObservabilityConfig::default(), }; let temp_file = NamedTempFile::new().unwrap(); diff --git a/src/config/mod.rs b/src/config/mod.rs index 1233222b..1d2dd7ba 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -6,9 +6,10 @@ mod types; #[allow(unused_imports)] pub use types::{ ApiConfig, ApplicationConfig, AuthConfig, Config, DatabaseConfig, DatabaseType, EmailConfig, - FilesConfig, KomgaApiConfig, KoreaderApiConfig, LoggingConfig, OidcConfig, OidcDefaultRole, - OidcProviderConfig, PdfConfig, PdfHandleCacheConfig, PostgresConfig, RateLimitConfig, - SQLiteConfig, ScannerConfig, SchedulerConfig, TaskConfig, + FilesConfig, KomgaApiConfig, KoreaderApiConfig, LoggingConfig, ObservabilityBrowserConfig, + ObservabilityConfig, ObservabilityMetricsConfig, ObservabilityTracesConfig, OidcConfig, + OidcDefaultRole, OidcProviderConfig, OtlpConfig, OtlpProtocol, PdfConfig, PdfHandleCacheConfig, + PostgresConfig, RateLimitConfig, SQLiteConfig, ScannerConfig, SchedulerConfig, TaskConfig, }; pub use env_override::EnvOverride; diff --git a/src/config/types.rs b/src/config/types.rs index 7907def9..f7dc4f4c 100644 --- a/src/config/types.rs +++ b/src/config/types.rs @@ -269,6 +269,8 @@ pub struct Config { pub koreader_api: KoreaderApiConfig, #[serde(default)] pub rate_limit: RateLimitConfig, + #[serde(default)] + pub observability: ObservabilityConfig, } fn default_data_dir() -> String { @@ -399,6 +401,7 @@ impl Default for Config { komga_api: KomgaApiConfig::default(), koreader_api: KoreaderApiConfig::default(), rate_limit: RateLimitConfig::default(), + observability: ObservabilityConfig::default(), } } } @@ -914,6 +917,194 @@ impl Default for EmailConfig { } } +/// OTLP wire protocol used by the OpenTelemetry exporter. +/// +/// `Grpc` is the default; `HttpProtobuf` is the right choice when the operator's +/// collector accepts OTLP/HTTP-protobuf (e.g., behind a load balancer that +/// can't terminate gRPC). `HttpJson` is supported for parity but rarely the +/// best pick: payloads are larger and most collectors prefer protobuf. +#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq, Default)] +#[serde(rename_all = "kebab-case")] +pub enum OtlpProtocol { + #[default] + Grpc, + #[serde(alias = "http-protobuf", alias = "http_protobuf", alias = "httpproto")] + HttpProtobuf, + #[serde(alias = "http-json", alias = "http_json")] + HttpJson, +} + +impl OtlpProtocol { + #[allow(dead_code)] // Used by observability module when feature is enabled. + pub fn as_str(&self) -> &'static str { + match self { + OtlpProtocol::Grpc => "grpc", + OtlpProtocol::HttpProtobuf => "http/protobuf", + OtlpProtocol::HttpJson => "http/json", + } + } +} + +/// OTLP exporter transport and endpoint configuration. +/// +/// `endpoint` is empty by default; an empty endpoint paired with +/// `observability.enabled = true` is treated as a misconfiguration and the +/// OTel layer will not be installed. `headers` is the place for tenant/auth +/// headers (e.g., `signoz-access-token`, `x-honeycomb-team`). +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(default)] +pub struct OtlpConfig { + /// OTLP collector endpoint URL. For gRPC use `http://host:4317`; for + /// HTTP/protobuf use `http://host:4318` (the SDK appends `/v1/traces`, + /// `/v1/metrics` per signal). + pub endpoint: String, + + /// Wire protocol used to reach the collector. + pub protocol: OtlpProtocol, + + /// Arbitrary headers attached to every export request (auth, tenancy). + pub headers: HashMap, + + /// Per-export request timeout in milliseconds. + pub timeout_ms: u64, +} + +impl Default for OtlpConfig { + fn default() -> Self { + Self { + endpoint: env_string_opt("CODEX_OBSERVABILITY_OTLP_ENDPOINT").unwrap_or_default(), + protocol: env_string_opt("CODEX_OBSERVABILITY_OTLP_PROTOCOL") + .and_then(|s| match s.to_lowercase().as_str() { + "grpc" => Some(OtlpProtocol::Grpc), + "http/protobuf" | "http-protobuf" | "http_protobuf" | "httpproto" => { + Some(OtlpProtocol::HttpProtobuf) + } + "http/json" | "http-json" | "http_json" => Some(OtlpProtocol::HttpJson), + _ => None, + }) + .unwrap_or_default(), + headers: HashMap::new(), + timeout_ms: env_or("CODEX_OBSERVABILITY_OTLP_TIMEOUT_MS", 5000), + } + } +} + +/// Trace exporter configuration. +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(default)] +pub struct ObservabilityTracesConfig { + /// Enable trace export. Honored only when the parent `observability.enabled` + /// is also true. + pub enabled: bool, + + /// Parent-based sampling ratio in `[0.0, 1.0]`. Values outside this range + /// are clamped at init. + pub sample_ratio: f64, +} + +impl Default for ObservabilityTracesConfig { + fn default() -> Self { + Self { + enabled: env_bool_or("CODEX_OBSERVABILITY_TRACES_ENABLED", true), + sample_ratio: env_or("CODEX_OBSERVABILITY_TRACES_SAMPLE_RATIO", 1.0_f64), + } + } +} + +/// Metrics exporter configuration. +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(default)] +pub struct ObservabilityMetricsConfig { + /// Enable metrics export. Honored only when the parent `observability.enabled` + /// is also true. + pub enabled: bool, + + /// Periodic reader export interval in milliseconds. + pub export_interval_ms: u64, +} + +impl Default for ObservabilityMetricsConfig { + fn default() -> Self { + Self { + enabled: env_bool_or("CODEX_OBSERVABILITY_METRICS_ENABLED", true), + export_interval_ms: env_or("CODEX_OBSERVABILITY_METRICS_EXPORT_INTERVAL_MS", 30000), + } + } +} + +/// Browser RUM configuration. The browser SDK posts OTLP via the Codex proxy; +/// this struct controls the proxy endpoint, default sample ratio, and an +/// opt-in switch separate from the backend tracing flag. +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(default)] +pub struct ObservabilityBrowserConfig { + /// Opt-in switch for serving the browser SDK config and the OTLP proxy. + /// Independent of the backend `observability.enabled` flag because some + /// operators want server-side observability without shipping spans from + /// every browser tab. + pub enabled: bool, + + /// Path on the Codex server where the browser SDK POSTs OTLP batches. + /// The SDK is expected to append `/v1/traces` / `/v1/metrics` to this base. + pub proxy_path: String, + + /// Sample ratio applied client-side. Browsers are noisy; default low. + pub sample_ratio: f64, +} + +impl Default for ObservabilityBrowserConfig { + fn default() -> Self { + Self { + enabled: env_bool_or("CODEX_OBSERVABILITY_BROWSER_ENABLED", false), + proxy_path: env_string_opt("CODEX_OBSERVABILITY_BROWSER_PROXY_PATH") + .unwrap_or_else(|| "/api/v1/observability/otlp".to_string()), + sample_ratio: env_or("CODEX_OBSERVABILITY_BROWSER_SAMPLE_RATIO", 0.1_f64), + } + } +} + +/// Top-level observability configuration. +/// +/// Disabled by default. When `enabled` is `false`, no providers are +/// initialized and no telemetry leaves the process. This is the trust posture +/// for a self-hosted product. +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(default)] +pub struct ObservabilityConfig { + /// Master switch. Must be `true` for any OTel work to happen. + pub enabled: bool, + + /// `service.name` resource attribute. Identifies this process in the + /// backend's UI; defaults to `codex`. + pub service_name: String, + + /// OTLP exporter transport configuration. + pub otlp: OtlpConfig, + + /// Trace pipeline configuration. + pub traces: ObservabilityTracesConfig, + + /// Metric pipeline configuration. + pub metrics: ObservabilityMetricsConfig, + + /// Browser RUM proxy configuration. + pub browser: ObservabilityBrowserConfig, +} + +impl Default for ObservabilityConfig { + fn default() -> Self { + Self { + enabled: env_bool_or("CODEX_OBSERVABILITY_ENABLED", false), + service_name: env_string_opt("CODEX_OBSERVABILITY_SERVICE_NAME") + .unwrap_or_else(|| "codex".to_string()), + otlp: OtlpConfig::default(), + traces: ObservabilityTracesConfig::default(), + metrics: ObservabilityMetricsConfig::default(), + browser: ObservabilityBrowserConfig::default(), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -1225,6 +1416,7 @@ verification_url_base: https://codex.example.com komga_api: KomgaApiConfig::default(), koreader_api: KoreaderApiConfig::default(), rate_limit: RateLimitConfig::default(), + observability: ObservabilityConfig::default(), }; // Application name moved to database settings @@ -2124,4 +2316,116 @@ files: let config: Config = serde_yaml::from_str(yaml_content).unwrap(); assert_eq!(config.files.plugins_dir, "/tmp/plugins"); } + + // ---- observability config ---- + + #[test] + #[serial] + fn test_observability_defaults_are_disabled() { + // Clear any env vars that might otherwise flip the defaults. + for var in [ + "CODEX_OBSERVABILITY_ENABLED", + "CODEX_OBSERVABILITY_SERVICE_NAME", + "CODEX_OBSERVABILITY_OTLP_ENDPOINT", + "CODEX_OBSERVABILITY_OTLP_PROTOCOL", + "CODEX_OBSERVABILITY_OTLP_TIMEOUT_MS", + "CODEX_OBSERVABILITY_TRACES_ENABLED", + "CODEX_OBSERVABILITY_TRACES_SAMPLE_RATIO", + "CODEX_OBSERVABILITY_METRICS_ENABLED", + "CODEX_OBSERVABILITY_METRICS_EXPORT_INTERVAL_MS", + "CODEX_OBSERVABILITY_BROWSER_ENABLED", + "CODEX_OBSERVABILITY_BROWSER_PROXY_PATH", + "CODEX_OBSERVABILITY_BROWSER_SAMPLE_RATIO", + ] { + unsafe { std::env::remove_var(var) }; + } + let config = ObservabilityConfig::default(); + assert!(!config.enabled, "observability must be off by default"); + assert_eq!(config.service_name, "codex"); + assert!(matches!(config.otlp.protocol, OtlpProtocol::Grpc)); + assert!(config.otlp.endpoint.is_empty()); + assert_eq!(config.otlp.timeout_ms, 5000); + assert!(config.traces.enabled); + assert!((config.traces.sample_ratio - 1.0).abs() < f64::EPSILON); + assert!(config.metrics.enabled); + assert_eq!(config.metrics.export_interval_ms, 30_000); + assert!(!config.browser.enabled); + assert_eq!(config.browser.proxy_path, "/api/v1/observability/otlp"); + assert!((config.browser.sample_ratio - 0.1).abs() < f64::EPSILON); + } + + #[test] + fn test_observability_section_in_full_config_yaml() { + // YAML round-trip: when the observability section is omitted, the + // default block fills it in. + let yaml_content = r#" +database: + db_type: sqlite + sqlite: + path: ./test.db +"#; + let config: Config = serde_yaml::from_str(yaml_content).unwrap(); + assert!(!config.observability.enabled); + // Round-trip preserves the section. + let serialized = serde_yaml::to_string(&config).unwrap(); + assert!(serialized.contains("observability:")); + } + + #[test] + fn test_observability_from_yaml_with_overrides() { + let yaml_content = r#" +observability: + enabled: true + service_name: codex-prod + otlp: + endpoint: http://collector:4317 + protocol: grpc + timeout_ms: 2000 + headers: + x-tenant: acme + traces: + enabled: true + sample_ratio: 0.25 + metrics: + enabled: false + export_interval_ms: 15000 + browser: + enabled: true + proxy_path: /custom/path + sample_ratio: 0.5 +"#; + let config: ObservabilityConfig = serde_yaml::from_str( + &yaml_content + .lines() + .skip(1) // drop leading "observability:" so the section root can deserialize directly + .map(|l| l.strip_prefix(" ").unwrap_or(l)) + .collect::>() + .join("\n"), + ) + .unwrap(); + assert!(config.enabled); + assert_eq!(config.service_name, "codex-prod"); + assert_eq!(config.otlp.endpoint, "http://collector:4317"); + assert!(matches!(config.otlp.protocol, OtlpProtocol::Grpc)); + assert_eq!(config.otlp.timeout_ms, 2000); + assert_eq!(config.otlp.headers.get("x-tenant"), Some(&"acme".into())); + assert!(config.traces.enabled); + assert!((config.traces.sample_ratio - 0.25).abs() < f64::EPSILON); + assert!(!config.metrics.enabled); + assert_eq!(config.metrics.export_interval_ms, 15000); + assert!(config.browser.enabled); + assert_eq!(config.browser.proxy_path, "/custom/path"); + } + + #[test] + fn test_otlp_protocol_aliases() { + let p: OtlpProtocol = serde_yaml::from_str("grpc").unwrap(); + assert!(matches!(p, OtlpProtocol::Grpc)); + let p: OtlpProtocol = serde_yaml::from_str("http-protobuf").unwrap(); + assert!(matches!(p, OtlpProtocol::HttpProtobuf)); + let p: OtlpProtocol = serde_yaml::from_str("http_protobuf").unwrap(); + assert!(matches!(p, OtlpProtocol::HttpProtobuf)); + let p: OtlpProtocol = serde_yaml::from_str("http-json").unwrap(); + assert!(matches!(p, OtlpProtocol::HttpJson)); + } } diff --git a/src/db/repositories/book.rs b/src/db/repositories/book.rs index 36e3000c..df0e01f7 100644 --- a/src/db/repositories/book.rs +++ b/src/db/repositories/book.rs @@ -17,6 +17,7 @@ use uuid::Uuid; use crate::db::entities::{books, prelude::*}; use crate::db::repositories::SeriesRepository; use crate::events::{EntityChangeEvent, EntityEvent, EventBroadcaster}; +use crate::observability::repo::db_system_str; use crate::utils::normalize_for_search; /// Options for querying books with filtering, sorting, and pagination @@ -133,6 +134,19 @@ impl BookRepository { /// }; /// let (books, total) = BookRepository::query(db, options).await?; /// ``` + #[tracing::instrument( + name = "db.book.query", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + library_id = ?options.library_id, + series_id = ?options.series_id, + page = options.page, + page_size = options.page_size, + ), + )] pub async fn query( db: &DatabaseConnection, options: BookQueryOptions<'_>, @@ -373,6 +387,16 @@ impl BookRepository { } /// Create a new book from entity model + #[tracing::instrument( + name = "db.book.insert", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "insert", + otel.kind = "client", + book.id = %book_model.id, + ), + )] pub async fn create( db: &DatabaseConnection, book_model: &books::Model, @@ -428,6 +452,16 @@ impl BookRepository { } /// Get a book by ID + #[tracing::instrument( + name = "db.book.get_by_id", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + book.id = %id, + ), + )] pub async fn get_by_id(db: &DatabaseConnection, id: Uuid) -> Result> { Books::find_by_id(id) .one(db) @@ -436,6 +470,16 @@ impl BookRepository { } /// Check if a book exists by ID (more efficient than get_by_id for existence checks) + #[tracing::instrument( + name = "db.book.exists", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + book.id = %id, + ), + )] pub async fn exists(db: &DatabaseConnection, id: Uuid) -> Result { let count = Books::find_by_id(id) .count(db) @@ -474,6 +518,16 @@ impl BookRepository { /// /// Returns all books matching the given IDs. This is useful for batch operations /// where all matching books need to be processed. + #[tracing::instrument( + name = "db.book.get_by_ids", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + id_count = ids.len(), + ), + )] pub async fn get_by_ids(db: &DatabaseConnection, ids: &[Uuid]) -> Result> { if ids.is_empty() { return Ok(vec![]); @@ -544,6 +598,17 @@ impl BookRepository { /// Get all books in a series /// Orders by book_metadata.number, book_metadata.title_sort, then file_name + #[tracing::instrument( + name = "db.book.list_by_series", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + series.id = %series_id, + include_deleted, + ), + )] pub async fn list_by_series( db: &DatabaseConnection, series_id: Uuid, @@ -1006,6 +1071,18 @@ impl BookRepository { } /// List books by library with pagination + #[tracing::instrument( + name = "db.book.list_by_library", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + library.id = %library_id, + page, + page_size, + ), + )] pub async fn list_by_library( db: &DatabaseConnection, library_id: Uuid, @@ -1675,6 +1752,16 @@ impl BookRepository { } /// Update book + #[tracing::instrument( + name = "db.book.update", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "update", + otel.kind = "client", + book.id = %book_model.id, + ), + )] pub async fn update( db: &DatabaseConnection, book_model: &books::Model, @@ -1728,6 +1815,17 @@ impl BookRepository { } /// Mark a book as deleted or restore it + #[tracing::instrument( + name = "db.book.mark_deleted", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "update", + otel.kind = "client", + book.id = %book_id, + deleted, + ), + )] pub async fn mark_deleted( db: &DatabaseConnection, book_id: Uuid, @@ -1777,6 +1875,16 @@ impl BookRepository { } /// Delete a book + #[tracing::instrument( + name = "db.book.delete", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "delete", + otel.kind = "client", + book.id = %id, + ), + )] pub async fn delete(db: &DatabaseConnection, id: Uuid) -> Result<()> { Books::delete_by_id(id) .exec(db) diff --git a/src/db/repositories/library.rs b/src/db/repositories/library.rs index 6cadd94a..aaac7a9a 100644 --- a/src/db/repositories/library.rs +++ b/src/db/repositories/library.rs @@ -13,6 +13,7 @@ use uuid::Uuid; use crate::db::entities::{libraries, prelude::*}; use crate::models::{BookStrategy, NumberStrategy, SeriesStrategy}; +use crate::observability::repo::db_system_str; /// Parameters for creating a new library #[derive(Debug, Clone)] @@ -105,6 +106,15 @@ pub struct LibraryRepository; impl LibraryRepository { /// Create a new library with full parameters + #[tracing::instrument( + name = "db.library.insert", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "insert", + otel.kind = "client", + ), + )] pub async fn create_with_params( db: &DatabaseConnection, params: CreateLibraryParams, @@ -149,6 +159,16 @@ impl LibraryRepository { } /// Get a library by ID + #[tracing::instrument( + name = "db.library.get_by_id", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + library.id = %id, + ), + )] pub async fn get_by_id(db: &DatabaseConnection, id: Uuid) -> Result> { Libraries::find_by_id(id) .one(db) @@ -159,6 +179,16 @@ impl LibraryRepository { /// Get libraries by multiple IDs /// /// Returns a HashMap keyed by library ID for efficient lookups + #[tracing::instrument( + name = "db.library.get_by_ids", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + id_count = ids.len(), + ), + )] pub async fn get_by_ids( db: &DatabaseConnection, ids: &[Uuid], @@ -179,6 +209,15 @@ impl LibraryRepository { } /// Get all libraries + #[tracing::instrument( + name = "db.library.list_all", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + ), + )] pub async fn list_all(db: &DatabaseConnection) -> Result> { Libraries::find() .order_by_asc(libraries::Column::Name) @@ -200,6 +239,16 @@ impl LibraryRepository { } /// Update library + #[tracing::instrument( + name = "db.library.update", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "update", + otel.kind = "client", + library.id = %library.id, + ), + )] pub async fn update(db: &DatabaseConnection, library: &libraries::Model) -> Result<()> { let active = libraries::ActiveModel { id: Set(library.id), @@ -251,6 +300,16 @@ impl LibraryRepository { /// Delete a library /// Note: task_metrics are automatically deleted via CASCADE foreign key + #[tracing::instrument( + name = "db.library.delete", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "delete", + otel.kind = "client", + library.id = %id, + ), + )] pub async fn delete(db: &DatabaseConnection, id: Uuid) -> Result<()> { Libraries::delete_by_id(id) .exec(db) diff --git a/src/db/repositories/plugins.rs b/src/db/repositories/plugins.rs index 0434b510..867a7c6d 100644 --- a/src/db/repositories/plugins.rs +++ b/src/db/repositories/plugins.rs @@ -15,6 +15,7 @@ #![allow(dead_code)] use crate::db::entities::plugins::{self, Entity as Plugins, PluginPermission}; +use crate::observability::repo::db_system_str; use crate::services::CredentialEncryption; use crate::services::plugin::protocol::{PluginManifest, PluginScope}; use anyhow::{Result, anyhow}; @@ -30,6 +31,15 @@ impl PluginsRepository { // ========================================================================= /// Get all plugins + #[tracing::instrument( + name = "db.plugin.get_all", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + ), + )] pub async fn get_all(db: &DatabaseConnection) -> Result> { let plugins = Plugins::find() .order_by_asc(plugins::Column::Name) @@ -101,6 +111,16 @@ impl PluginsRepository { } /// Get a plugin by ID + #[tracing::instrument( + name = "db.plugin.get_by_id", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + plugin.id = %id, + ), + )] pub async fn get_by_id(db: &DatabaseConnection, id: Uuid) -> Result> { let plugin = Plugins::find_by_id(id).one(db).await?; Ok(plugin) @@ -589,6 +609,16 @@ impl PluginsRepository { // ========================================================================= /// Record a successful operation + #[tracing::instrument( + name = "db.plugin.record_success", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "update", + otel.kind = "client", + plugin.id = %id, + ), + )] pub async fn record_success(db: &DatabaseConnection, id: Uuid) -> Result { let existing = Self::get_by_id(db, id) .await? @@ -605,6 +635,16 @@ impl PluginsRepository { } /// Record a failed operation and increment failure count + #[tracing::instrument( + name = "db.plugin.record_failure", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "update", + otel.kind = "client", + plugin.id = %id, + ), + )] pub async fn record_failure( db: &DatabaseConnection, id: Uuid, diff --git a/src/db/repositories/series.rs b/src/db/repositories/series.rs index ff61dc8a..d3d75531 100644 --- a/src/db/repositories/series.rs +++ b/src/db/repositories/series.rs @@ -19,6 +19,7 @@ use crate::db::entities::{ series_metadata, user_series_ratings, }; use crate::events::{EntityChangeEvent, EntityEvent, EventBroadcaster}; +use crate::observability::repo::db_system_str; use crate::utils::normalize_for_search; use std::sync::Arc; @@ -220,6 +221,18 @@ impl SeriesRepository { /// /// This is the primary composable query method that supports all filtering /// and sorting options. Use `SeriesQueryOptions` to configure the query. + #[tracing::instrument( + name = "db.series.query", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + library_id = ?options.library_id, + page = options.page, + page_size = options.page_size, + ), + )] pub async fn query( db: &DatabaseConnection, options: SeriesQueryOptions<'_>, @@ -678,6 +691,16 @@ impl SeriesRepository { /// Create a new series with a default path derived from the name /// For production use, prefer `create_with_fingerprint` which takes an explicit path + #[tracing::instrument( + name = "db.series.insert", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "insert", + otel.kind = "client", + library.id = %library_id, + ), + )] pub async fn create( db: &DatabaseConnection, library_id: Uuid, @@ -819,6 +842,16 @@ impl SeriesRepository { } /// Get a series by ID + #[tracing::instrument( + name = "db.series.get_by_id", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + series.id = %id, + ), + )] pub async fn get_by_id(db: &DatabaseConnection, id: Uuid) -> Result> { Series::find_by_id(id) .one(db) @@ -943,6 +976,16 @@ impl SeriesRepository { } /// Get all series in a library + #[tracing::instrument( + name = "db.series.list_by_library", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + library.id = %library_id, + ), + )] pub async fn list_by_library( db: &DatabaseConnection, library_id: Uuid, @@ -1713,6 +1756,16 @@ impl SeriesRepository { } /// Update series core fields + #[tracing::instrument( + name = "db.series.update", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "update", + otel.kind = "client", + series.id = %series_model.id, + ), + )] pub async fn update( db: &DatabaseConnection, series_model: &series::Model, diff --git a/src/db/repositories/user.rs b/src/db/repositories/user.rs index 3dc1462f..8f7d578a 100644 --- a/src/db/repositories/user.rs +++ b/src/db/repositories/user.rs @@ -1,4 +1,5 @@ use crate::db::entities::{sharing_tags, user_sharing_tags, users, users::Entity as User}; +use crate::observability::repo::db_system_str; use anyhow::Result; use chrono::Utc; use sea_orm::*; @@ -26,6 +27,16 @@ pub struct UserRepository; impl UserRepository { /// Create a new user + #[tracing::instrument( + name = "db.user.insert", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "insert", + otel.kind = "client", + user.id = %model.id, + ), + )] pub async fn create(db: &DatabaseConnection, model: &users::Model) -> Result { let active_model = users::ActiveModel { id: Set(model.id), @@ -46,12 +57,31 @@ impl UserRepository { } /// Get user by ID + #[tracing::instrument( + name = "db.user.get_by_id", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + user.id = %id, + ), + )] pub async fn get_by_id(db: &DatabaseConnection, id: Uuid) -> Result> { let user = User::find_by_id(id).one(db).await?; Ok(user) } /// Get user by username + #[tracing::instrument( + name = "db.user.get_by_username", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + ), + )] pub async fn get_by_username( db: &DatabaseConnection, username: &str, @@ -64,6 +94,15 @@ impl UserRepository { } /// Get user by email + #[tracing::instrument( + name = "db.user.get_by_email", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + ), + )] pub async fn get_by_email( db: &DatabaseConnection, email: &str, @@ -91,6 +130,16 @@ impl UserRepository { } /// Update user + #[tracing::instrument( + name = "db.user.update", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "update", + otel.kind = "client", + user.id = %model.id, + ), + )] pub async fn update(db: &DatabaseConnection, model: &users::Model) -> Result { let active_model = users::ActiveModel { id: Unchanged(model.id), @@ -110,12 +159,31 @@ impl UserRepository { } /// Delete user + #[tracing::instrument( + name = "db.user.delete", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "delete", + otel.kind = "client", + user.id = %id, + ), + )] pub async fn delete(db: &DatabaseConnection, id: Uuid) -> Result<()> { User::delete_by_id(id).exec(db).await?; Ok(()) } /// List users with filtering and pagination + #[tracing::instrument( + name = "db.user.list_paginated", + skip_all, + fields( + db.system = db_system_str(db), + db.operation = "select", + otel.kind = "client", + ), + )] pub async fn list_paginated( db: &DatabaseConnection, filter: &UserListFilter, diff --git a/src/lib.rs b/src/lib.rs index 4a8f932d..6831fdae 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ pub mod config; pub mod db; pub mod events; pub mod models; +pub mod observability; pub mod parsers; pub mod scanner; pub mod scheduler; diff --git a/src/main.rs b/src/main.rs index 2eaba1cf..2064e1ea 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,6 +4,7 @@ mod config; mod db; mod events; mod models; +mod observability; mod parsers; mod scanner; mod scheduler; diff --git a/src/observability/http.rs b/src/observability/http.rs new file mode 100644 index 00000000..dced7b7e --- /dev/null +++ b/src/observability/http.rs @@ -0,0 +1,31 @@ +//! Axum HTTP integration for OpenTelemetry. +//! +//! Wraps the `axum-tracing-opentelemetry` layers (which create the server +//! span from incoming `traceparent` and inject the active trace context into +//! responses) in a single helper that becomes a no-op when the `observability` +//! feature is off or when `observability.enabled` is false. + +use axum::Router; + +use crate::config::ObservabilityConfig; + +/// Apply the HTTP server-side OTel layers to the given router. +/// +/// Layered outside any rate limiter / CORS / panic-catch so every request +/// gets a server span before downstream middleware runs. +#[cfg(feature = "observability")] +pub fn install_http_layers(router: Router, config: &ObservabilityConfig) -> Router { + if !config.enabled || !config.traces.enabled || config.otlp.endpoint.trim().is_empty() { + // Nothing to do: either observability is off globally, traces are off, + // or the endpoint is unset (init() already logged the warning). + return router; + } + router + .layer(axum_tracing_opentelemetry::middleware::OtelInResponseLayer) + .layer(axum_tracing_opentelemetry::middleware::OtelAxumLayer::default()) +} + +#[cfg(not(feature = "observability"))] +pub fn install_http_layers(router: Router, _config: &ObservabilityConfig) -> Router { + router +} diff --git a/src/observability/inventory.rs b/src/observability/inventory.rs new file mode 100644 index 00000000..ec471544 --- /dev/null +++ b/src/observability/inventory.rs @@ -0,0 +1,84 @@ +//! Background poller that refreshes the inventory metric atomics. +//! +//! The OTel observable gauges read these atomics synchronously on each +//! collection cycle (see `metrics::install_inventory_gauges`). Polling the +//! database from inside a sync gauge callback is not feasible because the +//! SDK calls the callback from a non-tokio thread; we keep the DB queries +//! on the async runtime and the gauge callbacks read the cached values. + +use std::sync::Arc; +use std::time::Duration; + +use sea_orm::DatabaseConnection; +use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; +use tracing::warn; + +use crate::db::repositories::MetricsRepository; + +/// Spawn the inventory snapshot poller. Runs every `interval` until the +/// cancellation token fires. +pub fn spawn_poller( + db: Arc, + interval: Duration, + cancel: CancellationToken, +) -> JoinHandle<()> { + tokio::spawn(async move { + // Refresh once immediately so the first export cycle has fresh data. + refresh(&db).await; + + let mut ticker = tokio::time::interval(interval); + // Skip the immediate tick (we just did one). + ticker.tick().await; + + loop { + tokio::select! { + _ = cancel.cancelled() => break, + _ = ticker.tick() => refresh(&db).await, + } + } + }) +} + +async fn refresh(db: &DatabaseConnection) { + let libraries = MetricsRepository::count_libraries(db).await; + let series = MetricsRepository::count_series(db).await; + let books = MetricsRepository::count_books(db).await; + let users = MetricsRepository::count_users(db).await; + let pages = MetricsRepository::count_pages(db).await; + + let (Ok(libraries), Ok(series), Ok(books), Ok(users), Ok(pages)) = + (libraries, series, books, users, pages) + else { + warn!("Inventory metric refresh failed; leaving previous snapshot in place"); + return; + }; + + super::metrics::update_inventory_snapshot(libraries, series, books, users, pages); +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::Ordering; + + #[tokio::test] + async fn refresh_writes_snapshot_atomics() { + // Empty in-memory SQLite with the schema migrated so the count + // queries return zero rather than erroring. The cheapest way to + // exercise the refresh path end-to-end without coupling the test to + // a fixture builder. + let db = crate::db::test_helpers::setup_test_db().await; + + // Pre-load known sentinel values so we can detect that the refresh + // overwrote them with zeros (or any other DB count). + super::super::metrics::update_inventory_snapshot(99, 99, 99, 99, 99); + + refresh(&db).await; + + let snap = super::super::metrics::inventory_snapshot(); + assert_eq!(snap.libraries.load(Ordering::Relaxed), 0); + assert_eq!(snap.series.load(Ordering::Relaxed), 0); + assert_eq!(snap.books.load(Ordering::Relaxed), 0); + } +} diff --git a/src/observability/metrics.rs b/src/observability/metrics.rs new file mode 100644 index 00000000..280c4d41 --- /dev/null +++ b/src/observability/metrics.rs @@ -0,0 +1,536 @@ +//! OpenTelemetry meter instruments and dual-write helpers. +//! +//! Two-consumer model: existing in-process counters keep powering the in-app +//! metrics dashboards; the helpers here emit OTel counters / histograms / +//! gauges at the same call sites so an OTLP backend (SigNoz, Tempo, etc.) can +//! see the data with proper percentile aggregation. Stable instrument names +//! live as `const`s so they're searchable and easy to keep in sync with the +//! operator docs. +//! +//! All entry points are safe to call when observability is disabled: the +//! global meter provider is a no-op until [`crate::observability::init`] +//! installs one. + +use std::sync::OnceLock; +use std::sync::atomic::{AtomicI64, Ordering}; + +use opentelemetry::{ + KeyValue, global, + metrics::{Counter, Histogram, Meter}, +}; +use opentelemetry_semantic_conventions::{attribute, metric as metric_semconv}; + +const METER_NAME: &str = "codex"; + +// ---- Plugin metric names ---- +pub const PLUGIN_REQUESTS: &str = "codex.plugin.requests"; +pub const PLUGIN_DURATION: &str = "codex.plugin.duration_ms"; +pub const PLUGIN_RATE_LIMIT_REJECTIONS: &str = "codex.plugin.rate_limit_rejections"; + +// ---- Task metric names ---- +pub const TASK_COMPLETIONS: &str = "codex.task.completions"; +pub const TASK_DURATION: &str = "codex.task.duration_ms"; +pub const TASK_QUEUE_WAIT: &str = "codex.task.queue_wait_ms"; +pub const TASK_IN_FLIGHT: &str = "codex.task.in_flight"; + +// ---- Inventory metric names ---- +pub const INVENTORY_LIBRARIES: &str = "codex.inventory.libraries"; +pub const INVENTORY_SERIES: &str = "codex.inventory.series"; +pub const INVENTORY_BOOKS: &str = "codex.inventory.books"; +pub const INVENTORY_USERS: &str = "codex.inventory.users"; +pub const INVENTORY_PAGES: &str = "codex.inventory.pages"; + +fn meter() -> &'static Meter { + static METER: OnceLock = OnceLock::new(); + METER.get_or_init(|| global::meter(METER_NAME)) +} + +// ============================================================================= +// Plugin instruments +// ============================================================================= + +pub struct PluginInstruments { + requests: Counter, + duration_ms: Histogram, + rate_limit_rejections: Counter, +} + +impl PluginInstruments { + /// Build the plugin instrument set from an explicit meter. Tests use this + /// to point the instruments at an in-memory exporter without going + /// through the OnceLock-cached global accessor below. + pub fn new(m: &Meter) -> Self { + Self { + requests: m + .u64_counter(PLUGIN_REQUESTS) + .with_description("Plugin RPC requests") + .build(), + duration_ms: m + .f64_histogram(PLUGIN_DURATION) + .with_unit("ms") + .with_description("Plugin RPC duration") + .build(), + rate_limit_rejections: m + .u64_counter(PLUGIN_RATE_LIMIT_REJECTIONS) + .with_description("Plugin requests rejected by local rate limiter") + .build(), + } + } + + fn record_request(&self, plugin_id: &str, method: &str, outcome: &str, duration_ms: u64) { + let attrs = [ + KeyValue::new("plugin_id", plugin_id.to_string()), + KeyValue::new("method", method.to_string()), + KeyValue::new("outcome", outcome.to_string()), + ]; + self.requests.add(1, &attrs); + self.duration_ms.record(duration_ms as f64, &attrs); + } + + fn record_rate_limit_rejection(&self, plugin_id: &str) { + self.rate_limit_rejections + .add(1, &[KeyValue::new("plugin_id", plugin_id.to_string())]); + } +} + +fn plugin_instruments() -> &'static PluginInstruments { + static INST: OnceLock = OnceLock::new(); + INST.get_or_init(|| PluginInstruments::new(meter())) +} + +/// Record a plugin RPC outcome. `outcome` is one of `success`, `failure`. +pub fn record_plugin_request(plugin_id: &str, method: &str, outcome: &str, duration_ms: u64) { + plugin_instruments().record_request(plugin_id, method, outcome, duration_ms); +} + +/// Record a rate-limit rejection for a plugin (no method dimension; the limit +/// is applied at the plugin level). +pub fn record_plugin_rate_limit_rejection(plugin_id: &str) { + plugin_instruments().record_rate_limit_rejection(plugin_id); +} + +// ============================================================================= +// Task instruments +// ============================================================================= + +pub struct TaskInstruments { + completions: Counter, + duration_ms: Histogram, + queue_wait_ms: Histogram, +} + +impl TaskInstruments { + pub fn new(m: &Meter) -> Self { + Self { + completions: m + .u64_counter(TASK_COMPLETIONS) + .with_description("Background task completions") + .build(), + duration_ms: m + .f64_histogram(TASK_DURATION) + .with_unit("ms") + .with_description("Background task execution duration") + .build(), + queue_wait_ms: m + .f64_histogram(TASK_QUEUE_WAIT) + .with_unit("ms") + .with_description("Background task queue wait time") + .build(), + } + } + + fn record_completion( + &self, + task_type: &str, + outcome: &str, + duration_ms: i64, + queue_wait_ms: i64, + ) { + let attrs = [ + KeyValue::new("task_type", task_type.to_string()), + KeyValue::new("outcome", outcome.to_string()), + ]; + self.completions.add(1, &attrs); + if duration_ms >= 0 { + self.duration_ms.record(duration_ms as f64, &attrs); + } + if queue_wait_ms >= 0 { + self.queue_wait_ms.record(queue_wait_ms as f64, &attrs); + } + } +} + +fn task_instruments() -> &'static TaskInstruments { + static INST: OnceLock = OnceLock::new(); + INST.get_or_init(|| TaskInstruments::new(meter())) +} + +/// Currently executing background tasks. Workers increment on claim, +/// decrement on completion/failure; the gauge callback reads this atomic. +static TASKS_IN_FLIGHT: AtomicI64 = AtomicI64::new(0); + +/// Record a task completion. `outcome` is one of `success`, `failure`, +/// `rate_limited`. +pub fn record_task_completion( + task_type: &str, + outcome: &str, + duration_ms: i64, + queue_wait_ms: i64, +) { + task_instruments().record_completion(task_type, outcome, duration_ms, queue_wait_ms); +} + +/// Increment the in-flight tasks counter (call after claiming a task). +pub fn task_in_flight_inc() { + TASKS_IN_FLIGHT.fetch_add(1, Ordering::Relaxed); +} + +/// Decrement the in-flight tasks counter (call after a task completes or +/// fails). Saturates at zero to be safe against double-decrement bugs. +pub fn task_in_flight_dec() { + let _ = TASKS_IN_FLIGHT.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |v| { + Some(if v > 0 { v - 1 } else { 0 }) + }); +} + +/// Register the observable gauge that exposes the in-flight tasks counter. +/// Idempotent on the metric layer; should be called once at startup. +fn install_in_flight_gauge() { + let _ = meter() + .i64_observable_gauge(TASK_IN_FLIGHT) + .with_description("Background tasks currently executing") + .with_callback(|obs| obs.observe(TASKS_IN_FLIGHT.load(Ordering::Relaxed), &[])) + .build(); +} + +// ============================================================================= +// HTTP instruments +// ============================================================================= + +struct HttpInstruments { + duration_seconds: Histogram, +} + +fn http_instruments() -> &'static HttpInstruments { + static INST: OnceLock = OnceLock::new(); + INST.get_or_init(|| { + let m = meter(); + HttpInstruments { + // Semantic-convention default: `http.server.request.duration` in + // seconds. Bucketing is left to the SDK's default histogram view. + duration_seconds: m + .f64_histogram(metric_semconv::HTTP_SERVER_REQUEST_DURATION) + .with_unit("s") + .with_description("Duration of HTTP server requests") + .build(), + } + }) +} + +/// Record an HTTP server request. +/// +/// `route` should be the route template (e.g., `/api/v1/series/:id`), not the +/// resolved URL — otherwise the label cardinality explodes per series ID. +pub fn record_http_request(method: &str, route: &str, status: u16, duration_secs: f64) { + let attrs = [ + KeyValue::new(attribute::HTTP_REQUEST_METHOD, method.to_string()), + KeyValue::new(attribute::HTTP_ROUTE, route.to_string()), + KeyValue::new(attribute::HTTP_RESPONSE_STATUS_CODE, status as i64), + ]; + http_instruments() + .duration_seconds + .record(duration_secs, &attrs); +} + +// ============================================================================= +// Inventory observable gauges +// ============================================================================= + +/// Atomic snapshot of inventory counts, kept current by a background poller in +/// `commands::serve`. The OTel observable-gauge callbacks read these atomics +/// synchronously (they run on the SDK collection thread, no async context). +#[derive(Default)] +pub struct InventorySnapshot { + pub libraries: AtomicI64, + pub series: AtomicI64, + pub books: AtomicI64, + pub users: AtomicI64, + pub pages: AtomicI64, +} + +static INVENTORY_SNAPSHOT: OnceLock<&'static InventorySnapshot> = OnceLock::new(); + +/// Returns the global inventory snapshot. First call initializes it. +pub fn inventory_snapshot() -> &'static InventorySnapshot { + INVENTORY_SNAPSHOT.get_or_init(|| Box::leak(Box::new(InventorySnapshot::default()))) +} + +/// Install every observable instrument the binary owns (inventory gauges, +/// in-flight task gauge, process metrics). Idempotent only insofar as the +/// underlying meter accepts re-registration; intended to be called exactly +/// once at startup, after the meter provider is in place. +pub fn install_runtime_observers() { + install_inventory_gauges(); + install_in_flight_gauge(); + install_process_metrics(); +} + +/// Register the inventory observable gauges with the global meter. +/// +/// Must be called once after the meter provider is installed. Safe to call +/// when observability is disabled: the no-op meter provider will accept the +/// instrument registrations without doing anything with them. +fn install_inventory_gauges() { + let snap = inventory_snapshot(); + let m = meter(); + + macro_rules! gauge { + ($name:expr, $field:ident, $desc:expr) => { + m.i64_observable_gauge($name) + .with_description($desc) + .with_callback(move |obs| { + obs.observe(snap.$field.load(Ordering::Relaxed), &[]); + }) + .build() + }; + } + + let _ = gauge!(INVENTORY_LIBRARIES, libraries, "Number of libraries"); + let _ = gauge!(INVENTORY_SERIES, series, "Number of series"); + let _ = gauge!(INVENTORY_BOOKS, books, "Number of books"); + let _ = gauge!(INVENTORY_USERS, users, "Number of users"); + let _ = gauge!(INVENTORY_PAGES, pages, "Number of pages indexed"); +} + +/// Update the inventory snapshot with freshly counted values. +pub fn update_inventory_snapshot(libraries: i64, series: i64, books: i64, users: i64, pages: i64) { + let snap = inventory_snapshot(); + snap.libraries.store(libraries, Ordering::Relaxed); + snap.series.store(series, Ordering::Relaxed); + snap.books.store(books, Ordering::Relaxed); + snap.users.store(users, Ordering::Relaxed); + snap.pages.store(pages, Ordering::Relaxed); +} + +// ============================================================================= +// Process / runtime metrics +// ============================================================================= + +/// Install process-level observable gauges (CPU, memory). +/// +/// Uses `sysinfo` polled from a fresh `System` snapshot inside the gauge +/// callback. The callback runs on the SDK collection thread (synchronous). +fn install_process_metrics() { + use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, System}; + + let m = meter(); + let pid = match sysinfo::get_current_pid() { + Ok(p) => p, + Err(e) => { + tracing::warn!("Could not resolve current PID for process metrics; skipping: {e}"); + return; + } + }; + + let attrs: [KeyValue; 1] = [KeyValue::new("process.pid", pid.as_u32() as i64)]; + + // Semantic-convention metric names are gated behind the experimental + // feature flag in `opentelemetry-semantic-conventions` 0.32; use the + // standard string identifiers directly. These names match + // `process.cpu.time` and `process.memory.usage` from the OTel spec. + { + let attrs = attrs.clone(); + let sys = std::sync::Mutex::new(System::new()); + m.f64_observable_gauge("process.cpu.time") + .with_unit("s") + .with_description("Total user + system CPU time consumed by the process") + .with_callback(move |obs| { + let Ok(mut s) = sys.lock() else { return }; + s.refresh_processes_specifics( + ProcessesToUpdate::Some(&[pid]), + true, + ProcessRefreshKind::nothing().with_cpu(), + ); + if let Some(proc) = s.process(pid) { + obs.observe(proc.accumulated_cpu_time() as f64 / 1000.0, &attrs); + } + }) + .build(); + } + + { + let attrs = attrs.clone(); + let sys = std::sync::Mutex::new(System::new()); + m.i64_observable_gauge("process.memory.usage") + .with_unit("By") + .with_description("Resident memory of the process") + .with_callback(move |obs| { + let Ok(mut s) = sys.lock() else { return }; + s.refresh_processes_specifics( + ProcessesToUpdate::Some(&[pid]), + true, + ProcessRefreshKind::nothing().with_memory(), + ); + if let Some(proc) = s.process(pid) { + obs.observe(proc.memory() as i64, &attrs); + } + }) + .build(); + } + + { + let sys = std::sync::Mutex::new(System::new()); + m.i64_observable_gauge("process.memory.virtual") + .with_unit("By") + .with_description("Virtual memory size of the process") + .with_callback(move |obs| { + let Ok(mut s) = sys.lock() else { return }; + s.refresh_processes_specifics( + ProcessesToUpdate::Some(&[pid]), + true, + ProcessRefreshKind::nothing().with_memory(), + ); + if let Some(proc) = s.process(pid) { + obs.observe(proc.virtual_memory() as i64, &attrs); + } + }) + .build(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use opentelemetry::metrics::MeterProvider; + use opentelemetry_sdk::metrics::data::AggregatedMetrics; + use opentelemetry_sdk::metrics::{InMemoryMetricExporter, PeriodicReader, SdkMeterProvider}; + + fn test_provider() -> (InMemoryMetricExporter, SdkMeterProvider) { + let exporter = InMemoryMetricExporter::default(); + let reader = PeriodicReader::builder(exporter.clone()).build(); + let mp = SdkMeterProvider::builder().with_reader(reader).build(); + (exporter, mp) + } + + #[test] + fn metric_names_are_stable() { + // Sanity-check that the public constants haven't drifted; operators + // build dashboards against these names, so renames need to be + // deliberate (and announced in the changelog). + assert_eq!(PLUGIN_REQUESTS, "codex.plugin.requests"); + assert_eq!(PLUGIN_DURATION, "codex.plugin.duration_ms"); + assert_eq!(TASK_COMPLETIONS, "codex.task.completions"); + assert_eq!(TASK_IN_FLIGHT, "codex.task.in_flight"); + assert_eq!(INVENTORY_LIBRARIES, "codex.inventory.libraries"); + } + + #[test] + fn helpers_are_safe_with_noop_meter_provider() { + // The global meter provider is no-op in tests (no `init` call). All + // entry points should be safe to call: they just route to the no-op + // instruments. + record_plugin_request("p1", "search", "success", 12); + record_plugin_rate_limit_rejection("p1"); + record_task_completion("scan_library", "success", 100, 5); + task_in_flight_inc(); + task_in_flight_dec(); + record_http_request("GET", "/api/v1/series", 200, 0.014); + update_inventory_snapshot(1, 2, 3, 4, 5); + + // Snapshot atomics should hold the values we just wrote. + let s = inventory_snapshot(); + assert_eq!(s.libraries.load(Ordering::Relaxed), 1); + assert_eq!(s.books.load(Ordering::Relaxed), 3); + } + + #[test] + fn plugin_instruments_emit_counter_and_histogram_to_in_memory_exporter() { + let (exporter, mp) = test_provider(); + let inst = PluginInstruments::new(&mp.meter("test")); + + inst.record_request("plugin-a", "search", "success", 42); + inst.record_request("plugin-a", "search", "failure", 100); + inst.record_rate_limit_rejection("plugin-a"); + + mp.force_flush().expect("flush"); + let batches = exporter.get_finished_metrics().expect("collected metrics"); + assert!(!batches.is_empty(), "expected at least one ResourceMetrics"); + + let mut found_requests = false; + let mut found_duration = false; + let mut found_rejections = false; + for rm in batches { + for scope in rm.scope_metrics() { + for metric in scope.metrics() { + match metric.name() { + PLUGIN_REQUESTS => { + // Counter exports as a Sum aggregation. + assert!(matches!( + metric.data(), + AggregatedMetrics::U64( + opentelemetry_sdk::metrics::data::MetricData::Sum(_) + ) + )); + found_requests = true; + } + PLUGIN_DURATION => { + assert!(matches!( + metric.data(), + AggregatedMetrics::F64( + opentelemetry_sdk::metrics::data::MetricData::Histogram(_) + ) + )); + found_duration = true; + } + PLUGIN_RATE_LIMIT_REJECTIONS => { + found_rejections = true; + } + _ => {} + } + } + } + } + assert!(found_requests, "plugin requests counter not exported"); + assert!(found_duration, "plugin duration histogram not exported"); + assert!(found_rejections, "plugin rejections counter not exported"); + } + + #[test] + fn task_instruments_emit_counter_and_histograms() { + let (exporter, mp) = test_provider(); + let inst = TaskInstruments::new(&mp.meter("test")); + + inst.record_completion("scan_library", "success", 250, 10); + inst.record_completion("scan_library", "failure", 1000, 50); + inst.record_completion("scan_library", "rate_limited", -1, -1); + + mp.force_flush().expect("flush"); + let batches = exporter.get_finished_metrics().expect("collected metrics"); + let names: std::collections::HashSet = batches + .iter() + .flat_map(|rm| rm.scope_metrics().flat_map(|s| s.metrics())) + .map(|m| m.name().to_string()) + .collect(); + assert!(names.contains(TASK_COMPLETIONS), "task completions missing"); + assert!(names.contains(TASK_DURATION), "task duration missing"); + assert!( + names.contains(TASK_QUEUE_WAIT), + "task queue wait missing (got {names:?})" + ); + } + + #[test] + fn in_flight_saturates_at_zero() { + // Reset, then test the saturating decrement behavior. We compare + // against the post-test state so other tests running in parallel + // don't trip the assertions. + TASKS_IN_FLIGHT.store(0, Ordering::Relaxed); + task_in_flight_dec(); + assert_eq!(TASKS_IN_FLIGHT.load(Ordering::Relaxed), 0); + task_in_flight_inc(); + task_in_flight_inc(); + assert_eq!(TASKS_IN_FLIGHT.load(Ordering::Relaxed), 2); + task_in_flight_dec(); + assert_eq!(TASKS_IN_FLIGHT.load(Ordering::Relaxed), 1); + } +} diff --git a/src/observability/metrics_stub.rs b/src/observability/metrics_stub.rs new file mode 100644 index 00000000..3b641ad4 --- /dev/null +++ b/src/observability/metrics_stub.rs @@ -0,0 +1,69 @@ +//! No-op stubs for the metrics helpers when the `observability` feature is +//! disabled. The shapes mirror `metrics.rs` so call sites stay cfg-free. +//! +//! The metric-name constants below are not referenced when the feature is +//! off, but are kept so the public surface of `observability::metrics` +//! stays identical across feature configurations. + +#![allow(dead_code)] + +use std::sync::OnceLock; +use std::sync::atomic::AtomicI64; + +pub const PLUGIN_REQUESTS: &str = "codex.plugin.requests"; +pub const PLUGIN_DURATION: &str = "codex.plugin.duration_ms"; +pub const PLUGIN_RATE_LIMIT_REJECTIONS: &str = "codex.plugin.rate_limit_rejections"; +pub const TASK_COMPLETIONS: &str = "codex.task.completions"; +pub const TASK_DURATION: &str = "codex.task.duration_ms"; +pub const TASK_QUEUE_WAIT: &str = "codex.task.queue_wait_ms"; +pub const TASK_IN_FLIGHT: &str = "codex.task.in_flight"; +pub const INVENTORY_LIBRARIES: &str = "codex.inventory.libraries"; +pub const INVENTORY_SERIES: &str = "codex.inventory.series"; +pub const INVENTORY_BOOKS: &str = "codex.inventory.books"; +pub const INVENTORY_USERS: &str = "codex.inventory.users"; +pub const INVENTORY_PAGES: &str = "codex.inventory.pages"; + +pub fn record_plugin_request(_plugin_id: &str, _method: &str, _outcome: &str, _duration_ms: u64) {} + +pub fn record_plugin_rate_limit_rejection(_plugin_id: &str) {} + +pub fn record_task_completion( + _task_type: &str, + _outcome: &str, + _duration_ms: i64, + _queue_wait_ms: i64, +) { +} + +pub fn task_in_flight_inc() {} + +pub fn task_in_flight_dec() {} + +pub fn record_http_request(_method: &str, _route: &str, _status: u16, _duration_secs: f64) {} + +pub fn install_runtime_observers() {} + +#[derive(Default)] +pub struct InventorySnapshot { + pub libraries: AtomicI64, + pub series: AtomicI64, + pub books: AtomicI64, + pub users: AtomicI64, + pub pages: AtomicI64, +} + +static INVENTORY_SNAPSHOT: OnceLock<&'static InventorySnapshot> = OnceLock::new(); + +pub fn inventory_snapshot() -> &'static InventorySnapshot { + INVENTORY_SNAPSHOT.get_or_init(|| Box::leak(Box::new(InventorySnapshot::default()))) +} + +pub fn update_inventory_snapshot(libraries: i64, series: i64, books: i64, users: i64, pages: i64) { + use std::sync::atomic::Ordering; + let snap = inventory_snapshot(); + snap.libraries.store(libraries, Ordering::Relaxed); + snap.series.store(series, Ordering::Relaxed); + snap.books.store(books, Ordering::Relaxed); + snap.users.store(users, Ordering::Relaxed); + snap.pages.store(pages, Ordering::Relaxed); +} diff --git a/src/observability/mod.rs b/src/observability/mod.rs new file mode 100644 index 00000000..ce5a0be4 --- /dev/null +++ b/src/observability/mod.rs @@ -0,0 +1,39 @@ +//! OpenTelemetry instrumentation glue. +//! +//! Gated by the `observability` Cargo feature. When the feature is enabled +//! and `ObservabilityConfig::enabled` is true, [`init`] starts an OTLP tracer +//! and meter provider, wires them into the OTel globals, and returns a guard +//! that owns the providers for shutdown. +//! +//! When the feature is disabled (or `enabled` is false), every entry point is +//! a no-op so the rest of the codebase can stay cfg-free at call sites. + +#[cfg(feature = "observability")] +mod providers; +#[cfg(feature = "observability")] +mod trace_fmt; + +#[cfg(not(feature = "observability"))] +mod stub; + +#[cfg(feature = "observability")] +pub use providers::{ObservabilityHandle, init}; + +#[cfg(feature = "observability")] +pub use trace_fmt::TraceContextFormat; + +#[cfg(not(feature = "observability"))] +pub use stub::{ObservabilityHandle, TraceContextFormat, init}; + +mod http; +pub use http::install_http_layers; + +pub mod repo; + +#[cfg(feature = "observability")] +pub mod metrics; +#[cfg(not(feature = "observability"))] +#[path = "metrics_stub.rs"] +pub mod metrics; + +pub mod inventory; diff --git a/src/observability/providers.rs b/src/observability/providers.rs new file mode 100644 index 00000000..2a88931b --- /dev/null +++ b/src/observability/providers.rs @@ -0,0 +1,373 @@ +//! OTel SDK provider construction and lifetime management. + +use std::time::Duration; + +use anyhow::{Context, Result}; +use opentelemetry::{KeyValue, global, trace::TracerProvider}; +use opentelemetry_otlp::{Protocol, WithExportConfig, WithHttpConfig, WithTonicConfig}; +use opentelemetry_sdk::{ + Resource, + metrics::{PeriodicReader, SdkMeterProvider}, + propagation::TraceContextPropagator, + trace::{Sampler, SdkTracerProvider, Tracer}, +}; +use opentelemetry_semantic_conventions::resource::SERVICE_VERSION; + +use crate::config::{ObservabilityConfig, OtlpProtocol}; + +const TRACER_INSTRUMENTATION_NAME: &str = "codex"; + +/// Owns the OTel providers for the lifetime of the process. +/// +/// Drop alone does *not* flush the batch processors; call [`Self::shutdown`] +/// from the serve command on graceful exit to make sure the last spans and +/// metric points are delivered. +pub struct ObservabilityHandle { + inner: Option, +} + +struct Inner { + tracer_provider: Option, + meter_provider: Option, + tracer: Option, +} + +impl ObservabilityHandle { + fn disabled() -> Self { + Self { inner: None } + } + + /// Returns the SDK tracer used by the `tracing-opentelemetry` bridge. + /// + /// `None` when observability is disabled or trace export is off. + pub fn tracer(&self) -> Option<&Tracer> { + self.inner.as_ref().and_then(|i| i.tracer.as_ref()) + } + + /// Returns whether trace export is active. + pub fn traces_enabled(&self) -> bool { + self.tracer().is_some() + } + + /// Returns whether metric export is active. + pub fn metrics_enabled(&self) -> bool { + self.inner + .as_ref() + .and_then(|i| i.meter_provider.as_ref()) + .is_some() + } + + /// Flush and shut down the providers. Idempotent. + /// + /// Logs at warn level on per-provider failure; we never want a flush error + /// to cascade past process shutdown. + pub fn shutdown(mut self) { + let Some(inner) = self.inner.take() else { + return; + }; + if let Some(tp) = inner.tracer_provider + && let Err(e) = tp.shutdown() + { + tracing::warn!("Failed to shut down OTel tracer provider: {e}"); + } + if let Some(mp) = inner.meter_provider + && let Err(e) = mp.shutdown() + { + tracing::warn!("Failed to shut down OTel meter provider: {e}"); + } + } +} + +/// Build providers from config and install them as the OTel globals. +/// +/// Returns a handle even when nothing was installed (the disabled / no-op +/// path), so the caller can treat the result uniformly. +pub fn init(config: &ObservabilityConfig) -> Result { + if !config.enabled { + tracing::debug!("Observability disabled via config"); + return Ok(ObservabilityHandle::disabled()); + } + + if config.otlp.endpoint.trim().is_empty() { + tracing::warn!( + "observability.enabled = true but otlp.endpoint is empty; not installing OTel providers" + ); + return Ok(ObservabilityHandle::disabled()); + } + + // Install the W3C trace-context propagator so incoming `traceparent` + // headers are honored and outgoing requests can carry the context. + global::set_text_map_propagator(TraceContextPropagator::new()); + + let resource = build_resource(config); + + let tracer_provider = if config.traces.enabled { + Some(build_tracer_provider(config, resource.clone())?) + } else { + None + }; + + let tracer = tracer_provider + .as_ref() + .map(|tp| tp.tracer(TRACER_INSTRUMENTATION_NAME)); + + if let Some(tp) = tracer_provider.as_ref() { + global::set_tracer_provider(tp.clone()); + } + + let meter_provider = if config.metrics.enabled { + Some(build_meter_provider(config, resource)?) + } else { + None + }; + + if let Some(mp) = meter_provider.as_ref() { + global::set_meter_provider(mp.clone()); + + // Register the observable instruments (inventory gauges, in-flight + // task gauge, process CPU/memory). Only meaningful once a real meter + // provider is in place; the SDK ignores callbacks registered against + // the no-op default. + crate::observability::metrics::install_runtime_observers(); + } + + tracing::info!( + endpoint = %config.otlp.endpoint, + protocol = %config.otlp.protocol.as_str(), + traces_enabled = config.traces.enabled, + metrics_enabled = config.metrics.enabled, + sample_ratio = config.traces.sample_ratio, + "Initialized OpenTelemetry providers" + ); + + Ok(ObservabilityHandle { + inner: Some(Inner { + tracer_provider, + meter_provider, + tracer, + }), + }) +} + +fn build_resource(config: &ObservabilityConfig) -> Resource { + Resource::builder() + .with_service_name(config.service_name.clone()) + .with_attribute(KeyValue::new(SERVICE_VERSION, env!("CARGO_PKG_VERSION"))) + .build() +} + +fn build_tracer_provider( + config: &ObservabilityConfig, + resource: Resource, +) -> Result { + let exporter = build_span_exporter(config)?; + let sampler = build_sampler(config.traces.sample_ratio); + + Ok(SdkTracerProvider::builder() + .with_batch_exporter(exporter) + .with_resource(resource) + .with_sampler(sampler) + .build()) +} + +fn build_meter_provider( + config: &ObservabilityConfig, + resource: Resource, +) -> Result { + let exporter = build_metric_exporter(config)?; + let reader = PeriodicReader::builder(exporter) + .with_interval(Duration::from_millis(config.metrics.export_interval_ms)) + .build(); + + Ok(SdkMeterProvider::builder() + .with_reader(reader) + .with_resource(resource) + .build()) +} + +fn build_sampler(ratio: f64) -> Sampler { + // ParentBased so propagated decisions from upstream callers are honored; + // local roots use the configured ratio. + let clamped = ratio.clamp(0.0, 1.0); + let root = if clamped >= 1.0 { + Sampler::AlwaysOn + } else if clamped <= 0.0 { + Sampler::AlwaysOff + } else { + Sampler::TraceIdRatioBased(clamped) + }; + Sampler::ParentBased(Box::new(root)) +} + +fn build_span_exporter(config: &ObservabilityConfig) -> Result { + let timeout = Duration::from_millis(config.otlp.timeout_ms); + let endpoint = config.otlp.endpoint.clone(); + match config.otlp.protocol { + OtlpProtocol::Grpc => { + let mut builder = opentelemetry_otlp::SpanExporter::builder() + .with_tonic() + .with_endpoint(endpoint) + .with_timeout(timeout); + if !config.otlp.headers.is_empty() { + builder = builder + .with_metadata(build_tonic_metadata(&config.otlp.headers).context( + "Failed to build gRPC metadata from observability.otlp.headers", + )?); + } + builder + .build() + .context("Failed to build OTLP gRPC span exporter") + } + OtlpProtocol::HttpProtobuf | OtlpProtocol::HttpJson => { + let protocol = match config.otlp.protocol { + OtlpProtocol::HttpJson => Protocol::HttpJson, + _ => Protocol::HttpBinary, + }; + opentelemetry_otlp::SpanExporter::builder() + .with_http() + .with_protocol(protocol) + .with_endpoint(endpoint) + .with_timeout(timeout) + .with_headers(config.otlp.headers.clone()) + .build() + .context("Failed to build OTLP HTTP span exporter") + } + } +} + +fn build_metric_exporter( + config: &ObservabilityConfig, +) -> Result { + let timeout = Duration::from_millis(config.otlp.timeout_ms); + let endpoint = config.otlp.endpoint.clone(); + match config.otlp.protocol { + OtlpProtocol::Grpc => { + let mut builder = opentelemetry_otlp::MetricExporter::builder() + .with_tonic() + .with_endpoint(endpoint) + .with_timeout(timeout); + if !config.otlp.headers.is_empty() { + builder = builder + .with_metadata(build_tonic_metadata(&config.otlp.headers).context( + "Failed to build gRPC metadata from observability.otlp.headers", + )?); + } + builder + .build() + .context("Failed to build OTLP gRPC metric exporter") + } + OtlpProtocol::HttpProtobuf | OtlpProtocol::HttpJson => { + let protocol = match config.otlp.protocol { + OtlpProtocol::HttpJson => Protocol::HttpJson, + _ => Protocol::HttpBinary, + }; + opentelemetry_otlp::MetricExporter::builder() + .with_http() + .with_protocol(protocol) + .with_endpoint(endpoint) + .with_timeout(timeout) + .with_headers(config.otlp.headers.clone()) + .build() + .context("Failed to build OTLP HTTP metric exporter") + } + } +} + +fn build_tonic_metadata( + headers: &std::collections::HashMap, +) -> Result { + let mut map = tonic::metadata::MetadataMap::with_capacity(headers.len()); + for (k, v) in headers { + let key: tonic::metadata::MetadataKey = k + .parse() + .with_context(|| format!("invalid OTLP header name: {k}"))?; + let value: tonic::metadata::MetadataValue = v + .parse() + .with_context(|| format!("invalid OTLP header value for {k}"))?; + map.insert(key, value); + } + Ok(map) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn base_config() -> ObservabilityConfig { + ObservabilityConfig { + enabled: true, + service_name: "codex-test".to_string(), + otlp: crate::config::OtlpConfig { + endpoint: "http://127.0.0.1:14318".to_string(), + protocol: OtlpProtocol::HttpProtobuf, + headers: Default::default(), + timeout_ms: 1000, + }, + traces: crate::config::ObservabilityTracesConfig { + enabled: true, + sample_ratio: 1.0, + }, + metrics: crate::config::ObservabilityMetricsConfig { + enabled: true, + export_interval_ms: 1000, + }, + browser: Default::default(), + } + } + + #[test] + fn init_disabled_returns_noop() { + let mut cfg = base_config(); + cfg.enabled = false; + let handle = init(&cfg).unwrap(); + assert!(!handle.traces_enabled()); + assert!(!handle.metrics_enabled()); + handle.shutdown(); + } + + #[test] + fn init_empty_endpoint_returns_noop() { + let mut cfg = base_config(); + cfg.otlp.endpoint.clear(); + let handle = init(&cfg).unwrap(); + assert!(!handle.traces_enabled()); + assert!(!handle.metrics_enabled()); + handle.shutdown(); + } + + #[tokio::test] + async fn init_with_fake_endpoint_builds_providers_and_shuts_down() { + // The exporter is constructed lazily; it does not require the endpoint + // to be reachable at init time. Shutdown is what proves the providers + // and exporters are wired up cleanly. + let cfg = base_config(); + let handle = init(&cfg).unwrap(); + assert!(handle.traces_enabled()); + assert!(handle.metrics_enabled()); + handle.shutdown(); + } + + #[test] + fn sampler_clamps_ratio() { + // Just exercising the helper for the corner values; we trust the SDK + // implementation of TraceIdRatioBased itself. + assert!(matches!(build_sampler(-1.0), Sampler::ParentBased(_))); + assert!(matches!(build_sampler(2.0), Sampler::ParentBased(_))); + assert!(matches!(build_sampler(0.5), Sampler::ParentBased(_))); + } + + #[test] + fn service_name_in_resource() { + let cfg = base_config(); + let resource = build_resource(&cfg); + let attrs: Vec<_> = resource.iter().collect(); + let has_service_name = attrs.iter().any(|(k, v)| { + k.as_str() == opentelemetry_semantic_conventions::resource::SERVICE_NAME + && v.to_string() == "codex-test" + }); + assert!( + has_service_name, + "service.name attribute not set: {attrs:?}" + ); + } +} diff --git a/src/observability/repo.rs b/src/observability/repo.rs new file mode 100644 index 00000000..e93acd6d --- /dev/null +++ b/src/observability/repo.rs @@ -0,0 +1,232 @@ +//! Repository instrumentation helpers. +//! +//! Codex's repositories sit on top of SeaORM, which does not ship a built-in +//! tracing layer. Phase 2 of the OTLP plan instruments repository methods at +//! the method boundary instead of wrapping raw SQL, so a single SeaORM call +//! shows up as one span tagged with the operation (`select`, `insert`, +//! `update`, `delete`) and a stable entity name (`book`, `series`, ...). +//! +//! Span names follow `db..`. Each span carries the +//! [OpenTelemetry semantic-convention] attributes the `tracing-opentelemetry` +//! bridge recognises: +//! +//! - `db.system`: `"sqlite"` or `"postgresql"` +//! - `db.operation`: `"select" | "insert" | "update" | "delete" | ...` +//! - `otel.kind`: `"client"` (DB calls are client RPCs from our point of view) +//! +//! Entity-identifying values (`book.id`, `series.id`, ...) go in attributes, +//! never in the span name. This keeps span cardinality bounded by the number +//! of repository methods, which is small. +//! +//! [OpenTelemetry semantic-convention]: https://opentelemetry.io/docs/specs/semconv/database/ + +use sea_orm::{ConnectionTrait, DatabaseConnection, DbBackend}; + +/// Map a SeaORM backend to the OpenTelemetry `db.system` attribute value. +/// +/// The result is one of the standard `db.system` constants and is `'static` +/// so it can be embedded directly in span fields without allocation. +pub fn db_system_str(db: &DatabaseConnection) -> &'static str { + match db.get_database_backend() { + DbBackend::Sqlite => "sqlite", + DbBackend::Postgres => "postgresql", + DbBackend::MySql => "mysql", + } +} + +#[cfg(test)] +mod tests { + use super::*; + use sea_orm::{Database, DatabaseConnection}; + use std::collections::HashMap; + use std::sync::{Arc, Mutex}; + use tracing::field::{Field, Visit}; + use tracing_subscriber::Layer; + use tracing_subscriber::layer::{Context, SubscriberExt}; + + async fn in_memory_sqlite() -> DatabaseConnection { + Database::connect("sqlite::memory:") + .await + .expect("connect to in-memory sqlite") + } + + #[tokio::test] + async fn sqlite_backend_maps_to_db_system_sqlite() { + let db = in_memory_sqlite().await; + assert_eq!(db_system_str(&db), "sqlite"); + } + + /// Span metadata captured by [`CapturingLayer`] for assertions in tests. + #[derive(Debug, Default)] + struct CapturedSpan { + name: &'static str, + fields: HashMap, + } + + /// Tracing layer that records every span it sees so tests can assert on + /// span names and field values without a full OTel SDK. + struct CapturingLayer { + captured: Arc>>, + } + + impl CapturingLayer { + fn new() -> (Self, Arc>>) { + let captured = Arc::new(Mutex::new(Vec::new())); + ( + Self { + captured: captured.clone(), + }, + captured, + ) + } + } + + struct FieldVisitor<'a>(&'a mut HashMap); + + impl Visit for FieldVisitor<'_> { + fn record_str(&mut self, field: &Field, value: &str) { + self.0.insert(field.name().to_string(), value.to_string()); + } + fn record_debug(&mut self, field: &Field, value: &dyn std::fmt::Debug) { + self.0 + .insert(field.name().to_string(), format!("{value:?}")); + } + fn record_i64(&mut self, field: &Field, value: i64) { + self.0.insert(field.name().to_string(), value.to_string()); + } + fn record_u64(&mut self, field: &Field, value: u64) { + self.0.insert(field.name().to_string(), value.to_string()); + } + fn record_bool(&mut self, field: &Field, value: bool) { + self.0.insert(field.name().to_string(), value.to_string()); + } + } + + impl tracing_subscriber::registry::LookupSpan<'a>> Layer + for CapturingLayer + { + fn on_new_span( + &self, + attrs: &tracing::span::Attributes<'_>, + _id: &tracing::span::Id, + _ctx: Context<'_, S>, + ) { + let mut fields = HashMap::new(); + attrs.record(&mut FieldVisitor(&mut fields)); + self.captured.lock().unwrap().push(CapturedSpan { + name: attrs.metadata().name(), + fields, + }); + } + } + + /// Demonstrates that a `#[tracing::instrument]`-decorated repository + /// method emits a span with the expected name and OTel semantic-convention + /// attributes. This is the shape Phase 2 contracts: callers can rely on + /// the `db..` naming and the `db.system`, + /// `db.operation`, `otel.kind` fields being populated. + #[tokio::test] + async fn instrumented_repo_method_emits_named_span_with_semantic_conv_fields() { + use crate::db::repositories::UserRepository; + use uuid::Uuid; + + let db = in_memory_sqlite().await; + let (layer, captured) = CapturingLayer::new(); + let subscriber = tracing_subscriber::registry().with(layer); + + let _guard = tracing::subscriber::set_default(subscriber); + + // The lookup will fail (no users table), which is fine: we only care + // that the instrumented function created the expected span. + let _ = UserRepository::get_by_id(&db, Uuid::nil()).await; + + let spans = captured.lock().unwrap(); + let span = spans + .iter() + .find(|s| s.name == "db.user.get_by_id") + .expect("db.user.get_by_id span should be emitted"); + assert_eq!( + span.fields.get("db.system").map(String::as_str), + Some("sqlite") + ); + assert_eq!( + span.fields.get("db.operation").map(String::as_str), + Some("select") + ); + assert_eq!( + span.fields.get("otel.kind").map(String::as_str), + Some("client") + ); + } + + /// Microbench for instrumentation overhead. Not part of CI: run manually + /// with `cargo test --release -p codex -- --ignored bench_instrumentation_overhead --nocapture` + /// to get a feel for the per-call cost of `#[tracing::instrument]` under + /// the two configurations that matter: + /// + /// 1. No subscriber attached (production path when observability is off): + /// the macro short-circuits to a no-op. + /// 2. A capturing subscriber attached (closest in-process analogue to the + /// enabled path): the macro builds the span, records fields, and pushes + /// a frame onto the registry. + /// + /// The numbers are recorded in `tmp/implementation/planned/otlp-traces.md` + /// under Phase 5's progress log. + #[tokio::test] + #[ignore = "manual benchmark; run with --ignored"] + async fn bench_instrumentation_overhead() { + use std::time::Instant; + + const ITERS: u32 = 200_000; + + #[tracing::instrument( + name = "db.bench.noop", + skip_all, + fields( + db.system = "sqlite", + db.operation = "select", + otel.kind = "client", + id = %0u32, + ) + )] + fn instrumented_call(_i: u32) -> u32 { + std::hint::black_box(_i.wrapping_mul(31)) + } + + // Warmup + for i in 0..1_000 { + std::hint::black_box(instrumented_call(i)); + } + + // No subscriber: the instrument! macro is meant to short-circuit. + let start = Instant::now(); + for i in 0..ITERS { + std::hint::black_box(instrumented_call(i)); + } + let disabled = start.elapsed(); + + // With a subscriber: full span construction + field recording. + let (layer, _captured) = CapturingLayer::new(); + let subscriber = tracing_subscriber::registry().with(layer); + let _guard = tracing::subscriber::set_default(subscriber); + + let start = Instant::now(); + for i in 0..ITERS { + std::hint::black_box(instrumented_call(i)); + } + let enabled = start.elapsed(); + + let per_call_disabled_ns = disabled.as_nanos() / u128::from(ITERS); + let per_call_enabled_ns = enabled.as_nanos() / u128::from(ITERS); + + println!("---"); + println!("Instrumentation overhead microbench ({ITERS} iters)"); + println!(" disabled (no subscriber): {disabled:?} ({per_call_disabled_ns} ns/call)"); + println!(" enabled (capturing layer): {enabled:?} ({per_call_enabled_ns} ns/call)"); + println!( + " per-call overhead added: {} ns", + per_call_enabled_ns.saturating_sub(per_call_disabled_ns) + ); + println!("---"); + } +} diff --git a/src/observability/stub.rs b/src/observability/stub.rs new file mode 100644 index 00000000..d02a96f9 --- /dev/null +++ b/src/observability/stub.rs @@ -0,0 +1,76 @@ +//! No-op stubs used when the `observability` feature is disabled. + +use anyhow::Result; +use std::fmt; +use tracing::{Event, Subscriber}; +use tracing_subscriber::{ + fmt::{ + FmtContext, FormatEvent, FormatFields, + format::{Format, Writer}, + }, + registry::LookupSpan, +}; + +use crate::config::ObservabilityConfig; + +/// Empty handle. All accessors return as if observability is disabled. +pub struct ObservabilityHandle; + +impl ObservabilityHandle { + pub fn traces_enabled(&self) -> bool { + false + } + pub fn metrics_enabled(&self) -> bool { + false + } + pub fn shutdown(self) {} +} + +/// Init is a no-op when the feature is off. +/// +/// Logs a hint at info level if the operator asked for observability so they +/// realize the binary was built without the feature. +pub fn init(config: &ObservabilityConfig) -> Result { + if config.enabled { + tracing::info!( + "observability.enabled = true but binary was built without the `observability` feature; ignoring" + ); + } + Ok(ObservabilityHandle) +} + +/// Identity formatter that delegates to the inner formatter unchanged. +/// +/// Mirrors the real `TraceContextFormat` so [`crate::commands::common::init_tracing`] +/// can use the same type name regardless of feature state. +pub struct TraceContextFormat { + inner: F, +} + +impl TraceContextFormat { + pub fn new(inner: F) -> Self { + Self { inner } + } +} + +impl Default for TraceContextFormat { + fn default() -> Self { + Self::new(Format::default()) + } +} + +impl FormatEvent for TraceContextFormat +where + S: Subscriber + for<'a> LookupSpan<'a>, + N: for<'a> FormatFields<'a> + 'static, + F: FormatEvent, +{ + fn format_event( + &self, + ctx: &FmtContext<'_, S, N>, + writer: Writer<'_>, + event: &Event<'_>, + ) -> fmt::Result { + self.inner.format_event(ctx, writer, event) + } +} diff --git a/src/observability/trace_fmt.rs b/src/observability/trace_fmt.rs new file mode 100644 index 00000000..3ee5a4c5 --- /dev/null +++ b/src/observability/trace_fmt.rs @@ -0,0 +1,68 @@ +//! `tracing_subscriber::fmt::FormatEvent` wrapper that prepends the active +//! OpenTelemetry trace and span IDs to every emitted log line. +//! +//! Combined with the `tracing-opentelemetry` layer this makes log → trace +//! correlation a single grep away: the trace_id in a log line is the same +//! one the OTLP backend stores against the span tree. + +use std::fmt; + +use opentelemetry::trace::TraceContextExt; +use tracing::{Event, Span, Subscriber}; +use tracing_opentelemetry::OpenTelemetrySpanExt; +use tracing_subscriber::{ + fmt::{ + FmtContext, FormatEvent, FormatFields, + format::{Format, Writer}, + }, + registry::LookupSpan, +}; + +/// Wraps the default fmt event formatter to prepend `trace_id` / `span_id`. +/// +/// Reads the OTel context from `tracing::Span::current()` so this works for +/// any event emitted inside an active span carrying OTel context (e.g., +/// anything inside the HTTP request span installed by the OTel axum layer). +pub struct TraceContextFormat { + inner: F, +} + +impl TraceContextFormat { + pub fn new(inner: F) -> Self { + Self { inner } + } +} + +impl Default for TraceContextFormat { + fn default() -> Self { + Self::new(Format::default()) + } +} + +impl FormatEvent for TraceContextFormat +where + S: Subscriber + for<'a> LookupSpan<'a>, + N: for<'a> FormatFields<'a> + 'static, + F: FormatEvent, +{ + fn format_event( + &self, + ctx: &FmtContext<'_, S, N>, + mut writer: Writer<'_>, + event: &Event<'_>, + ) -> fmt::Result { + let span = Span::current(); + let otel_ctx = span.context(); + let otel_span = otel_ctx.span(); + let span_ctx = otel_span.span_context(); + if span_ctx.is_valid() { + write!( + writer, + "trace_id={} span_id={} ", + span_ctx.trace_id(), + span_ctx.span_id() + )?; + } + self.inner.format_event(ctx, writer, event) + } +} diff --git a/src/scanner/analyzer_queue.rs b/src/scanner/analyzer_queue.rs index 10375776..62d1a2db 100644 --- a/src/scanner/analyzer_queue.rs +++ b/src/scanner/analyzer_queue.rs @@ -39,6 +39,11 @@ pub struct AnalysisResult { /// # Arguments /// * `force` - If true, bypass full hash check and force re-analysis even if file hasn't changed /// * `event_broadcaster` - Optional event broadcaster for emitting entity change events +#[tracing::instrument( + name = "scanner.analyze_book", + skip_all, + fields(book.id = %book_id, force), +)] pub async fn analyze_book( db: &DatabaseConnection, book_id: Uuid, diff --git a/src/scanner/library_scanner.rs b/src/scanner/library_scanner.rs index d7ab2fcd..136dd1a6 100644 --- a/src/scanner/library_scanner.rs +++ b/src/scanner/library_scanner.rs @@ -434,6 +434,15 @@ impl BookBatch { } /// Main library scanner that orchestrates the scanning process +#[tracing::instrument( + name = "scanner.scan_library", + skip_all, + fields( + library.id = %library_id, + scan.mode = %mode, + task.id = ?task_id, + ), +)] pub async fn scan_library( db: &DatabaseConnection, library_id: Uuid, diff --git a/src/services/plugin/manager.rs b/src/services/plugin/manager.rs index e065c3cd..178e6353 100644 --- a/src/services/plugin/manager.rs +++ b/src/services/plugin/manager.rs @@ -43,7 +43,7 @@ use std::time::{Duration, Instant}; use sea_orm::DatabaseConnection; use tokio::sync::{Mutex, RwLock}; -use tracing::{debug, error, info, warn}; +use tracing::{Span, debug, error, field::Empty, info, warn}; use uuid::Uuid; use crate::db::entities::plugins; @@ -1087,6 +1087,19 @@ impl PluginManager { } /// Search for series metadata using a specific plugin + #[tracing::instrument( + name = "plugin.search_series", + skip_all, + fields( + plugin_id = %plugin_id, + plugin_name = Empty, + plugin.method = "search", + otel.kind = "client", + otel.status_code = Empty, + duration_ms = Empty, + error.code = Empty, + ), + )] pub async fn search_series( &self, plugin_id: Uuid, @@ -1094,6 +1107,7 @@ impl PluginManager { ) -> Result { // Check rate limit before making the request let plugin_name = self.check_rate_limit(plugin_id).await?; + Span::current().record("plugin_name", plugin_name.as_str()); let timeout_ms = self.config.default_request_timeout.as_millis(); debug!( @@ -1108,9 +1122,11 @@ impl PluginManager { let handle = self.get_or_spawn(plugin_id).await?; let result = handle.search_series(params.clone()).await; let duration_ms = start.elapsed().as_millis() as u64; + Span::current().record("duration_ms", duration_ms); match &result { Ok(response) => { + Span::current().record("otel.status_code", "OK"); debug!( plugin_id = %plugin_id, plugin_name = %plugin_name, @@ -1133,6 +1149,9 @@ impl PluginManager { } } Err(e) => { + let error_code = self.error_to_code(e); + Span::current().record("otel.status_code", "ERROR"); + Span::current().record("error.code", error_code); if e.rpc_retry_after_seconds().is_some() { warn!( plugin_id = %plugin_id, @@ -1159,7 +1178,6 @@ impl PluginManager { if e.rpc_retry_after_seconds().is_none() && let Some(ref metrics) = self.metrics_service { - let error_code = self.error_to_code(e); metrics .record_failure( plugin_id, @@ -1177,6 +1195,19 @@ impl PluginManager { } /// Get series metadata using a specific plugin + #[tracing::instrument( + name = "plugin.get_series_metadata", + skip_all, + fields( + plugin_id = %plugin_id, + plugin_name = Empty, + plugin.method = "get_metadata", + otel.kind = "client", + otel.status_code = Empty, + duration_ms = Empty, + error.code = Empty, + ), + )] pub async fn get_series_metadata( &self, plugin_id: Uuid, @@ -1184,14 +1215,17 @@ impl PluginManager { ) -> Result { // Check rate limit before making the request let plugin_name = self.check_rate_limit(plugin_id).await?; + Span::current().record("plugin_name", plugin_name.as_str()); let start = Instant::now(); let handle = self.get_or_spawn(plugin_id).await?; let result = handle.get_series_metadata(params).await; let duration_ms = start.elapsed().as_millis() as u64; + Span::current().record("duration_ms", duration_ms); match &result { Ok(_) => { + Span::current().record("otel.status_code", "OK"); // Update health status on success if self.config.auto_sync_health { let _ = PluginsRepository::record_success(&self.db, plugin_id).await; @@ -1205,11 +1239,13 @@ impl PluginManager { } } Err(e) => { + let error_code = self.error_to_code(e); + Span::current().record("otel.status_code", "ERROR"); + Span::current().record("error.code", error_code); // Don't record RPC rate limits as failures — the plugin is healthy if e.rpc_retry_after_seconds().is_none() && let Some(ref metrics) = self.metrics_service { - let error_code = self.error_to_code(e); metrics .record_failure( plugin_id, @@ -1227,6 +1263,19 @@ impl PluginManager { } /// Find best series match using a specific plugin + #[tracing::instrument( + name = "plugin.match_series", + skip_all, + fields( + plugin_id = %plugin_id, + plugin_name = Empty, + plugin.method = "match", + otel.kind = "client", + otel.status_code = Empty, + duration_ms = Empty, + error.code = Empty, + ), + )] pub async fn match_series( &self, plugin_id: Uuid, @@ -1234,14 +1283,17 @@ impl PluginManager { ) -> Result, PluginManagerError> { // Check rate limit before making the request let plugin_name = self.check_rate_limit(plugin_id).await?; + Span::current().record("plugin_name", plugin_name.as_str()); let start = Instant::now(); let handle = self.get_or_spawn(plugin_id).await?; let result = handle.match_series(params).await; let duration_ms = start.elapsed().as_millis() as u64; + Span::current().record("duration_ms", duration_ms); match &result { Ok(_) => { + Span::current().record("otel.status_code", "OK"); // Update health status on success if self.config.auto_sync_health { let _ = PluginsRepository::record_success(&self.db, plugin_id).await; @@ -1255,11 +1307,13 @@ impl PluginManager { } } Err(e) => { + let error_code = self.error_to_code(e); + Span::current().record("otel.status_code", "ERROR"); + Span::current().record("error.code", error_code); // Don't record RPC rate limits as failures — the plugin is healthy if e.rpc_retry_after_seconds().is_none() && let Some(ref metrics) = self.metrics_service { - let error_code = self.error_to_code(e); metrics .record_failure( plugin_id, @@ -1281,6 +1335,19 @@ impl PluginManager { // ========================================================================= /// Search for book metadata using a specific plugin + #[tracing::instrument( + name = "plugin.search_book", + skip_all, + fields( + plugin_id = %plugin_id, + plugin_name = Empty, + plugin.method = "book_search", + otel.kind = "client", + otel.status_code = Empty, + duration_ms = Empty, + error.code = Empty, + ), + )] pub async fn search_book( &self, plugin_id: Uuid, @@ -1288,14 +1355,17 @@ impl PluginManager { ) -> Result { // Check rate limit before making the request let plugin_name = self.check_rate_limit(plugin_id).await?; + Span::current().record("plugin_name", plugin_name.as_str()); let start = Instant::now(); let handle = self.get_or_spawn(plugin_id).await?; let result = handle.search_book(params.clone()).await; let duration_ms = start.elapsed().as_millis() as u64; + Span::current().record("duration_ms", duration_ms); match &result { Ok(response) => { + Span::current().record("otel.status_code", "OK"); debug!( plugin_id = %plugin_id, isbn = ?params.isbn, @@ -1318,6 +1388,9 @@ impl PluginManager { } } Err(e) => { + let error_code = self.error_to_code(e); + Span::current().record("otel.status_code", "ERROR"); + Span::current().record("error.code", error_code); if e.rpc_retry_after_seconds().is_some() { warn!( plugin_id = %plugin_id, @@ -1342,7 +1415,6 @@ impl PluginManager { if e.rpc_retry_after_seconds().is_none() && let Some(ref metrics) = self.metrics_service { - let error_code = self.error_to_code(e); metrics .record_failure( plugin_id, @@ -1360,6 +1432,19 @@ impl PluginManager { } /// Get full book metadata using a specific plugin + #[tracing::instrument( + name = "plugin.get_book_metadata", + skip_all, + fields( + plugin_id = %plugin_id, + plugin_name = Empty, + plugin.method = "book_get", + otel.kind = "client", + otel.status_code = Empty, + duration_ms = Empty, + error.code = Empty, + ), + )] pub async fn get_book_metadata( &self, plugin_id: Uuid, @@ -1367,14 +1452,17 @@ impl PluginManager { ) -> Result { // Check rate limit before making the request let plugin_name = self.check_rate_limit(plugin_id).await?; + Span::current().record("plugin_name", plugin_name.as_str()); let start = Instant::now(); let handle = self.get_or_spawn(plugin_id).await?; let result = handle.get_book_metadata(params).await; let duration_ms = start.elapsed().as_millis() as u64; + Span::current().record("duration_ms", duration_ms); match &result { Ok(_) => { + Span::current().record("otel.status_code", "OK"); // Update health status on success if self.config.auto_sync_health { let _ = PluginsRepository::record_success(&self.db, plugin_id).await; @@ -1388,11 +1476,13 @@ impl PluginManager { } } Err(e) => { + let error_code = self.error_to_code(e); + Span::current().record("otel.status_code", "ERROR"); + Span::current().record("error.code", error_code); // Don't record RPC rate limits as failures — the plugin is healthy if e.rpc_retry_after_seconds().is_none() && let Some(ref metrics) = self.metrics_service { - let error_code = self.error_to_code(e); metrics .record_failure( plugin_id, @@ -1410,6 +1500,19 @@ impl PluginManager { } /// Find best book match using a specific plugin + #[tracing::instrument( + name = "plugin.match_book", + skip_all, + fields( + plugin_id = %plugin_id, + plugin_name = Empty, + plugin.method = "book_match", + otel.kind = "client", + otel.status_code = Empty, + duration_ms = Empty, + error.code = Empty, + ), + )] pub async fn match_book( &self, plugin_id: Uuid, @@ -1417,14 +1520,17 @@ impl PluginManager { ) -> Result, PluginManagerError> { // Check rate limit before making the request let plugin_name = self.check_rate_limit(plugin_id).await?; + Span::current().record("plugin_name", plugin_name.as_str()); let start = Instant::now(); let handle = self.get_or_spawn(plugin_id).await?; let result = handle.match_book(params).await; let duration_ms = start.elapsed().as_millis() as u64; + Span::current().record("duration_ms", duration_ms); match &result { Ok(_) => { + Span::current().record("otel.status_code", "OK"); // Update health status on success if self.config.auto_sync_health { let _ = PluginsRepository::record_success(&self.db, plugin_id).await; @@ -1438,11 +1544,13 @@ impl PluginManager { } } Err(e) => { + let error_code = self.error_to_code(e); + Span::current().record("otel.status_code", "ERROR"); + Span::current().record("error.code", error_code); // Don't record RPC rate limits as failures — the plugin is healthy if e.rpc_retry_after_seconds().is_none() && let Some(ref metrics) = self.metrics_service { - let error_code = self.error_to_code(e); metrics .record_failure( plugin_id, @@ -1464,6 +1572,15 @@ impl PluginManager { // ========================================================================= /// Ping a plugin to check health + #[tracing::instrument( + name = "plugin.ping", + skip_all, + fields( + plugin_id = %plugin_id, + plugin.method = "ping", + otel.kind = "client", + ), + )] pub async fn ping(&self, plugin_id: Uuid) -> Result<(), PluginManagerError> { let handle = self.get_or_spawn(plugin_id).await?; handle.ping().await?; @@ -1479,6 +1596,16 @@ impl PluginManager { /// /// This is useful for admin testing of plugin configuration without /// affecting the managed plugin state. + #[tracing::instrument( + name = "plugin.test_plugin", + skip_all, + fields( + plugin_id = %plugin.id, + plugin_name = %plugin.name, + plugin.method = "test", + otel.kind = "client", + ), + )] pub async fn test_plugin( &self, _db: &DatabaseConnection, @@ -2218,4 +2345,114 @@ mod tests { // Integration tests require a database connection // See tests/integration/plugin_manager.rs for full tests + + /// Phase 2 instrumentation smoke test: a call into `search_series` must + /// emit a span named `plugin.search_series` with the OTel client-kind + /// attributes set. The call itself fails because the manager has no + /// plugins loaded against a `Disconnected` database — that's fine, the + /// span is created at function entry before any error path runs. + #[tokio::test] + async fn search_series_emits_plugin_span_with_otel_kind_client() { + use std::collections::HashMap; + use std::sync::{Arc as StdArc, Mutex}; + use tracing::field::{Field, Visit}; + use tracing_subscriber::Layer; + use tracing_subscriber::layer::{Context, SubscriberExt}; + + #[derive(Debug, Default)] + struct CapturedSpan { + name: &'static str, + fields: HashMap, + } + + struct CapturingLayer { + captured: StdArc>>, + } + + struct FieldVisitor<'a>(&'a mut HashMap); + impl Visit for FieldVisitor<'_> { + fn record_str(&mut self, field: &Field, value: &str) { + self.0.insert(field.name().to_string(), value.to_string()); + } + fn record_debug(&mut self, field: &Field, value: &dyn std::fmt::Debug) { + self.0 + .insert(field.name().to_string(), format!("{value:?}")); + } + fn record_u64(&mut self, field: &Field, value: u64) { + self.0.insert(field.name().to_string(), value.to_string()); + } + fn record_i64(&mut self, field: &Field, value: i64) { + self.0.insert(field.name().to_string(), value.to_string()); + } + fn record_bool(&mut self, field: &Field, value: bool) { + self.0.insert(field.name().to_string(), value.to_string()); + } + } + + impl tracing_subscriber::registry::LookupSpan<'a>> Layer + for CapturingLayer + { + fn on_new_span( + &self, + attrs: &tracing::span::Attributes<'_>, + _id: &tracing::span::Id, + _ctx: Context<'_, S>, + ) { + let mut fields = HashMap::new(); + attrs.record(&mut FieldVisitor(&mut fields)); + self.captured.lock().unwrap().push(CapturedSpan { + name: attrs.metadata().name(), + fields, + }); + } + } + + let captured: StdArc>> = StdArc::new(Mutex::new(Vec::new())); + let layer = CapturingLayer { + captured: captured.clone(), + }; + let subscriber = tracing_subscriber::registry().with(layer); + let _guard = tracing::subscriber::set_default(subscriber); + + // Use a real in-memory sqlite connection so SeaORM doesn't panic on + // the cache-refresh read. The plugins table will not exist, so the + // refresh fails cleanly (logged, ignored) and the lookup misses, + // letting the call proceed to `get_or_spawn` and error out there. + let db = Arc::new( + sea_orm::Database::connect("sqlite::memory:") + .await + .expect("connect to in-memory sqlite"), + ); + let manager = PluginManager::new(db, PluginManagerConfig::default()); + + // Call into the instrumented entry method. We expect it to fail (the + // plugin isn't loaded), but the span is created before the failure. + let params = MetadataSearchParams { + query: "anything".to_string(), + limit: None, + cursor: None, + }; + let _ = manager.search_series(Uuid::nil(), params).await; + + let spans = captured.lock().unwrap(); + let search_span = spans + .iter() + .find(|s| s.name == "plugin.search_series") + .expect("plugin.search_series span should be emitted"); + assert_eq!( + search_span.fields.get("otel.kind").map(String::as_str), + Some("client"), + "otel.kind should be client for plugin RPC spans" + ); + assert_eq!( + search_span.fields.get("plugin.method").map(String::as_str), + Some("search"), + "plugin.method should identify the RPC method" + ); + assert!( + search_span.fields.contains_key("plugin_id"), + "plugin_id must be on the span; got fields: {:?}", + search_span.fields + ); + } } diff --git a/src/services/plugin/rpc.rs b/src/services/plugin/rpc.rs index f2d80aab..530b74bd 100644 --- a/src/services/plugin/rpc.rs +++ b/src/services/plugin/rpc.rs @@ -12,7 +12,7 @@ use serde::de::DeserializeOwned; use serde_json::Value; use tokio::sync::{Mutex, RwLock, mpsc}; use tokio::time::timeout; -use tracing::{debug, error, warn}; +use tracing::{Instrument, debug, error, warn}; use super::permissions::{self, PermissionError}; use super::process::{PluginProcess, ProcessError}; @@ -342,10 +342,21 @@ impl RpcClient { self.remove_pending(id).await; return Err(RpcError::Process(ProcessError::ProcessTerminated)); } - { + // Span around the stdio write so its duration is attributable + // separately from waiting for the response. Most calls spend + // microseconds here; a slow write usually means a wedged plugin. + async { let process = self.process.lock().await; - process.write_line(&request_json).await?; + process.write_line(&request_json).await } + .instrument(tracing::info_span!( + "plugin.rpc.write", + otel.kind = "internal", + rpc.id = id, + rpc.method = method, + request_len = request_json.len(), + )) + .await?; // Loop, servicing reverse-RPC frames until the response frame // arrives or we time out. Dispatching reverse-RPCs here (on the @@ -356,6 +367,13 @@ impl RpcClient { timeout_ms = request_timeout.as_millis(), "Waiting for RPC response" ); + let wait_span = tracing::info_span!( + "plugin.rpc.wait", + otel.kind = "internal", + rpc.id = id, + rpc.method = method, + timeout_ms = request_timeout.as_millis() as u64, + ); let response_result = timeout(request_timeout, async { loop { match rx.recv().await { @@ -401,6 +419,7 @@ impl RpcClient { } } }) + .instrument(wait_span) .await; let result = match response_result { diff --git a/src/services/plugin_metrics.rs b/src/services/plugin_metrics.rs index 9653830d..88c8c734 100644 --- a/src/services/plugin_metrics.rs +++ b/src/services/plugin_metrics.rs @@ -195,6 +195,15 @@ impl PluginMetricsService { method: &str, duration_ms: u64, ) { + // OTel dual-write: emit the counter + histogram before taking the + // write lock so the OTel cost doesn't widen the critical section. + crate::observability::metrics::record_plugin_request( + &plugin_id.to_string(), + method, + "success", + duration_ms, + ); + let mut plugins = self.plugins.write().await; let entry = plugins .entry(plugin_id) @@ -244,6 +253,13 @@ impl PluginMetricsService { duration_ms: u64, error_code: Option<&str>, ) { + crate::observability::metrics::record_plugin_request( + &plugin_id.to_string(), + method, + "failure", + duration_ms, + ); + let mut plugins = self.plugins.write().await; let entry = plugins .entry(plugin_id) @@ -298,6 +314,8 @@ impl PluginMetricsService { /// Record a rate limit rejection pub async fn record_rate_limit(&self, plugin_id: Uuid, plugin_name: &str) { + crate::observability::metrics::record_plugin_rate_limit_rejection(&plugin_id.to_string()); + let mut plugins = self.plugins.write().await; let entry = plugins .entry(plugin_id) diff --git a/src/services/task_metrics.rs b/src/services/task_metrics.rs index 390a627d..c656c698 100644 --- a/src/services/task_metrics.rs +++ b/src/services/task_metrics.rs @@ -245,6 +245,24 @@ impl TaskMetricsService { bytes_processed: i64, error: Option, ) { + // OTel dual-write. Rate-limited / rescheduled tasks come through here + // with `success = false` + the literal `"rate_limited"` error string + // (set by the worker on rate-limit recovery); surface that as a + // distinct outcome so dashboards can filter it out of error rates. + let outcome = if success { + "success" + } else if error.as_deref() == Some("rate_limited") { + "rate_limited" + } else { + "failure" + }; + crate::observability::metrics::record_task_completion( + &task_type, + outcome, + duration_ms, + queue_wait_ms, + ); + let completion = TaskCompletion { task_type, library_id, diff --git a/src/tasks/worker.rs b/src/tasks/worker.rs index 15c2657d..f31cfd82 100644 --- a/src/tasks/worker.rs +++ b/src/tasks/worker.rs @@ -39,6 +39,24 @@ use crate::tasks::handlers::{ UserPluginSyncHandler, }; +/// RAII guard that increments the OTel in-flight task gauge on creation and +/// decrements it on drop. Used by `process_next_task` to track currently- +/// executing tasks across all exit paths (success, failure, `?` propagation). +struct InFlightGuard; + +impl InFlightGuard { + fn new() -> Self { + crate::observability::metrics::task_in_flight_inc(); + Self + } +} + +impl Drop for InFlightGuard { + fn drop(&mut self) { + crate::observability::metrics::task_in_flight_dec(); + } +} + /// Task worker that processes tasks from the queue pub struct TaskWorker { db: DatabaseConnection, @@ -612,6 +630,10 @@ impl TaskWorker { } }; + // RAII guard for the OTel in-flight task gauge: increments on claim, + // decrements on every exit path (success, failure, error propagation). + let _in_flight = InFlightGuard::new(); + let started_at = Utc::now(); info!( @@ -647,6 +669,20 @@ impl TaskWorker { task.book_id, )); + // Each task gets its own root span so background work does not + // accidentally inherit an HTTP server span as its parent. The span + // covers handler execution across both single-process and + // distributed-mode branches below. + let task_span = tracing::info_span!( + "task.execute", + task.id = %task.id, + task.type = %task.task_type, + library.id = ?task.library_id, + series.id = ?task.series_id, + book.id = ?task.book_id, + otel.kind = "internal", + ); + // In distributed mode, create a recording broadcaster to capture events // that need to be replayed by the TaskListener on the web server let (task_broadcaster, recorded_events): ( @@ -666,12 +702,15 @@ impl TaskWorker { // request id. Without these scopes, plugins that emit events // via reverse-RPC would have no recording context and their // events would never replay. - let result = crate::events::with_task_identity( - task_identity.clone(), - crate::events::with_recording_broadcaster( - recording_broadcaster.clone(), - handler.handle(&task, &self.db, Some(&recording_broadcaster)), + let result = tracing::Instrument::instrument( + crate::events::with_task_identity( + task_identity.clone(), + crate::events::with_recording_broadcaster( + recording_broadcaster.clone(), + handler.handle(&task, &self.db, Some(&recording_broadcaster)), + ), ), + task_span.clone(), ) .await; @@ -719,18 +758,24 @@ impl TaskWorker { // The shared broadcaster has recording disabled here (web/single- // process mode), so emits flow straight to live SSE subscribers. let result = if let Some(ref shared) = task_broadcaster { - crate::events::with_task_identity( - task_identity.clone(), - crate::events::with_recording_broadcaster( - shared.clone(), - handler.handle(&task, &self.db, task_broadcaster.as_ref()), + tracing::Instrument::instrument( + crate::events::with_task_identity( + task_identity.clone(), + crate::events::with_recording_broadcaster( + shared.clone(), + handler.handle(&task, &self.db, task_broadcaster.as_ref()), + ), ), + task_span.clone(), ) .await } else { - crate::events::with_task_identity( - task_identity.clone(), - handler.handle(&task, &self.db, task_broadcaster.as_ref()), + tracing::Instrument::instrument( + crate::events::with_task_identity( + task_identity.clone(), + handler.handle(&task, &self.db, task_broadcaster.as_ref()), + ), + task_span.clone(), ) .await }; diff --git a/tests/api/mod.rs b/tests/api/mod.rs index 51163162..5f9ea53f 100644 --- a/tests/api/mod.rs +++ b/tests/api/mod.rs @@ -26,6 +26,8 @@ mod library_jobs; mod metadata_locks; mod metadata_reset; mod metrics; +mod observability; +mod observability_proxy; mod oidc; mod opds; mod opds2; diff --git a/tests/api/observability.rs b/tests/api/observability.rs new file mode 100644 index 00000000..caf99db3 --- /dev/null +++ b/tests/api/observability.rs @@ -0,0 +1,151 @@ +//! HTTP integration tests for the OpenTelemetry middleware wiring. +//! +//! Phase 1 of the OTLP plan installs the `axum-tracing-opentelemetry` layers +//! into the router behind a config flag. These tests cover the wiring +//! decisions we make, not the end-to-end propagation behavior of the layers +//! themselves (which require a real SDK runtime + collector to observe +//! correctly and are validated by the manual SigNoz smoke test in the plan). +//! +//! What we DO test here: +//! - When observability is disabled in config, no OTel response headers +//! appear (the layers are absent). +//! - The OTel layer + tracer bridge attaches a valid trace context to a +//! span when scoped through `with_default`. This confirms our provider +//! construction is correct without polluting the global subscriber slot, +//! which would conflict with other tests' use of `tracing_test`. + +#![cfg(feature = "observability")] + +#[path = "../common/mod.rs"] +mod common; + +use codex::api::routes::create_router; +use codex::config::{Config, ObservabilityConfig, OtlpConfig, OtlpProtocol}; +use common::*; +use hyper::StatusCode; +use tracing_subscriber::layer::SubscriberExt; + +fn base_observability_cfg(enabled: bool) -> ObservabilityConfig { + ObservabilityConfig { + enabled, + service_name: "codex-tests".to_string(), + otlp: OtlpConfig { + // Unreachable endpoint by design: tests only verify layer wiring, + // not real export. + endpoint: "http://127.0.0.1:1".to_string(), + protocol: OtlpProtocol::HttpProtobuf, + headers: Default::default(), + timeout_ms: 100, + }, + traces: codex::config::ObservabilityTracesConfig { + enabled: true, + sample_ratio: 1.0, + }, + metrics: codex::config::ObservabilityMetricsConfig { + enabled: false, + export_interval_ms: 60_000, + }, + browser: Default::default(), + } +} + +fn config_with_observability(enabled: bool) -> Config { + let mut config = create_test_config(); + config.observability = base_observability_cfg(enabled); + config +} + +#[tokio::test] +async fn disabled_router_does_not_inject_traceparent() { + let (db, _temp_dir) = setup_test_db().await; + let (state, _router) = setup_test_app(db).await; + + let config = config_with_observability(false); + let app = create_router(state, &config); + + let incoming_traceparent = "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01"; + let request = axum::http::Request::builder() + .method("GET") + .uri("/health") + .header("traceparent", incoming_traceparent) + .body(String::new()) + .unwrap(); + + let (status, headers, _body) = make_full_request(app, request).await; + assert_eq!(status, StatusCode::OK); + + assert!( + headers.get("traceparent").is_none(), + "no traceparent should appear in response when observability is disabled" + ); +} + +#[tokio::test] +async fn enabled_router_health_still_responds() { + // Confirms layers don't break basic request handling when observability + // is enabled. End-to-end traceparent propagation is validated manually + // against a live collector (see Phase 1 manual verification task). + let handle = codex::observability::init(&base_observability_cfg(true)) + .expect("init OTel providers for the enabled-router smoke test"); + + let (db, _temp_dir) = setup_test_db().await; + let (state, _router) = setup_test_app(db).await; + + let config = config_with_observability(true); + let app = create_router(state, &config); + + let request = axum::http::Request::builder() + .method("GET") + .uri("/health") + .header( + "traceparent", + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01", + ) + .body(String::new()) + .unwrap(); + + let (status, _headers, _body) = make_full_request(app, request).await; + assert_eq!(status, StatusCode::OK); + handle.shutdown(); +} + +#[tokio::test] +async fn otel_bridge_attaches_valid_trace_context_to_spans() { + use opentelemetry::trace::TraceContextExt; + use tracing_opentelemetry::OpenTelemetrySpanExt; + + // We deliberately scope the subscriber with `with_default` instead of + // calling `init()` because installing a global subscriber from a test + // would conflict with other tests (e.g. `#[tracing_test::traced_test]`) + // that need to install their own. + let handle = codex::observability::init(&base_observability_cfg(true)) + .expect("init OTel providers for the bridge test"); + let tracer = handle.tracer().cloned().expect("tracer should exist"); + let subscriber = + tracing_subscriber::registry().with(tracing_opentelemetry::layer().with_tracer(tracer)); + + tracing::subscriber::with_default(subscriber, || { + // Mirror the call OtelAxumLayer makes internally (TRACE level on the + // "otel::tracing" target). If the bridge is wired the span carries a + // valid OTel SpanContext with a non-INVALID trace_id. + let span = tracing::span!( + target: "otel::tracing", + tracing::Level::TRACE, + "phase1_smoke" + ); + let _entered = span.enter(); + let ctx = tracing::Span::current().context(); + let span = ctx.span(); + let span_ctx = span.span_context(); + assert!( + span_ctx.trace_id() != opentelemetry::trace::TraceId::INVALID, + "tracer + tracing-opentelemetry bridge must produce a valid trace ID" + ); + assert!( + span_ctx.span_id() != opentelemetry::trace::SpanId::INVALID, + "tracer + tracing-opentelemetry bridge must produce a valid span ID" + ); + }); + + handle.shutdown(); +} diff --git a/tests/api/observability_proxy.rs b/tests/api/observability_proxy.rs new file mode 100644 index 00000000..1c0e48b5 --- /dev/null +++ b/tests/api/observability_proxy.rs @@ -0,0 +1,258 @@ +//! Integration tests for the browser RUM bootstrap + OTLP forwarding proxy +//! (Phase 4 of the observability plan). +//! +//! Scope: the handler layer is reqwest + axum — no OTel SDK is required to +//! exercise it. These tests cover: +//! - `/api/v1/observability/config` requires auth and reflects the +//! server-side flag state without leaking secrets. +//! - `/api/v1/observability/otlp/v1/traces` rejects when the browser +//! feature is off (503). +//! - The same path forwards the body verbatim, stamps the configured +//! auth headers, and ignores browser-supplied headers when the +//! feature is on. +//! +//! Upstream collector is faked with a local axum listener so we can +//! observe what reached it. + +#[path = "../common/mod.rs"] +mod common; + +use std::sync::Arc; + +use axum::{ + Router, + body::Bytes, + extract::State, + http::{HeaderMap, StatusCode}, + routing::post, +}; +use codex::api::extractors::AppState; +use codex::api::routes::create_router; +use codex::config::{Config, ObservabilityBrowserConfig, ObservabilityConfig}; +use codex::db::repositories::UserRepository; +use codex::utils::password; +use common::*; +use hyper::Request; +use tokio::sync::Mutex; + +/// One captured upstream POST. +#[derive(Clone, Debug)] +struct CapturedRequest { + path: String, + headers: Vec<(String, String)>, + body: Vec, +} + +#[derive(Default, Clone)] +struct CaptureState { + captures: Arc>>, +} + +async fn capture_handler( + State(state): State, + headers: HeaderMap, + axum::extract::OriginalUri(uri): axum::extract::OriginalUri, + body: Bytes, +) -> StatusCode { + let header_pairs = headers + .iter() + .filter_map(|(k, v)| v.to_str().ok().map(|s| (k.to_string(), s.to_string()))) + .collect(); + state.captures.lock().await.push(CapturedRequest { + path: uri.path().to_string(), + headers: header_pairs, + body: body.to_vec(), + }); + StatusCode::OK +} + +/// Spawn a one-listener axum collector. Returns the base URL (e.g. +/// `http://127.0.0.1:PORT`) and the capture state so the test can assert +/// what arrived. +async fn spawn_capture_upstream() -> (String, CaptureState) { + let state = CaptureState::default(); + let app = Router::new() + .route("/v1/traces", post(capture_handler)) + .route("/v1/metrics", post(capture_handler)) + .with_state(state.clone()); + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + (format!("http://{}", addr), state) +} + +/// Build an observability config that points at the given upstream and has +/// the browser proxy enabled (or not). +fn observability_config( + upstream: &str, + browser_enabled: bool, + extra_headers: Vec<(String, String)>, +) -> ObservabilityConfig { + let mut cfg = ObservabilityConfig { + browser: ObservabilityBrowserConfig { + enabled: browser_enabled, + proxy_path: "/api/v1/observability/otlp".to_string(), + sample_ratio: 0.25, + }, + ..ObservabilityConfig::default() + }; + cfg.otlp.endpoint = upstream.to_string(); + cfg.otlp.timeout_ms = 2000; + for (k, v) in extra_headers { + cfg.otlp.headers.insert(k, v); + } + cfg.service_name = "codex-test".to_string(); + cfg +} + +/// Build an AppState that uses the supplied observability config. +async fn app_state_with_observability( + db: sea_orm::DatabaseConnection, + obs: ObservabilityConfig, +) -> Arc { + let mut state = (*create_test_app_state(db).await).clone(); + state.observability_config = Arc::new(obs); + Arc::new(state) +} + +async fn bootstrap_user( + db: &sea_orm::DatabaseConnection, + username: &str, +) -> (codex::db::entities::users::Model, String) { + let pwd_hash = password::hash_password("hunter2-for-the-tests").unwrap(); + let user = create_test_user( + username, + &format!("{username}@example.com"), + &pwd_hash, + false, + ); + UserRepository::create(db, &user).await.unwrap(); + (user, "hunter2-for-the-tests".to_string()) +} + +fn router_for(state: Arc) -> Router { + let config = Config::default(); + create_router(state, &config) +} + +#[tokio::test] +async fn observability_config_requires_auth() { + let (db, _temp) = setup_test_db().await; + let state = create_test_app_state(db).await; + let app = router_for(state); + + let request = Request::builder() + .method("GET") + .uri("/api/v1/observability/config") + .body(String::new()) + .unwrap(); + let (status, _body) = make_request(app, request).await; + assert_eq!(status, StatusCode::UNAUTHORIZED); +} + +#[tokio::test] +async fn observability_config_returns_disabled_payload_by_default() { + let (db, _temp) = setup_test_db().await; + let state = create_test_app_state(db).await; + let (user, _) = bootstrap_user(&state.db, "obs_default").await; + let token = generate_test_token(&state, &user); + let app = router_for(state); + + let request = get_request_with_auth("/api/v1/observability/config", &token); + let (status, body) = make_json_request::(app, request).await; + assert_eq!(status, StatusCode::OK); + let payload = body.expect("config payload"); + assert_eq!(payload["enabled"], serde_json::Value::Bool(false)); + assert_eq!(payload["proxyPath"], "/api/v1/observability/otlp"); + assert_eq!(payload["serviceName"], "codex"); +} + +#[tokio::test] +async fn observability_config_advertises_enabled_when_browser_on() { + let (db, _temp) = setup_test_db().await; + let obs = observability_config("http://example.invalid:4318", true, vec![]); + let state = app_state_with_observability(db, obs).await; + let (user, _) = bootstrap_user(&state.db, "obs_enabled").await; + let token = generate_test_token(&state, &user); + let app = router_for(state); + + let request = get_request_with_auth("/api/v1/observability/config", &token); + let (status, body) = make_json_request::(app, request).await; + assert_eq!(status, StatusCode::OK); + let payload = body.expect("config payload"); + assert_eq!(payload["enabled"], serde_json::Value::Bool(true)); + assert_eq!(payload["sampleRatio"], 0.25); + assert_eq!(payload["serviceName"], "codex-test"); +} + +#[tokio::test] +async fn otlp_proxy_rejects_when_browser_disabled() { + let (db, _temp) = setup_test_db().await; + let state = create_test_app_state(db).await; // browser_enabled=false by default + let (user, _) = bootstrap_user(&state.db, "obs_disabled_proxy").await; + let token = generate_test_token(&state, &user); + let app = router_for(state); + + let request = Request::builder() + .method("POST") + .uri("/api/v1/observability/otlp/v1/traces") + .header("authorization", format!("Bearer {token}")) + .header("content-type", "application/x-protobuf") + .body(String::from("anything")) + .unwrap(); + let (status, _body) = make_request(app, request).await; + assert_eq!(status, StatusCode::SERVICE_UNAVAILABLE); +} + +#[tokio::test] +async fn otlp_proxy_forwards_body_and_headers() { + let (upstream_url, capture) = spawn_capture_upstream().await; + let (db, _temp) = setup_test_db().await; + let obs = observability_config( + &upstream_url, + true, + vec![("x-tenant".to_string(), "test-tenant".to_string())], + ); + let state = app_state_with_observability(db, obs).await; + let (user, _) = bootstrap_user(&state.db, "obs_forward").await; + let token = generate_test_token(&state, &user); + let app = router_for(state); + + let payload = b"\x0aFAKE-OTLP-PROTO-BYTES".to_vec(); + let request = Request::builder() + .method("POST") + .uri("/api/v1/observability/otlp/v1/traces") + .header("authorization", format!("Bearer {token}")) + .header("content-type", "application/x-protobuf") + // Browser-supplied header that should NOT be forwarded. + .header("x-tenant", "evil-spoof") + .body(String::from_utf8_lossy(&payload).to_string()) + .unwrap(); + let (status, _body) = make_request(app, request).await; + assert_eq!(status, StatusCode::OK, "proxy should pass through 200"); + + let captured = capture.captures.lock().await.clone(); + assert_eq!(captured.len(), 1, "exactly one upstream POST should arrive"); + let c = &captured[0]; + assert_eq!(c.path, "/v1/traces"); + assert_eq!(c.body, payload, "body should reach upstream unmodified"); + let content_type = c + .headers + .iter() + .find(|(k, _)| k.eq_ignore_ascii_case("content-type")) + .map(|(_, v)| v.as_str()); + assert_eq!(content_type, Some("application/x-protobuf")); + let tenant = c + .headers + .iter() + .find(|(k, _)| k.eq_ignore_ascii_case("x-tenant")) + .map(|(_, v)| v.as_str()); + assert_eq!( + tenant, + Some("test-tenant"), + "operator-configured header must win; browser-supplied value is dropped" + ); +} diff --git a/tests/api/oidc.rs b/tests/api/oidc.rs index 7b311341..46187d03 100644 --- a/tests/api/oidc.rs +++ b/tests/api/oidc.rs @@ -11,8 +11,8 @@ use codex::api::extractors::AppState; use codex::api::extractors::auth::UserAuthCache; use codex::api::routes::create_router; use codex::config::{ - AuthConfig, Config, DatabaseConfig, EmailConfig, FilesConfig, OidcConfig, OidcDefaultRole, - OidcProviderConfig, PdfConfig, + AuthConfig, Config, DatabaseConfig, EmailConfig, FilesConfig, ObservabilityConfig, OidcConfig, + OidcDefaultRole, OidcProviderConfig, PdfConfig, }; use codex::events::EventBroadcaster; use codex::services::email::EmailService; @@ -84,6 +84,7 @@ async fn create_test_state_with_oidc( auth_config, database_config, pdf_config, + observability_config: Arc::new(ObservabilityConfig::default()), email_service, event_broadcaster, settings_service, diff --git a/tests/api/pdf_cache.rs b/tests/api/pdf_cache.rs index b9eb4443..3d021de3 100644 --- a/tests/api/pdf_cache.rs +++ b/tests/api/pdf_cache.rs @@ -10,7 +10,9 @@ use codex::api::routes::v1::dto::{ PdfCacheCleanupResultDto, PdfCacheStatsDto, PdfHandleCacheClearResultDto, PdfHandleCacheStatsDto, TriggerPdfCacheCleanupResponse, }; -use codex::config::{AuthConfig, DatabaseConfig, EmailConfig, FilesConfig, PdfConfig}; +use codex::config::{ + AuthConfig, DatabaseConfig, EmailConfig, FilesConfig, ObservabilityConfig, PdfConfig, +}; use codex::db::repositories::UserRepository; use codex::events::EventBroadcaster; use codex::parsers::pdf::{open_pdf_document, renderer}; @@ -81,6 +83,7 @@ async fn create_test_app_state_with_pdf_cache( auth_config, database_config, pdf_config, + observability_config: Arc::new(ObservabilityConfig::default()), email_service, event_broadcaster, settings_service, diff --git a/tests/api/rate_limit.rs b/tests/api/rate_limit.rs index 17642f39..e5251a50 100644 --- a/tests/api/rate_limit.rs +++ b/tests/api/rate_limit.rs @@ -64,7 +64,9 @@ async fn create_rate_limited_app_state( db: sea_orm::DatabaseConnection, config: &RateLimitConfig, ) -> Arc { - use codex::config::{AuthConfig, DatabaseConfig, EmailConfig, FilesConfig, PdfConfig}; + use codex::config::{ + AuthConfig, DatabaseConfig, EmailConfig, FilesConfig, ObservabilityConfig, PdfConfig, + }; use codex::events::EventBroadcaster; use codex::services::email::EmailService; use codex::services::{ @@ -114,6 +116,7 @@ async fn create_rate_limited_app_state( auth_config, database_config, pdf_config, + observability_config: Arc::new(ObservabilityConfig::default()), email_service, event_broadcaster, settings_service, diff --git a/tests/api/refresh_token.rs b/tests/api/refresh_token.rs index cd726f5b..b4c5f747 100644 --- a/tests/api/refresh_token.rs +++ b/tests/api/refresh_token.rs @@ -16,7 +16,9 @@ use codex::api::routes::create_router; use codex::api::routes::v1::dto::auth::{ LoginRequest, LoginResponse, LogoutRequest, RefreshRequest, TokenPair, }; -use codex::config::{AuthConfig, DatabaseConfig, EmailConfig, FilesConfig, PdfConfig}; +use codex::config::{ + AuthConfig, DatabaseConfig, EmailConfig, FilesConfig, ObservabilityConfig, PdfConfig, +}; use codex::db::repositories::{NewRefreshToken, RefreshTokenRepository, UserRepository}; use codex::events::EventBroadcaster; use codex::services::email::EmailService; @@ -77,6 +79,7 @@ async fn build_state(db: DatabaseConnection, refresh_enabled: bool) -> Arc Arc Arc { auth_config, database_config, pdf_config, + observability_config: Arc::new(ObservabilityConfig::default()), email_service, event_broadcaster, settings_service, @@ -123,6 +126,7 @@ pub async fn create_test_app_state(db: DatabaseConnection) -> Arc { auth_config, database_config, pdf_config, + observability_config: Arc::new(ObservabilityConfig::default()), email_service, event_broadcaster, settings_service, @@ -212,6 +216,7 @@ pub async fn create_test_router(state: Arc) -> Router { auth_config, database_config, pdf_config, + observability_config: Arc::new(ObservabilityConfig::default()), email_service, event_broadcaster, settings_service, diff --git a/web/openapi.json b/web/openapi.json index d07ef05c..c878be7c 100644 --- a/web/openapi.json +++ b/web/openapi.json @@ -7423,6 +7423,119 @@ ] } }, + "/api/v1/observability/config": { + "get": { + "tags": [ + "Observability" + ], + "summary": "Return the configuration the browser SDK needs to bootstrap itself.", + "description": "Authenticated to keep the response (which leaks the sample ratio /\nproxy path / service name) inside the existing trust boundary;\neverything sensitive (endpoint, headers) stays server-side.", + "operationId": "get_browser_config", + "responses": { + "200": { + "description": "Browser SDK bootstrap config", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BrowserObservabilityConfigDto" + } + } + } + }, + "401": { + "description": "Unauthorized" + } + }, + "security": [ + { + "jwt_bearer": [] + }, + { + "api_key": [] + } + ] + } + }, + "/api/v1/observability/otlp/v1/metrics": { + "post": { + "tags": [ + "Observability" + ], + "summary": "Forward a batched OTLP/HTTP metrics payload to the configured upstream.", + "operationId": "proxy_metrics", + "requestBody": { + "description": "OTLP/HTTP metrics payload (protobuf or JSON)", + "content": { + "application/x-protobuf": {} + } + }, + "responses": { + "200": { + "description": "Forwarded successfully" + }, + "400": { + "description": "Payload too large" + }, + "401": { + "description": "Unauthorized" + }, + "502": { + "description": "Upstream collector error" + }, + "503": { + "description": "Browser observability disabled" + } + }, + "security": [ + { + "jwt_bearer": [] + }, + { + "api_key": [] + } + ] + } + }, + "/api/v1/observability/otlp/v1/traces": { + "post": { + "tags": [ + "Observability" + ], + "summary": "Forward a batched OTLP/HTTP traces payload to the configured upstream.", + "operationId": "proxy_traces", + "requestBody": { + "description": "OTLP/HTTP traces payload (protobuf or JSON)", + "content": { + "application/x-protobuf": {} + } + }, + "responses": { + "200": { + "description": "Forwarded successfully" + }, + "400": { + "description": "Payload too large" + }, + "401": { + "description": "Unauthorized" + }, + "502": { + "description": "Upstream collector error" + }, + "503": { + "description": "Browser observability disabled" + } + }, + "security": [ + { + "jwt_bearer": [] + }, + { + "api_key": [] + } + ] + } + }, "/api/v1/plugins/actions": { "get": { "tags": [ @@ -21675,6 +21788,38 @@ "parentPath": "/home/user" } }, + "BrowserObservabilityConfigDto": { + "type": "object", + "description": "Browser RUM bootstrap configuration returned by\n`GET /api/v1/observability/config`.", + "required": [ + "enabled", + "serviceName", + "proxyPath", + "sampleRatio" + ], + "properties": { + "enabled": { + "type": "boolean", + "description": "Whether the browser SDK should initialize. False means the SDK\nbootstrap is a no-op even if the script is loaded." + }, + "proxyPath": { + "type": "string", + "description": "Same-origin path prefix on the Codex server where the browser SDK\nshould POST OTLP batches. The SDK appends `/v1/traces` and\n`/v1/metrics` to this base.", + "example": "/api/v1/observability/otlp" + }, + "sampleRatio": { + "type": "number", + "format": "double", + "description": "Parent-based sampling ratio applied client-side. Browsers are noisy;\ndefault low.", + "example": 0.1 + }, + "serviceName": { + "type": "string", + "description": "`service.name` resource attribute the browser SDK should set on\nevery span (matches the backend service name unless the operator\noverrode it specifically for the browser).", + "example": "codex-web" + } + } + }, "BulkAnalyzeBooksRequest": { "type": "object", "description": "Request to perform bulk analyze operations on multiple books", @@ -41463,6 +41608,10 @@ "name": "Metrics", "description": "Application metrics and statistics" }, + { + "name": "Observability", + "description": "Browser RUM bootstrap configuration and OTLP forwarding proxy" + }, { "name": "Filesystem", "description": "Filesystem browsing for library paths" @@ -41553,6 +41702,7 @@ "Plugins", "Plugin Actions", "Metrics", + "Observability", "Filesystem", "Duplicates", "Sharing Tags" diff --git a/web/package-lock.json b/web/package-lock.json index 22740ebb..4926ec33 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -13,6 +13,16 @@ "@mantine/form": "^8.3.15", "@mantine/hooks": "^8.3.15", "@mantine/notifications": "^8.3.15", + "@opentelemetry/api": "^1.9.0", + "@opentelemetry/context-zone": "^2.1.0", + "@opentelemetry/exporter-trace-otlp-http": "^0.207.0", + "@opentelemetry/instrumentation-document-load": "^0.51.0", + "@opentelemetry/instrumentation-fetch": "^0.207.0", + "@opentelemetry/instrumentation-user-interaction": "^0.51.0", + "@opentelemetry/resources": "^2.1.0", + "@opentelemetry/sdk-trace-base": "^2.1.0", + "@opentelemetry/sdk-trace-web": "^2.1.0", + "@opentelemetry/semantic-conventions": "^1.38.0", "@tabler/icons-react": "^3.37.1", "@tanstack/react-query": "^5.90.21", "@tanstack/react-router": "^1.162.1", @@ -3027,6 +3037,604 @@ "dev": true, "license": "MIT" }, + "node_modules/@opentelemetry/api": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.1.tgz", + "integrity": "sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==", + "license": "Apache-2.0", + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/api-logs": { + "version": "0.205.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.205.0.tgz", + "integrity": "sha512-wBlPk1nFB37Hsm+3Qy73yQSobVn28F4isnWIBvKpd5IUH/eat8bwcL02H9yzmHyyPmukeccSl2mbN5sDQZYnPg==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.3.0" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/context-zone": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/context-zone/-/context-zone-2.7.1.tgz", + "integrity": "sha512-B42kO3zIMVbJ+wj5nlSkDvLF8cJY+7wDKLomHp10GL00nvUnhY67UQ/soZQgKR4dvPf8zTKbcONDsOiJLyRuXw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/context-zone-peer-dep": "2.7.1", + "zone.js": "^0.11.0 || ^0.12.0 || ^0.13.0 || ^0.14.0 || ^0.15.0 || ^0.16.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + } + }, + "node_modules/@opentelemetry/context-zone-peer-dep": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/context-zone-peer-dep/-/context-zone-peer-dep-2.7.1.tgz", + "integrity": "sha512-QPLvl82Ds+W9Tjz0s4b8UDUK9YkCb3pvaur4JQdgHe+eph6Ii20NbiC+wsdnBtG17DTPhmZcFvWMcQXZFBgeVw==", + "license": "Apache-2.0", + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0", + "zone.js": "^0.10.2 || ^0.11.0 || ^0.12.0 || ^0.13.0 || ^0.14.0 || ^0.15.0 || ^0.16.0" + } + }, + "node_modules/@opentelemetry/core": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.2.0.tgz", + "integrity": "sha512-FuabnnUm8LflnieVxs6eP7Z383hgQU4W1e3KJS6aOG3RxWxcHyBxH8fDMHNgu/gFx/M2jvTOW/4/PHhLz6bjWw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/exporter-trace-otlp-http": { + "version": "0.207.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/exporter-trace-otlp-http/-/exporter-trace-otlp-http-0.207.0.tgz", + "integrity": "sha512-HSRBzXHIC7C8UfPQdu15zEEoBGv0yWkhEwxqgPCHVUKUQ9NLHVGXkVrf65Uaj7UwmAkC1gQfkuVYvLlD//AnUQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/otlp-exporter-base": "0.207.0", + "@opentelemetry/otlp-transformer": "0.207.0", + "@opentelemetry/resources": "2.2.0", + "@opentelemetry/sdk-trace-base": "2.2.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/exporter-trace-otlp-http/node_modules/@opentelemetry/resources": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.2.0.tgz", + "integrity": "sha512-1pNQf/JazQTMA0BiO5NINUzH0cbLbbl7mntLa4aJNmCCXSj0q03T5ZXXL0zw4G55TjdL9Tz32cznGClf+8zr5A==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/exporter-trace-otlp-http/node_modules/@opentelemetry/sdk-trace-base": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.2.0.tgz", + "integrity": "sha512-xWQgL0Bmctsalg6PaXExmzdedSp3gyKV8mQBwK/j9VGdCDu2fmXIb2gAehBKbkXCpJ4HPkgv3QfoJWRT4dHWbw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/resources": "2.2.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/instrumentation": { + "version": "0.205.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/instrumentation/-/instrumentation-0.205.0.tgz", + "integrity": "sha512-cgvm7tvQdu9Qo7VurJP84wJ7ZV9F6WqDDGZpUc6rUEXwjV7/bXWs0kaYp9v+1Vh1+3TZCD3i6j/lUBcPhu8NhA==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api-logs": "0.205.0", + "import-in-the-middle": "^1.8.1", + "require-in-the-middle": "^7.1.1" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/instrumentation-document-load": { + "version": "0.51.2", + "resolved": "https://registry.npmjs.org/@opentelemetry/instrumentation-document-load/-/instrumentation-document-load-0.51.2.tgz", + "integrity": "sha512-9ZhLEt8qGUFtOqhl/+ANQsZITbl502YF2vDovsKXbiODOjD3a73rgMXe4YEKv7a0Q6inREzZNqVrgKtAmeHnMw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "^2.0.0", + "@opentelemetry/instrumentation": "^0.205.0", + "@opentelemetry/sdk-trace-web": "^2.0.0", + "@opentelemetry/semantic-conventions": "^1.23.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/instrumentation-fetch": { + "version": "0.207.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/instrumentation-fetch/-/instrumentation-fetch-0.207.0.tgz", + "integrity": "sha512-Urqh7w/KIGNYeaRf5Ba9FdJYCUF/g8RpiyywsMRc8sTK6hyQsn2p2vh+MzUQacQ7vZPzBc2u1l2034sIMhvGzA==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/instrumentation": "0.207.0", + "@opentelemetry/sdk-trace-web": "2.2.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/instrumentation-fetch/node_modules/@opentelemetry/api-logs": { + "version": "0.207.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.207.0.tgz", + "integrity": "sha512-lAb0jQRVyleQQGiuuvCOTDVspc14nx6XJjP4FspJ1sNARo3Regq4ZZbrc3rN4b1TYSuUCvgH+UXUPug4SLOqEQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.3.0" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/instrumentation-fetch/node_modules/@opentelemetry/instrumentation": { + "version": "0.207.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/instrumentation/-/instrumentation-0.207.0.tgz", + "integrity": "sha512-y6eeli9+TLKnznrR8AZlQMSJT7wILpXH+6EYq5Vf/4Ao+huI7EedxQHwRgVUOMLFbe7VFDvHJrX9/f4lcwnJsA==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api-logs": "0.207.0", + "import-in-the-middle": "^2.0.0", + "require-in-the-middle": "^8.0.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/instrumentation-fetch/node_modules/@opentelemetry/resources": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.2.0.tgz", + "integrity": "sha512-1pNQf/JazQTMA0BiO5NINUzH0cbLbbl7mntLa4aJNmCCXSj0q03T5ZXXL0zw4G55TjdL9Tz32cznGClf+8zr5A==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/instrumentation-fetch/node_modules/@opentelemetry/sdk-trace-base": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.2.0.tgz", + "integrity": "sha512-xWQgL0Bmctsalg6PaXExmzdedSp3gyKV8mQBwK/j9VGdCDu2fmXIb2gAehBKbkXCpJ4HPkgv3QfoJWRT4dHWbw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/resources": "2.2.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/instrumentation-fetch/node_modules/@opentelemetry/sdk-trace-web": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-web/-/sdk-trace-web-2.2.0.tgz", + "integrity": "sha512-x/LHsDBO3kfqaFx5qSzBljJ5QHsRXrvS4MybBDy1k7Svidb8ZyIPudWVzj3s5LpPkYZIgi9e+7tdsNCnptoelw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/sdk-trace-base": "2.2.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/instrumentation-fetch/node_modules/cjs-module-lexer": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-2.2.0.tgz", + "integrity": "sha512-4bHTS2YuzUvtoLjdy+98ykbNB5jS0+07EvFNXerqZQJ89F7DI6ET7OQo/HJuW6K0aVsKA9hj9/RVb2kQVOrPDQ==", + "license": "MIT" + }, + "node_modules/@opentelemetry/instrumentation-fetch/node_modules/import-in-the-middle": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/import-in-the-middle/-/import-in-the-middle-2.0.6.tgz", + "integrity": "sha512-3vZV3jX0XRFW3EJDTwzWoZa+RH1b8eTTx6YOCjglrLyPuepwoBti1k3L2dKwdCUrnVEfc5CuRuGstaC/uQJJaw==", + "license": "Apache-2.0", + "dependencies": { + "acorn": "^8.15.0", + "acorn-import-attributes": "^1.9.5", + "cjs-module-lexer": "^2.2.0", + "module-details-from-path": "^1.0.4" + } + }, + "node_modules/@opentelemetry/instrumentation-fetch/node_modules/require-in-the-middle": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/require-in-the-middle/-/require-in-the-middle-8.0.1.tgz", + "integrity": "sha512-QT7FVMXfWOYFbeRBF6nu+I6tr2Tf3u0q8RIEjNob/heKY/nh7drD/k7eeMFmSQgnTtCzLDcCu/XEnpW2wk4xCQ==", + "license": "MIT", + "dependencies": { + "debug": "^4.3.5", + "module-details-from-path": "^1.0.3" + }, + "engines": { + "node": ">=9.3.0 || >=8.10.0 <9.0.0" + } + }, + "node_modules/@opentelemetry/instrumentation-user-interaction": { + "version": "0.51.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/instrumentation-user-interaction/-/instrumentation-user-interaction-0.51.0.tgz", + "integrity": "sha512-v7LfzdGlbu3+/CXoK1PG8m05s3mz2K+03sOPsq7Y0HoJ1JRMCwF0uMUujeMDcno3EVImHkngzfH+/F/cjRggUw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "^2.0.0", + "@opentelemetry/instrumentation": "^0.206.0", + "@opentelemetry/sdk-trace-web": "^2.0.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0", + "zone.js": "^0.11.4 || ^0.13.0 || ^0.14.0 || ^0.15.0" + } + }, + "node_modules/@opentelemetry/instrumentation-user-interaction/node_modules/@opentelemetry/api-logs": { + "version": "0.206.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.206.0.tgz", + "integrity": "sha512-yIVDu9jX//nV5wSMLZLdHdb1SKHIMj9k+wQVFtln5Flcgdldz9BkHtavvExQiJqBZg2OpEEJEZmzQazYztdz2A==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.3.0" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/instrumentation-user-interaction/node_modules/@opentelemetry/instrumentation": { + "version": "0.206.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/instrumentation/-/instrumentation-0.206.0.tgz", + "integrity": "sha512-anPU9GAn3vSH/0JFQZ4e626xRw8p8R21kxM7xammFk9BRhfDw1IpgqvFMllbb+1MSHHEX9EiUqYHJyWo/B6KGA==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api-logs": "0.206.0", + "import-in-the-middle": "^1.8.1", + "require-in-the-middle": "^8.0.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/instrumentation-user-interaction/node_modules/require-in-the-middle": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/require-in-the-middle/-/require-in-the-middle-8.0.1.tgz", + "integrity": "sha512-QT7FVMXfWOYFbeRBF6nu+I6tr2Tf3u0q8RIEjNob/heKY/nh7drD/k7eeMFmSQgnTtCzLDcCu/XEnpW2wk4xCQ==", + "license": "MIT", + "dependencies": { + "debug": "^4.3.5", + "module-details-from-path": "^1.0.3" + }, + "engines": { + "node": ">=9.3.0 || >=8.10.0 <9.0.0" + } + }, + "node_modules/@opentelemetry/otlp-exporter-base": { + "version": "0.207.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-exporter-base/-/otlp-exporter-base-0.207.0.tgz", + "integrity": "sha512-4RQluMVVGMrHok/3SVeSJ6EnRNkA2MINcX88sh+d/7DjGUrewW/WT88IsMEci0wUM+5ykTpPPNbEOoW+jwHnbw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/otlp-transformer": "0.207.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/otlp-transformer": { + "version": "0.207.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-transformer/-/otlp-transformer-0.207.0.tgz", + "integrity": "sha512-+6DRZLqM02uTIY5GASMZWUwr52sLfNiEe20+OEaZKhztCs3+2LxoTjb6JxFRd9q1qNqckXKYlUKjbH/AhG8/ZA==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api-logs": "0.207.0", + "@opentelemetry/core": "2.2.0", + "@opentelemetry/resources": "2.2.0", + "@opentelemetry/sdk-logs": "0.207.0", + "@opentelemetry/sdk-metrics": "2.2.0", + "@opentelemetry/sdk-trace-base": "2.2.0", + "protobufjs": "^7.3.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.3.0" + } + }, + "node_modules/@opentelemetry/otlp-transformer/node_modules/@opentelemetry/api-logs": { + "version": "0.207.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.207.0.tgz", + "integrity": "sha512-lAb0jQRVyleQQGiuuvCOTDVspc14nx6XJjP4FspJ1sNARo3Regq4ZZbrc3rN4b1TYSuUCvgH+UXUPug4SLOqEQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.3.0" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/otlp-transformer/node_modules/@opentelemetry/resources": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.2.0.tgz", + "integrity": "sha512-1pNQf/JazQTMA0BiO5NINUzH0cbLbbl7mntLa4aJNmCCXSj0q03T5ZXXL0zw4G55TjdL9Tz32cznGClf+8zr5A==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/otlp-transformer/node_modules/@opentelemetry/sdk-trace-base": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.2.0.tgz", + "integrity": "sha512-xWQgL0Bmctsalg6PaXExmzdedSp3gyKV8mQBwK/j9VGdCDu2fmXIb2gAehBKbkXCpJ4HPkgv3QfoJWRT4dHWbw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/resources": "2.2.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/resources": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.7.1.tgz", + "integrity": "sha512-DeT6KKolmC4e/dRQvMQ/RwlnzhaqeiFOXY5ngoOPJ07GgVVKxZOg9EcrNZb5aTzUn+iCrJldAgOfQm1O/QfPAQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.7.1", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/resources/node_modules/@opentelemetry/core": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.7.1.tgz", + "integrity": "sha512-QAqIj32AtK6+pEVNG7EOVxHdE06RP+FM5qpiEJ4RtDcFIqKUZHYhl7/7UY5efhwmwNAg7j8QbJVBLxMerc0+gw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-logs": { + "version": "0.207.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-logs/-/sdk-logs-0.207.0.tgz", + "integrity": "sha512-4MEQmn04y+WFe6cyzdrXf58hZxilvY59lzZj2AccuHW/+BxLn/rGVN/Irsi/F0qfBOpMOrrCLKTExoSL2zoQmg==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api-logs": "0.207.0", + "@opentelemetry/core": "2.2.0", + "@opentelemetry/resources": "2.2.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.4.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-logs/node_modules/@opentelemetry/api-logs": { + "version": "0.207.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.207.0.tgz", + "integrity": "sha512-lAb0jQRVyleQQGiuuvCOTDVspc14nx6XJjP4FspJ1sNARo3Regq4ZZbrc3rN4b1TYSuUCvgH+UXUPug4SLOqEQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.3.0" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/@opentelemetry/sdk-logs/node_modules/@opentelemetry/resources": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.2.0.tgz", + "integrity": "sha512-1pNQf/JazQTMA0BiO5NINUzH0cbLbbl7mntLa4aJNmCCXSj0q03T5ZXXL0zw4G55TjdL9Tz32cznGClf+8zr5A==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-metrics": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-2.2.0.tgz", + "integrity": "sha512-G5KYP6+VJMZzpGipQw7Giif48h6SGQ2PFKEYCybeXJsOCB4fp8azqMAAzE5lnnHK3ZVwYQrgmFbsUJO/zOnwGw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/resources": "2.2.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.9.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-metrics/node_modules/@opentelemetry/resources": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.2.0.tgz", + "integrity": "sha512-1pNQf/JazQTMA0BiO5NINUzH0cbLbbl7mntLa4aJNmCCXSj0q03T5ZXXL0zw4G55TjdL9Tz32cznGClf+8zr5A==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.2.0", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-base": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.7.1.tgz", + "integrity": "sha512-NAYIlsF8MPUsKqJMiDQJTMPOmlbawC1Iz/omMLygZ1C9am8fTKYjTaI+OZM+WTY3t3Glo0wnOg/6/pac6RGPPw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.7.1", + "@opentelemetry/resources": "2.7.1", + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.3.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-base/node_modules/@opentelemetry/core": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.7.1.tgz", + "integrity": "sha512-QAqIj32AtK6+pEVNG7EOVxHdE06RP+FM5qpiEJ4RtDcFIqKUZHYhl7/7UY5efhwmwNAg7j8QbJVBLxMerc0+gw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-web": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-web/-/sdk-trace-web-2.7.1.tgz", + "integrity": "sha512-K806OouCSOjMd8Nr7+ZCq3QT22tdAzzS/7h8vprfiKjkgFQ99/dvwU8d12WJANA6D5Qtme65hyBAqAu9CkQuxQ==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/core": "2.7.1", + "@opentelemetry/sdk-trace-base": "2.7.1" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/sdk-trace-web/node_modules/@opentelemetry/core": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.7.1.tgz", + "integrity": "sha512-QAqIj32AtK6+pEVNG7EOVxHdE06RP+FM5qpiEJ4RtDcFIqKUZHYhl7/7UY5efhwmwNAg7j8QbJVBLxMerc0+gw==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/semantic-conventions": "^1.29.0" + }, + "engines": { + "node": "^18.19.0 || >=20.6.0" + }, + "peerDependencies": { + "@opentelemetry/api": ">=1.0.0 <1.10.0" + } + }, + "node_modules/@opentelemetry/semantic-conventions": { + "version": "1.41.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-1.41.1.tgz", + "integrity": "sha512-/UhIkaZgPutTFmQ7RnIJGgDXZmtEJ7Dvi86xNTFWcnRxVRNk/aotsqDJYeEvDP+FSMB2SdW+pQzNMcWP0rwuNA==", + "license": "Apache-2.0", + "engines": { + "node": ">=14" + } + }, "node_modules/@polka/url": { "version": "1.0.0-next.29", "resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.29.tgz", @@ -3034,6 +3642,69 @@ "dev": true, "license": "MIT" }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.5.tgz", + "integrity": "sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.1.tgz", + "integrity": "sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.1.tgz", + "integrity": "sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==", + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.1" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/inquire": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.2.tgz", + "integrity": "sha512-pa0vFRuws4wkvaXKK1uXZMAwAX4/t8ANaJo45iw/oQHNQ9q5xUzwgFmVJGXiga2BeN+zpX7Vf9vmsiIa2J+MUw==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.1.tgz", + "integrity": "sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==", + "license": "BSD-3-Clause" + }, "node_modules/@redocly/ajv": { "version": "8.17.3", "resolved": "https://registry.npmjs.org/@redocly/ajv/-/ajv-8.17.3.tgz", @@ -4229,7 +4900,6 @@ "version": "25.3.0", "resolved": "https://registry.npmjs.org/@types/node/-/node-25.3.0.tgz", "integrity": "sha512-4K3bqJpXpqfg2XKGK9bpDTc6xO/xoUP/RBWS7AtRMug6zZFaRekiLzjVtAoZMquxoAbzBvy5nxQ7veS5eYzf8A==", - "dev": true, "license": "MIT", "dependencies": { "undici-types": "~7.18.0" @@ -4504,7 +5174,6 @@ "version": "8.16.0", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz", "integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==", - "dev": true, "license": "MIT", "bin": { "acorn": "bin/acorn" @@ -4513,6 +5182,15 @@ "node": ">=0.4.0" } }, + "node_modules/acorn-import-attributes": { + "version": "1.9.5", + "resolved": "https://registry.npmjs.org/acorn-import-attributes/-/acorn-import-attributes-1.9.5.tgz", + "integrity": "sha512-n02Vykv5uA3eHGM/Z2dQrcD56kL8TyDb2p1+0P83PClMnC/nc+anbQRhIOWnSq4Ke/KvDPrY3C9hDtC/A3eHnQ==", + "license": "MIT", + "peerDependencies": { + "acorn": "^8" + } + }, "node_modules/agent-base": { "version": "7.1.4", "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", @@ -5027,6 +5705,12 @@ "node": ">=8" } }, + "node_modules/cjs-module-lexer": { + "version": "1.4.3", + "resolved": "https://registry.npmjs.org/cjs-module-lexer/-/cjs-module-lexer-1.4.3.tgz", + "integrity": "sha512-9z8TZaGM1pfswYeXrUpzPrkx8UnWYdhJclsiYMm6x/w5+nN+8Tf/LnAgfLGQCm59qAOxU8WwHEq2vNwF6i4j+Q==", + "license": "MIT" + }, "node_modules/cli-width": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/cli-width/-/cli-width-4.1.0.tgz", @@ -6830,6 +7514,18 @@ "url": "https://opencollective.com/immer" } }, + "node_modules/import-in-the-middle": { + "version": "1.15.0", + "resolved": "https://registry.npmjs.org/import-in-the-middle/-/import-in-the-middle-1.15.0.tgz", + "integrity": "sha512-bpQy+CrsRmYmoPMAE/0G33iwRqwW4ouqdRg8jgbH3aKuCtOc8lxgmYXg2dMM92CRiGP660EtBcymH/eVUpCSaA==", + "license": "Apache-2.0", + "dependencies": { + "acorn": "^8.14.0", + "acorn-import-attributes": "^1.9.5", + "cjs-module-lexer": "^1.2.2", + "module-details-from-path": "^1.0.3" + } + }, "node_modules/indent-string": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz", @@ -6992,7 +7688,6 @@ "version": "2.16.2", "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.2.tgz", "integrity": "sha512-evOr8xfXKxE6qSR0hSXL2r3sd7ALj8+7jQEUvPYcm5sgZFdJ+AYzT6yNmJenvIYQBgIGwfwz08sL8zoL7yq2BA==", - "dev": true, "license": "MIT", "dependencies": { "hasown": "^2.0.3" @@ -7838,6 +8533,12 @@ "dev": true, "license": "MIT" }, + "node_modules/long": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", + "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==", + "license": "Apache-2.0" + }, "node_modules/longest-streak": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz", @@ -8866,6 +9567,12 @@ "node": ">=16 || 14 >=14.17" } }, + "node_modules/module-details-from-path": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/module-details-from-path/-/module-details-from-path-1.0.4.tgz", + "integrity": "sha512-EGWKgxALGMgzvxYF1UyGTy0HXX/2vHLkw6+NvDKW2jypWbHpjQuj4UMcqQWXHERJhVGKikolT06G3bcKe4fi7w==", + "license": "MIT" + }, "node_modules/motion": { "version": "12.38.0", "resolved": "https://registry.npmjs.org/motion/-/motion-12.38.0.tgz", @@ -9289,7 +9996,6 @@ "version": "1.0.7", "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", - "dev": true, "license": "MIT" }, "node_modules/path-scurry": { @@ -9629,6 +10335,30 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/protobufjs": { + "version": "7.6.1", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.6.1.tgz", + "integrity": "sha512-4K0myLaWL5EteuSAro91EGFgcfVgxb64Jx+7oDAY6GOkXD4M69yuSEljNcInGVCA5sOPxmZ/EqDLj2x0Q0+Ygg==", + "hasInstallScript": true, + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.5", + "@protobufjs/eventemitter": "^1.1.1", + "@protobufjs/fetch": "^1.1.1", + "@protobufjs/float": "^1.0.2", + "@protobufjs/inquire": "^1.1.2", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.1", + "@types/node": ">=13.7.0", + "long": "^5.3.2" + }, + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/proxy-from-env": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", @@ -10157,11 +10887,24 @@ "node": ">=0.10.0" } }, + "node_modules/require-in-the-middle": { + "version": "7.5.2", + "resolved": "https://registry.npmjs.org/require-in-the-middle/-/require-in-the-middle-7.5.2.tgz", + "integrity": "sha512-gAZ+kLqBdHarXB64XpAe2VCjB7rIRv+mU8tfRWziHRJ5umKsIHN2tLLv6EtMw7WCdP19S0ERVMldNvxYCHnhSQ==", + "license": "MIT", + "dependencies": { + "debug": "^4.3.5", + "module-details-from-path": "^1.0.3", + "resolve": "^1.22.8" + }, + "engines": { + "node": ">=8.6.0" + } + }, "node_modules/resolve": { "version": "1.22.12", "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.12.tgz", "integrity": "sha512-TyeJ1zif53BPfHootBGwPRYT1RUt6oGWsaQr8UyZW/eAm9bKoijtvruSDEmZHm92CwS9nj7/fWttqPCgzep8CA==", - "dev": true, "license": "MIT", "dependencies": { "es-errors": "^1.3.0", @@ -10937,7 +11680,6 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", - "dev": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -11382,7 +12124,6 @@ "version": "7.18.2", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz", "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==", - "dev": true, "license": "MIT" }, "node_modules/unicode-canonical-property-names-ecmascript": { @@ -12586,6 +13327,12 @@ "url": "https://github.com/sponsors/colinhacks" } }, + "node_modules/zone.js": { + "version": "0.15.1", + "resolved": "https://registry.npmjs.org/zone.js/-/zone.js-0.15.1.tgz", + "integrity": "sha512-XE96n56IQpJM7NAoXswY3XRLcWFW83xe0BiAOeMD7K5k5xecOeul3Qcpx6GqEeeHNkW5DWL5zOyTbEfB4eti8w==", + "license": "MIT" + }, "node_modules/zustand": { "version": "5.0.11", "resolved": "https://registry.npmjs.org/zustand/-/zustand-5.0.11.tgz", diff --git a/web/package.json b/web/package.json index f3bd208c..1721ebd0 100644 --- a/web/package.json +++ b/web/package.json @@ -22,6 +22,16 @@ "@mantine/form": "^8.3.15", "@mantine/hooks": "^8.3.15", "@mantine/notifications": "^8.3.15", + "@opentelemetry/api": "^1.9.0", + "@opentelemetry/context-zone": "^2.1.0", + "@opentelemetry/exporter-trace-otlp-http": "^0.207.0", + "@opentelemetry/instrumentation-document-load": "^0.51.0", + "@opentelemetry/instrumentation-fetch": "^0.207.0", + "@opentelemetry/instrumentation-user-interaction": "^0.51.0", + "@opentelemetry/resources": "^2.1.0", + "@opentelemetry/sdk-trace-base": "^2.1.0", + "@opentelemetry/sdk-trace-web": "^2.1.0", + "@opentelemetry/semantic-conventions": "^1.38.0", "@tabler/icons-react": "^3.37.1", "@tanstack/react-query": "^5.90.21", "@tanstack/react-router": "^1.162.1", diff --git a/web/src/lib/observability/index.ts b/web/src/lib/observability/index.ts new file mode 100644 index 00000000..e74cc914 --- /dev/null +++ b/web/src/lib/observability/index.ts @@ -0,0 +1,60 @@ +// Lightweight entry point: ask the server whether RUM is enabled, then +// dynamically import the SDK bundle only if we need it. The full SDK +// pulls in ~120 KB of JS (gzipped) and we do not want that cost on every +// page load when observability is off (the default). + +const CONFIG_URL = "/api/v1/observability/config"; + +export interface BrowserObservabilityConfig { + enabled: boolean; + serviceName: string; + proxyPath: string; + sampleRatio: number; +} + +let initPromise: Promise | null = null; + +/** + * Fetch the server-side bootstrap config and, if RUM is enabled, lazily + * import and start the OTel web SDK. Safe to call multiple times — only + * the first invocation actually does work. + * + * Failures are logged and swallowed: observability must never break the + * SPA. If the server is unreachable or the user is not yet authenticated, + * we just leave the SDK uninitialized and the app keeps working. + */ +export function initObservability(): Promise { + if (initPromise) { + return initPromise; + } + initPromise = (async () => { + let config: BrowserObservabilityConfig | null = null; + try { + const res = await fetch(CONFIG_URL, { + credentials: "include", + headers: { Accept: "application/json" }, + }); + if (!res.ok) { + return; + } + config = (await res.json()) as BrowserObservabilityConfig; + } catch { + // Network error, server not reachable, etc. Stay silent. + return; + } + + if (!config?.enabled) { + return; + } + + try { + const { startTracer } = await import("./tracer"); + startTracer(config); + } catch (err) { + // SDK import failed — possibly a code split error. Log to console + // for debugging; do not surface to the user. + console.warn("[observability] failed to start OTel web SDK", err); + } + })(); + return initPromise; +} diff --git a/web/src/lib/observability/tracer.ts b/web/src/lib/observability/tracer.ts new file mode 100644 index 00000000..4c511217 --- /dev/null +++ b/web/src/lib/observability/tracer.ts @@ -0,0 +1,119 @@ +// Heavyweight bootstrap for the OTel web SDK. Only imported when the +// server config flag turns RUM on — see ../observability/index.ts for +// the gated entry point. + +import { ZoneContextManager } from "@opentelemetry/context-zone"; +import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http"; +import { registerInstrumentations } from "@opentelemetry/instrumentation"; +import { DocumentLoadInstrumentation } from "@opentelemetry/instrumentation-document-load"; +import { FetchInstrumentation } from "@opentelemetry/instrumentation-fetch"; +import { UserInteractionInstrumentation } from "@opentelemetry/instrumentation-user-interaction"; +import { resourceFromAttributes } from "@opentelemetry/resources"; +import { + BatchSpanProcessor, + ParentBasedSampler, + TraceIdRatioBasedSampler, +} from "@opentelemetry/sdk-trace-base"; +import { WebTracerProvider } from "@opentelemetry/sdk-trace-web"; +import { + ATTR_SERVICE_NAME, + ATTR_SERVICE_VERSION, +} from "@opentelemetry/semantic-conventions"; +import type { BrowserObservabilityConfig } from "."; + +const APP_VERSION = (import.meta.env.PACKAGE_VERSION as string) || "unknown"; + +let started = false; + +/** + * Register the OTel web tracer provider with the document-load, fetch, + * and user-interaction instrumentations. Idempotent; second + later + * calls are no-ops. + */ +export function startTracer(config: BrowserObservabilityConfig): void { + if (started) { + return; + } + started = true; + + const tracesUrl = `${trimTrailingSlash(config.proxyPath)}/v1/traces`; + + const provider = new WebTracerProvider({ + resource: resourceFromAttributes({ + [ATTR_SERVICE_NAME]: config.serviceName || "codex-web", + [ATTR_SERVICE_VERSION]: APP_VERSION, + }), + sampler: new ParentBasedSampler({ + root: new TraceIdRatioBasedSampler(clampRatio(config.sampleRatio)), + }), + spanProcessors: [ + new BatchSpanProcessor( + new OTLPTraceExporter({ + url: tracesUrl, + // The proxy is same-origin; cookies / bearer headers go along + // for free. We deliberately do NOT set custom Authorization + // headers here — the server proxy adds the upstream auth. + }), + { + // Modest defaults: flush every ~5s or 512 spans, whichever first. + maxExportBatchSize: 512, + maxQueueSize: 2048, + scheduledDelayMillis: 5000, + }, + ), + ], + }); + + provider.register({ + // ZoneContextManager preserves the active span across async + // callbacks (setTimeout, fetch promises, etc.) on browsers without + // AsyncContext support. + contextManager: new ZoneContextManager(), + }); + + registerInstrumentations({ + instrumentations: [ + new DocumentLoadInstrumentation(), + new FetchInstrumentation({ + // Only inject traceparent on same-origin (Codex API) requests. + // We don't want to leak trace context to third-party CDNs. + propagateTraceHeaderCorsUrls: [ + new RegExp(`^${escapeRegExp(window.location.origin)}/`), + ], + }), + // Default event set is hover-heavy. Restrict to clicks + key + // presses so the trace volume stays sane on busy pages. + new UserInteractionInstrumentation({ + eventNames: ["click", "submit"], + }), + ], + }); + + // Flush on the tab going away. The OTel BatchSpanProcessor wires its + // own `pagehide` / `visibilitychange` listeners internally, but we + // also kick `forceFlush` to be explicit during a hot reload. + window.addEventListener("pagehide", () => { + void provider.forceFlush(); + }); +} + +function trimTrailingSlash(s: string): string { + return s.endsWith("/") ? s.slice(0, -1) : s; +} + +function clampRatio(r: number): number { + if (!Number.isFinite(r)) { + return 0; + } + if (r < 0) { + return 0; + } + if (r > 1) { + return 1; + } + return r; +} + +function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} diff --git a/web/src/main.tsx b/web/src/main.tsx index ac6dbe6b..218ca2dd 100644 --- a/web/src/main.tsx +++ b/web/src/main.tsx @@ -7,6 +7,7 @@ import App from "./App.tsx"; import { InstallPrompt, PwaUpdatePrompt } from "./components/pwa"; import { ThemeSync } from "./components/ThemeSync.tsx"; import { MotionProvider } from "./lib/motion/MotionProvider"; +import { initObservability } from "./lib/observability"; import { installOutboxDrainListeners } from "./lib/offline/outbox"; import { cssVariablesResolver, theme } from "./theme"; @@ -57,6 +58,12 @@ async function enableMocking() { // no-op if there is nothing queued, and double-install is guarded. installOutboxDrainListeners(); +// Kick off the OTel web SDK bootstrap. The call returns immediately; +// the network round-trip + SDK code-split happen in the background and +// never block render. If the server says RUM is disabled we never load +// the SDK bundle in the first place. +void initObservability(); + // Start the application after mocking is ready enableMocking().then(() => { const rootElement = document.getElementById("root"); diff --git a/web/src/types/api.generated.ts b/web/src/types/api.generated.ts index 272e5183..0ad42142 100644 --- a/web/src/types/api.generated.ts +++ b/web/src/types/api.generated.ts @@ -2531,6 +2531,62 @@ export interface paths { patch?: never; trace?: never; }; + "/api/v1/observability/config": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** + * Return the configuration the browser SDK needs to bootstrap itself. + * @description Authenticated to keep the response (which leaks the sample ratio / + * proxy path / service name) inside the existing trust boundary; + * everything sensitive (endpoint, headers) stays server-side. + */ + get: operations["get_browser_config"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/v1/observability/otlp/v1/metrics": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Forward a batched OTLP/HTTP metrics payload to the configured upstream. */ + post: operations["proxy_metrics"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/v1/observability/otlp/v1/traces": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Forward a batched OTLP/HTTP traces payload to the configured upstream. */ + post: operations["proxy_traces"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/v1/plugins/actions": { parameters: { query?: never; @@ -8649,6 +8705,38 @@ export interface components { /** @description Parent directory path (None if at root) */ parentPath?: string | null; }; + /** + * @description Browser RUM bootstrap configuration returned by + * `GET /api/v1/observability/config`. + */ + BrowserObservabilityConfigDto: { + /** + * @description Whether the browser SDK should initialize. False means the SDK + * bootstrap is a no-op even if the script is loaded. + */ + enabled: boolean; + /** + * @description Same-origin path prefix on the Codex server where the browser SDK + * should POST OTLP batches. The SDK appends `/v1/traces` and + * `/v1/metrics` to this base. + * @example /api/v1/observability/otlp + */ + proxyPath: string; + /** + * Format: double + * @description Parent-based sampling ratio applied client-side. Browsers are noisy; + * default low. + * @example 0.1 + */ + sampleRatio: number; + /** + * @description `service.name` resource attribute the browser SDK should set on + * every span (matches the backend service name unless the operator + * overrode it specifically for the browser). + * @example codex-web + */ + serviceName: string; + }; /** @description Request to perform bulk analyze operations on multiple books */ BulkAnalyzeBooksRequest: { /** @@ -24639,6 +24727,135 @@ export interface operations { }; }; }; + get_browser_config: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Browser SDK bootstrap config */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["BrowserObservabilityConfigDto"]; + }; + }; + /** @description Unauthorized */ + 401: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + }; + }; + proxy_metrics: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** @description OTLP/HTTP metrics payload (protobuf or JSON) */ + requestBody?: { + content: { + "application/x-protobuf": unknown; + }; + }; + responses: { + /** @description Forwarded successfully */ + 200: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Payload too large */ + 400: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Unauthorized */ + 401: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Upstream collector error */ + 502: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Browser observability disabled */ + 503: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + }; + }; + proxy_traces: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** @description OTLP/HTTP traces payload (protobuf or JSON) */ + requestBody?: { + content: { + "application/x-protobuf": unknown; + }; + }; + responses: { + /** @description Forwarded successfully */ + 200: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Payload too large */ + 400: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Unauthorized */ + 401: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Upstream collector error */ + 502: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Browser observability disabled */ + 503: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + }; + }; get_plugin_actions: { parameters: { query: {