From 5fb51ac6f2513d292fc2cc59ebed361bc1739675 Mon Sep 17 00:00:00 2001 From: Vincent Caux-Brisebois Date: Sat, 11 Apr 2026 00:16:07 +0000 Subject: [PATCH 1/5] GPU support design and implementation plan Signed-off-by: Vincent Caux-Brisebois --- vm-gpu-passthrough-implementation.md | 246 +++++++++++++++++++++++++++ vm-gpu-passthrough.md | 225 ++++++++++++++++++++++++ 2 files changed, 471 insertions(+) create mode 100644 vm-gpu-passthrough-implementation.md create mode 100644 vm-gpu-passthrough.md diff --git a/vm-gpu-passthrough-implementation.md b/vm-gpu-passthrough-implementation.md new file mode 100644 index 000000000..77ea95971 --- /dev/null +++ b/vm-gpu-passthrough-implementation.md @@ -0,0 +1,246 @@ +# VM GPU passthrough: implementation plan + +> Design: [vm-gpu-passthrough.md](vm-gpu-passthrough.md) + +## Phase 0 -- Specification and failing test (current) + +- [x] Design doc. +- [x] Phase 0.5 VMM decision (cloud-hypervisor selected). +- [ ] **`gpu_passthrough` module** integrated into `crates/openshell-vm/src/`: + - `probe_host_nvidia_vfio_readiness()` -- Linux sysfs scan; non-Linux returns `UnsupportedPlatform`. + - `nvidia_gpu_available_for_vm_passthrough()` -- hard-coded `false` until end-to-end passthrough works. + - **Note:** `gpu_passthrough.rs` and `gpu_passthrough_implementation.rs` exist as untracked files at the repo root but are not wired into the crate module tree (`lib.rs` does not `mod gpu_passthrough;`). Move them into `crates/openshell-vm/src/`, add `pub mod gpu_passthrough;`, and ensure `cargo test -p openshell-vm` compiles them. +- [ ] **Failing integration test** `tests/gpu_passthrough_implementation.rs` -- documents the target and fails until implementation is finished. + +**Running the red test:** `cargo test -p openshell-vm --test gpu_passthrough_implementation` + +**Note:** `mise run test` uses `cargo test --workspace --exclude openshell-vm`, so default CI stays green. + +--- + +## Phase 1 -- VMM backend abstraction and cloud-hypervisor integration + +### 1a. Backend trait and libkrun extraction + +Refactor only -- no behavior changes. Existing tests must still pass. + +- [ ] Create `src/backend.rs` with the `VmBackend` trait: + +```rust +pub trait VmBackend { + fn launch(&self, config: &VmLaunchConfig) -> Result; +} + +pub struct VmLaunchConfig { + pub base: VmConfig, + pub vfio_device: Option, +} +``` + +- [ ] Create `src/backend/libkrun.rs` -- move into `LibkrunBackend`: + - `VmContext` struct and all methods (current `lib.rs` lines 584-811) + - gvproxy setup block inside `NetBackend::Gvproxy` (lines 1337-1466) + - fork + waitpid + signal forwarding (lines 1525-1710) + - bootstrap block (lines 1648-1663) +- [ ] Extract shared gvproxy startup into a helper used by both backends. +- [ ] Update `launch()` to dispatch: + +```rust +pub fn launch(config: &VmLaunchConfig) -> Result { + // ... existing pre-launch checks ... + + if config.vfio_device.is_some() { + #[cfg(not(target_os = "linux"))] + return Err(VmError::HostSetup( + "GPU passthrough requires Linux with KVM and IOMMU".into(), + )); + + #[cfg(target_os = "linux")] + { + let backend = CloudHypervisorBackend::new()?; + return backend.launch(config); + } + } + + LibkrunBackend.launch(config) +} +``` + +- [ ] `ffi.rs` stays as-is -- only used by `LibkrunBackend`. + +### 1b. cloud-hypervisor backend + +- [ ] Create `src/backend/cloud_hypervisor.rs` implementing `VmBackend`. +- [ ] REST API client -- HTTP/1.1 over Unix socket, ~5 endpoints: + +``` +PUT /api/v1/vm.create -- configure VM +PUT /api/v1/vm.boot -- start VM +PUT /api/v1/vm.shutdown -- graceful stop +GET /api/v1/vm.info -- status check +PUT /api/v1/vm.delete -- cleanup +``` + +Use `hyper` over Unix socket (already in dependency tree) or raw HTTP. Avoid adding `cloud-hypervisor-client` crate for ~5 calls. + +- [ ] VM create payload mapping from `VmConfig`: + +```json +{ + "cpus": { "boot_vcpus": 4 }, + "memory": { "size": 8589934592 }, + "payload": { + "kernel": "/path/to/vmlinux", + "cmdline": "console=hvc0 root=virtiofs:rootfs rw init=/srv/openshell-vm-init.sh" + }, + "fs": [ + { "tag": "rootfs", "socket": "/path/to/virtiofsd.sock", "num_queues": 1, "queue_size": 1024 } + ], + "disks": [ + { "path": "/path/to/state.raw", "readonly": false } + ], + "net": [ + { "socket": "/path/to/gvproxy-qemu.sock", "mac": "5a:94:ef:e4:0c:ee" } + ], + "vsock": { + "cid": 3, + "socket": "/path/to/vsock.sock" + }, + "devices": [ + { "path": "/sys/bus/pci/devices/0000:41:00.0/" } + ], + "serial": { "mode": "File", "file": "/path/to/console.log" }, + "console": { "mode": "Off" } +} +``` + +- [ ] Process lifecycle: + 1. Start `cloud-hypervisor --api-socket /tmp/ovm-chv-{id}.sock` as subprocess + 2. Wait for API socket to appear (exponential backoff, same pattern as gvproxy) + 3. `PUT vm.create` with config payload + 4. `PUT vm.boot` + 5. Parent waits on subprocess + 6. Signal forwarding: SIGINT/SIGTERM -> `PUT vm.shutdown` + subprocess SIGTERM + 7. Cleanup: remove API socket + +### 1c. Kernel extraction and build pipeline + +- [ ] Modify `build-libkrun.sh`: after building libkrunfw, copy `vmlinux` from the kernel build tree to `target/libkrun-build/vmlinux` before cleanup. +- [ ] Add to `openshell.kconfig` (harmless for non-GPU boots): + +``` +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_DRM=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +``` + +- [ ] Add to `pins.env`: + +```bash +CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}" +VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" +``` + +- [ ] Create `build-cloud-hypervisor.sh` (or download step): download pre-built static binary from cloud-hypervisor GitHub releases for the target architecture. +- [ ] Update `package-vm-runtime.sh`: include `cloud-hypervisor`, `vmlinux`, and `virtiofsd` in the runtime tarball for Linux builds. +- [ ] `validate_runtime_dir()` in `lib.rs` must **not** require GPU binaries. Only `CloudHypervisorBackend::new()` validates their presence. + +### 1d. vsock exec agent compatibility + +libkrun uses per-port vsock bridging (`krun_add_vsock_port2`): each guest vsock port maps to a host Unix socket. cloud-hypervisor uses standard vhost-vsock with a single socket and CID-based addressing. + +- [ ] Update `exec.rs` to support both connection modes: + - **libkrun**: connect to `vm_exec_socket_path()` (existing) + - **cloud-hypervisor**: connect via `AF_VSOCK` (CID 3, port 10777) or bridge with `socat` +- [ ] Test exec agent communication (cat, env) over both backends. + +### 1e. Plumb `--gpu` flag + +- [ ] Add fields to `VmConfig`: + +```rust +pub vfio_device: Option, +pub gpu_enabled: bool, +``` + +- [ ] When `gpu_enabled` is set, add `GPU_ENABLED=true` to guest environment. +- [ ] Wire `--gpu` / `--gpu ` from the CLI to `VmConfig`. + +--- + +## Phase 1.5 -- Guest rootfs: NVIDIA driver and toolkit + +- [ ] **NVIDIA driver in rootfs.** Options: + - **Separate GPU rootfs artifact**: build `rootfs-gpu.tar.zst` alongside `rootfs.tar.zst`. Launcher selects GPU variant when `--gpu` is passed. + - **Bake into rootfs**: use `nvcr.io/nvidia/base/ubuntu` base image from `pins.env`. Heavier (~2-3 GB) but self-contained. + - **Runtime injection via virtio-fs**: stage driver packages on host, mount into guest. Lighter but more complex. +- [ ] **Driver version compatibility**: document minimum driver version and GPU compute capability. +- [ ] **NVIDIA container toolkit**: install `nvidia-container-toolkit` so `nvidia-container-runtime` is available to containerd/k3s. +- [ ] **Smoke test**: `nvidia-smi` runs inside the guest after rootfs build. + +--- + +## Phase 2 -- Guest appliance parity + +- [ ] **Init script changes** (`openshell-vm-init.sh`): when `GPU_ENABLED=true`: + - Load NVIDIA kernel modules (`nvidia`, `nvidia_uvm`, `nvidia_modeset`) + - Run `nvidia-smi` -- fail fast if device not visible + - Copy `gpu-manifests/*.yaml` into k3s auto-deploy directory (mirrors `cluster-entrypoint.sh` ~line 384) + - Verify `nvidia-container-runtime` is registered with containerd +- [ ] **End-to-end validation**: sandbox pod requesting `nvidia.com/gpu: 1` gets scheduled and can run `nvidia-smi` inside the pod. + +--- + +## Phase 3 -- CLI / UX + +- [ ] Mirror `openshell gateway start --gpu` semantics for VM backend. +- [ ] Support `--gpu ` for multi-GPU hosts. +- [ ] Document host preparation (IOMMU, `vfio-pci`, unbinding `nvidia`). +- [ ] Document single-GPU caveats (host display loss, headless operation). + +--- + +## Phase 4 -- CI + +- [ ] GPU E2E job: optional runner with `OPENSHELL_VM_GPU_E2E=1` and a VFIO-bound GPU. Tighten `nvidia_gpu_available_for_vm_passthrough()` to require `VfioBoundReady` + guest smoke. +- [ ] Non-GPU cloud-hypervisor CI test: boot and exec agent check without VFIO. Catches backend regressions without GPU hardware. + +--- + +## Test evolution + +Today `nvidia_gpu_available_for_vm_passthrough()` returns `false`. When complete, it should compose: + +1. `probe_host_nvidia_vfio_readiness()` returns `VfioBoundReady` (clean IOMMU group) +2. cloud-hypervisor binary present in runtime bundle +3. `/dev/vfio/vfio` and `/dev/vfio/{group}` accessible +4. Guest rootfs includes NVIDIA driver and toolkit + +Options for the final gate: +- `true` only when CI env var is set and hardware verified +- Replace boolean with full integration check +- Remove `#[ignore]` and run only on GPU runners + +Pick one in the final PR so `mise run test` policy stays intentional. + +--- + +## File change index + +| File | Change | +|---|---| +| `crates/openshell-vm/src/lib.rs` | Extract `launch()` internals into backend dispatch; add `vfio_device` / `gpu_enabled` to `VmConfig` | +| `crates/openshell-vm/src/backend.rs` (new) | `VmBackend` trait, `VmLaunchConfig` | +| `crates/openshell-vm/src/backend/libkrun.rs` (new) | `LibkrunBackend` -- moved from `lib.rs` (mechanical refactor) | +| `crates/openshell-vm/src/backend/cloud_hypervisor.rs` (new) | `CloudHypervisorBackend` -- REST API client, process lifecycle, VFIO assignment | +| `crates/openshell-vm/src/ffi.rs` | No changes (used only by `LibkrunBackend`) | +| `crates/openshell-vm/src/exec.rs` | Support both libkrun Unix socket and vhost-vsock connection modes | +| `crates/openshell-vm/src/gpu_passthrough.rs` (move from repo root) | `probe_host_nvidia_vfio_readiness()` with IOMMU group check | +| `crates/openshell-vm/runtime/kernel/openshell.kconfig` | Add `CONFIG_PCI`, `CONFIG_PCI_MSI`, `CONFIG_DRM`, `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | +| `crates/openshell-vm/pins.env` | Add `CLOUD_HYPERVISOR_VERSION`, `VIRTIOFSD_VERSION` | +| `crates/openshell-vm/scripts/openshell-vm-init.sh` | GPU-gated block: module loading, `nvidia-smi` check, manifest copy | +| `tasks/scripts/vm/build-libkrun.sh` | Preserve `vmlinux` in `target/libkrun-build/` | +| `tasks/scripts/vm/build-cloud-hypervisor.sh` (new) | Download or build cloud-hypervisor static binary | +| `tasks/scripts/vm/package-vm-runtime.sh` | Include `cloud-hypervisor`, `vmlinux`, `virtiofsd` for Linux builds | diff --git a/vm-gpu-passthrough.md b/vm-gpu-passthrough.md new file mode 100644 index 000000000..02262ddd9 --- /dev/null +++ b/vm-gpu-passthrough.md @@ -0,0 +1,225 @@ +# VM GPU passthrough: design + +> Status: **Design complete.** Implementation tracked in [vm-gpu-passthrough-implementation.md](vm-gpu-passthrough-implementation.md). + +## Goal + +Match the **Docker cluster GPU path** (`openshell gateway start --gpu`): the k3s node inside the microVM sees a **real NVIDIA GPU** so sandbox pods can request `nvidia.com/gpu`, the NVIDIA device plugin, and `nvidia` RuntimeClass behave identically to the Docker path. + +This is **PCI passthrough** (VFIO) of the physical GPU into the guest -- not virtio-gpu / Venus / virgl. + +## Decision record + +### Venus / virgl rejected + +libkrun's virtio-gpu Venus path forwards **Vulkan** API calls, not NVIDIA's proprietary CUDA stack. The guest never loads the NVIDIA kernel driver and has no `/dev/nvidia*` device nodes. This rules out `nvidia-smi`, CUDA workloads, the k8s device plugin, and the NVIDIA container runtime -- all of which the Docker `--gpu` path depends on. + +| Requirement | Venus | VFIO passthrough | +|---|---|---| +| `nvidia-smi` in guest | No (Vulkan only) | Yes (bare-metal driver) | +| CUDA workloads | No | Yes | +| `nvidia.com/gpu` k8s resource | No | Yes | +| NVIDIA container runtime | No | Yes | +| Performance | ~75-80% (forwarding overhead) | ~100% (bare-metal) | +| macOS support | Yes (MoltenVK) | No (Linux IOMMU only) | + +### libkrun VFIO rejected + +libkrun upstream closed the device passthrough request ([containers/libkrun#32](https://github.com/containers/libkrun/issues/32), March 2023). VFIO would require PCI bus emulation and ACPI tables -- outside libkrun's MMIO-only virtio design. No known forks add this. + +### VMM selection: dual backend + +| VMM | VFIO | Size | vsock | Rust | macOS | Decision | +|-----|------|------|-------|------|-------|----------| +| **libkrun** v1.17.4 | No | ~5 MB | Yes | Yes | Yes (HVF) | Keep for non-GPU | +| **cloud-hypervisor** | Yes | ~10 MB | Yes | Yes | No | **GPU backend** | +| QEMU | Yes | ~50+ MB | Yes | No (C) | Limited | Rejected: size, C | +| crosvm | Yes | ~15 MB | Yes | Yes | No | Rejected: heavier | +| libkrun fork | Needs patches | ~5 MB | Yes | Yes | Possible | Rejected: maintenance | + +**cloud-hypervisor** is the GPU-only VMM backend. libkrun remains the default for all non-GPU workloads and is the only backend on macOS. + +--- + +## Architecture + +``` + openshell gateway start + | + ┌───────┴───────┐ + │ --gpu flag? │ + └───────┬───────┘ + no / \ yes (Linux only) + / \ + ┌──────┴──────┐ ┌────┴─────────────┐ + │ LibkrunBack │ │ CloudHvBackend │ + │ end │ │ │ + │ │ │ REST API over │ + │ ffi.rs │ │ Unix socket │ + │ (dlopen) │ │ │ + └──────┬──────┘ └────┬─────────────┘ + │ │ + ┌──────┴──────┐ ┌────┴─────────────┐ + │ libkrun VM │ │ cloud-hypervisor │ + │ │ │ VM │ + │ virtio-fs │ │ virtio-fs │ + │ virtio-net │ │ virtio-net │ + │ vsock │ │ vsock │ + │ virtio-blk │ │ virtio-blk │ + │ │ │ VFIO PCI (GPU) │ + └─────────────┘ └──────────────────┘ +``` + +### Shared across both backends + +- **Guest rootfs**: Same directory tree under `~/.local/share/openshell/openshell-vm/{version}/instances//rootfs/`. +- **Init script**: `/srv/openshell-vm-init.sh` runs as PID 1. GPU behavior is gated on `GPU_ENABLED=true`. +- **Exec agent**: `openshell-vm-exec-agent.py` on vsock port 10777. +- **gvproxy**: DNS, DHCP, and port forwarding. Both backends connect to gvproxy's QEMU-mode Unix socket. +- **Host bootstrap**: `bootstrap_gateway()` fetches PKI over the exec agent and stores mTLS creds. + +### Per-backend differences + +| Concern | libkrun | cloud-hypervisor | +|---|---|---| +| **Process model** | Library via `dlopen`; `fork()` + `krun_start_enter()` | Subprocess; REST API over Unix socket | +| **Boot model** | `krun_set_root(dir)` + `krun_set_exec(init)` -- kernel in libkrunfw | `--kernel vmlinux` + virtio-fs via virtiofsd -- explicit kernel binary | +| **Networking** | `krun_add_net_unixstream` (Linux) / `krun_add_net_unixgram` (macOS) | `--net socket=/path/to/gvproxy.sock` | +| **vsock** | `krun_add_vsock_port2(port, socket)` per port | `--vsock cid=3,socket=/path/to/vsock.sock` (vhost-vsock) | +| **Block storage** | `krun_add_disk3(id, path, format, ...)` | `--disk path=/path/to/state.raw` | +| **GPU** | N/A | `--device path=/sys/bus/pci/devices/ADDR/` (VFIO) | +| **Console** | `krun_set_console_output(path)` | `--serial file=/path` | +| **Lifecycle** | `krun_free_ctx` in `Drop`; `waitpid` on child | REST: `vm.create` -> `vm.boot` -> `vm.shutdown`; wait on subprocess | +| **macOS** | Yes (HVF) | No (KVM only) | + +--- + +## Host requirements (GPU path) + +### Host kernel + +- `CONFIG_VFIO`, `CONFIG_VFIO_PCI`, `CONFIG_VFIO_IOMMU_TYPE1` +- IOMMU enabled: BIOS (VT-d / AMD-Vi) + kernel params (`intel_iommu=on iommu=pt` or AMD equivalent) + +### Host preparation + +1. Unbind GPU from `nvidia` driver: `echo > /sys/bus/pci/drivers/nvidia/unbind` +2. Bind to `vfio-pci`: `echo > /sys/bus/pci/drivers/vfio-pci/new_id` +3. Verify: `readlink /sys/bus/pci/devices//driver` points to `vfio-pci` +4. Ensure `/dev/vfio/vfio` and `/dev/vfio/{group}` are accessible + +### Host preflight state machine + +The stack classifies each NVIDIA PCI device into one of these states: + +| State | Meaning | Action | +|---|---|---| +| `NoNvidiaDevice` | No NVIDIA PCI device found | Error: no GPU to pass through | +| `BoundToNvidia` | Device on `nvidia` driver | Not available until unbound and rebound to `vfio-pci` | +| `VfioBoundDirtyGroup` | On `vfio-pci` but IOMMU group has non-VFIO peers | Report which peers need unbinding | +| `VfioBoundReady` | On `vfio-pci`, IOMMU group clean | Ready for passthrough | + +`probe_host_nvidia_vfio_readiness()` scans sysfs for vendor ID `0x10de`, checks the driver symlink, and inspects `/sys/bus/pci/devices//iommu_group/devices/` for group cleanliness. Returns per-device readiness for multi-GPU hosts. + +--- + +## Guest requirements (GPU path) + +### Guest kernel (`openshell.kconfig` additions) + +| Config | Purpose | +|---|---| +| `CONFIG_PCI`, `CONFIG_PCI_MSI` | PCIe device visibility and interrupts | +| `CONFIG_DRM` | GPU device node creation (`/dev/dri/*`) | +| `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | NVIDIA proprietary driver is a loadable module | +| `CONFIG_FB` / `CONFIG_FRAMEBUFFER_CONSOLE` | Optional: if GPU is the only display device | + +Do **not** enable `CONFIG_VFIO` in the guest (no nested passthrough). + +### Guest rootfs (GPU variant) + +The GPU rootfs extends the base rootfs with: + +- **NVIDIA kernel driver** matching the target GPU hardware generation +- **NVIDIA container toolkit** (`nvidia-container-toolkit`) so `nvidia-container-runtime` is available to containerd/k3s +- **`nvidia-smi`** for health checks + +Distribution: separate `rootfs-gpu.tar.zst` artifact alongside the base `rootfs.tar.zst`. The launcher selects the GPU variant when `--gpu` is passed. + +### Guest init (`openshell-vm-init.sh`) + +When `GPU_ENABLED=true` is set in the environment: + +1. Load NVIDIA kernel modules (`nvidia`, `nvidia_uvm`, `nvidia_modeset`) +2. Run `nvidia-smi` -- fail fast with a clear error if the device is not visible +3. Copy `gpu-manifests/*.yaml` (NVIDIA device plugin HelmChart CR) into the k3s auto-deploy directory +4. Verify `nvidia-container-runtime` is registered with containerd + +When `GPU_ENABLED` is unset or false: no GPU paths execute (current behavior). + +--- + +## CLI interface + +### `--gpu` + +``` +openshell gateway start --gpu # Auto-select first VFIO-ready GPU +openshell gateway start --gpu 0000:41:00.0 # Select specific PCI address +``` + +Errors: +- No NVIDIA PCI device found +- GPU not bound to `vfio-pci` (with instructions to bind) +- IOMMU group not clean (lists non-VFIO peers) +- GPU passthrough not supported on macOS +- cloud-hypervisor binary not found in runtime bundle + +### Runtime bundle + +``` +~/.local/share/openshell/vm-runtime/{version}/ +├── libkrun.so # existing +├── libkrunfw.so.5 # existing +├── gvproxy # existing +├── provenance.json # existing +├── cloud-hypervisor # new (GPU path, ~10 MB, Linux only) +├── vmlinux # new (GPU path, ~15 MB, from libkrunfw build) +└── virtiofsd # new (GPU path, ~5 MB) +``` + +`cloud-hypervisor`, `vmlinux`, and `virtiofsd` are only required for `--gpu` launches. Non-GPU launches do not validate their presence. + +--- + +## Security model + +GPU passthrough grants the guest **full device access** -- the same trust model as passing a GPU into the Docker cluster container today. The guest can issue arbitrary PCIe transactions to the device. IOMMU protects host memory from DMA attacks by the device, but the guest has unrestricted control of the GPU itself. + +--- + +## Constraints and limitations + +| Constraint | Impact | Mitigation | +|---|---|---| +| **Dual-backend maintenance** | Two VMM code paths for boot, networking, vsock, console | `VmBackend` trait limits blast radius; CI tests for both | +| **Linux-only GPU path** | macOS cannot use VFIO passthrough | macOS uses libkrun exclusively; GPU is out of scope for macOS | +| **NVIDIA FLR quirks** | Consumer GeForce may not reset on VM shutdown | Target data-center GPUs (A100, H100, L40) first; document supported list | +| **Single-GPU display loss** | Binding only GPU to `vfio-pci` removes host display | Document headless operation; recommend secondary GPU | +| **NVIDIA driver coupling** | Guest driver must match GPU generation | Pin driver version in rootfs; test against GPU matrix | +| **IOMMU group granularity** | Some boards group GPU with other devices | Recommend server hardware; document ACS override (unsupported) | +| **BAR size / MMIO** | Large-BAR GPUs need 64-bit MMIO support | Document BIOS settings (Above 4G Decoding, Resizable BAR) | +| **cloud-hypervisor NVIDIA issues** | Some driver failures reported upstream | Target data-center GPUs; pin cloud-hypervisor version | +| **GPU rootfs size** | NVIDIA driver + toolkit adds ~2-3 GB | Separate `rootfs-gpu.tar.zst` artifact | +| **Runtime bundle size** | cloud-hypervisor + vmlinux + virtiofsd add ~30 MB | Only in Linux GPU builds; separate tarball if needed | + +--- + +## Related documents + +- [Custom libkrun VM runtime](architecture/custom-vm-runtime.md) -- microVM layout, build pipeline, networking +- [Cluster bootstrap (Docker)](architecture/gateway-single-node.md) -- existing `--gpu` / `GPU_ENABLED` behavior +- [Implementation plan](vm-gpu-passthrough-implementation.md) -- phased work to build this +- [cloud-hypervisor VFIO docs](https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/docs/vfio.md) -- upstream VFIO reference +- [cloud-hypervisor REST API](https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/docs/api.md) -- programmatic VM management +- [rust-vmm/vfio](https://github.com/rust-vmm/vfio) -- VFIO Rust bindings used by cloud-hypervisor From a9491fd0a6b2feabc657c8658567fdf2e5937c0c Mon Sep 17 00:00:00 2001 From: Vincent Caux-Brisebois Date: Wed, 15 Apr 2026 17:52:01 +0000 Subject: [PATCH 2/5] feat(vm): add GPU passthrough with cloud-hypervisor backend and nvidia unbind hardening Signed-off-by: Vincent Caux-Brisebois --- .github/workflows/test-gpu.yml | 2 +- Cargo.lock | 2 + architecture/README.md | 2 + architecture/custom-vm-runtime.md | 163 +- architecture/vm-gpu-passthrough.md | 413 ++++ crates/openshell-cli/Cargo.toml | 1 + crates/openshell-cli/src/main.rs | 39 +- crates/openshell-cli/src/run.rs | 138 +- crates/openshell-vm/Cargo.toml | 3 + crates/openshell-vm/build.rs | 4 +- crates/openshell-vm/pins.env | 30 + .../runtime/kernel/openshell.kconfig | 30 + crates/openshell-vm/scripts/build-rootfs.sh | 150 +- .../scripts/gpu-manifests/README.md | 41 + .../gpu-manifests/nvidia-device-plugin.yaml | 47 + .../gpu-manifests/nvidia-runtime-class.yaml | 13 + .../openshell-vm/scripts/openshell-vm-init.sh | 105 +- .../src/backend/cloud_hypervisor.rs | 1476 +++++++++++++ crates/openshell-vm/src/backend/libkrun.rs | 469 ++++ crates/openshell-vm/src/backend/mod.rs | 208 ++ crates/openshell-vm/src/exec.rs | 37 +- crates/openshell-vm/src/gpu_passthrough.rs | 1959 +++++++++++++++++ crates/openshell-vm/src/lib.rs | 848 ++----- crates/openshell-vm/src/main.rs | 81 +- .../tests/gpu_passthrough_implementation.rs | 114 + crates/openshell-vm/tests/vm_boot_smoke.rs | 151 ++ tasks/scripts/vm/build-cloud-hypervisor.sh | 75 + tasks/scripts/vm/build-libkrun.sh | 12 + tasks/scripts/vm/download-kernel-runtime.sh | 4 +- tasks/scripts/vm/package-vm-runtime.sh | 7 + tasks/scripts/vm/sync-vm-rootfs.sh | 16 + vm-gpu-passthrough-implementation.md | 246 --- vm-gpu-passthrough.md | 225 -- 33 files changed, 5907 insertions(+), 1204 deletions(-) create mode 100644 architecture/vm-gpu-passthrough.md create mode 100644 crates/openshell-vm/scripts/gpu-manifests/README.md create mode 100644 crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml create mode 100644 crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml create mode 100644 crates/openshell-vm/src/backend/cloud_hypervisor.rs create mode 100644 crates/openshell-vm/src/backend/libkrun.rs create mode 100644 crates/openshell-vm/src/backend/mod.rs create mode 100644 crates/openshell-vm/src/gpu_passthrough.rs create mode 100644 crates/openshell-vm/tests/gpu_passthrough_implementation.rs create mode 100644 crates/openshell-vm/tests/vm_boot_smoke.rs create mode 100755 tasks/scripts/vm/build-cloud-hypervisor.sh delete mode 100644 vm-gpu-passthrough-implementation.md delete mode 100644 vm-gpu-passthrough.md diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index df953b5d3..6dd98b1cd 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -22,7 +22,7 @@ jobs: - id: get_pr_info if: github.event_name == 'push' continue-on-error: true - uses: nv-gha-runners/get-pr-info@main + uses: nv-gha-runners/get-pr-info@090577647b8ddc4e06e809e264f7881650ecdccf - id: gate shell: bash diff --git a/Cargo.lock b/Cargo.lock index e4057f75c..d347ff86c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3028,6 +3028,7 @@ dependencies = [ "openshell-prover", "openshell-providers", "openshell-tui", + "openshell-vm", "owo-colors", "prost-types", "rcgen", @@ -3288,6 +3289,7 @@ dependencies = [ "serde", "serde_json", "tar", + "tempfile", "thiserror 2.0.18", "tokio", "tokio-rustls", diff --git a/architecture/README.md b/architecture/README.md index 570fce660..45457d37c 100644 --- a/architecture/README.md +++ b/architecture/README.md @@ -301,4 +301,6 @@ This opens an interactive SSH session into the sandbox, with all provider creden | [Inference Routing](inference-routing.md) | Transparent interception and sandbox-local routing of AI inference API calls to configured backends. | | [System Architecture](system-architecture.md) | Top-level system architecture diagram with all deployable components and communication flows. | | [Gateway Settings Channel](gateway-settings.md) | Runtime settings channel: two-tier key-value configuration, global policy override, settings registry, CLI/TUI commands. | +| [Custom VM Runtime](custom-vm-runtime.md) | Dual-backend VM runtime (libkrun / cloud-hypervisor), kernel configuration, and build pipeline. | +| [VM GPU Passthrough](vm-gpu-passthrough.md) | VFIO GPU passthrough for VMs: host preparation, safety checks, nvidia driver hardening, and troubleshooting. | | [TUI](tui.md) | Terminal user interface for sandbox interaction. | diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index ce4d0bf39..6dac41064 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -1,18 +1,31 @@ -# Custom libkrunfw VM Runtime +# Custom VM Runtime > Status: Experimental and work in progress (WIP). VM support is under active development and may change. ## Overview -The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a -lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel -is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel. +The OpenShell gateway VM supports two hypervisor backends: -The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or -conntrack support. This is insufficient for Kubernetes pod networking. +- **libkrun** (default) — lightweight VMM using Apple Hypervisor.framework (macOS) or KVM + (Linux). The kernel is embedded inside `libkrunfw`. Uses virtio-MMIO device transport and + gvproxy for user-space networking. +- **cloud-hypervisor** — Linux-only KVM-based VMM used for GPU passthrough (VFIO). Uses + virtio-PCI device transport, TAP networking, and requires a separate `vmlinux` kernel and + `virtiofsd` for rootfs access. + +Backend selection is automatic: `--gpu` selects cloud-hypervisor, otherwise libkrun is used. +The `--backend` flag provides explicit control (`auto`, `libkrun`, `cloud-hypervisor`). + +When `--gpu` is passed, `openshell-vm` automatically binds an eligible GPU to `vfio-pci` +and restores it to the original driver on shutdown. See +[vm-gpu-passthrough.md](vm-gpu-passthrough.md) for the full lifecycle description. + +Both backends share the same guest kernel (built from a single `openshell.kconfig` fragment) +and rootfs. -The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to -the VM kernel, enabling standard Kubernetes networking. +The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or +conntrack support. This is insufficient for Kubernetes pod networking. The custom kconfig +adds bridge CNI, iptables/nftables, conntrack, and cloud-hypervisor compatibility. ## Architecture @@ -20,10 +33,11 @@ the VM kernel, enabling standard Kubernetes networking. graph TD subgraph Host["Host (macOS / Linux)"] BIN[openshell-vm binary] - EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy"] + EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy · rootfs"] CACHE["~/.local/share/openshell/vm-runtime/{version}/"] PROV[Runtime provenance logging] GVP[gvproxy networking proxy] + CHV_BIN["cloud-hypervisor · virtiofsd · vmlinux\n(GPU runtime bundle)"] BIN --> EMB BIN -->|extracts to| CACHE @@ -44,8 +58,9 @@ graph TD INIT --> VAL --> CNI --> EXECA --> PKI --> K3S end - BIN -- "fork + krun_start_enter" --> INIT - GVP -- "virtio-net" --> Guest + BIN -- "libkrun: fork + krun_start_enter" --> INIT + BIN -- "CHV: cloud-hypervisor API + virtiofsd" --> INIT + GVP -- "virtio-net (libkrun only)" --> Guest ``` ## Embedded Runtime @@ -67,9 +82,23 @@ these to XDG cache directories with progress bars: └── ... ``` -This eliminates the need for separate bundles or downloads - a single ~120MB binary -provides everything needed to run the VM. Old cache versions are automatically -cleaned up when a new version is extracted. +When using cloud-hypervisor, an additional runtime bundle is required alongside the +binary: + +``` +target/debug/openshell-vm.runtime/ (or alongside the installed binary) +├── cloud-hypervisor # CHV binary +├── virtiofsd # virtio-fs daemon +└── vmlinux # extracted guest kernel +``` + +This bundle is built with `mise run vm:bundle-runtime` and is separate from the +embedded runtime because CHV and virtiofsd are Linux-only and not embedded in the +self-extracting binary. + +This eliminates the need for separate bundles or downloads for the default (libkrun) +path — a single ~120MB binary provides everything needed. Old cache versions are +automatically cleaned up when a new version is extracted. ### Hybrid Approach @@ -86,6 +115,31 @@ mise run vm:rootfs # Full rootfs (~2GB, includes images) mise run vm:build # Rebuild binary with full rootfs ``` +## Backend Comparison + +| | libkrun (default) | cloud-hypervisor | +|---|---|---| +| Platforms | macOS (Hypervisor.framework), Linux (KVM) | Linux (KVM) only | +| Device transport | virtio-MMIO | virtio-PCI | +| Networking | gvproxy (user-space, no root needed) | TAP (requires root/CAP_NET_ADMIN) | +| Rootfs delivery | In-process (krun API) | virtiofsd (virtio-fs daemon) | +| Kernel delivery | Embedded in libkrunfw | Separate `vmlinux` file | +| Console | virtio-console (`hvc0`) | 8250 UART (`ttyS0`) | +| Shutdown | Automatic on PID 1 exit | ACPI poweroff (`poweroff -f`) | +| GPU passthrough | Not supported | VFIO PCI passthrough | +| `--exec` mode | Direct init replacement | Wrapper script with ACPI shutdown | +| CLI flag | `--backend libkrun` | `--backend cloud-hypervisor` or `--gpu` | + +### Exec mode differences + +With libkrun, when `--exec ` is used, the command replaces the init process and +the VM exits when PID 1 exits. + +With cloud-hypervisor, the VM does not automatically exit when PID 1 terminates. A +wrapper init script is dynamically written to the guest rootfs that mounts necessary +filesystems, executes the user command, captures the exit code, and calls +`poweroff -f` to trigger an ACPI shutdown that cloud-hypervisor detects. + ## Network Profile The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and @@ -100,6 +154,26 @@ fast with an actionable error if they are missing. - Service VIPs: functional (ClusterIP, NodePort) - hostNetwork workarounds: not required +### Networking by backend + +- **libkrun**: Uses gvproxy for user-space virtio-net networking. No root privileges + needed. Port forwarding is handled via gvproxy configuration. +- **cloud-hypervisor**: Uses TAP networking (requires root or CAP_NET_ADMIN). When + `--net none` is passed, networking is disabled entirely (useful for `--exec` mode + tests). gvproxy is not used with cloud-hypervisor. + +## Guest Init Script + +The init script (`openshell-vm-init.sh`) runs as PID 1 in the guest. After mounting essential filesystems, it performs: + +1. **Kernel cmdline parsing** — exports environment variables passed via the kernel command line (`GPU_ENABLED`, `OPENSHELL_VM_STATE_DISK_DEVICE`, `VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). This runs after `/proc` is mounted so `/proc/cmdline` is available. + +2. **Cgroup v2 controller enablement** — enables `cpu`, `cpuset`, `memory`, `pids`, and `io` controllers in the root cgroup hierarchy (`cgroup.subtree_control`). k3s/kubelet requires these controllers; the `cpu` controller depends on `CONFIG_CGROUP_SCHED` in the kernel. + +3. **Networking** — detects `eth0` and attempts DHCP (via `udhcpc`). On failure, falls back to static IP configuration using `VM_NET_IP` and `VM_NET_GW` from the kernel cmdline (set by the CHV backend for TAP networking). DNS is configured from `VM_NET_DNS` if set, overriding any stale `/etc/resolv.conf` entries. + +4. **Capability validation** — verifies required kernel features (bridge networking, netfilter, cgroups) and fails fast with actionable errors if missing. + ## Runtime Provenance At boot, the openshell-vm binary logs provenance metadata about the loaded runtime bundle: @@ -128,21 +202,35 @@ graph LR BUILD_M["Build libkrunfw.dylib + libkrun.dylib"] end + subgraph CHV["Linux CI (build-cloud-hypervisor.sh)"] + BUILD_CHV["Build cloud-hypervisor + virtiofsd"] + end + subgraph Output["target/libkrun-build/"] LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"] LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"] + CHV_OUT["cloud-hypervisor + virtiofsd\n(Linux)"] + VMLINUX["vmlinux\n(extracted from libkrunfw)"] end KCONF --> BUILD_L BUILD_L --> LIB_SO + BUILD_L --> VMLINUX KCONF --> BUILD_M BUILD_M --> LIB_DY + BUILD_CHV --> CHV_OUT ``` +The `vmlinux` kernel is extracted from the libkrunfw build and reused by cloud-hypervisor. +Both backends boot the same kernel — the kconfig fragment includes drivers for both +virtio-MMIO (libkrun) and virtio-PCI (CHV) transports. + ## Kernel Config Fragment The `openshell.kconfig` fragment enables these kernel features on top of the stock -libkrunfw kernel: +libkrunfw kernel. A single kernel binary is shared by both libkrun and cloud-hypervisor — +backend-specific drivers coexist safely (the kernel probes whichever transport the +hypervisor provides). | Feature | Key Configs | Purpose | |---------|-------------|---------| @@ -158,11 +246,18 @@ libkrunfw kernel: | IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Pod-to-pod routing | | IPVS | `CONFIG_IP_VS`, `CONFIG_IP_VS_RR`, `CONFIG_IP_VS_NFCT` | kube-proxy IPVS mode (optional) | | Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | Kubernetes QoS | -| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Container resource limits | +| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_CGROUP_CPUACCT`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS`, `CONFIG_CGROUP_FREEZER` | Container resource limits | +| Cgroup CPU | `CONFIG_CGROUP_SCHED`, `CONFIG_FAIR_GROUP_SCHED`, `CONFIG_CFS_BANDWIDTH` | cgroup v2 `cpu` controller for k3s/kubelet | | TUN/TAP | `CONFIG_TUN` | CNI plugin support | | Dummy interface | `CONFIG_DUMMY` | Fallback networking | | Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support | | Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support | +| PCI / GPU | `CONFIG_PCI`, `CONFIG_PCI_MSI`, `CONFIG_DRM` | GPU passthrough via VFIO | +| Kernel modules | `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | Loading NVIDIA drivers in guest | +| virtio-PCI transport | `CONFIG_VIRTIO_PCI` | cloud-hypervisor device bus (libkrun uses MMIO) | +| Serial console | `CONFIG_SERIAL_8250`, `CONFIG_SERIAL_8250_CONSOLE` | cloud-hypervisor console (`ttyS0`) | +| ACPI | `CONFIG_ACPI` | cloud-hypervisor power management / clean shutdown | +| x2APIC | `CONFIG_X86_X2APIC` | Multi-vCPU support (CHV uses x2APIC MADT entries) | See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full fragment with inline comments explaining why each option is needed. @@ -189,13 +284,21 @@ The standalone `openshell-vm` binary supports `openshell-vm exec -- `openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style commands work the same way they would inside the VM shell. +### Vsock by backend + +- **libkrun**: Uses libkrun's built-in vsock port mapping, which transparently + bridges the guest vsock port to a host Unix socket. +- **cloud-hypervisor**: Uses a vsock exec bridge — a host-side process that + connects an AF_VSOCK socket to a Unix domain socket, providing the same + interface to the exec agent. + ## Build Commands ```bash # One-time setup: download pre-built runtime (~30s) mise run vm:setup -# Build and run +# Build and run (libkrun, default) mise run vm # Build embedded binary with base rootfs (~120MB, recommended) @@ -210,6 +313,13 @@ mise run vm:build # Rebuild binary FROM_SOURCE=1 mise run vm:setup # Build runtime from source mise run vm:build # Then build embedded binary +# Build cloud-hypervisor runtime bundle (Linux only) +mise run vm:bundle-runtime # Builds CHV + virtiofsd + extracts vmlinux + +# Run with cloud-hypervisor backend +openshell-vm --backend cloud-hypervisor # Requires runtime bundle +openshell-vm --gpu # Auto-selects CHV with GPU passthrough + # Wipe everything and start over mise run vm:clean ``` @@ -221,20 +331,23 @@ rolling `vm-dev` GitHub Release: ### Kernel Runtime (`release-vm-kernel.yml`) -Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for all -supported platforms. Runs on-demand or when the kernel config / pinned versions change. +Builds the custom libkrunfw (kernel firmware), libkrun (VMM), gvproxy, cloud-hypervisor, +and virtiofsd for all supported platforms. Runs on-demand or when the kernel config / +pinned versions change. | Platform | Runner | Build Method | |----------|--------|-------------| -| Linux ARM64 | `build-arm64` (self-hosted) | Native `build-libkrun.sh` | -| Linux x86_64 | `build-amd64` (self-hosted) | Native `build-libkrun.sh` | -| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` | +| Linux ARM64 | `build-arm64` (self-hosted) | `build-libkrun.sh` + `build-cloud-hypervisor.sh` | +| Linux x86_64 | `build-amd64` (self-hosted) | `build-libkrun.sh` + `build-cloud-hypervisor.sh` | +| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` (no CHV) | -Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, and -provenance metadata. +Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, +and provenance metadata. Linux artifacts additionally include cloud-hypervisor, +virtiofsd, and the extracted `vmlinux` kernel. Each platform builds its own libkrunfw and libkrun natively. The kernel inside -libkrunfw is always Linux regardless of host platform. +libkrunfw is always Linux regardless of host platform. cloud-hypervisor and virtiofsd +are Linux-only (macOS does not support VFIO/KVM passthrough). ### VM Binary (`release-vm-dev.yml`) diff --git a/architecture/vm-gpu-passthrough.md b/architecture/vm-gpu-passthrough.md new file mode 100644 index 000000000..c15fd668b --- /dev/null +++ b/architecture/vm-gpu-passthrough.md @@ -0,0 +1,413 @@ +# VM GPU Passthrough + +> Status: Experimental and work in progress (WIP). GPU passthrough for the VM backend is under active development. + +## Overview + +OpenShell's VM backend can pass a physical NVIDIA GPU into a microVM using VFIO (Virtual Function I/O). This gives the guest direct access to GPU hardware, enabling CUDA workloads and `nvidia-smi` inside sandboxes without virtualization overhead. + +GPU passthrough uses cloud-hypervisor (instead of the default libkrun backend) to attach a VFIO device to the VM. The guest sees a real PCI GPU device and loads standard NVIDIA drivers. + +## Architecture + +``` +Host │ Guest (microVM) +──────────────────────────────│─────────────────────────── + NVIDIA GPU (PCI BDF addr) │ nvidia driver + CUDA + ↕ bound to vfio-pci │ ↕ + /dev/vfio/ │ /dev/nvidia* + ↕ │ ↕ + cloud-hypervisor (VFIO) ────│→ PCI device visible + ↕ │ ↕ + TAP networking │ k3s + device plugin + virtiofsd (rootfs) │ ↕ + │ sandbox pods (nvidia.com/gpu) +``` + +### Backend selection + +| Flag | Backend | GPU attached? | +|------|---------|---------------| +| (none) | libkrun | No | +| `--gpu` | cloud-hypervisor | Yes (auto-detect and bind) | +| `--gpu 0000:41:00.0` | cloud-hypervisor | Yes (specific PCI device) | +| `--backend cloud-hypervisor` | cloud-hypervisor | No (force CHV without GPU) | + +Auto mode (`--backend auto`, the default) selects cloud-hypervisor when `--gpu` is used or a VFIO PCI address is configured. Otherwise libkrun is used. + +### Automatic GPU binding + +When `--gpu` is passed (with or without a specific PCI address), the launcher automatically prepares the GPU for VFIO passthrough: + +1. **Probe** — scans `/sys/bus/pci/devices` for NVIDIA devices (vendor `0x10de`). +2. **Safety checks** — for each candidate GPU, verifies it is safe to claim (see below). If any check fails, the launcher refuses to proceed and exits with an actionable error. +3. **Bind** — unbinds the selected GPU from the `nvidia` driver and binds it to `vfio-pci`. Also binds any IOMMU group peers to `vfio-pci` for group cleanliness. +4. **Launch** — starts cloud-hypervisor with the VFIO device attached and sets `GPU_ENABLED=true` in the guest kernel cmdline. +5. **Rebind on shutdown** — when the VM exits (clean shutdown, Ctrl+C, or crash), the launcher rebinds the GPU back to the `nvidia` driver and clears `driver_override`, restoring host GPU access. Cleanup is guaranteed by a `GpuBindGuard` RAII guard that calls restore on drop, covering normal exit, early return, and panic. Only `SIGKILL` (kill -9) bypasses the guard — see Troubleshooting below for manual recovery. + +When a specific PCI address is given (`--gpu 0000:41:00.0`), the launcher targets that exact device. When `--gpu` is used without an address (`auto` mode), the launcher selects the best available GPU using the multi-GPU selection strategy. + +### Safety checks + +All safety checks are hard failures — if any check fails, the launcher prints an error and exits without binding. There is no `--force` override. + +| Check | What it detects | Failure behavior | +|-------|----------------|------------------| +| **Display attached** | GPU drives an active DRM framebuffer or is the primary rendering device | Error: "GPU 0000:xx:xx.x has active display outputs — cannot passthrough without losing host display" | +| **Active processes** | Processes holding `/dev/nvidia*` file descriptors (CUDA jobs, monitoring) | Error: "GPU 0000:xx:xx.x is in use by PID(s) — stop these processes first" | +| **IOMMU enabled** | `/sys/kernel/iommu_groups/` exists and the GPU has a group assignment | Error: "IOMMU is not enabled — add intel_iommu=on or amd_iommu=on to kernel cmdline" | +| **VFIO modules loaded** | `vfio-pci` and `vfio_iommu_type1` kernel modules are loaded | Error: "vfio-pci kernel module not loaded — run: sudo modprobe vfio-pci" | +| **Permissions** | Write access to sysfs bind/unbind and `/dev/vfio/` | Error: "insufficient permissions — run as root or with CAP_NET_ADMIN" | + +### Multi-GPU selection (`--gpu` auto mode) + +On hosts with multiple NVIDIA GPUs, the launcher selects a GPU using this priority: + +1. **Already on vfio-pci** with a clean IOMMU group — use immediately (no rebind needed). +2. **Idle (no processes, no display)** — preferred for binding. +3. **Skip** GPUs with active displays or running processes. + +If no GPU passes all safety checks, the launcher fails with per-device status listing what blocked each GPU. + +## Host preparation + +The launcher handles GPU driver binding automatically. The host only needs IOMMU and VFIO kernel modules configured. + +### 1. Enable IOMMU + +IOMMU must be enabled in both BIOS/UEFI and the Linux kernel. + +**Intel systems:** + +```shell +# Add to kernel command line (e.g. /etc/default/grub GRUB_CMDLINE_LINUX) +intel_iommu=on iommu=pt +``` + +**AMD systems:** + +```shell +# AMD IOMMU is usually enabled by default; verify or add: +amd_iommu=on iommu=pt +``` + +After editing, run `update-grub` (or equivalent) and reboot. Verify IOMMU is active: + +```shell +dmesg | grep -i iommu +# Should show: "DMAR: IOMMU enabled" or "AMD-Vi: AMD IOMMUv2" +``` + +### 2. Load VFIO kernel modules + +```shell +sudo modprobe vfio-pci +sudo modprobe vfio_iommu_type1 + +# Persist across reboots +echo "vfio-pci" | sudo tee /etc/modules-load.d/vfio-pci.conf +echo "vfio_iommu_type1" | sudo tee /etc/modules-load.d/vfio_iommu_type1.conf +``` + +### 3. Device permissions + +The launcher needs root (or `CAP_NET_ADMIN`) to bind/unbind GPU drivers and configure TAP networking: + +```shell +# Option A: run as root (simplest) +sudo openshell-vm --gpu + +# Option B: set udev rules for /dev/vfio/ access (still needs sysfs write via root) +echo 'SUBSYSTEM=="vfio", OWNER="root", GROUP="kvm", MODE="0660"' | \ + sudo tee /etc/udev/rules.d/99-vfio.rules +sudo udevadm control --reload-rules +sudo usermod -aG kvm $USER +``` + +### What the launcher does automatically + +When `--gpu` is passed, the launcher performs the following steps that previously required manual intervention: + +1. **Identifies NVIDIA GPUs** via sysfs (`/sys/bus/pci/devices/*/vendor`) +2. **Runs safety checks** — display, active processes, IOMMU, VFIO modules (see Safety checks above) +3. **Unbinds from nvidia** — writes to `/sys/bus/pci/devices//driver/unbind` +4. **Sets driver override** — writes `vfio-pci` to `/sys/bus/pci/devices//driver_override` +5. **Binds to vfio-pci** — writes to `/sys/bus/pci/drivers/vfio-pci/bind` +6. **Handles IOMMU group peers** — binds other devices in the same IOMMU group to `vfio-pci` +7. **On shutdown** — reverses all bindings, clears `driver_override`, rebinds to `nvidia` + +## Single-GPU caveats + +When the host has only one NVIDIA GPU: + +- **Display-attached GPUs are blocked.** The safety checks detect if the GPU drives an active display (DRM framebuffer). If so, the launcher refuses to bind it — this prevents accidentally killing the host desktop. On headless data center servers (the typical deployment), this check passes and the GPU is bound automatically. +- **Recovery is automatic.** When the VM exits (clean shutdown, Ctrl+C, or process crash), the launcher rebinds the GPU to the `nvidia` driver and clears `driver_override`. No manual intervention is needed. +- **Process check.** If CUDA processes are using the GPU (visible via `/dev/nvidia*` file descriptors), the launcher refuses to unbind. Stop those processes first. + +## Supported GPUs + +GPU passthrough is validated with NVIDIA data center GPUs. Consumer GPUs may work but are not officially supported (NVIDIA restricts GeForce passthrough in some driver versions). + +| GPU | Architecture | Compute Capability | Status | +|-----|-------------|-------------------|--------| +| A100 | Ampere | 8.0 | Supported | +| A30 | Ampere | 8.0 | Supported | +| H100 | Hopper | 9.0 | Supported | +| H200 | Hopper | 9.0 | Supported | +| L40 | Ada Lovelace | 8.9 | Supported | +| L40S | Ada Lovelace | 8.9 | Supported | +| L4 | Ada Lovelace | 8.9 | Supported | + +## CLI usage + +### Auto-select GPU + +```shell +# openshell-vm binary (VM backend directly) +sudo openshell-vm --gpu + +# openshell CLI (gateway deployment — requires VM backend) +OPENSHELL_GATEWAY_BACKEND=vm sudo openshell gateway start --gpu +``` + +> **Note:** The default gateway backend is Docker (containers). GPU passthrough +> requires the VM backend. Set `OPENSHELL_GATEWAY_BACKEND=vm` (or `microvm`) +> to use the VM path with `openshell gateway start`. + +### Specific PCI address (multi-GPU hosts) + +```shell +sudo openshell-vm --gpu 0000:41:00.0 +``` + +### Backend selection + +The `--backend` flag controls hypervisor selection independently of `--gpu`: + +```shell +sudo openshell-vm --gpu # auto: selects cloud-hypervisor +sudo openshell-vm --backend cloud-hypervisor # explicit CHV, no GPU +sudo openshell-vm --backend libkrun # explicit libkrun (no GPU support) +``` + +The `chv` alias is accepted as shorthand for `cloud-hypervisor`. + +### Diagnostics + +When `--gpu` is passed, the launcher runs safety checks before unbinding. If +checks fail, it exits with an actionable error: + +```text +$ sudo openshell-vm --gpu +GPU passthrough blocked by safety checks. + + Detected devices: + 0000:41:00.0: has active display outputs + 0000:42:00.0: in use by PIDs: 12345 (python3), 12400 (nvidia-smi) + + No GPU is available for passthrough. +``` + +On a headless server with an idle GPU, the pre-unbind preparation runs first: + +```text +$ sudo openshell-vm --gpu +GPU 0000:41:00.0: disabled nvidia persistence mode +GPU 0000:41:00.0: unloaded nvidia_uvm +GPU 0000:41:00.0: unloaded nvidia_drm +GPU 0000:41:00.0: unloaded nvidia_modeset +GPU 0000:41:00.0: device already unbound after nvidia module cleanup +GPU: binding 0000:41:00.0 for VFIO passthrough +``` + +On shutdown (Ctrl+C or VM exit), the original driver is restored: + +```text +^C +GPU: restoring 0000:41:00.0 (cleanup) +GPU: rebinding 0000:41:00.0 to nvidia +``` + +## VM Networking (Cloud Hypervisor) + +Cloud Hypervisor uses TAP-based networking instead of the gvproxy user-mode networking used by the libkrun backend. This has several implications for connectivity and port forwarding. + +### Network topology + +``` +Host Guest (microVM) +───────────────────────────────────── ────────────────────────── + eth0 (or primary NIC) eth0 (virtio-net) + ↕ ↕ + iptables MASQUERADE ←── NAT ──→ 192.168.249.2/24 + ↕ ↕ default gw 192.168.249.1 + vmtap0 (TAP device) ↕ + 192.168.249.1/24 ←─── L2 bridge ──→ (kernel routes) + ↕ + 127.0.0.1:{port} ←── TCP proxy ──→ {port} (k3s NodePort) +``` + +### How it works + +The CHV backend configures networking in three layers: + +**1. TAP device and guest IP assignment** + +Cloud Hypervisor creates a TAP device on the host side with IP `192.168.249.1/24`. The guest is assigned `192.168.249.2/24` via kernel command line parameters (`VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). The init script reads these from `/proc/cmdline` and uses them as the static fallback when DHCP is unavailable (CHV does not run a DHCP server). + +**2. Host-side NAT and IP forwarding** + +After booting the VM, the launcher: +- Enables IP forwarding (`/proc/sys/net/ipv4/ip_forward`) +- Adds iptables MASQUERADE rules for the `192.168.249.0/24` subnet +- Adds FORWARD rules to allow traffic to/from the VM + +This gives the guest internet access through the host. Rules are cleaned up on VM shutdown. + +**3. TCP port forwarding** + +Unlike gvproxy (which provides built-in port forwarding), CHV TAP networking requires explicit port forwarding. The launcher starts a userspace TCP proxy for each port mapping (e.g., `30051:30051`). The proxy binds to `127.0.0.1:{host_port}` and forwards connections to `192.168.249.2:{guest_port}`. + +### DNS resolution + +The launcher detects the host's upstream DNS server using a two-step lookup: + +1. Reads `/etc/resolv.conf` and picks the first nameserver that does not start with `127.` (skipping systemd-resolved's `127.0.0.53` stub and other loopback addresses). +2. If all nameservers in `/etc/resolv.conf` are loopback, falls back to `/run/systemd/resolve/resolv.conf` (the upstream resolv.conf maintained by systemd-resolved). +3. If no non-loopback nameserver is found in either file, falls back to `8.8.8.8`. + +The resolved DNS server is passed to the guest via `VM_NET_DNS=` on the kernel command line. The init script writes it to `/etc/resolv.conf` inside the guest, unconditionally overriding any stale entries from previous boot cycles. + +### Key constants + +| Constant | Value | Purpose | +|----------|-------|---------| +| `CHV_TAP_HOST_IP` | `192.168.249.1` | Host side of the TAP device | +| `CHV_TAP_GUEST_IP` | `192.168.249.2` | Guest static IP | +| `CHV_TAP_SUBNET` | `192.168.249.0/24` | Subnet for iptables rules | +| `CHV_TAP_NETMASK` | `255.255.255.0` | Subnet mask in VM payload | + +### Differences from libkrun/gvproxy networking + +| Feature | libkrun + gvproxy | CHV + TAP | +|---------|------------------|-----------| +| Network mode | User-mode (SLIRP-like) | Kernel TAP device | +| DHCP | Built-in (gvproxy) | None (static IP via cmdline) | +| Guest IP | `192.168.127.2/24` | `192.168.249.2/24` | +| Port forwarding | Built-in (gvproxy `-forward`) | Userspace TCP proxy | +| Privileges | Unprivileged | Root or `CAP_NET_ADMIN` | +| NAT | Handled by gvproxy | iptables MASQUERADE | +| DNS | gvproxy provides | Host resolver passed via cmdline | + +### Troubleshooting networking + +**"lookup registry-1.docker.io: Try again" (DNS failure)** + +The VM cannot resolve DNS. Check: + +```shell +# Verify the host DNS is non-loopback +grep nameserver /etc/resolv.conf +# If only 127.0.0.53 (systemd-resolved), find the upstream: +resolvectl status | grep 'DNS Servers' + +# Verify iptables rules are in place +sudo iptables -t nat -L POSTROUTING -n -v | grep 192.168.249 +sudo iptables -L FORWARD -n -v | grep 192.168.249 + +# Verify IP forwarding is enabled +cat /proc/sys/net/ipv4/ip_forward +``` + +**Gateway health check fails (port 30051 unreachable)** + +The TCP port forwarder may not have started, or the guest service is not yet listening: + +```shell +# Check if the port forwarder is bound on the host +ss -tlnp | grep 30051 + +# Check if the guest is reachable +ping -c1 192.168.249.2 +``` + +### Host mTLS cache and state disk + +The launcher caches mTLS certificates on the host after the first successful boot (warm boot path). If the state disk is deleted or `--reset` is used, the VM generates new PKI that won't match the cached certs. The launcher detects this — when the state disk is freshly created or reset, it clears the stale host mTLS cache and runs the cold-boot PKI fetch path. This prevents `transport error` failures on the gateway health check after a state disk reset. + +## Troubleshooting + +### "no NVIDIA PCI device found" + +The host has no NVIDIA GPU installed, or the PCI device is not visible: + +```shell +lspci -nn | grep -i nvidia +# If empty, the GPU is not detected at the PCI level +``` + +### "has active display outputs" + +The GPU drives a DRM framebuffer or is the boot VGA device. This is a hard safety check — the launcher will not unbind a display GPU. Options: + +- Use a different GPU for the monitor (iGPU, secondary card) +- Stop the display manager first: `sudo systemctl stop gdm` +- On headless servers, this should not occur — verify with `ls /sys/class/drm/card*/device` + +### "in use by PIDs: ..." + +Active processes hold `/dev/nvidia*` file descriptors. The check is host-wide +(across all NVIDIA GPUs, not per-device). The launcher lists the PIDs and +process names. Stop those processes before retrying. + +### "IOMMU not enabled or device has no IOMMU group" + +IOMMU must be enabled in both BIOS/UEFI and kernel cmdline. See Host Preparation above. + +### "VFIO kernel modules not loaded" + +```shell +sudo modprobe vfio-pci +sudo modprobe vfio_iommu_type1 +``` + +### "insufficient sysfs permissions — run as root" + +The launcher needs root to write to sysfs bind/unbind paths. Run with `sudo`. + +### GPU not rebound after crash + +If the launcher process is killed with `SIGKILL` (kill -9), the cleanup handler cannot run and the GPU remains on `vfio-pci`. Manually rebind: + +```shell +PCI_ADDR="0000:41:00.0" +echo "$PCI_ADDR" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver/unbind +echo "" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver_override +echo "$PCI_ADDR" | sudo tee /sys/bus/pci/drivers/nvidia/bind +``` + +### nvidia driver unbind deadlock (kernel bug) + +Some nvidia driver versions deadlock in their sysfs `unbind` handler — the `write()` syscall to `/sys/bus/pci/drivers/nvidia/unbind` never returns. When this happens, the subprocess enters uninterruptible sleep (D state) and becomes unkillable even by `SIGKILL`. The GPU's PCI subsystem state is corrupted and all subsequent PCI operations on the device hang. Only a host reboot clears this state. + +This is a kernel/nvidia driver bug, not an openshell-vm issue. Three mitigation layers are in place: + +1. **Pre-unbind preparation**: Before the raw sysfs unbind, the launcher disables nvidia persistence mode (`nvidia-smi -pm 0`) and unloads nvidia submodules (`nvidia_uvm`, `nvidia_drm`, `nvidia_modeset`) via `modprobe -r`. This often cascade-removes the base nvidia module entirely, unbinding the device automatically without ever touching the dangerous sysfs path. + +2. **Subprocess isolation with timeout**: All sysfs writes (and the nvidia prep commands) run in a subprocess with a timeout (10s for sysfs, 15s for prep). On timeout, the subprocess is killed and dropped without calling `wait()` — preventing the parent process from being dragged into D-state. + +3. **Post-timeout verification**: If the unbind subprocess times out but the device is actually unbound at the hardware level (which the nvidia bug can cause — the operation completes but the syscall never returns), the launcher detects this and continues with the VFIO bind. + +If you hit this issue repeatedly, check for nvidia driver updates or file a bug with NVIDIA. + +### VM boots but `nvidia-smi` fails inside guest + +- Verify the GPU rootfs includes NVIDIA drivers: `chroot /path/to/rootfs which nvidia-smi` +- Check that NVIDIA kernel modules load: `openshell-vm exec -- lsmod | grep nvidia` +- Inspect dmesg for NVIDIA driver errors: `openshell-vm exec -- dmesg | grep -i nvidia` + +## Related + +- [Custom VM Runtime](custom-vm-runtime.md) — building and customizing the libkrun VM runtime +- [System Architecture](system-architecture.md) — overall OpenShell architecture +- Implementation: [`crates/openshell-vm/src/gpu_passthrough.rs`](../crates/openshell-vm/src/gpu_passthrough.rs) diff --git a/crates/openshell-cli/Cargo.toml b/crates/openshell-cli/Cargo.toml index b3a006fdd..dd8f83bb8 100644 --- a/crates/openshell-cli/Cargo.toml +++ b/crates/openshell-cli/Cargo.toml @@ -21,6 +21,7 @@ openshell-policy = { path = "../openshell-policy" } openshell-providers = { path = "../openshell-providers" } openshell-prover = { path = "../openshell-prover" } openshell-tui = { path = "../openshell-tui" } +openshell-vm = { path = "../openshell-vm" } serde = { workspace = true } serde_json = { workspace = true } prost-types = { workspace = true } diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 292922411..05d1fb7c1 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -807,18 +807,21 @@ enum GatewayCommands { #[arg(long, env = "OPENSHELL_REGISTRY_TOKEN")] registry_token: Option, - /// Enable NVIDIA GPU passthrough. + /// Enable NVIDIA GPU support for the gateway cluster. /// - /// Passes all host GPUs into the cluster container and deploys the - /// NVIDIA k8s-device-plugin so Kubernetes workloads can request - /// `nvidia.com/gpu` resources. Requires NVIDIA drivers and the - /// NVIDIA Container Toolkit on the host. + /// **Docker path (default):** passes GPUs into the gateway container via + /// the NVIDIA Container Toolkit — CDI when the daemon supports it, else + /// Docker's `--gpus all` — and deploys the NVIDIA device plugin. Use + /// `--gpu` or `--gpu auto` only; PCI addresses are not valid CDI device + /// names on this path. /// - /// When enabled, OpenShell auto-selects CDI when the Docker daemon has - /// CDI enabled and falls back to Docker's NVIDIA GPU request path - /// (`--gpus all`) otherwise. - #[arg(long)] - gpu: bool, + /// **MicroVM path:** set `OPENSHELL_GATEWAY_BACKEND=vm` for deployments + /// that use the VM gateway. Then you may pass `--gpu` / `--gpu auto` for + /// VFIO auto-select, or `--gpu 0000:41:00.0` (PCI BDF) for a specific GPU. + /// Requires IOMMU and the GPU bound to `vfio-pci`. See + /// `architecture/vm-gpu-passthrough.md`. + #[arg(long, num_args = 0..=1, default_missing_value = "auto")] + gpu: Option, }, /// Stop the gateway (preserves state). @@ -1129,10 +1132,9 @@ enum SandboxCommands { /// Request GPU resources for the sandbox. /// /// When no gateway is running, auto-bootstrap starts a GPU-enabled - /// gateway using the same automatic injection selection as - /// `openshell gateway start --gpu`. GPU intent is also inferred - /// automatically for known GPU-designated image names such as - /// `nvidia-gpu`. + /// gateway using the Docker NVIDIA path (`--gpu auto`), same as + /// `openshell gateway start --gpu` without the microVM backend. GPU + /// intent is also inferred for known GPU image names (e.g. `nvidia-gpu`). #[arg(long)] gpu: bool, @@ -1655,12 +1657,11 @@ async fn main() -> Result<()> { registry_token, gpu, } => { - let gpu = if gpu { - vec!["auto".to_string()] - } else { - vec![] + let gpu = match gpu { + Some(val) => vec![val], + None => vec![], }; - run::gateway_admin_deploy( + let _gpu_guard = run::gateway_admin_deploy( &name, remote.as_deref(), ssh_key.as_deref(), diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index c41b53518..247f41d11 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1434,7 +1434,9 @@ pub async fn gateway_admin_deploy( registry_username: Option<&str>, registry_token: Option<&str>, gpu: Vec, -) -> Result<()> { +) -> Result> { + let (gpu, gpu_guard) = prepare_gateway_deploy_gpu(gpu, remote.as_deref())?; + let location = if remote.is_some() { "remote" } else { "local" }; // Build remote options once so we can reuse them for the existence check @@ -1457,7 +1459,7 @@ pub async fn gateway_admin_deploy( "{} Gateway '{name}' is already running.", "✓".green().bold() ); - return Ok(()); + return Ok(gpu_guard); } } } @@ -1518,7 +1520,7 @@ pub async fn gateway_admin_deploy( save_active_gateway(name)?; eprintln!("{} Active gateway set to '{name}'", "✓".green().bold()); - Ok(()) + Ok(gpu_guard) } /// Resolve the remote SSH destination for a gateway. @@ -5193,6 +5195,126 @@ fn format_timestamp_ms(ms: i64) -> String { } } +/// Environment variable selecting the gateway deployment backend for GPU checks. +/// +/// VFIO sysfs probes apply only to the microVM (`openshell-vm`) deploy path. +/// The default `openshell gateway start` flow uses Docker with the NVIDIA +/// Container Toolkit; leave this unset for that path. +const OPENSHELL_GATEWAY_BACKEND_ENV: &str = "OPENSHELL_GATEWAY_BACKEND"; + +fn gateway_deploy_uses_vm_backend() -> bool { + std::env::var(OPENSHELL_GATEWAY_BACKEND_ENV) + .ok() + .map(|v| { + matches!( + v.trim().to_ascii_lowercase().as_str(), + "vm" | "microvm" | "openshell-vm" + ) + }) + .unwrap_or(false) +} + +/// Heuristic: value looks like a PCI domain:bus:dev.fn address (Linux sysfs BDF). +fn looks_like_pci_bdf(s: &str) -> bool { + let s = s.trim(); + let rest = if let Some((prefix, after_colon)) = s.split_once(':') { + if prefix.len() == 4 && prefix.chars().all(|c| c.is_ascii_hexdigit()) { + after_colon + } else { + s + } + } else { + return false; + }; + + let Some((bus, dev_fn)) = rest.split_once(':') else { + return false; + }; + if bus.len() != 2 || !bus.chars().all(|c| c.is_ascii_hexdigit()) { + return false; + } + let Some((dev, func)) = dev_fn.split_once('.') else { + return false; + }; + if dev.len() != 2 || !dev.chars().all(|c| c.is_ascii_hexdigit()) { + return false; + } + if func.len() != 1 || !func.chars().all(|c| ('0'..='7').contains(&c)) { + return false; + } + true +} + +/// Validate `--gpu` for `gateway start`, run VFIO checks only for the VM deploy path, +/// and normalize Docker-path requests to CDI-compatible `auto`. +fn prepare_gateway_deploy_gpu( + gpu: Vec, + remote: Option<&str>, +) -> Result<( + Vec, + Option, +)> { + if gpu.is_empty() { + return Ok((gpu, None)); + } + + if gateway_deploy_uses_vm_backend() { + if remote.is_none() { + let guard = check_gpu_readiness(&gpu)?; + let selected_bdf = guard.pci_addr().unwrap_or("auto").to_string(); + let updated_gpu = vec![selected_bdf]; + return Ok((updated_gpu, Some(guard))); + } else { + eprintln!( + "{} Local VFIO GPU probe skipped (--remote): GPU readiness is checked on the remote host during deployment.", + "ℹ".cyan().bold() + ); + } + return Ok((gpu, None)); + } + + let Some(first) = gpu.first() else { + return Ok((gpu, None)); + }; + if first.as_str() != "auto" { + if looks_like_pci_bdf(first) { + return Err(miette!( + "PCI address GPU selection ({first}) is only supported for the microVM gateway backend.\n\n\ + `openshell gateway start` uses Docker by default (NVIDIA Container Toolkit / CDI, or Docker `--gpus all`). \ + Use `--gpu` or `--gpu auto` for that path.\n\n\ + For VFIO passthrough, set {}=vm and follow architecture/vm-gpu-passthrough.md.", + OPENSHELL_GATEWAY_BACKEND_ENV, + )); + } + return Err(miette!( + "Unrecognized --gpu value `{first}` for Docker gateway deploy. Use `--gpu` or `--gpu auto`.", + )); + } + + Ok((vec!["auto".to_string()], None)) +} + +/// Bind a GPU for VFIO passthrough and return an RAII guard that restores it on drop. +fn check_gpu_readiness(gpu: &[String]) -> Result { + use openshell_vm::gpu_passthrough::{GpuBindGuard, prepare_gpu_for_passthrough}; + + let requested_addr = gpu + .first() + .filter(|v| v.as_str() != "auto") + .map(|v| v.as_str()); + + let bind_state = prepare_gpu_for_passthrough(requested_addr).map_err(|e| miette!("{e}"))?; + + eprintln!( + "{} GPU {} bound to vfio-pci (was: {})", + "✓".green().bold(), + bind_state.pci_addr, + bind_state.original_driver, + ); + + Ok(GpuBindGuard::new(bind_state)) +} + #[cfg(test)] mod tests { use super::{ @@ -5416,6 +5538,16 @@ mod tests { assert!(sandbox_should_persist(false, Some(&spec))); } + #[test] + fn looks_like_pci_bdf_recognizes_sysfs_addresses() { + assert!(super::looks_like_pci_bdf("0000:41:00.0")); + assert!(super::looks_like_pci_bdf("41:00.0")); + assert!(super::looks_like_pci_bdf(" 0a:1f.7 ")); + assert!(!super::looks_like_pci_bdf("auto")); + assert!(!super::looks_like_pci_bdf("nvidia.com/gpu=all")); + assert!(!super::looks_like_pci_bdf("00:00.8")); // invalid function + } + #[test] fn image_requests_gpu_matches_known_gpu_image_names() { for image in [ diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml index 7d74b3139..388e42351 100644 --- a/crates/openshell-vm/Cargo.toml +++ b/crates/openshell-vm/Cargo.toml @@ -46,5 +46,8 @@ tokio-rustls = { workspace = true } [build-dependencies] zstd = "0.13" +[dev-dependencies] +tempfile = "3" + [lints] workspace = true diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs index 33fab9a78..f448ed0bc 100644 --- a/crates/openshell-vm/build.rs +++ b/crates/openshell-vm/build.rs @@ -12,7 +12,7 @@ //! Environment: //! `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` - Path to compressed artifacts -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::{env, fs}; fn main() { @@ -116,7 +116,7 @@ fn main() { /// Generate stub (empty) resource files so the build can complete. /// The embedded module will fail at runtime if these stubs are used. -fn generate_stub_resources(out_dir: &PathBuf) { +fn generate_stub_resources(out_dir: &Path) { let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); let (libkrun_name, libkrunfw_name) = match target_os.as_str() { diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index b3d802292..d44f044c8 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -42,3 +42,33 @@ GVPROXY_VERSION="${GVPROXY_VERSION:-v0.8.8}" # Repo: https://github.com/containers/libkrunfw # Pinned: 2026-03-27 (main branch HEAD at time of pinning) LIBKRUNFW_REF="${LIBKRUNFW_REF:-463f717bbdd916e1352a025b6fb2456e882b0b39}" + +# ── cloud-hypervisor (GPU passthrough VMM) ────────────────────────────── +# Repo: https://github.com/cloud-hypervisor/cloud-hypervisor +CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}" + +# ── virtiofsd (virtio-fs daemon for cloud-hypervisor rootfs) ──────────── +# Repo: https://gitlab.com/virtio-fs/virtiofsd +VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" + +# ── NVIDIA GPU support (GPU rootfs variant) ──────────────────────────── +# Driver branch: 570.x (open kernel modules, data-center/workstation) +# +# Compatibility matrix: +# Minimum driver version: 570 (NVIDIA 570.x open kernel modules) +# Minimum compute capability: sm_70 (Volta V100 and newer) +# Supported architectures: Volta (V100), Turing (T4, RTX 20xx), +# Ampere (A100, A10, RTX 30xx), +# Hopper (H100, H200), Ada Lovelace (L40S), +# Blackwell (B100, B200) +# Guest architecture: x86_64 only (NVIDIA does not publish +# aarch64 data-center drivers in APT form) +# Host requirements: IOMMU enabled, GPU bound to vfio-pci driver, +# host driver version >= guest driver version +# +# The 570.x branch uses the open kernel module flavour +# (nvidia-headless-570-open), required for data-center GPUs (Turing+). +# Consumer GPUs (GeForce) may work but are not officially supported +# for VFIO passthrough. +NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}" +NVIDIA_CONTAINER_TOOLKIT_VERSION="${NVIDIA_CONTAINER_TOOLKIT_VERSION:-1.17.5}" diff --git a/crates/openshell-vm/runtime/kernel/openshell.kconfig b/crates/openshell-vm/runtime/kernel/openshell.kconfig index b5f0330af..5ce14a683 100644 --- a/crates/openshell-vm/runtime/kernel/openshell.kconfig +++ b/crates/openshell-vm/runtime/kernel/openshell.kconfig @@ -115,6 +115,10 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PIDS=y CONFIG_MEMCG=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_FREEZER=y # ── Disable kernel headers archive (avoids cpio issues in CI) ────────── # CONFIG_IKHEADERS is not set @@ -126,3 +130,29 @@ CONFIG_POSIX_MQUEUE_SYSCTL=y # ── Security features required by the sandbox runtime ─────────────────── CONFIG_SECURITY_LANDLOCK=y CONFIG_SECCOMP_FILTER=y + +# ── PCI / GPU passthrough (harmless for non-GPU boots) ────────────────── +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_DRM=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y + +# ── cloud-hypervisor support ──────────────────────────────────────────── +# CHV uses virtio-PCI transport (libkrun uses virtio-MMIO). Both drivers +# coexist safely — the kernel probes whichever transport the hypervisor +# provides. +CONFIG_VIRTIO_PCI=y + +# Serial console for cloud-hypervisor (8250/16550 UART). libkrun uses +# virtio-console which is already enabled in the base config. +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y + +# ACPI support for cloud-hypervisor power management. Required for +# `poweroff -f` to trigger a clean ACPI shutdown that CHV detects. +CONFIG_ACPI=y + +# x2APIC support — Cloud Hypervisor uses x2APIC MADT entries for +# multi-vCPU VMs. Without this, only the bootstrap CPU is activated. +CONFIG_X86_X2APIC=y diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index d43046d4f..99a301f85 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -18,11 +18,16 @@ # - NO pre-initialized k3s state (cold start on first boot) # First boot will be slower (~30-60s) as k3s initializes and pulls images. # +# With --gpu, installs NVIDIA driver packages and the nvidia-container-toolkit +# into the rootfs, producing a GPU-capable variant. The launcher selects this +# rootfs when `--gpu` is passed. Only supported on x86_64 (NVIDIA does not +# publish aarch64 data-center drivers for Ubuntu in this packaging form). +# # Supports aarch64 and x86_64 guest architectures. The target architecture # is auto-detected from the host but can be overridden with --arch. # # Usage: -# ./build-rootfs.sh [--base] [--arch aarch64|x86_64] [output_dir] +# ./build-rootfs.sh [--base] [--gpu] [--arch aarch64|x86_64] [output_dir] # # If output_dir is omitted, the rootfs is built under target/rootfs-build. # @@ -43,12 +48,15 @@ fi # ── Argument parsing ─────────────────────────────────────────────────── BASE_ONLY=false +GPU_BUILD=false GUEST_ARCH="" POSITIONAL_ARGS=() while [[ $# -gt 0 ]]; do case "$1" in --base) BASE_ONLY=true; shift ;; + --gpu) + GPU_BUILD=true; shift ;; --arch) GUEST_ARCH="$2"; shift 2 ;; *) @@ -90,6 +98,14 @@ case "$GUEST_ARCH" in ;; esac +# GPU builds are only supported on x86_64 — NVIDIA does not publish +# aarch64 data-center driver packages in the same APT repository. +if [ "$GPU_BUILD" = true ] && [ "$GUEST_ARCH" != "x86_64" ]; then + echo "ERROR: --gpu is only supported for x86_64 guest architecture." >&2 + echo " Current arch: ${GUEST_ARCH}" >&2 + exit 1 +fi + # Project root (two levels up from crates/openshell-vm/scripts/) PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" DEFAULT_ROOTFS="${PROJECT_ROOT}/target/rootfs-build" @@ -125,6 +141,9 @@ if [ "$BASE_ONLY" = true ]; then echo " k3s version: ${K3S_VERSION}" echo " Output: ${ROOTFS_DIR}" echo " Mode: base (no pre-loaded images, cold start)" + if [ "$GPU_BUILD" = true ]; then + echo " GPU: yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})" + fi else echo "==> Building openshell-vm rootfs" echo " Guest arch: ${GUEST_ARCH}" @@ -132,6 +151,9 @@ else echo " Images: ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}" echo " Output: ${ROOTFS_DIR}" echo " Mode: full (pre-loaded images, pre-initialized)" + if [ "$GPU_BUILD" = true ]; then + echo " GPU: yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})" + fi fi echo "" @@ -222,8 +244,55 @@ fi docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true echo "==> Building base image..." -docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ - --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' +if [ "$GPU_BUILD" = true ]; then + docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" \ + --build-arg "NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}" \ + --build-arg "NVIDIA_CONTAINER_TOOLKIT_VERSION=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \ + -f - . <<'DOCKERFILE' +ARG BASE_IMAGE +FROM ${BASE_IMAGE} +ARG NVIDIA_DRIVER_VERSION +ARG NVIDIA_CONTAINER_TOOLKIT_VERSION +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + e2fsprogs \ + iptables \ + iproute2 \ + python3 \ + busybox-static \ + sqlite3 \ + util-linux \ + zstd \ + gnupg \ + curl \ + && rm -rf /var/lib/apt/lists/* +# busybox-static provides udhcpc for DHCP inside the VM. +RUN mkdir -p /usr/share/udhcpc && \ + ln -sf /bin/busybox /sbin/udhcpc +RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s +# ── NVIDIA driver and container toolkit ────────────────────────────── +# Add the NVIDIA package repository and install the open kernel module +# flavour of the driver plus nvidia-container-toolkit. The open modules +# are required for data-center GPUs (Turing+ / compute capability >= 7.0). +RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + > /etc/apt/sources.list.d/nvidia-container-toolkit.list +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + nvidia-headless-${NVIDIA_DRIVER_VERSION}-open \ + nvidia-utils-${NVIDIA_DRIVER_VERSION} \ + nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION}-1 \ + && rm -rf /var/lib/apt/lists/* +# Configure the NVIDIA container runtime as the default for containerd. +RUN nvidia-ctk runtime configure --runtime=containerd --set-as-default +DOCKERFILE +else + docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' ARG BASE_IMAGE FROM ${BASE_IMAGE} RUN apt-get update && \ @@ -243,6 +312,7 @@ RUN mkdir -p /usr/share/udhcpc && \ ln -sf /bin/busybox /sbin/udhcpc RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s DOCKERFILE +fi # Create a container and export the filesystem echo "==> Creating container..." @@ -363,6 +433,28 @@ for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do fi done +# ── Inject GPU manifests (when building GPU rootfs) ─────────────────── +# These are deployed by openshell-vm-init.sh when GPU_ENABLED=true. +GPU_MANIFEST_SRC="${SCRIPT_DIR}/gpu-manifests" +GPU_MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/gpu-manifests" +if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_SRC}" ]; then + echo "==> Injecting GPU manifests..." + mkdir -p "${GPU_MANIFEST_DEST}" + GPU_MANIFEST_COPIED=0 + for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do + [ -f "$manifest" ] || continue + cp "$manifest" "${GPU_MANIFEST_DEST}/" + echo " $(basename "$manifest")" + GPU_MANIFEST_COPIED=$((GPU_MANIFEST_COPIED + 1)) + done + # Sentinel only when at least one manifest was staged (empty glob must not create it). + if [ "$GPU_MANIFEST_COPIED" -gt 0 ]; then + echo "gpu" > "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" + else + echo "WARNING: No GPU manifests (*.yaml) found in ${GPU_MANIFEST_SRC}; not writing .rootfs-gpu sentinel." >&2 + fi +fi + # ── Base mode: mark rootfs type and skip pre-loading ─────────────────── if [ "$BASE_ONLY" = true ]; then @@ -384,10 +476,33 @@ if [ "$BASE_ONLY" = true ]; then exit 1 fi + if [ "$GPU_BUILD" = true ]; then + echo "==> Verifying GPU components in rootfs..." + if [ ! -f "${ROOTFS_DIR}/usr/bin/nvidia-smi" ]; then + echo "ERROR: nvidia-smi not found in rootfs." + exit 1 + fi + if [ ! -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ]; then + echo "ERROR: GPU sentinel file not found in rootfs." + exit 1 + fi + echo " nvidia-smi: found" + # nvidia-container-runtime is installed via nvidia-container-toolkit. + if ls "${ROOTFS_DIR}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then + echo " nvidia-container-runtime: found" + else + echo "WARNING: nvidia-container-runtime not found — GPU pods may not work." + fi + fi + echo "" echo "==> Base rootfs ready at: ${ROOTFS_DIR}" echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" - echo " Type: base (cold start, images pulled on demand)" + if [ "$GPU_BUILD" = true ]; then + echo " Type: base + GPU (cold start, NVIDIA driver ${NVIDIA_DRIVER_VERSION})" + else + echo " Type: base (cold start, images pulled on demand)" + fi echo "" echo "Note: First boot will take ~30-60s as k3s initializes." echo " Container images will be pulled from registries on first use." @@ -475,6 +590,15 @@ for manifest in "${MANIFEST_DEST}"/*.yaml; do cp "$manifest" "${INIT_MANIFESTS}/" done +# GPU manifests: same pre-init path as other auto-deploy manifests so k3s +# sees them during cluster bake (not only under /opt/openshell/gpu-manifests). +if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_DEST}" ]; then + for manifest in "${GPU_MANIFEST_DEST}"/*.yaml; do + [ -f "$manifest" ] || continue + cp "$manifest" "${INIT_MANIFESTS}/" + done +fi + # Patch HelmChart for local images and VM settings. HELMCHART="${INIT_MANIFESTS}/openshell-helmchart.yaml" if [ -f "$HELMCHART" ]; then @@ -741,10 +865,28 @@ if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then exit 1 fi +# ── GPU verification (full mode) ────────────────────────────────────── +if [ "$GPU_BUILD" = true ]; then + echo "==> Verifying GPU components in rootfs..." + if [ ! -f "${ROOTFS_DIR}/usr/bin/nvidia-smi" ]; then + echo "ERROR: nvidia-smi not found in rootfs." + exit 1 + fi + echo " nvidia-smi: found" + if ls "${ROOTFS_DIR}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then + echo " nvidia-container-runtime: found" + else + echo "WARNING: nvidia-container-runtime not found — GPU pods may not work." + fi +fi + echo "" echo "==> Rootfs ready at: ${ROOTFS_DIR}" echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" echo " Pre-initialized: $(cat "${ROOTFS_DIR}/opt/openshell/.initialized" 2>/dev/null || echo 'no')" +if [ "$GPU_BUILD" = true ]; then + echo " GPU: NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION}" +fi # Show k3s data size K3S_DATA="${ROOTFS_DIR}/var/lib/rancher/k3s" diff --git a/crates/openshell-vm/scripts/gpu-manifests/README.md b/crates/openshell-vm/scripts/gpu-manifests/README.md new file mode 100644 index 000000000..c72deb1aa --- /dev/null +++ b/crates/openshell-vm/scripts/gpu-manifests/README.md @@ -0,0 +1,41 @@ +# GPU Rootfs Manifests + +These Kubernetes manifests are injected into the VM rootfs when +`build-rootfs.sh --gpu` is used. During a **full** rootfs build they are +also copied into the k3s auto-deploy manifest directory so they are +applied at pre-init time. + +**Phase 2:** deployment from `openshell-vm-init.sh` when +`GPU_ENABLED=true` is not implemented yet; that path will copy or +reconcile these manifests at VM boot. + +## NVIDIA Driver Compatibility + +| Property | Value | +|---|---| +| Driver branch | 570.x (open kernel modules) | +| Minimum compute capability | sm_70 (Volta V100 and newer) | +| Container toolkit | nvidia-container-toolkit 1.17.x | +| Device plugin Helm chart | 0.18.2 | + +### Why open kernel modules? + +The 570.x open kernel modules are required for data-center GPUs +(Volta, Turing, Ampere, Hopper, Blackwell). They are the +NVIDIA-recommended driver for passthrough and container workloads. +Consumer GPUs (GeForce) prior to Turing (sm_75) are **not supported** +with open modules — use the proprietary driver branch if needed. + +### Host requirements + +- IOMMU enabled in BIOS and kernel (`intel_iommu=on` or `amd_iommu=on`) +- GPU bound to `vfio-pci` driver on the host +- `/dev/vfio/vfio` and `/dev/vfio/` accessible +- Host NVIDIA driver version >= 570 (must match or exceed guest driver) + +### Files + +- `nvidia-device-plugin.yaml` — HelmChart CR that deploys the NVIDIA + k8s-device-plugin via the k3s Helm controller. +- `nvidia-runtime-class.yaml` — RuntimeClass object so pods can use + `runtimeClassName: nvidia`. diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml new file mode 100644 index 000000000..c1cbeaa8a --- /dev/null +++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# HelmChart CR for auto-deploying the NVIDIA k8s-device-plugin via k3s Helm controller. +# +# This manifest is copied into /var/lib/rancher/k3s/server/manifests/ by the +# VM init script when GPU_ENABLED=true. It is the VM-specific equivalent of +# deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml used by the +# Docker-based gateway. +# +# The chart installs: +# - NVIDIA device plugin DaemonSet (advertises nvidia.com/gpu resources) +# +# NFD and GFD are disabled; the device plugin's default nodeAffinity +# (which requires nvidia.com/gpu.present=true) is overridden to empty +# so it schedules on any node without requiring NFD/GFD labels. +# +# CDI injection mode: the device plugin uses deviceListStrategy=cdi-cri so that +# devices are injected via CDI hooks before container start. Sandbox pods only +# need the nvidia.com/gpu resource request — no runtimeClassName is required. +# +# k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia" +# RuntimeClass automatically, so no manual RuntimeClass manifest is needed. + +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: nvidia-device-plugin + namespace: kube-system +spec: + repo: https://nvidia.github.io/k8s-device-plugin + chart: nvidia-device-plugin + version: "0.18.2" + targetNamespace: nvidia-device-plugin + createNamespace: true + valuesContent: |- + runtimeClassName: nvidia + deviceListStrategy: cdi-cri + deviceIDStrategy: index + cdi: + nvidiaHookPath: /usr/bin/nvidia-cdi-hook + nvidiaDriverRoot: "/" + gfd: + enabled: false + nfd: + enabled: false + affinity: null diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml new file mode 100644 index 000000000..fe2ccbd6e --- /dev/null +++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# RuntimeClass for NVIDIA GPU workloads. +# Deployed alongside the device plugin when GPU_ENABLED=true. +# Pods requesting nvidia.com/gpu resources should set +# runtimeClassName: nvidia to use the NVIDIA container runtime. +--- +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh index 1cb686a31..222bcc641 100755 --- a/crates/openshell-vm/scripts/openshell-vm-init.sh +++ b/crates/openshell-vm/scripts/openshell-vm-init.sh @@ -46,6 +46,31 @@ mkdir -p /sys/fs/cgroup mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null & wait +# ── Parse kernel cmdline for env vars (cloud-hypervisor path) ──────── +# cloud-hypervisor passes environment variables via kernel cmdline +# (KEY=VALUE tokens). These are not automatically exported to init. +# Must run after /proc is mounted. +if [ -f /proc/cmdline ]; then + for token in $(cat /proc/cmdline); do + case "$token" in + GPU_ENABLED=*|OPENSHELL_VM_STATE_DISK_DEVICE=*|VM_NET_IP=*|VM_NET_GW=*|VM_NET_DNS=*) + export "$token" + ;; + esac + done +fi + +# Enable cgroup v2 controllers in the root cgroup hierarchy. +# k3s/kubelet requires cpu, cpuset, memory, and pids controllers. +# The kernel must have CONFIG_CGROUP_SCHED=y for the cpu controller. +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + for ctrl in cpu cpuset memory pids io; do + if grep -qw "$ctrl" /sys/fs/cgroup/cgroup.controllers; then + echo "+$ctrl" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true + fi + done +fi + ts "filesystems mounted" # ── Networking ────────────────────────────────────────────────────────── @@ -97,20 +122,26 @@ DHCP_SCRIPT # -n: exit if no lease, -T 1: 1s between retries, -t 3: 3 retries # -A 1: wait 1s before first retry (aggressive for local gvproxy) if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then - ts "WARNING: DHCP failed, falling back to static config" - ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + STATIC_IP="${VM_NET_IP:-192.168.127.2}" + STATIC_GW="${VM_NET_GW:-192.168.127.1}" + ts "WARNING: DHCP failed, falling back to static config ($STATIC_IP gw $STATIC_GW)" + ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true + ip route add default via "$STATIC_GW" 2>/dev/null || true fi else - # Fallback to static config if no DHCP client available. - ts "no DHCP client, using static config" - ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + STATIC_IP="${VM_NET_IP:-192.168.127.2}" + STATIC_GW="${VM_NET_GW:-192.168.127.1}" + ts "no DHCP client, using static config ($STATIC_IP gw $STATIC_GW)" + ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true + ip route add default via "$STATIC_GW" 2>/dev/null || true fi - # Ensure DNS is configured. DHCP should have set /etc/resolv.conf, - # but if it didn't (or static fallback was used), provide a default. - if [ ! -s /etc/resolv.conf ]; then + # Ensure DNS is configured. When VM_NET_DNS is set (TAP networking), + # always use it — the rootfs may have a stale resolv.conf from a + # previous gvproxy run that points to an unreachable gateway. + if [ -n "${VM_NET_DNS:-}" ]; then + echo "nameserver $VM_NET_DNS" > /etc/resolv.conf + elif [ ! -s /etc/resolv.conf ]; then echo "nameserver 8.8.8.8" > /etc/resolv.conf echo "nameserver 8.8.4.4" >> /etc/resolv.conf fi @@ -366,6 +397,35 @@ if [ "$_caps_ok" = false ]; then exit 1 fi +# ── GPU: NVIDIA driver and device plugin ───────────────────────────── +# When the VM is launched with --gpu, the Rust launcher passes +# GPU_ENABLED=true. Load the NVIDIA kernel modules, verify the device +# is visible via nvidia-smi, and confirm that the container runtime is +# available before k3s starts. + +if [ "${GPU_ENABLED:-false}" = "true" ]; then + ts "GPU mode enabled — loading NVIDIA drivers" + + modprobe nvidia || { echo "FATAL: failed to load nvidia kernel module" >&2; exit 1; } + modprobe nvidia_uvm || { echo "FATAL: failed to load nvidia_uvm kernel module" >&2; exit 1; } + modprobe nvidia_modeset || { echo "FATAL: failed to load nvidia_modeset kernel module" >&2; exit 1; } + ts "NVIDIA kernel modules loaded" + + if ! nvidia-smi > /dev/null 2>&1; then + echo "FATAL: GPU_ENABLED=true but nvidia-smi failed — GPU not visible to guest" >&2 + echo "Check: VFIO passthrough, IOMMU groups, guest kernel modules" >&2 + exit 1 + fi + ts "nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)" + + if command -v nvidia-container-runtime >/dev/null 2>&1; then + ts "nvidia-container-runtime: $(command -v nvidia-container-runtime)" + else + echo "FATAL: nvidia-container-runtime not found — GPU pods will fail" >&2 + exit 1 + fi +fi + # ── Deploy bundled manifests (cold boot only) ─────────────────────────── # On pre-initialized rootfs, manifests are already in place from the # build-time k3s boot. Skip this entirely for fast startup. @@ -411,6 +471,29 @@ else ts "skipping manifest deploy (pre-initialized)" fi +# ── GPU manifests (device plugin, runtime class) ───────────────────── +# Deployed on every boot (not just cold boot) so the device plugin is +# always present when GPU_ENABLED=true. Mirrors cluster-entrypoint.sh. +if [ "${GPU_ENABLED:-false}" = "true" ]; then + GPU_MANIFESTS="/opt/openshell/gpu-manifests" + if [ ! -d "$GPU_MANIFESTS" ]; then + echo "FATAL: GPU_ENABLED=true but GPU manifests directory missing: $GPU_MANIFESTS" >&2 + exit 1 + fi + mkdir -p "$K3S_MANIFESTS" + _gpu_manifest_deployed=false + for manifest in "$GPU_MANIFESTS"/*.yaml; do + [ -f "$manifest" ] || continue + _gpu_manifest_deployed=true + cp "$manifest" "$K3S_MANIFESTS/" + ts "deployed GPU manifest: $(basename "$manifest")" + done + if [ "$_gpu_manifest_deployed" = false ]; then + echo "FATAL: GPU_ENABLED=true but no YAML manifests found in $GPU_MANIFESTS" >&2 + exit 1 + fi +fi + # Patch manifests for VM deployment constraints. HELMCHART="$K3S_MANIFESTS/openshell-helmchart.yaml" if [ -f "$HELMCHART" ]; then @@ -737,7 +820,7 @@ K3S_ARGS=( --node-ip="$NODE_IP" --kube-apiserver-arg=bind-address=0.0.0.0 --resolv-conf=/etc/resolv.conf - --tls-san=localhost,127.0.0.1,10.0.2.15,192.168.127.2 + --tls-san="localhost,127.0.0.1,10.0.2.15,192.168.127.2,$NODE_IP" --flannel-backend=none --snapshotter=overlayfs --kube-proxy-arg=proxy-mode=nftables diff --git a/crates/openshell-vm/src/backend/cloud_hypervisor.rs b/crates/openshell-vm/src/backend/cloud_hypervisor.rs new file mode 100644 index 000000000..869b1747d --- /dev/null +++ b/crates/openshell-vm/src/backend/cloud_hypervisor.rs @@ -0,0 +1,1476 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! cloud-hypervisor backend for GPU passthrough VMs. +//! +//! Uses the cloud-hypervisor REST API over a Unix socket to manage VMs +//! with VFIO device passthrough. This backend is Linux-only and requires +//! a separate kernel image (`vmlinux`) and `virtiofsd` for the root +//! filesystem. + +use std::io::{Read, Write}; +use std::os::unix::net::UnixStream; +use std::path::{Path, PathBuf}; +use std::time::{Duration, Instant}; + +use super::VmBackend; +use crate::exec::{ + VM_EXEC_VSOCK_PORT, clear_vm_runtime_state, vm_exec_socket_path, write_vm_runtime_state, +}; +use crate::{NetBackend, VmConfig, VmError, vm_rootfs_key}; + +/// cloud-hypervisor hypervisor backend for GPU passthrough. +pub struct CloudHypervisorBackend { + /// Path to the cloud-hypervisor binary. + chv_binary: PathBuf, + /// Path to the vmlinux kernel image. + vmlinux: PathBuf, + /// Path to the virtiofsd binary. + virtiofsd: PathBuf, +} + +impl CloudHypervisorBackend { + /// Create a new cloud-hypervisor backend, validating required binaries. + pub fn new() -> Result { + let runtime_dir = crate::configured_runtime_dir()?; + + let chv_binary = runtime_dir.join("cloud-hypervisor"); + if !chv_binary.is_file() { + return Err(VmError::BinaryNotFound { + path: chv_binary.display().to_string(), + hint: "GPU passthrough requires cloud-hypervisor. Run the GPU build pipeline or set OPENSHELL_VM_RUNTIME_DIR".to_string(), + }); + } + + let vmlinux = runtime_dir.join("vmlinux"); + if !vmlinux.is_file() { + return Err(VmError::BinaryNotFound { + path: vmlinux.display().to_string(), + hint: "GPU passthrough requires a vmlinux kernel. Run the GPU build pipeline" + .to_string(), + }); + } + + let virtiofsd = runtime_dir.join("virtiofsd"); + if !virtiofsd.is_file() { + return Err(VmError::BinaryNotFound { + path: virtiofsd.display().to_string(), + hint: "GPU passthrough requires virtiofsd. Run the GPU build pipeline".to_string(), + }); + } + + Ok(Self { + chv_binary, + vmlinux, + virtiofsd, + }) + } +} + +impl VmBackend for CloudHypervisorBackend { + fn launch(&self, config: &VmConfig) -> Result { + launch_cloud_hypervisor(self, config) + } +} + +// ── REST API client ───────────────────────────────────────────────────── + +/// Send a raw HTTP/1.1 request over a Unix socket and return the response body. +/// +/// Parses the response headers to determine Content-Length so we read exactly +/// the right number of bytes without relying on EOF or Connection: close. +fn http_request_unix( + socket_path: &Path, + method: &str, + path: &str, + body: Option<&str>, +) -> Result<(u16, String), String> { + use std::io::BufRead; + + let stream = UnixStream::connect(socket_path) + .map_err(|e| format!("connect to cloud-hypervisor API: {e}"))?; + + stream + .set_read_timeout(Some(Duration::from_secs(30))) + .map_err(|e| format!("set read timeout: {e}"))?; + + let request = if let Some(body) = body { + format!( + "{method} {path} HTTP/1.1\r\n\ + Host: localhost\r\n\ + Content-Type: application/json\r\n\ + Content-Length: {}\r\n\ + \r\n\ + {body}", + body.len(), + ) + } else { + format!( + "{method} {path} HTTP/1.1\r\n\ + Host: localhost\r\n\ + \r\n" + ) + }; + + { + let mut writer = &stream; + writer + .write_all(request.as_bytes()) + .map_err(|e| format!("write to cloud-hypervisor API: {e}"))?; + } + + let mut reader = std::io::BufReader::new(&stream); + + // Read status line + let mut status_line = String::new(); + reader + .read_line(&mut status_line) + .map_err(|e| format!("read status line: {e}"))?; + + let status_code = status_line + .split_whitespace() + .nth(1) + .and_then(|code| code.parse::().ok()) + .unwrap_or(0); + + // Read headers to find Content-Length + let mut content_length: usize = 0; + loop { + let mut header_line = String::new(); + reader + .read_line(&mut header_line) + .map_err(|e| format!("read header: {e}"))?; + if header_line.trim().is_empty() { + break; + } + if let Some(val) = header_line + .strip_prefix("Content-Length:") + .or_else(|| header_line.strip_prefix("content-length:")) + { + if let Ok(len) = val.trim().parse::() { + content_length = len; + } + } + } + + // Read body based on Content-Length + let mut body_bytes = vec![0u8; content_length]; + if content_length > 0 { + reader + .read_exact(&mut body_bytes) + .map_err(|e| format!("read body ({content_length} bytes): {e}"))?; + } + + let body_str = String::from_utf8_lossy(&body_bytes).to_string(); + Ok((status_code, body_str)) +} + +/// Wait for a Unix socket to appear on the filesystem. +fn wait_for_socket(socket_path: &Path, label: &str, timeout: Duration) -> Result<(), VmError> { + let deadline = Instant::now() + timeout; + let mut interval = Duration::from_millis(10); + + while !socket_path.exists() { + if Instant::now() >= deadline { + return Err(VmError::HostSetup(format!( + "{label} socket did not appear within {}s: {}", + timeout.as_secs(), + socket_path.display(), + ))); + } + std::thread::sleep(interval); + interval = (interval * 2).min(Duration::from_millis(200)); + } + + Ok(()) +} + +/// Create the VM via the cloud-hypervisor REST API. +fn api_vm_create(socket_path: &Path, payload: &str) -> Result<(), VmError> { + let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.create", Some(payload)) + .map_err(|e| VmError::HostSetup(format!("vm.create: {e}")))?; + + if status >= 200 && status < 300 { + Ok(()) + } else { + Err(VmError::HostSetup(format!( + "vm.create returned HTTP {status}: {body}" + ))) + } +} + +/// Boot the VM. +fn api_vm_boot(socket_path: &Path) -> Result<(), VmError> { + let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.boot", None) + .map_err(|e| VmError::HostSetup(format!("vm.boot: {e}")))?; + + if status >= 200 && status < 300 { + Ok(()) + } else { + Err(VmError::HostSetup(format!( + "vm.boot returned HTTP {status}: {body}" + ))) + } +} + +/// Request a graceful shutdown. +fn api_vm_shutdown(socket_path: &Path) -> Result<(), VmError> { + let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.shutdown", None) + .map_err(|e| VmError::HostSetup(format!("vm.shutdown: {e}")))?; + + if status >= 200 && status < 300 { + Ok(()) + } else { + Err(VmError::HostSetup(format!( + "vm.shutdown returned HTTP {status}: {body}" + ))) + } +} + +/// Query VM info/status. +#[allow(dead_code)] +fn api_vm_info(socket_path: &Path) -> Result { + let (status, body) = http_request_unix(socket_path, "GET", "/api/v1/vm.info", None) + .map_err(|e| VmError::HostSetup(format!("vm.info: {e}")))?; + + if status >= 200 && status < 300 { + Ok(body) + } else { + Err(VmError::HostSetup(format!( + "vm.info returned HTTP {status}: {body}" + ))) + } +} + +/// Delete the VM. +#[allow(dead_code)] +fn api_vm_delete(socket_path: &Path) -> Result<(), VmError> { + let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.delete", None) + .map_err(|e| VmError::HostSetup(format!("vm.delete: {e}")))?; + + if status >= 200 && status < 300 { + Ok(()) + } else { + Err(VmError::HostSetup(format!( + "vm.delete returned HTTP {status}: {body}" + ))) + } +} + +// ── Build the VM create payload ───────────────────────────────────────── + +fn build_vm_create_payload( + backend: &CloudHypervisorBackend, + config: &VmConfig, + effective_exec_path: &str, + vfio_device: Option<&str>, + virtiofsd_sock: &Path, + state_disk_path: Option<&Path>, + use_tap_net: bool, + vsock_sock: &Path, + console_log: &Path, +) -> Result { + let mem_bytes = u64::from(config.mem_mib) * 1024 * 1024; + + let mut cmdline_parts = vec![ + "console=ttyS0".to_string(), + "root=rootfs".to_string(), + "rootfstype=virtiofs".to_string(), + "rw".to_string(), + "panic=-1".to_string(), + format!("init={effective_exec_path}"), + ]; + + // Pass environment variables via kernel cmdline. Unrecognised kernel + // parameters are forwarded to init as env vars. Only simple KEY=VALUE + // pairs without spaces are safe (cmdline is space-delimited, ~4096 B). + if config.gpu_enabled && config.vfio_device.is_some() { + cmdline_parts.push("GPU_ENABLED=true".to_string()); + } + if let Some(state_disk) = &config.state_disk { + cmdline_parts.push(format!( + "OPENSHELL_VM_STATE_DISK_DEVICE={}", + state_disk.guest_device + )); + } + for var in &config.env { + if var.contains('=') && !var.contains(' ') && !var.contains('"') { + cmdline_parts.push(var.clone()); + } + } + + if use_tap_net { + cmdline_parts.push(format!("VM_NET_IP={CHV_TAP_GUEST_IP}")); + cmdline_parts.push(format!("VM_NET_GW={CHV_TAP_HOST_IP}")); + cmdline_parts.push(format!("VM_NET_DNS={}", host_dns_server())); + } + + let cmdline = cmdline_parts.join(" "); + + let mut payload = serde_json::json!({ + "cpus": { + "boot_vcpus": config.vcpus, + "max_vcpus": config.vcpus, + }, + "memory": { + "size": mem_bytes, + "shared": true, + }, + "payload": { + "kernel": backend.vmlinux.display().to_string(), + "cmdline": cmdline, + }, + "fs": [{ + "tag": "rootfs", + "socket": virtiofsd_sock.display().to_string(), + "num_queues": 1, + "queue_size": 1024, + }], + "vsock": { + "cid": VSOCK_GUEST_CID, + "socket": vsock_sock.display().to_string(), + }, + "serial": { + "mode": "File", + "file": console_log.display().to_string(), + }, + "console": { + "mode": "Off", + }, + }); + + if let Some(disk_path) = state_disk_path { + payload["disks"] = serde_json::json!([{ + "path": disk_path.display().to_string(), + "readonly": false, + }]); + } + + // Cloud-hypervisor uses TAP devices for networking (requires root or + // CAP_NET_ADMIN). The gvproxy QEMU-style socket protocol is not + // compatible with CHV's NetConfig. GPU passthrough already requires + // elevated privileges, so TAP access is expected. + if use_tap_net { + payload["net"] = serde_json::json!([{ + "mac": "5a:94:ef:e4:0c:ee", + "ip": CHV_TAP_HOST_IP, + "mask": CHV_TAP_NETMASK, + }]); + } + + if let Some(vfio_path) = vfio_device { + payload["devices"] = serde_json::json!([{ + "path": format!("/sys/bus/pci/devices/{vfio_path}/"), + }]); + } + + serde_json::to_string(&payload) + .map_err(|e| VmError::HostSetup(format!("serialize vm.create payload: {e}"))) +} + +// ── Launch ────────────────────────────────────────────────────────────── + +#[allow(clippy::similar_names)] +fn launch_cloud_hypervisor( + backend: &CloudHypervisorBackend, + config: &VmConfig, +) -> Result { + let launch_start = Instant::now(); + + let run_dir = config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .to_path_buf(); + let rootfs_key = vm_rootfs_key(&config.rootfs); + + // Unix domain sockets are limited to 108 characters (SUN_LEN). + // Instance rootfs paths can be deeply nested, so place sockets + // under /tmp to stay within the limit. + let sock_dir = PathBuf::from(format!("/tmp/ovm-chv-{}", std::process::id())); + std::fs::create_dir_all(&sock_dir).map_err(|e| { + VmError::HostSetup(format!("create socket dir {}: {e}", sock_dir.display())) + })?; + + let api_sock_path = sock_dir.join("api.sock"); + let vsock_sock_path = sock_dir.join("vsock.sock"); + let virtiofsd_sock_path = sock_dir.join("virtiofsd.sock"); + let console_log = config + .console_output + .clone() + .unwrap_or_else(|| run_dir.join(format!("{rootfs_key}-console.log"))); + + // Clean stale sockets + let _ = std::fs::remove_file(&api_sock_path); + let _ = std::fs::remove_file(&vsock_sock_path); + let _ = std::fs::remove_file(&virtiofsd_sock_path); + + // Start virtiofsd for the rootfs + eprintln!("Starting virtiofsd: {}", backend.virtiofsd.display()); + let virtiofsd_log = run_dir.join(format!("{rootfs_key}-virtiofsd.log")); + let virtiofsd_log_file = std::fs::File::create(&virtiofsd_log) + .map_err(|e| VmError::Fork(format!("create virtiofsd log: {e}")))?; + + let mut virtiofsd_child = std::process::Command::new(&backend.virtiofsd) + .arg(format!("--socket-path={}", virtiofsd_sock_path.display())) + .arg(format!("--shared-dir={}", config.rootfs.display())) + .arg("--cache=always") + .stdout(std::process::Stdio::null()) + .stderr(virtiofsd_log_file) + .spawn() + .map_err(|e| VmError::Fork(format!("start virtiofsd: {e}")))?; + + eprintln!( + "virtiofsd started (pid {}) [{:.1}s]", + virtiofsd_child.id(), + launch_start.elapsed().as_secs_f64() + ); + + // Wait for virtiofsd socket + wait_for_socket(&virtiofsd_sock_path, "virtiofsd", Duration::from_secs(5))?; + + // CHV uses TAP networking (requires root/CAP_NET_ADMIN). The gvproxy + // QEMU-style socket protocol is not compatible with cloud-hypervisor's + // NetConfig. GPU passthrough already requires elevated privileges. + let use_tap_net = !matches!(config.net, NetBackend::None); + + // For --exec mode: wrap the command so the VM powers off after it exits. + // Unlike libkrun (which exits when init terminates), cloud-hypervisor + // keeps running after PID 1 exits (kernel panics). A wrapper init script + // runs the command then calls `poweroff -f` for a clean ACPI shutdown. + let is_exec_mode = config.exec_path != "/srv/openshell-vm-init.sh"; + let wrapper_path = config.rootfs.join("tmp/chv-exec-wrapper.sh"); + let effective_exec_path; + if is_exec_mode { + let args_str = config + .args + .iter() + .map(|a| shell_escape(a)) + .collect::>() + .join(" "); + + let env_str = config + .env + .iter() + .map(|v| format!("export {}", shell_escape(v))) + .collect::>() + .join("\n"); + + let wrapper = format!( + "#!/bin/sh\n\ + mount -t proc proc /proc 2>/dev/null\n\ + mount -t sysfs sysfs /sys 2>/dev/null\n\ + mount -t devtmpfs devtmpfs /dev 2>/dev/null\n\ + {env_str}\n\ + cd {workdir}\n\ + {exec} {args}\n\ + RC=$?\n\ + # Trigger ACPI power-off so cloud-hypervisor exits cleanly.\n\ + # The rootfs may not have a `poweroff` binary, so try multiple methods.\n\ + if command -v poweroff >/dev/null 2>&1; then\n\ + poweroff -f\n\ + elif [ -x /usr/bin/busybox ]; then\n\ + /usr/bin/busybox poweroff -f\n\ + else\n\ + echo o > /proc/sysrq-trigger\n\ + fi\n\ + exit $RC\n", + env_str = env_str, + workdir = shell_escape(&config.workdir), + exec = shell_escape(&config.exec_path), + args = args_str, + ); + + if let Some(parent) = wrapper_path.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| VmError::HostSetup(format!("create wrapper dir: {e}")))?; + } + std::fs::write(&wrapper_path, &wrapper) + .map_err(|e| VmError::HostSetup(format!("write exec wrapper: {e}")))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let _ = std::fs::set_permissions(&wrapper_path, std::fs::Permissions::from_mode(0o755)); + } + effective_exec_path = "/tmp/chv-exec-wrapper.sh".to_string(); + } else { + effective_exec_path = config.exec_path.clone(); + } + + // Start cloud-hypervisor process + eprintln!( + "Starting cloud-hypervisor: {}", + backend.chv_binary.display() + ); + + let chv_log = run_dir.join(format!("{rootfs_key}-cloud-hypervisor.log")); + let chv_log_file = std::fs::File::create(&chv_log) + .map_err(|e| VmError::Fork(format!("create cloud-hypervisor log: {e}")))?; + + let mut chv_child = std::process::Command::new(&backend.chv_binary) + .arg("--api-socket") + .arg(&api_sock_path) + .stdout(std::process::Stdio::null()) + .stderr(chv_log_file) + .spawn() + .map_err(|e| VmError::Fork(format!("start cloud-hypervisor: {e}")))?; + + let chv_pid = chv_child.id() as i32; + eprintln!( + "cloud-hypervisor started (pid {chv_pid}) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + + // Wait for API socket + wait_for_socket(&api_sock_path, "cloud-hypervisor", Duration::from_secs(10))?; + + // Build and send VM create payload + let state_disk_path = config.state_disk.as_ref().map(|sd| sd.path.as_path()); + let payload = build_vm_create_payload( + backend, + config, + &effective_exec_path, + config.vfio_device.as_deref(), + &virtiofsd_sock_path, + state_disk_path, + use_tap_net, + &vsock_sock_path, + &console_log, + )?; + + api_vm_create(&api_sock_path, &payload)?; + eprintln!("VM created [{:.1}s]", launch_start.elapsed().as_secs_f64()); + + api_vm_boot(&api_sock_path)?; + let boot_start = Instant::now(); + eprintln!("VM booting [{:.1}s]", launch_start.elapsed().as_secs_f64()); + + // Set up host-side networking for TAP (NAT, IP forwarding, masquerade) + // so the guest can reach the internet through the host. + let mut original_ip_forward: Option = None; + if use_tap_net { + match setup_chv_host_networking() { + Ok(orig) => original_ip_forward = Some(orig), + Err(e) => { + eprintln!("WARNING: host networking setup failed: {e}"); + eprintln!(" The VM may not have internet access."); + } + } + } + + // Write runtime state (vsock_bridge: true — CHV uses AF_VSOCK bridging) + if config.exec_path == "/srv/openshell-vm-init.sh" { + if let Err(err) = write_vm_runtime_state(&config.rootfs, chv_pid, &console_log, None, true) + { + let _ = api_vm_shutdown(&api_sock_path); + let _ = chv_child.kill(); + let _ = chv_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_chv_host_networking(orig); + } + clear_vm_runtime_state(&config.rootfs); + return Err(err); + } + } + + // CHV TAP networking doesn't provide built-in port forwarding like + // gvproxy. Start a TCP proxy for each port mapping so the host can + // reach guest services (e.g., the gateway health check on :30051). + if use_tap_net { + for pm in &config.port_map { + let parts: Vec<&str> = pm.split(':').collect(); + if parts.len() == 2 { + if let (Ok(hp), Ok(gp)) = (parts[0].parse::(), parts[1].parse::()) { + start_tcp_port_forwarder(hp, CHV_TAP_GUEST_IP, gp)?; + } + } + } + } + + for pm in &config.port_map { + let host_port = pm.split(':').next().unwrap_or(pm); + eprintln!(" port {pm} -> http://localhost:{host_port}"); + } + eprintln!("Console output: {}", console_log.display()); + + // Start vsock exec bridge (exec Unix socket → CHV vsock Unix socket). + // The bridge allows `openshell-vm exec` and bootstrap to communicate + // with the guest exec agent over the standard exec socket path. + let exec_socket = vm_exec_socket_path(&config.rootfs); + start_vsock_exec_bridge(&exec_socket, &vsock_sock_path, VM_EXEC_VSOCK_PORT)?; + + // Gateway bootstrap and health check (mirrors libkrun backend). + if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { + let gateway_port = crate::gateway_host_port(config); + crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; + crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; + } + + eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); + eprintln!("Press Ctrl+C to stop."); + + // Signal forwarding: SIGINT/SIGTERM -> graceful shutdown + unsafe { + libc::signal( + libc::SIGINT, + crate::forward_signal as *const () as libc::sighandler_t, + ); + libc::signal( + libc::SIGTERM, + crate::forward_signal as *const () as libc::sighandler_t, + ); + crate::CHILD_PID.store(chv_pid, std::sync::atomic::Ordering::Relaxed); + } + + // Wait for cloud-hypervisor to exit + let status = chv_child + .wait() + .map_err(|e| VmError::HostSetup(format!("wait for cloud-hypervisor: {e}")))?; + + // Clean up host networking rules + if let Some(ref orig) = original_ip_forward { + teardown_chv_host_networking(orig); + } + + // Cleanup + if config.exec_path == "/srv/openshell-vm-init.sh" { + clear_vm_runtime_state(&config.rootfs); + } + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + eprintln!("virtiofsd stopped"); + + // Clean up sockets and wrapper + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + if is_exec_mode { + let _ = std::fs::remove_file(&wrapper_path); + } + + let code = status.code().unwrap_or(1); + eprintln!("VM exited with code {code}"); + Ok(code) +} + +/// Escape a string for use in a shell script. Wraps in single quotes. +fn shell_escape(s: &str) -> String { + if s.is_empty() { + return "''".to_string(); + } + if !s.contains('\'') && !s.contains(' ') && !s.contains('"') && !s.contains('\\') { + return s.to_string(); + } + format!("'{}'", s.replace('\'', "'\\''")) +} + +// ── Vsock exec bridge ─────────────────────────────────────────────────── + +/// Guest CID assigned in the cloud-hypervisor vsock config. +const VSOCK_GUEST_CID: u32 = 3; + +// ── CHV TAP networking constants ──────────────────────────────────────── +// cloud-hypervisor defaults to 192.168.249.1/24 on the host side of the +// TAP device. The guest uses .2 with the host as its gateway. + +const CHV_TAP_HOST_IP: &str = "192.168.249.1"; +const CHV_TAP_GUEST_IP: &str = "192.168.249.2"; +const CHV_TAP_SUBNET: &str = "192.168.249.0/24"; +const CHV_TAP_NETMASK: &str = "255.255.255.0"; + +/// Start a background bridge: exec Unix socket → CHV vsock Unix socket. +/// +/// cloud-hypervisor exposes guest vsock via a host-side Unix socket with a +/// text protocol: connect to the socket, send `CONNECT \n`, read +/// back `OK \n`, then the stream is a raw bidirectional channel to +/// the guest vsock port. This is different from kernel `AF_VSOCK` (which +/// `vhost-vsock` uses) — CHV manages its own transport. +/// +/// This bridge creates a Unix socket at `exec_socket` and, for each +/// incoming connection, opens a connection to the CHV vsock socket, +/// performs the CONNECT handshake, and forwards data bidirectionally. +fn start_vsock_exec_bridge( + exec_socket: &Path, + chv_vsock_socket: &Path, + guest_port: u32, +) -> Result<(), VmError> { + use std::os::unix::net::UnixListener; + + if let Some(parent) = exec_socket.parent() { + std::fs::create_dir_all(parent).map_err(|e| { + VmError::HostSetup(format!("create exec bridge dir {}: {e}", parent.display())) + })?; + } + let _ = std::fs::remove_file(exec_socket); + + let listener = UnixListener::bind(exec_socket).map_err(|e| { + VmError::HostSetup(format!( + "bind vsock exec bridge {}: {e}", + exec_socket.display() + )) + })?; + + let chv_vsock = chv_vsock_socket.to_path_buf(); + eprintln!( + "vsock exec bridge: {} → {} port {}", + exec_socket.display(), + chv_vsock.display(), + guest_port, + ); + + std::thread::spawn(move || { + vsock_bridge_accept_loop(listener, &chv_vsock, guest_port); + }); + + Ok(()) +} + +/// Accept loop for the vsock bridge background thread. +/// +/// "CONNECT rejected" (empty response) is normal during boot — the guest +/// exec agent isn't listening yet. We keep retrying those indefinitely +/// since the bootstrap caller has its own 120s timeout. Only fatal errors +/// (socket gone = VM died) cause the bridge to give up. +fn vsock_bridge_accept_loop( + listener: std::os::unix::net::UnixListener, + chv_vsock_socket: &Path, + port: u32, +) { + let mut fatal_failures: u32 = 0; + let mut logged_transient = false; + + for stream in listener.incoming() { + let client = match stream { + Ok(s) => s, + Err(e) => { + eprintln!("vsock bridge: accept: {e}"); + continue; + } + }; + + match chv_vsock_connect(chv_vsock_socket, port) { + Ok(guest) => { + fatal_failures = 0; + bridge_bidirectional(client, guest); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + fatal_failures += 1; + if fatal_failures <= 2 { + eprintln!("vsock bridge: CHV socket gone (VM exited?): {e}"); + } + if fatal_failures >= 3 { + eprintln!("vsock bridge: CHV socket not found, stopping bridge"); + return; + } + } + Err(e) => { + if !logged_transient { + eprintln!( + "vsock bridge: guest not ready on port {port} ({e}), \ + will keep retrying..." + ); + logged_transient = true; + } + } + } + } +} + +/// Connect to a guest vsock port via cloud-hypervisor's Unix socket protocol. +/// +/// CHV exposes guest vsock through a host Unix socket. The protocol is: +/// 1. Connect to the CHV vsock Unix socket +/// 2. Send: `CONNECT \n` +/// 3. Read: `OK \n` on success +/// 4. The stream is now a raw bidirectional channel to the guest port +fn chv_vsock_connect(chv_vsock_socket: &Path, port: u32) -> std::io::Result { + let mut stream = UnixStream::connect(chv_vsock_socket)?; + stream.set_read_timeout(Some(Duration::from_secs(5)))?; + stream.set_write_timeout(Some(Duration::from_secs(5)))?; + + let connect_msg = format!("CONNECT {port}\n"); + stream.write_all(connect_msg.as_bytes())?; + + let mut buf = [0u8; 64]; + let n = stream.read(&mut buf)?; + let response = std::str::from_utf8(&buf[..n]).unwrap_or(""); + + if !response.starts_with("OK") { + return Err(std::io::Error::new( + std::io::ErrorKind::ConnectionRefused, + format!("CHV vsock CONNECT rejected: {}", response.trim()), + )); + } + + stream.set_read_timeout(None)?; + stream.set_write_timeout(None)?; + Ok(stream) +} + +/// Spawn two threads that copy data between two Unix streams. +fn bridge_bidirectional(client: UnixStream, guest: UnixStream) { + let Ok(mut client_r) = client.try_clone() else { + return; + }; + let mut client_w = client; + let Ok(mut guest_r) = guest.try_clone() else { + return; + }; + let mut guest_w = guest; + + std::thread::spawn(move || { + let _ = std::io::copy(&mut client_r, &mut guest_w); + }); + std::thread::spawn(move || { + let _ = std::io::copy(&mut guest_r, &mut client_w); + }); +} + +// ── CHV host networking ───────────────────────────────────────────────── + +/// Parse a DNS server from resolv.conf content. +/// +/// Returns the first non-`127.x.x.x` nameserver, or `8.8.8.8` if none found. +/// Extracted from [`host_dns_server`] for testability. +fn parse_dns_server(content: &str) -> String { + content + .lines() + .filter(|line| line.starts_with("nameserver")) + .filter_map(|line| line.split_whitespace().nth(1)) + .find(|ip| !ip.starts_with("127.")) + .map(String::from) + .unwrap_or_else(|| "8.8.8.8".to_string()) +} + +/// Read the host's primary DNS server. +/// +/// Checks `/etc/resolv.conf` first. If every nameserver there is a loopback +/// address (e.g. systemd-resolved's `127.0.0.53`), falls back to the +/// upstream resolv.conf at `/run/systemd/resolve/resolv.conf` which +/// contains the real upstream nameservers. Final fallback is `8.8.8.8`. +fn host_dns_server() -> String { + for path in &["/etc/resolv.conf", "/run/systemd/resolve/resolv.conf"] { + if let Ok(content) = std::fs::read_to_string(path) { + let server = parse_dns_server(&content); + if server != "8.8.8.8" { + return server; + } + } + } + "8.8.8.8".to_string() +} + +/// Run a command, returning an error if it fails. +fn run_cmd(cmd: &str, args: &[&str]) -> Result<(), VmError> { + let output = std::process::Command::new(cmd) + .args(args) + .output() + .map_err(|e| VmError::HostSetup(format!("{cmd}: {e}")))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(VmError::HostSetup(format!( + "{cmd} {}: {}", + args.join(" "), + stderr.trim() + ))); + } + + Ok(()) +} + +/// Set up host-side networking so the CHV guest can reach the internet. +/// +/// 1. Enable IP forwarding (saving the original value for teardown) +/// 2. MASQUERADE outbound traffic from the VM subnet +/// 3. Allow forwarding to/from the VM subnet +/// +/// Returns the original value of `ip_forward` so the caller can restore it. +fn setup_chv_host_networking() -> Result { + let original_ip_forward = std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward") + .map(|s| s.trim().to_string()) + .unwrap_or_else(|_| "0".to_string()); + + std::fs::write("/proc/sys/net/ipv4/ip_forward", "1") + .map_err(|e| VmError::HostSetup(format!("enable IP forwarding: {e}")))?; + + run_cmd( + "iptables", + &[ + "-t", + "nat", + "-A", + "POSTROUTING", + "-s", + CHV_TAP_SUBNET, + "!", + "-d", + CHV_TAP_SUBNET, + "-j", + "MASQUERADE", + ], + )?; + + run_cmd( + "iptables", + &["-A", "FORWARD", "-s", CHV_TAP_SUBNET, "-j", "ACCEPT"], + )?; + + run_cmd( + "iptables", + &["-A", "FORWARD", "-d", CHV_TAP_SUBNET, "-j", "ACCEPT"], + )?; + + eprintln!("host networking: IP forwarding + NAT masquerade for {CHV_TAP_SUBNET}"); + Ok(original_ip_forward) +} + +/// Remove the iptables rules added by [`setup_chv_host_networking`] and +/// restore the original `ip_forward` sysctl value. +fn teardown_chv_host_networking(original_ip_forward: &str) { + let _ = run_cmd( + "iptables", + &[ + "-t", + "nat", + "-D", + "POSTROUTING", + "-s", + CHV_TAP_SUBNET, + "!", + "-d", + CHV_TAP_SUBNET, + "-j", + "MASQUERADE", + ], + ); + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-s", CHV_TAP_SUBNET, "-j", "ACCEPT"], + ); + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-d", CHV_TAP_SUBNET, "-j", "ACCEPT"], + ); + if original_ip_forward != "1" { + let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", original_ip_forward); + } + eprintln!("host networking: cleaned up iptables rules, restored ip_forward={original_ip_forward}"); +} + +/// Start a background TCP proxy that forwards `127.0.0.1:{host_port}` +/// to `{guest_ip}:{guest_port}`. +/// +/// Each accepted connection spawns two threads for bidirectional copy. +/// The listener thread runs until the process exits. +fn start_tcp_port_forwarder( + host_port: u16, + guest_ip: &str, + guest_port: u16, +) -> Result<(), VmError> { + use std::net::{TcpListener, TcpStream}; + + let listener = TcpListener::bind(("127.0.0.1", host_port)) + .map_err(|e| VmError::HostSetup(format!("bind port forwarder on :{host_port}: {e}")))?; + + let guest_addr = format!("{guest_ip}:{guest_port}"); + eprintln!("port forwarder: 127.0.0.1:{host_port} -> {guest_addr}"); + + std::thread::spawn(move || { + for stream in listener.incoming() { + let client = match stream { + Ok(s) => s, + Err(_) => continue, + }; + + let addr = guest_addr.clone(); + std::thread::spawn(move || { + if let Ok(remote) = TcpStream::connect(&addr) { + forward_tcp_bidirectional(client, remote); + } + }); + } + }); + + Ok(()) +} + +/// Copy data bidirectionally between two TCP streams until either side closes. +fn forward_tcp_bidirectional(client: std::net::TcpStream, remote: std::net::TcpStream) { + let Ok(mut client_r) = client.try_clone() else { + return; + }; + let mut client_w = client; + let Ok(mut remote_r) = remote.try_clone() else { + return; + }; + let mut remote_w = remote; + + std::thread::spawn(move || { + let _ = std::io::copy(&mut client_r, &mut remote_w); + }); + std::thread::spawn(move || { + let _ = std::io::copy(&mut remote_r, &mut client_w); + }); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn http_request_format_with_body() { + let payload = r#"{"cpus":{"boot_vcpus":4}}"#; + let request = format!( + "PUT /api/v1/vm.create HTTP/1.1\r\n\ + Host: localhost\r\n\ + Content-Type: application/json\r\n\ + Content-Length: {}\r\n\ + Connection: close\r\n\ + \r\n\ + {payload}", + payload.len(), + ); + assert!(request.contains("Content-Length: 25")); + assert!(request.contains("boot_vcpus")); + } + + #[test] + fn http_request_format_without_body() { + let request = format!( + "GET /api/v1/vm.info HTTP/1.1\r\n\ + Host: localhost\r\n\ + Connection: close\r\n\ + \r\n" + ); + assert!(request.contains("GET /api/v1/vm.info")); + assert!(!request.contains("Content-Length")); + } + + #[test] + fn build_payload_includes_vfio_device() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::None, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: true, + vfio_device: Some("0000:41:00.0".into()), + backend: crate::VmBackendChoice::CloudHypervisor, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + false, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + assert!( + payload.contains("0000:41:00.0"), + "payload should contain VFIO device" + ); + assert!( + payload.contains("boot_vcpus"), + "payload should contain vcpus config" + ); + assert!( + payload.contains("GPU_ENABLED=true"), + "payload should contain GPU_ENABLED in cmdline" + ); + } + + #[test] + fn build_payload_without_vfio() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 2, + mem_mib: 4096, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::None, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: false, + vfio_device: None, + backend: crate::VmBackendChoice::Auto, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + false, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + assert!( + !payload.contains("devices"), + "payload without VFIO should not have devices key" + ); + assert!( + !payload.contains("GPU_ENABLED"), + "payload should not contain GPU_ENABLED" + ); + } + + #[test] + fn build_payload_with_tap_net_includes_ip_and_cmdline() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec!["30051:30051".into()], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::Gvproxy { + binary: "/usr/bin/gvproxy".into(), + }, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: true, + vfio_device: Some("0000:41:00.0".into()), + backend: crate::VmBackendChoice::CloudHypervisor, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + true, // use_tap_net + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + assert!( + payload.contains("192.168.249.1"), + "net should contain TAP host IP" + ); + assert!( + payload.contains("255.255.255.0"), + "net should contain TAP netmask" + ); + assert!( + payload.contains("VM_NET_IP=192.168.249.2"), + "cmdline should contain guest IP" + ); + assert!( + payload.contains("VM_NET_GW=192.168.249.1"), + "cmdline should contain gateway IP" + ); + assert!( + payload.contains("VM_NET_DNS="), + "cmdline should contain DNS server" + ); + } + + #[test] + fn build_payload_tap_net_false_omits_net_and_vm_net_vars() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 2, + mem_mib: 4096, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::None, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: false, + vfio_device: None, + backend: crate::VmBackendChoice::Auto, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + false, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + assert!( + !payload.contains("\"net\""), + "no-tap payload should not contain net section" + ); + assert!( + !payload.contains("VM_NET_IP"), + "no-tap payload should not contain VM_NET_IP" + ); + assert!( + !payload.contains("VM_NET_GW"), + "no-tap payload should not contain VM_NET_GW" + ); + assert!( + !payload.contains("VM_NET_DNS"), + "no-tap payload should not contain VM_NET_DNS" + ); + } + + #[test] + fn build_payload_tap_net_has_correct_mac_ip_mask() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 2, + mem_mib: 4096, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::Gvproxy { + binary: "/usr/bin/gvproxy".into(), + }, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: false, + vfio_device: None, + backend: crate::VmBackendChoice::CloudHypervisor, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + true, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + let json: serde_json::Value = serde_json::from_str(&payload).unwrap(); + let net = &json["net"][0]; + assert_eq!(net["mac"], "5a:94:ef:e4:0c:ee"); + assert_eq!(net["ip"], "192.168.249.1"); + assert_eq!(net["mask"], "255.255.255.0"); + } + + #[test] + fn build_payload_vfio_and_tap_net_coexist() { + use crate::{NetBackend, VmConfig}; + + let config = VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::Gvproxy { + binary: "/usr/bin/gvproxy".into(), + }, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: true, + vfio_device: Some("0000:41:00.0".into()), + backend: crate::VmBackendChoice::CloudHypervisor, + }; + + let backend = CloudHypervisorBackend { + chv_binary: "/usr/bin/cloud-hypervisor".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + }; + + let payload = build_vm_create_payload( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + true, + Path::new("/tmp/vsock.sock"), + Path::new("/tmp/console.log"), + ) + .unwrap(); + + let json: serde_json::Value = serde_json::from_str(&payload).unwrap(); + assert!( + json["devices"].is_array(), + "devices section should exist for VFIO" + ); + assert!(json["net"].is_array(), "net section should exist for TAP"); + assert!( + json["devices"][0]["path"] + .as_str() + .unwrap() + .contains("0000:41:00.0"), + "VFIO device path should be present" + ); + assert_eq!(json["net"][0]["ip"], "192.168.249.1"); + } + + // ── parse_dns_server tests ────────────────────────────────────────── + + #[test] + fn parse_dns_server_returns_first_non_loopback() { + let content = "nameserver 10.0.0.1\nnameserver 8.8.8.8\n"; + assert_eq!(parse_dns_server(content), "10.0.0.1"); + } + + #[test] + fn parse_dns_server_skips_systemd_resolved() { + let content = "nameserver 127.0.0.53\nnameserver 1.1.1.1\n"; + assert_eq!(parse_dns_server(content), "1.1.1.1"); + } + + #[test] + fn parse_dns_server_skips_all_loopback_variants() { + let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\nnameserver 172.16.0.1\n"; + assert_eq!(parse_dns_server(content), "172.16.0.1"); + } + + #[test] + fn parse_dns_server_falls_back_when_only_loopback() { + let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\n"; + assert_eq!(parse_dns_server(content), "8.8.8.8"); + } + + #[test] + fn parse_dns_server_handles_empty_content() { + assert_eq!(parse_dns_server(""), "8.8.8.8"); + } + + #[test] + fn parse_dns_server_ignores_comments_and_other_lines() { + let content = "# Generated by NetworkManager\nsearch example.com\nnameserver 10.1.2.3\n"; + assert_eq!(parse_dns_server(content), "10.1.2.3"); + } + + // ── shell_escape tests ────────────────────────────────────────────── + + #[test] + fn shell_escape_empty_string() { + assert_eq!(shell_escape(""), "''"); + } + + #[test] + fn shell_escape_simple_string() { + assert_eq!(shell_escape("hello"), "hello"); + } + + #[test] + fn shell_escape_string_with_single_quotes() { + assert_eq!(shell_escape("it's"), "'it'\\''s'"); + } + + #[test] + fn shell_escape_string_with_spaces() { + assert_eq!(shell_escape("hello world"), "'hello world'"); + } + + #[test] + fn shell_escape_string_with_double_quotes() { + assert_eq!(shell_escape(r#"say "hi""#), r#"'say "hi"'"#); + } + + #[test] + fn shell_escape_string_with_backslash() { + assert_eq!(shell_escape("path\\to"), "'path\\to'"); + } +} diff --git a/crates/openshell-vm/src/backend/libkrun.rs b/crates/openshell-vm/src/backend/libkrun.rs new file mode 100644 index 000000000..1f077563a --- /dev/null +++ b/crates/openshell-vm/src/backend/libkrun.rs @@ -0,0 +1,469 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! libkrun hypervisor backend. +//! +//! Implements [`VmBackend`] using the libkrun C API for lightweight microVMs. +//! This is the original backend — on macOS it uses Hypervisor.framework, +//! on Linux it uses KVM. + +use std::ffi::CString; +use std::path::Path; +use std::time::Instant; + +use super::{VmBackend, setup_gvproxy_port_forwarding, start_gvproxy}; +use crate::exec::{clear_vm_runtime_state, write_vm_runtime_state}; +use crate::{ + GvproxyGuard, NetBackend, StateDiskConfig, VmConfig, VmError, VsockPort, bootstrap_gateway, + c_string_array, check, ffi, gateway_host_port, health, path_to_cstring, vm_rootfs_key, +}; + +/// libkrun hypervisor backend. +pub struct LibkrunBackend; + +impl VmBackend for LibkrunBackend { + fn launch(&self, config: &VmConfig) -> Result { + launch_libkrun(config) + } +} + +/// VM context wrapping the libkrun FFI context ID. +struct VmContext { + krun: &'static ffi::LibKrun, + ctx_id: u32, +} + +impl VmContext { + fn create(log_level: u32) -> Result { + let krun = ffi::libkrun()?; + unsafe { + check( + (krun.krun_init_log)( + ffi::KRUN_LOG_TARGET_DEFAULT, + crate::clamp_log_level(log_level), + ffi::KRUN_LOG_STYLE_AUTO, + ffi::KRUN_LOG_OPTION_NO_ENV, + ), + "krun_init_log", + )?; + } + + let ctx_id = unsafe { (krun.krun_create_ctx)() }; + if ctx_id < 0 { + return Err(VmError::Krun { + func: "krun_create_ctx", + code: ctx_id, + }); + } + + Ok(Self { + krun, + ctx_id: ctx_id as u32, + }) + } + + fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib), + "krun_set_vm_config", + ) + } + } + + fn set_root(&self, rootfs: &Path) -> Result<(), VmError> { + let rootfs_c = path_to_cstring(rootfs)?; + unsafe { + check( + (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()), + "krun_set_root", + ) + } + } + + fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> { + let Some(add_disk3) = self.krun.krun_add_disk3 else { + return Err(VmError::HostSetup( + "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support" + .to_string(), + )); + }; + + let block_id_c = CString::new(state_disk.block_id.as_str())?; + let disk_path_c = path_to_cstring(&state_disk.path)?; + unsafe { + check( + add_disk3( + self.ctx_id, + block_id_c.as_ptr(), + disk_path_c.as_ptr(), + ffi::KRUN_DISK_FORMAT_RAW, + false, + false, + crate::state_disk_sync_mode(), + ), + "krun_add_disk3", + ) + } + } + + fn set_workdir(&self, workdir: &str) -> Result<(), VmError> { + let workdir_c = CString::new(workdir)?; + unsafe { + check( + (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()), + "krun_set_workdir", + ) + } + } + + fn disable_implicit_vsock(&self) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_disable_implicit_vsock)(self.ctx_id), + "krun_disable_implicit_vsock", + ) + } + } + + fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_add_vsock)(self.ctx_id, tsi_features), + "krun_add_vsock", + ) + } + } + + #[cfg(target_os = "macos")] + fn add_net_unixgram( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + flags: u32, + ) -> Result<(), VmError> { + let sock_c = path_to_cstring(socket_path)?; + unsafe { + check( + (self.krun.krun_add_net_unixgram)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + flags, + ), + "krun_add_net_unixgram", + ) + } + } + + #[allow(dead_code)] + fn add_net_unixstream( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + ) -> Result<(), VmError> { + let sock_c = path_to_cstring(socket_path)?; + unsafe { + check( + (self.krun.krun_add_net_unixstream)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + 0, + ), + "krun_add_net_unixstream", + ) + } + } + + fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { + let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); + let (_port_owners, port_ptrs) = c_string_array(&port_strs)?; + unsafe { + check( + (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()), + "krun_set_port_map", + ) + } + } + + fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> { + let socket_c = path_to_cstring(&port.socket_path)?; + unsafe { + check( + (self.krun.krun_add_vsock_port2)( + self.ctx_id, + port.port, + socket_c.as_ptr(), + port.listen, + ), + "krun_add_vsock_port2", + ) + } + } + + fn set_console_output(&self, path: &Path) -> Result<(), VmError> { + let console_c = path_to_cstring(path)?; + unsafe { + check( + (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()), + "krun_set_console_output", + ) + } + } + + fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> { + let exec_c = CString::new(exec_path)?; + let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect(); + let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?; + let env_strs: Vec<&str> = env.iter().map(String::as_str).collect(); + let (_env_owners, env_ptrs) = c_string_array(&env_strs)?; + + unsafe { + check( + (self.krun.krun_set_exec)( + self.ctx_id, + exec_c.as_ptr(), + argv_ptrs.as_ptr(), + env_ptrs.as_ptr(), + ), + "krun_set_exec", + ) + } + } + + fn start_enter(&self) -> i32 { + unsafe { (self.krun.krun_start_enter)(self.ctx_id) } + } +} + +impl Drop for VmContext { + fn drop(&mut self) { + unsafe { + let ret = (self.krun.krun_free_ctx)(self.ctx_id); + if ret < 0 { + eprintln!( + "warning: krun_free_ctx({}) failed with code {ret}", + self.ctx_id + ); + } + } + } +} + +/// Launch a VM using the libkrun backend. +/// +/// This contains the VM-specific configuration, networking, fork/exec, +/// signal forwarding, bootstrap, and cleanup logic that was previously +/// inline in `lib.rs::launch()`. +#[allow(clippy::similar_names)] +fn launch_libkrun(config: &VmConfig) -> Result { + let launch_start = Instant::now(); + + let vm = VmContext::create(config.log_level)?; + vm.set_vm_config(config.vcpus, config.mem_mib)?; + vm.set_root(&config.rootfs)?; + if let Some(state_disk) = &config.state_disk { + vm.add_state_disk(state_disk)?; + } + vm.set_workdir(&config.workdir)?; + + let mut gvproxy_guard: Option = None; + let mut gvproxy_api_sock: Option = None; + + match &config.net { + NetBackend::Tsi => {} + NetBackend::None => { + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + eprintln!("Networking: disabled (no TSI, no virtio-net)"); + } + NetBackend::Gvproxy { .. } => { + let gvproxy_setup = start_gvproxy(config, launch_start)?; + + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; + + const NET_FEATURE_CSUM: u32 = 1 << 0; + const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; + const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; + const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; + const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; + const NET_FEATURE_HOST_UFO: u32 = 1 << 14; + const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM + | NET_FEATURE_GUEST_CSUM + | NET_FEATURE_GUEST_TSO4 + | NET_FEATURE_GUEST_UFO + | NET_FEATURE_HOST_TSO4 + | NET_FEATURE_HOST_UFO; + + #[cfg(target_os = "linux")] + vm.add_net_unixstream(&gvproxy_setup.net_sock, &mac, COMPAT_NET_FEATURES)?; + #[cfg(target_os = "macos")] + { + const NET_FLAG_VFKIT: u32 = 1 << 0; + vm.add_net_unixgram( + &gvproxy_setup.net_sock, + &mac, + COMPAT_NET_FEATURES, + NET_FLAG_VFKIT, + )?; + } + + eprintln!( + "Networking: gvproxy (virtio-net) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + gvproxy_api_sock = Some(gvproxy_setup.api_sock); + gvproxy_guard = Some(gvproxy_setup.guard); + } + } + + if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) { + vm.set_port_map(&config.port_map)?; + } + + for vsock_port in &config.vsock_ports { + if let Some(parent) = vsock_port.socket_path.parent() { + std::fs::create_dir_all(parent).map_err(|e| { + VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display())) + })?; + } + let _ = std::fs::remove_file(&vsock_port.socket_path); + vm.add_vsock_port(vsock_port)?; + } + + let console_log = config.console_output.clone().unwrap_or_else(|| { + config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs))) + }); + vm.set_console_output(&console_log)?; + + let mut env: Vec = if config.env.is_empty() { + vec![ + "HOME=/root", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + ] + .into_iter() + .map(ToOwned::to_owned) + .collect() + } else { + config.env.clone() + }; + if let Some(state_disk) = &config.state_disk + && !env + .iter() + .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE=")) + { + env.push(format!( + "OPENSHELL_VM_STATE_DISK_DEVICE={}", + state_disk.guest_device + )); + } + if config.gpu_enabled { + env.push("GPU_ENABLED=true".to_string()); + } + vm.set_exec(&config.exec_path, &config.args, &env)?; + + // Fork and enter the VM + let boot_start = Instant::now(); + eprintln!("Booting microVM..."); + + let pid = unsafe { libc::fork() }; + match pid { + -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())), + 0 => { + let ret = vm.start_enter(); + eprintln!("krun_start_enter failed: {ret}"); + std::process::exit(1); + } + _ => { + if config.exec_path == "/srv/openshell-vm-init.sh" { + let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id); + if let Err(err) = + write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid, false) + { + unsafe { + libc::kill(pid, libc::SIGTERM); + } + drop(gvproxy_guard); + clear_vm_runtime_state(&config.rootfs); + return Err(err); + } + } + eprintln!( + "VM started (child pid {pid}) [{:.1}s]", + boot_start.elapsed().as_secs_f64() + ); + for pm in &config.port_map { + let host_port = pm.split(':').next().unwrap_or(pm); + eprintln!(" port {pm} -> http://localhost:{host_port}"); + } + eprintln!("Console output: {}", console_log.display()); + + if let Some(ref api_sock) = gvproxy_api_sock { + setup_gvproxy_port_forwarding(api_sock, &config.port_map)?; + } + + if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { + let gateway_port = gateway_host_port(config); + bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; + health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; + } + + eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); + eprintln!("Press Ctrl+C to stop."); + + unsafe { + libc::signal( + libc::SIGINT, + crate::forward_signal as *const () as libc::sighandler_t, + ); + libc::signal( + libc::SIGTERM, + crate::forward_signal as *const () as libc::sighandler_t, + ); + crate::CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); + } + + let mut status: libc::c_int = 0; + unsafe { + libc::waitpid(pid, &raw mut status, 0); + } + + if config.exec_path == "/srv/openshell-vm-init.sh" { + clear_vm_runtime_state(&config.rootfs); + } + if let Some(mut guard) = gvproxy_guard + && let Some(mut child) = guard.disarm() + { + let _ = child.kill(); + let _ = child.wait(); + eprintln!("gvproxy stopped"); + } + + if libc::WIFEXITED(status) { + let code = libc::WEXITSTATUS(status); + eprintln!("VM exited with code {code}"); + return Ok(code); + } else if libc::WIFSIGNALED(status) { + let sig = libc::WTERMSIG(status); + eprintln!("VM killed by signal {sig}"); + return Ok(128 + sig); + } + + Ok(status) + } + } +} diff --git a/crates/openshell-vm/src/backend/mod.rs b/crates/openshell-vm/src/backend/mod.rs new file mode 100644 index 000000000..9c2167fc5 --- /dev/null +++ b/crates/openshell-vm/src/backend/mod.rs @@ -0,0 +1,208 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! VM backend abstraction layer. +//! +//! Defines the [`VmBackend`] trait that all hypervisor backends implement, +//! and shared infrastructure (gvproxy startup, networking helpers) used by +//! both the libkrun and cloud-hypervisor backends. + +pub mod cloud_hypervisor; +pub mod libkrun; + +use std::path::{Path, PathBuf}; +use std::time::Instant; + +use crate::{ + GvproxyGuard, NetBackend, VmConfig, VmError, gvproxy_expose, gvproxy_socket_dir, + kill_stale_gvproxy, kill_stale_gvproxy_by_port, pick_gvproxy_ssh_port, vm_rootfs_key, +}; + +/// Trait implemented by each hypervisor backend (libkrun, cloud-hypervisor). +pub trait VmBackend { + /// Launch a VM with the given configuration. + /// + /// Returns the VM exit code. + fn launch(&self, config: &VmConfig) -> Result; +} + +/// Result of starting a gvproxy instance, used by both backends. +pub(crate) struct GvproxySetup { + pub(crate) guard: GvproxyGuard, + pub(crate) api_sock: PathBuf, + pub(crate) net_sock: PathBuf, +} + +/// Start gvproxy for the given configuration. +/// +/// Shared between libkrun and cloud-hypervisor backends. Handles stale +/// process cleanup, socket setup, and process spawning with exponential +/// backoff waiting for the network socket. +pub(crate) fn start_gvproxy( + config: &VmConfig, + launch_start: Instant, +) -> Result { + let binary = match &config.net { + NetBackend::Gvproxy { binary } => binary, + _ => { + return Err(VmError::HostSetup( + "start_gvproxy called without Gvproxy net backend".into(), + )); + } + }; + + if !binary.exists() { + return Err(VmError::BinaryNotFound { + path: binary.display().to_string(), + hint: "Install Podman Desktop or place gvproxy in PATH".to_string(), + }); + } + + let run_dir = config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .to_path_buf(); + let rootfs_key = vm_rootfs_key(&config.rootfs); + let sock_base = gvproxy_socket_dir(&config.rootfs)?; + let net_sock = sock_base.with_extension("v"); + let api_sock = sock_base.with_extension("a"); + + kill_stale_gvproxy(&config.rootfs); + for pm in &config.port_map { + if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::().ok()) { + kill_stale_gvproxy_by_port(host_port); + } + } + + let _ = std::fs::remove_file(&net_sock); + let _ = std::fs::remove_file(&api_sock); + let krun_sock = sock_base.with_extension("v-krun.sock"); + let _ = std::fs::remove_file(&krun_sock); + + eprintln!("Starting gvproxy: {}", binary.display()); + let ssh_port = pick_gvproxy_ssh_port()?; + let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log")); + let gvproxy_log_file = std::fs::File::create(&gvproxy_log) + .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; + + #[cfg(target_os = "linux")] + let (gvproxy_net_flag, gvproxy_net_url) = + ("-listen-qemu", format!("unix://{}", net_sock.display())); + #[cfg(target_os = "macos")] + let (gvproxy_net_flag, gvproxy_net_url) = ( + "-listen-vfkit", + format!("unixgram://{}", net_sock.display()), + ); + + let child = std::process::Command::new(binary) + .arg(gvproxy_net_flag) + .arg(&gvproxy_net_url) + .arg("-listen") + .arg(format!("unix://{}", api_sock.display())) + .arg("-ssh-port") + .arg(ssh_port.to_string()) + .stdout(std::process::Stdio::null()) + .stderr(gvproxy_log_file) + .spawn() + .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; + + eprintln!( + "gvproxy started (pid {}, ssh port {}) [{:.1}s]", + child.id(), + ssh_port, + launch_start.elapsed().as_secs_f64() + ); + + { + let deadline = Instant::now() + std::time::Duration::from_secs(5); + let mut interval = std::time::Duration::from_millis(5); + while !net_sock.exists() { + if Instant::now() >= deadline { + return Err(VmError::Fork( + "gvproxy socket did not appear within 5s".to_string(), + )); + } + std::thread::sleep(interval); + interval = (interval * 2).min(std::time::Duration::from_millis(100)); + } + } + + Ok(GvproxySetup { + guard: GvproxyGuard::new(child), + api_sock, + net_sock, + }) +} + +/// Set up port forwarding via the gvproxy HTTP API. +/// +/// Translates `host:guest` port map entries into gvproxy expose calls. +pub(crate) fn setup_gvproxy_port_forwarding( + api_sock: &Path, + port_map: &[String], +) -> Result<(), VmError> { + let fwd_start = Instant::now(); + { + let deadline = Instant::now() + std::time::Duration::from_secs(2); + let mut interval = std::time::Duration::from_millis(5); + while !api_sock.exists() { + if Instant::now() >= deadline { + eprintln!("warning: gvproxy API socket not ready after 2s, attempting anyway"); + break; + } + std::thread::sleep(interval); + interval = (interval * 2).min(std::time::Duration::from_millis(200)); + } + } + + let guest_ip = "192.168.127.2"; + + for pm in port_map { + let parts: Vec<&str> = pm.split(':').collect(); + let (host_port, guest_port) = match parts.len() { + 2 => (parts[0], parts[1]), + 1 => (parts[0], parts[0]), + _ => { + eprintln!(" skipping invalid port mapping: {pm}"); + continue; + } + }; + + let expose_body = format!( + r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"# + ); + + let mut expose_ok = false; + let mut retry_interval = std::time::Duration::from_millis(100); + let expose_deadline = Instant::now() + std::time::Duration::from_secs(10); + loop { + match gvproxy_expose(api_sock, &expose_body) { + Ok(()) => { + eprintln!(" port {host_port} -> {guest_ip}:{guest_port}"); + expose_ok = true; + break; + } + Err(e) => { + if Instant::now() >= expose_deadline { + eprintln!(" port {host_port}: {e} (retries exhausted)"); + break; + } + std::thread::sleep(retry_interval); + retry_interval = (retry_interval * 2).min(std::time::Duration::from_secs(1)); + } + } + } + if !expose_ok { + return Err(VmError::HostSetup(format!( + "failed to forward port {host_port} via gvproxy" + ))); + } + } + eprintln!( + "Port forwarding ready [{:.1}s]", + fwd_start.elapsed().as_secs_f64() + ); + + Ok(()) +} diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs index 6195556e1..1f8ad03fe 100644 --- a/crates/openshell-vm/src/exec.rs +++ b/crates/openshell-vm/src/exec.rs @@ -48,6 +48,22 @@ fn safe_remove_dir_all(path: &Path) -> Result { pub const VM_EXEC_VSOCK_PORT: u32 = 10_777; +/// How to connect to the VM exec agent. +/// +/// libkrun bridges each guest vsock port to a host Unix socket via +/// `krun_add_vsock_port2`. cloud-hypervisor uses standard vhost-vsock +/// with CID-based addressing — the host connects via `AF_VSOCK` or a +/// vsock-proxy/socat bridge. +#[derive(Debug, Clone)] +pub enum VsockConnectMode { + /// Connect via a host Unix socket (libkrun per-port bridging). + UnixSocket(PathBuf), + /// Connect via a vsock proxy bridge (cloud-hypervisor). + /// The path points to a socat-bridged Unix socket that forwards + /// to guest CID 3, port [`VM_EXEC_VSOCK_PORT`]. + VsockBridge(PathBuf), +} + const VM_STATE_NAME: &str = "vm-state.json"; const VM_LOCK_NAME: &str = "vm.lock"; const KUBECONFIG_ENV: &str = "KUBECONFIG=/etc/rancher/k3s/k3s.yaml"; @@ -72,6 +88,10 @@ pub struct VmRuntimeState { /// PID of the gvproxy process (if networking uses gvproxy). #[serde(default, skip_serializing_if = "Option::is_none")] pub gvproxy_pid: Option, + /// Whether this VM uses vsock-bridge mode (cloud-hypervisor) vs + /// Unix socket mode (libkrun). Defaults to false for backward compat. + #[serde(default, skip_serializing_if = "std::ops::Not::not")] + pub vsock_bridge: bool, } #[derive(Debug, Serialize)] @@ -132,6 +152,7 @@ pub fn write_vm_runtime_state( pid: i32, console_log: &Path, gvproxy_pid: Option, + vsock_bridge: bool, ) -> Result<(), VmError> { let state = VmRuntimeState { pid, @@ -141,6 +162,7 @@ pub fn write_vm_runtime_state( console_log: console_log.to_path_buf(), started_at_ms: now_ms()?, gvproxy_pid, + vsock_bridge, }; let path = vm_state_path(rootfs); let bytes = serde_json::to_vec_pretty(&state) @@ -471,10 +493,21 @@ pub fn ensure_vm_not_running(rootfs: &Path) -> Result<(), VmError> { pub fn exec_running_vm(options: VmExecOptions) -> Result { let state = load_vm_runtime_state(options.rootfs.as_deref())?; - let mut stream = UnixStream::connect(&state.socket_path).map_err(|e| { + + let connect_mode = if state.vsock_bridge { + VsockConnectMode::VsockBridge(state.socket_path.clone()) + } else { + VsockConnectMode::UnixSocket(state.socket_path.clone()) + }; + + let socket_path = match &connect_mode { + VsockConnectMode::UnixSocket(p) | VsockConnectMode::VsockBridge(p) => p, + }; + + let mut stream = UnixStream::connect(socket_path).map_err(|e| { VmError::Exec(format!( "connect to VM exec socket {}: {e}", - state.socket_path.display() + socket_path.display() )) })?; let mut writer = stream diff --git a/crates/openshell-vm/src/gpu_passthrough.rs b/crates/openshell-vm/src/gpu_passthrough.rs new file mode 100644 index 000000000..b835bca89 --- /dev/null +++ b/crates/openshell-vm/src/gpu_passthrough.rs @@ -0,0 +1,1959 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Host-side NVIDIA GPU VFIO readiness probing for VM passthrough. +//! +//! This module scans Linux sysfs (`/sys/bus/pci/devices`) for NVIDIA GPUs +//! (vendor ID `0x10de`), checks their driver binding, and verifies IOMMU +//! group cleanliness — the prerequisites for passing a physical GPU into +//! a cloud-hypervisor VM via VFIO. +//! +//! Returns per-device readiness for multi-GPU hosts. +//! +//! On non-Linux platforms, probing returns an empty list. + +use std::fmt; +use std::path::PathBuf; +use std::time::Duration; + +/// Per-device readiness state for NVIDIA GPU VFIO passthrough. +/// +/// Each variant represents a distinct readiness state for a single PCI device. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum HostNvidiaVfioReadiness { + /// The current platform does not support VFIO passthrough (non-Linux). + UnsupportedPlatform, + + /// No PCI device with NVIDIA vendor ID (`0x10de`) was found. + NoNvidiaDevice, + + /// An NVIDIA device exists but is bound to the nvidia (or other non-VFIO) driver. + BoundToNvidia, + + /// An NVIDIA device is bound to `vfio-pci` and its IOMMU group is clean — ready for passthrough. + VfioBoundReady, + + /// An NVIDIA device is bound to `vfio-pci` but its IOMMU group contains + /// devices not bound to `vfio-pci`, which prevents safe passthrough. + VfioBoundDirtyGroup, + + /// Some NVIDIA devices are bound to `vfio-pci` while others use + /// a different driver (mixed fleet). + MixedVfioAndOther, +} + +impl fmt::Display for HostNvidiaVfioReadiness { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::UnsupportedPlatform => write!( + f, + "VFIO passthrough is not supported on this platform (Linux required)" + ), + Self::NoNvidiaDevice => write!(f, "no NVIDIA PCI device found"), + Self::BoundToNvidia => { + write!(f, "NVIDIA device found but not bound to vfio-pci driver") + } + Self::VfioBoundReady => write!( + f, + "NVIDIA device bound to vfio-pci and IOMMU group is clean" + ), + Self::VfioBoundDirtyGroup => write!( + f, + "NVIDIA device bound to vfio-pci but IOMMU group contains non-VFIO devices" + ), + Self::MixedVfioAndOther => write!( + f, + "some NVIDIA devices are on vfio-pci while others use a different driver" + ), + } + } +} + +const NVIDIA_VENDOR_ID: &str = "0x10de"; + +#[cfg(target_os = "linux")] +const SYSFS_WRITE_TIMEOUT: Duration = Duration::from_secs(10); + +#[cfg(target_os = "linux")] +fn sysfs_write_with_timeout( + path: &std::path::Path, + data: &str, + timeout: Duration, +) -> Result<(), std::io::Error> { + use std::process::{Command, Stdio}; + use std::thread; + + let mut child = Command::new("sh") + .arg("-c") + .arg(format!( + r#"printf '%s' '{}' > '{}'"#, + data.replace('\'', "'\\''"), + path.display().to_string().replace('\'', "'\\''") + )) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + std::io::Error::new( + e.kind(), + format!( + "failed to spawn sysfs write subprocess for {}: {e}", + path.display() + ), + ) + })?; + + let poll_interval = Duration::from_millis(100); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => { + if status.success() { + return Ok(()); + } + let mut stderr_buf = String::new(); + if let Some(mut stderr) = child.stderr.take() { + use std::io::Read; + let _ = stderr.read_to_string(&mut stderr_buf); + } + let hint = if stderr_buf.contains("Permission denied") { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!( + "sysfs write to {} failed (exit {}){hint}: {stderr_buf}", + path.display(), + status.code().unwrap_or(-1), + ), + )); + } + Ok(None) => { + if start.elapsed() > timeout { + let pid = child.id(); + let _ = child.kill(); + // CRITICAL: Do NOT call child.wait() here. If the child + // is stuck in uninterruptible sleep (D-state) — which is + // the nvidia unbind deadlock scenario — wait() will block + // the parent indefinitely, making it unkillable too. + // + // Dropping the Child struct closes pipe handles but does + // NOT wait. The zombie child is reparented to init and + // reaped when/if it eventually exits. + drop(child); + return Err(std::io::Error::new( + std::io::ErrorKind::TimedOut, + format!( + "sysfs write to {} timed out after {:.0}s (subprocess pid {pid}) — \ + possible nvidia driver deadlock. The subprocess may still be \ + stuck in kernel space; a reboot may be required to clear it.", + path.display(), + timeout.as_secs_f64(), + ), + )); + } + thread::sleep(poll_interval); + } + Err(e) => return Err(e), + } + } +} + +/// Validates that `addr` matches the PCI BDF format `DDDD:BB:DD.F`. +fn validate_pci_addr(addr: &str) -> Result<(), std::io::Error> { + let bytes = addr.as_bytes(); + let valid = bytes.len() == 12 + && bytes[4] == b':' + && bytes[7] == b':' + && bytes[10] == b'.' + && bytes[..4].iter().all(|b| b.is_ascii_hexdigit()) + && bytes[5..7].iter().all(|b| b.is_ascii_hexdigit()) + && bytes[8..10].iter().all(|b| b.is_ascii_hexdigit()) + && bytes[11].is_ascii_digit(); + if valid { + Ok(()) + } else { + Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("invalid PCI address '{addr}': expected DDDD:BB:DD.F format"), + )) + } +} + +/// Probe the host for NVIDIA GPU VFIO readiness by scanning Linux sysfs. +/// +/// Returns a per-device list of `(pci_address, readiness)` tuples for every +/// NVIDIA GPU found. On non-Linux platforms the list is empty. +/// +/// On Linux, walks `/sys/bus/pci/devices/` and for each device: +/// 1. Reads `vendor` to check for NVIDIA (`0x10de`). +/// 2. Reads the `driver` symlink to determine which kernel driver is bound. +/// 3. If bound to `vfio-pci`, inspects the `iommu_group/devices/` directory +/// to verify all group members are also on `vfio-pci`. +pub fn probe_host_nvidia_vfio_readiness() -> Vec<(String, HostNvidiaVfioReadiness)> { + #[cfg(not(target_os = "linux"))] + { + Vec::new() + } + + #[cfg(target_os = "linux")] + { + probe_linux_sysfs() + } +} + +#[cfg(target_os = "linux")] +fn probe_linux_sysfs() -> Vec<(String, HostNvidiaVfioReadiness)> { + use std::fs; + use std::path::Path; + + let pci_devices = Path::new("/sys/bus/pci/devices"); + let entries = match fs::read_dir(pci_devices) { + Ok(e) => e, + Err(_) => return Vec::new(), + }; + + let mut results = Vec::new(); + + for entry in entries.filter_map(Result::ok) { + let dev_path = entry.path(); + + let vendor = match fs::read_to_string(dev_path.join("vendor")) { + Ok(v) => v.trim().to_lowercase(), + Err(_) => continue, + }; + + if vendor != NVIDIA_VENDOR_ID { + continue; + } + + let pci_addr = entry.file_name().to_string_lossy().to_string(); + + let driver_link = dev_path.join("driver"); + let driver_name = fs::read_link(&driver_link).ok().and_then(|target| { + target + .file_name() + .map(|name| name.to_string_lossy().to_string()) + }); + + let state = match driver_name.as_deref() { + Some("vfio-pci") => { + let iommu_group_devices = dev_path.join("iommu_group/devices"); + let group_clean = match fs::read_dir(&iommu_group_devices) { + Ok(group_entries) => group_entries.filter_map(Result::ok).all(|ge| { + let peer_path = iommu_group_devices.join(ge.file_name()).join("driver"); + fs::read_link(&peer_path) + .ok() + .and_then(|t| t.file_name().map(|n| n.to_string_lossy().to_string())) + .as_deref() + == Some("vfio-pci") + }), + Err(_) => false, + }; + + if group_clean { + HostNvidiaVfioReadiness::VfioBoundReady + } else { + HostNvidiaVfioReadiness::VfioBoundDirtyGroup + } + } + _ => HostNvidiaVfioReadiness::BoundToNvidia, + }; + + results.push((pci_addr, state)); + } + + results +} + +/// Returns whether any NVIDIA GPU is fully available for VM passthrough. +/// +/// Requires `OPENSHELL_VM_GPU_E2E=1` to activate probing. When the env var +/// is unset or not `"1"`, returns `false` unconditionally so non-GPU CI +/// runners are never affected. +/// +/// When activated, checks two conditions: +/// 1. At least one NVIDIA device reports [`VfioBoundReady`]. +/// 2. The cloud-hypervisor binary exists in the runtime bundle. +pub fn nvidia_gpu_available_for_vm_passthrough() -> bool { + if std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() != Ok("1") { + return false; + } + + let has_vfio_ready = probe_host_nvidia_vfio_readiness() + .iter() + .any(|(_, state)| *state == HostNvidiaVfioReadiness::VfioBoundReady); + + if !has_vfio_ready { + return false; + } + + let chv_exists = crate::configured_runtime_dir() + .map(|dir| dir.join("cloud-hypervisor").is_file()) + .unwrap_or(false); + + chv_exists +} + +/// Sysfs root path, defaulting to "/" in production and a temp dir in tests. +#[derive(Debug, Clone)] +pub(crate) struct SysfsRoot(PathBuf); + +impl Default for SysfsRoot { + fn default() -> Self { + Self(PathBuf::from("/")) + } +} + +impl SysfsRoot { + #[cfg(test)] + pub fn new(root: PathBuf) -> Self { + Self(root) + } + + pub fn sys_bus_pci_devices(&self) -> PathBuf { + self.0.join("sys/bus/pci/devices") + } + + pub fn sys_class_drm(&self) -> PathBuf { + self.0.join("sys/class/drm") + } + + pub fn sys_module(&self, module: &str) -> PathBuf { + self.0.join("sys/module").join(module) + } + + pub fn sys_bus_pci_drivers(&self, driver: &str) -> PathBuf { + self.0.join("sys/bus/pci/drivers").join(driver) + } + + pub fn sys_kernel_iommu_groups(&self) -> PathBuf { + self.0.join("sys/kernel/iommu_groups") + } + + fn is_real_sysfs(&self) -> bool { + self.0 == std::path::Path::new("/") + } + + #[cfg(target_os = "linux")] + fn write_sysfs(&self, path: &std::path::Path, data: &str) -> Result<(), std::io::Error> { + if self.is_real_sysfs() { + sysfs_write_with_timeout(path, data, SYSFS_WRITE_TIMEOUT) + } else { + std::fs::write(path, data).map_err(|e| { + std::io::Error::new(e.kind(), format!("failed to write {}: {e}", path.display())) + }) + } + } +} + +#[cfg(target_os = "linux")] +pub(crate) fn check_display_attached(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + use std::fs; + + let drm_dir = sysfs.sys_class_drm(); + let entries = match fs::read_dir(&drm_dir) { + Ok(e) => e, + Err(_) => return false, + }; + + for entry in entries.filter_map(Result::ok) { + let name = entry.file_name().to_string_lossy().to_string(); + if !name.starts_with("card") || name.contains('-') { + continue; + } + + let card_dir = entry.path(); + let device_link = card_dir.join("device"); + + let target = match fs::read_link(&device_link) { + Ok(t) => t, + Err(_) => continue, + }; + if !target.to_string_lossy().ends_with(pci_addr) { + continue; + } + + let boot_vga_path = card_dir.join("device").join("boot_vga"); + if let Ok(val) = fs::read_to_string(&boot_vga_path) { + if val.trim() == "1" { + return true; + } + } + + if let Ok(sub_entries) = fs::read_dir(&card_dir) { + for sub in sub_entries.filter_map(Result::ok) { + let sub_name = sub.file_name().to_string_lossy().to_string(); + if sub_name.starts_with(&format!("{name}-")) { + if let Ok(status) = fs::read_to_string(sub.path().join("status")) { + if status.trim() == "connected" { + return true; + } + } + } + } + } + } + + false +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_display_attached(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +#[cfg(target_os = "linux")] +/// Checks whether any process on the host has an open handle to an NVIDIA GPU +/// device (`/dev/nvidia*`). This is a host-wide check across ALL NVIDIA GPUs, +/// not scoped to a single PCI address. Returns a list of (pid, comm) pairs. +pub(crate) fn check_active_gpu_processes() -> std::io::Result> { + use std::fs; + + let mut result = Vec::new(); + + let proc_dir = match fs::read_dir("/proc") { + Ok(d) => d, + Err(e) => { + return Err(std::io::Error::new( + e.kind(), + format!( + "cannot scan /proc for active GPU processes: {e} — \ + refusing to unbind (fail-closed)" + ), + )); + } + }; + + for proc_entry in proc_dir.filter_map(Result::ok) { + let pid: u32 = match proc_entry.file_name().to_string_lossy().parse() { + Ok(p) => p, + Err(_) => continue, + }; + + let fd_dir = proc_entry.path().join("fd"); + let fds = match fs::read_dir(&fd_dir) { + Ok(d) => d, + Err(_) => continue, + }; + + for fd_entry in fds.filter_map(Result::ok) { + if let Ok(target) = fs::read_link(fd_entry.path()) { + if target.to_string_lossy().starts_with("/dev/nvidia") { + let comm = fs::read_to_string(format!("/proc/{pid}/comm")) + .unwrap_or_default() + .trim() + .to_string(); + result.push((pid, comm)); + break; + } + } + } + } + + Ok(result) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_active_gpu_processes() -> std::io::Result> { + Ok(vec![]) +} + +#[cfg(target_os = "linux")] +pub(crate) fn check_iommu_enabled(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + let iommu_groups = sysfs.sys_kernel_iommu_groups(); + if !iommu_groups.is_dir() { + return false; + } + sysfs + .sys_bus_pci_devices() + .join(pci_addr) + .join("iommu_group") + .exists() +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_iommu_enabled(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +#[cfg(target_os = "linux")] +pub(crate) fn check_vfio_modules_loaded(sysfs: &SysfsRoot) -> bool { + sysfs.sys_module("vfio_pci").is_dir() && sysfs.sys_module("vfio_iommu_type1").is_dir() +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_vfio_modules_loaded(_sysfs: &SysfsRoot) -> bool { + false +} + +#[cfg(target_os = "linux")] +pub(crate) fn check_sysfs_permissions(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + use nix::unistd::{AccessFlags, access}; + + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + let driver_override = dev_dir.join("driver_override"); + let unbind = dev_dir.join("driver/unbind"); + let bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind"); + + let writable = |path: &std::path::Path| -> bool { access(path, AccessFlags::W_OK).is_ok() }; + + let unbind_ok = !unbind.exists() || writable(&unbind); + writable(&driver_override) && unbind_ok && writable(&bind) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn check_sysfs_permissions(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +#[cfg(target_os = "linux")] +pub(crate) fn current_driver(sysfs: &SysfsRoot, pci_addr: &str) -> Option { + let driver_link = sysfs.sys_bus_pci_devices().join(pci_addr).join("driver"); + std::fs::read_link(&driver_link) + .ok() + .and_then(|target| target.file_name().map(|n| n.to_string_lossy().to_string())) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn current_driver(_sysfs: &SysfsRoot, _pci_addr: &str) -> Option { + None +} + +/// Nvidia kernel modules that hold internal references to GPU devices and can +/// prevent a clean unbind. Unloaded in order (most-dependent first). +#[cfg(target_os = "linux")] +const NVIDIA_SUBMODULES: &[&str] = &["nvidia_uvm", "nvidia_drm", "nvidia_modeset"]; + +/// Timeout for nvidia prep commands (nvidia-smi, modprobe). These commands +/// can wedge if the nvidia driver is in a bad state. +#[cfg(target_os = "linux")] +const NVIDIA_PREP_TIMEOUT: Duration = Duration::from_secs(15); + +/// Run a command with a timeout. Returns `Some(ExitStatus)` on success, +/// `None` on timeout or spawn failure. On timeout, kills the child and +/// drops it without calling `wait()` (same D-state safety as sysfs writes). +#[cfg(target_os = "linux")] +fn run_with_timeout( + mut cmd: std::process::Command, + timeout: Duration, +) -> Option { + use std::thread; + + let mut child = match cmd.spawn() { + Ok(c) => c, + Err(_) => return None, + }; + + let poll_interval = Duration::from_millis(100); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => return Some(status), + Ok(None) => { + if start.elapsed() > timeout { + let _ = child.kill(); + drop(child); + return None; + } + thread::sleep(poll_interval); + } + Err(_) => return None, + } + } +} + +/// Best-effort preparation of the nvidia driver before a raw sysfs unbind. +/// +/// Reduces the probability of the nvidia unbind deadlock by: +/// 1. Disabling persistence mode (nvidia-persistenced holds device refs). +/// 2. Unloading nvidia submodules that keep internal references open. +/// +/// All commands run with a timeout — if `nvidia-smi` or `modprobe` hangs +/// (which can happen when the nvidia driver is in a bad state), the parent +/// process is not blocked. Failures are logged but not fatal. +#[cfg(target_os = "linux")] +fn nvidia_pre_unbind_prep(pci_addr: &str) { + use std::process::{Command, Stdio}; + + // 1. Disable persistence mode via nvidia-smi (if available). + let mut cmd = Command::new("nvidia-smi"); + cmd.args(["-i", pci_addr, "-pm", "0"]) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()); + match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) { + Some(s) if s.success() => { + eprintln!("GPU {pci_addr}: disabled nvidia persistence mode"); + } + None => { + eprintln!( + "GPU {pci_addr}: nvidia-smi timed out after {:.0}s — skipping persistence mode", + NVIDIA_PREP_TIMEOUT.as_secs_f64() + ); + } + _ => {} + } + + // 2. Unload nvidia submodules that hold device references. + // This is best-effort — modules may be in use by other GPUs. + for module in NVIDIA_SUBMODULES { + let mut cmd = Command::new("modprobe"); + cmd.args(["-r", module]) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()); + match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) { + Some(s) if s.success() => { + eprintln!("GPU {pci_addr}: unloaded {module}"); + } + None => { + eprintln!( + "GPU {pci_addr}: modprobe -r {module} timed out after {:.0}s", + NVIDIA_PREP_TIMEOUT.as_secs_f64() + ); + } + _ => {} + } + } +} + +#[cfg(target_os = "linux")] +pub(crate) fn bind_gpu_to_vfio( + sysfs: &SysfsRoot, + pci_addr: &str, +) -> Result { + validate_pci_addr(pci_addr)?; + let drv = current_driver(sysfs, pci_addr); + + if drv.as_deref() == Some("vfio-pci") { + return Ok("vfio-pci".to_string()); + } + + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + + if drv.is_some() { + let is_nvidia = drv.as_deref() == Some("nvidia"); + if is_nvidia && sysfs.is_real_sysfs() { + nvidia_pre_unbind_prep(pci_addr); + + // nvidia_pre_unbind_prep may cascade-remove the nvidia module when + // all submodules are unloaded, which automatically unbinds the device. + // Re-check before attempting the sysfs unbind write. + if current_driver(sysfs, pci_addr).is_none() { + eprintln!("GPU {pci_addr}: device already unbound after nvidia module cleanup"); + } else if current_driver(sysfs, pci_addr).as_deref() == Some("vfio-pci") { + return Ok("vfio-pci".to_string()); + } + } + + // Only attempt the sysfs unbind if a driver is still bound. + if current_driver(sysfs, pci_addr).is_some() { + let unbind = dev_dir.join("driver/unbind"); + let unbind_result = sysfs.write_sysfs(&unbind, pci_addr); + + if let Err(ref e) = unbind_result { + if e.kind() == std::io::ErrorKind::TimedOut { + // The nvidia unbind deadlock can complete the unbind at the + // hardware level while the syscall never returns to userspace. + // Check if the device is actually unbound despite the timeout. + if current_driver(sysfs, pci_addr).is_none() { + eprintln!( + "GPU {pci_addr}: sysfs unbind timed out but device is unbound — \ + continuing (zombie subprocess may linger until reboot)" + ); + } else { + return Err(std::io::Error::new( + std::io::ErrorKind::TimedOut, + format!( + "Failed to unbind {pci_addr}: timed out and device is still \ + bound to {}. A reboot may be required.", + drv.as_deref().unwrap_or("unknown"), + ), + )); + } + } else { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to unbind device at {path}{hint}", + path = unbind.display() + ), + )); + } + } + } + } + + let driver_override = dev_dir.join("driver_override"); + if let Err(e) = sysfs.write_sysfs(&driver_override, "vfio-pci") { + if let Some(ref orig) = drv { + let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind"); + let _ = sysfs.write_sysfs(&orig_bind, pci_addr); + } + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to write driver_override at {path}{hint}", + path = driver_override.display() + ), + )); + } + + let vfio_bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind"); + if let Err(e) = sysfs.write_sysfs(&vfio_bind, pci_addr) { + let _ = sysfs.write_sysfs(&driver_override, ""); + if let Some(ref orig) = drv { + let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind"); + let _ = sysfs.write_sysfs(&orig_bind, pci_addr); + } + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to bind to vfio-pci at {path}{hint} — is the vfio-pci module loaded?", + path = vfio_bind.display() + ), + )); + } + + Ok(drv.unwrap_or_default()) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn bind_gpu_to_vfio( + _sysfs: &SysfsRoot, + _pci_addr: &str, +) -> Result { + Ok(String::new()) +} + +#[cfg(target_os = "linux")] +pub(crate) fn rebind_gpu_to_original( + sysfs: &SysfsRoot, + pci_addr: &str, + original_driver: &str, +) -> Result<(), std::io::Error> { + validate_pci_addr(pci_addr)?; + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + + if current_driver(sysfs, pci_addr).is_some() { + let unbind = dev_dir.join("driver/unbind"); + sysfs.write_sysfs(&unbind, pci_addr).map_err(|e| { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + std::io::Error::new( + e.kind(), + format!( + "Failed to unbind device at {path}{hint}", + path = unbind.display() + ), + ) + })?; + } + + let driver_override = dev_dir.join("driver_override"); + sysfs.write_sysfs(&driver_override, "").map_err(|e| { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + std::io::Error::new( + e.kind(), + format!( + "Failed to clear driver_override at {path}{hint}", + path = driver_override.display() + ), + ) + })?; + + if !original_driver.is_empty() && original_driver != "none" { + let bind = sysfs.sys_bus_pci_drivers(original_driver).join("bind"); + sysfs.write_sysfs(&bind, pci_addr).map_err(|e| { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + std::io::Error::new( + e.kind(), + format!( + "Failed to rebind to {original_driver} at {path}{hint}", + path = bind.display() + ), + ) + })?; + } else { + let rescan = sysfs.0.join("sys/bus/pci/rescan"); + let _ = sysfs.write_sysfs(&rescan, "1"); + } + + Ok(()) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn rebind_gpu_to_original( + _sysfs: &SysfsRoot, + _pci_addr: &str, + _original_driver: &str, +) -> Result<(), std::io::Error> { + Ok(()) +} + +#[cfg(target_os = "linux")] +pub(crate) fn iommu_group_peers( + sysfs: &SysfsRoot, + pci_addr: &str, +) -> Result, std::io::Error> { + validate_pci_addr(pci_addr)?; + let iommu_devices = sysfs + .sys_bus_pci_devices() + .join(pci_addr) + .join("iommu_group/devices"); + + let entries = match std::fs::read_dir(&iommu_devices) { + Ok(e) => e, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(vec![]), + Err(e) => return Err(e), + }; + + let mut peers = Vec::new(); + for entry in entries.filter_map(Result::ok) { + let name = entry.file_name().to_string_lossy().to_string(); + if name != pci_addr { + peers.push(name); + } + } + Ok(peers) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn iommu_group_peers( + _sysfs: &SysfsRoot, + _pci_addr: &str, +) -> Result, std::io::Error> { + Ok(vec![]) +} + +#[cfg(target_os = "linux")] +pub(crate) fn bind_iommu_group_peers( + sysfs: &SysfsRoot, + pci_addr: &str, +) -> Result, std::io::Error> { + let peers = iommu_group_peers(sysfs, pci_addr)?; + let mut restore_list = Vec::new(); + + for peer in peers { + match bind_gpu_to_vfio(sysfs, &peer) { + Ok(original) => { + if original != "vfio-pci" { + restore_list.push((peer, original)); + } + } + Err(e) => { + let _ = rebind_iommu_group_peers(sysfs, &restore_list); + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to bind IOMMU peer {peer}: {e}. Rolled back {} peer(s).", + restore_list.len() + ), + )); + } + } + } + + Ok(restore_list) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn bind_iommu_group_peers( + _sysfs: &SysfsRoot, + _pci_addr: &str, +) -> Result, std::io::Error> { + Ok(vec![]) +} + +#[cfg(target_os = "linux")] +pub(crate) fn rebind_iommu_group_peers( + sysfs: &SysfsRoot, + peers: &[(String, String)], +) -> Result<(), std::io::Error> { + let mut first_err = None; + for (peer_addr, original_driver) in peers { + if let Err(e) = rebind_gpu_to_original(sysfs, peer_addr, original_driver) { + if first_err.is_none() { + first_err = Some(e); + } + } + } + match first_err { + Some(e) => Err(e), + None => Ok(()), + } +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn rebind_iommu_group_peers( + _sysfs: &SysfsRoot, + _peers: &[(String, String)], +) -> Result<(), std::io::Error> { + Ok(()) +} + +#[cfg(target_os = "linux")] +fn is_iommu_group_clean(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + let peers = match iommu_group_peers(sysfs, pci_addr) { + Ok(p) => p, + Err(_) => return false, + }; + peers + .iter() + .all(|peer| current_driver(sysfs, peer).as_deref() == Some("vfio-pci")) +} + +#[cfg(not(target_os = "linux"))] +fn is_iommu_group_clean(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +/// Captures the bind state for a GPU so it can be restored on shutdown. +#[derive(Debug)] +pub struct GpuBindState { + /// PCI address of the GPU that was bound. + pub pci_addr: String, + /// Driver the GPU was on before binding (e.g. "nvidia"). + pub original_driver: String, + /// IOMMU group peers that were rebound, with their original drivers. + pub peer_binds: Vec<(String, String)>, + /// Whether this instance performed the bind (false if GPU was already on vfio-pci). + pub did_bind: bool, +} + +impl GpuBindState { + /// Restore the GPU and its IOMMU peers to their original drivers. + pub fn restore(&self) -> Result<(), std::io::Error> { + self.restore_with_sysfs(&SysfsRoot::default()) + } + + pub(crate) fn restore_with_sysfs(&self, sysfs: &SysfsRoot) -> Result<(), std::io::Error> { + if !self.did_bind { + return Ok(()); + } + eprintln!( + "GPU: rebinding {} to {}", + self.pci_addr, self.original_driver + ); + rebind_gpu_to_original(sysfs, &self.pci_addr, &self.original_driver)?; + rebind_iommu_group_peers(sysfs, &self.peer_binds)?; + Ok(()) + } +} + +/// RAII guard that restores GPU driver binding when dropped. +/// +/// Ensures the GPU is rebound to its original driver on normal exit, +/// early return (?), or panic. Cannot protect against SIGKILL. +pub struct GpuBindGuard { + state: Option, +} + +impl GpuBindGuard { + pub fn new(state: GpuBindState) -> Self { + Self { state: Some(state) } + } + + /// Take the state out, preventing restore on drop. + pub fn disarm(&mut self) -> Option { + self.state.take() + } + + /// Get the PCI address of the bound GPU, if any. + pub fn pci_addr(&self) -> Option<&str> { + self.state.as_ref().map(|s| s.pci_addr.as_str()) + } +} + +impl Drop for GpuBindGuard { + fn drop(&mut self) { + if let Some(ref state) = self.state { + eprintln!( + "GPU: restoring {} to {} (cleanup)", + state.pci_addr, state.original_driver + ); + if let Err(e) = state.restore() { + eprintln!("GPU: restore failed: {e}"); + } + } + } +} + +/// Prepare a GPU for VFIO passthrough: run safety checks, select, and bind. +/// +/// When `requested_bdf` is Some, targets that specific device. +/// When None (auto mode), selects the best available GPU. +/// +/// All safety checks are hard failures — if any check fails, this returns +/// an error and does not bind anything. +pub fn prepare_gpu_for_passthrough( + requested_bdf: Option<&str>, +) -> Result { + prepare_gpu_with_sysfs(&SysfsRoot::default(), requested_bdf) +} + +pub(crate) fn prepare_gpu_with_sysfs( + sysfs: &SysfsRoot, + requested_bdf: Option<&str>, +) -> Result { + match requested_bdf { + Some(bdf) => prepare_specific_gpu(sysfs, bdf), + None => prepare_auto_gpu(sysfs), + } +} + +fn prepare_specific_gpu(sysfs: &SysfsRoot, bdf: &str) -> Result { + validate_pci_addr(bdf)?; + + let dev_dir = sysfs.sys_bus_pci_devices().join(bdf); + if !dev_dir.exists() { + return Err(std::io::Error::new( + std::io::ErrorKind::NotFound, + format!("PCI device {bdf} not found in sysfs"), + )); + } + + let vendor = std::fs::read_to_string(dev_dir.join("vendor")) + .map(|v| v.trim().to_lowercase()) + .unwrap_or_default(); + if vendor != NVIDIA_VENDOR_ID { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("PCI device {bdf} is not an NVIDIA device (vendor: {vendor})"), + )); + } + let class = std::fs::read_to_string(dev_dir.join("class")) + .map(|c| c.trim().to_lowercase()) + .unwrap_or_default(); + if !class.starts_with("0x03") { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("PCI device {bdf} is not a GPU (class: {class})"), + )); + } + + if current_driver(sysfs, bdf).as_deref() == Some("vfio-pci") && is_iommu_group_clean(sysfs, bdf) + { + return Ok(GpuBindState { + pci_addr: bdf.to_string(), + original_driver: "vfio-pci".to_string(), + peer_binds: vec![], + did_bind: false, + }); + } + + if check_display_attached(sysfs, bdf) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: has active display outputs"), + )); + } + + let procs = check_active_gpu_processes().map_err(|e| { + std::io::Error::new( + e.kind(), + format!("GPU {bdf}: cannot verify GPU is idle — {e}"), + ) + })?; + if !procs.is_empty() { + let desc: Vec = procs + .iter() + .map(|(pid, comm)| format!("{pid} ({comm})")) + .collect(); + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: in use by PIDs: {}", desc.join(", ")), + )); + } + + if !check_iommu_enabled(sysfs, bdf) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: IOMMU not enabled or device has no IOMMU group"), + )); + } + + if !check_vfio_modules_loaded(sysfs) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: VFIO kernel modules not loaded"), + )); + } + + if !check_sysfs_permissions(sysfs, bdf) { + return Err(std::io::Error::new( + std::io::ErrorKind::PermissionDenied, + format!("GPU {bdf}: insufficient sysfs permissions — run as root"), + )); + } + + let original_driver = bind_gpu_to_vfio(sysfs, bdf)?; + let peer_binds = match bind_iommu_group_peers(sysfs, bdf) { + Ok(peers) => peers, + Err(e) => { + let _ = rebind_gpu_to_original(sysfs, bdf, &original_driver); + return Err(e); + } + }; + + Ok(GpuBindState { + pci_addr: bdf.to_string(), + original_driver, + peer_binds, + did_bind: true, + }) +} + +fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result { + let pci_dir = sysfs.sys_bus_pci_devices(); + let entries = std::fs::read_dir(&pci_dir).map_err(|e| { + std::io::Error::new(e.kind(), format!("cannot read {}: {e}", pci_dir.display())) + })?; + + let mut nvidia_addrs = Vec::new(); + for entry in entries.filter_map(Result::ok) { + let dev_path = entry.path(); + let vendor = match std::fs::read_to_string(dev_path.join("vendor")) { + Ok(v) => v.trim().to_lowercase(), + Err(_) => continue, + }; + let class = match std::fs::read_to_string(dev_path.join("class")) { + Ok(c) => c.trim().to_lowercase(), + Err(_) => continue, + }; + if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x03") { + nvidia_addrs.push(entry.file_name().to_string_lossy().to_string()); + } + } + + if nvidia_addrs.is_empty() { + return Err(std::io::Error::new( + std::io::ErrorKind::NotFound, + "no NVIDIA PCI device found", + )); + } + + nvidia_addrs.sort(); + + for addr in &nvidia_addrs { + if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") + && is_iommu_group_clean(sysfs, addr) + { + return Ok(GpuBindState { + pci_addr: addr.clone(), + original_driver: "vfio-pci".to_string(), + peer_binds: vec![], + did_bind: false, + }); + } + } + + let mut blocked: Vec<(String, String)> = Vec::new(); + let active_procs = check_active_gpu_processes() + .map_err(|e| std::io::Error::new(e.kind(), format!("cannot verify GPUs are idle — {e}")))?; + + for addr in &nvidia_addrs { + if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") { + blocked.push((addr.clone(), "IOMMU group not clean".to_string())); + continue; + } + + if check_display_attached(sysfs, addr) { + blocked.push((addr.clone(), "has active display outputs".to_string())); + continue; + } + + if !active_procs.is_empty() { + let desc: Vec = active_procs + .iter() + .map(|(pid, comm)| format!("{pid} ({comm})")) + .collect(); + blocked.push((addr.clone(), format!("in use by PIDs: {}", desc.join(", ")))); + continue; + } + + if !check_iommu_enabled(sysfs, addr) { + blocked.push((addr.clone(), "IOMMU not enabled".to_string())); + continue; + } + + if !check_vfio_modules_loaded(sysfs) { + blocked.push((addr.clone(), "VFIO modules not loaded".to_string())); + continue; + } + + if !check_sysfs_permissions(sysfs, addr) { + blocked.push((addr.clone(), "insufficient sysfs permissions".to_string())); + continue; + } + + eprintln!("GPU: binding {addr} for VFIO passthrough"); + let original_driver = bind_gpu_to_vfio(sysfs, addr)?; + let peer_binds = match bind_iommu_group_peers(sysfs, addr) { + Ok(peers) => peers, + Err(e) => { + let _ = rebind_gpu_to_original(sysfs, addr, &original_driver); + return Err(e); + } + }; + + return Ok(GpuBindState { + pci_addr: addr.clone(), + original_driver, + peer_binds, + did_bind: true, + }); + } + + let mut msg = + String::from("GPU passthrough blocked by safety checks.\n\n Detected devices:\n"); + for (addr, reason) in &blocked { + msg.push_str(&format!(" {addr}: {reason}\n")); + } + msg.push_str("\n No GPU is available for passthrough."); + + Err(std::io::Error::new(std::io::ErrorKind::Other, msg)) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::path::Path; + + #[test] + fn passthrough_gate_is_false_without_env_var() { + // SAFETY: test runs single-threaded; no other thread reads this var. + unsafe { std::env::remove_var("OPENSHELL_VM_GPU_E2E") }; + assert!( + !nvidia_gpu_available_for_vm_passthrough(), + "gate must return false when OPENSHELL_VM_GPU_E2E is unset" + ); + } + + #[test] + fn probe_returns_no_device_or_readiness_on_typical_ci() { + let results = probe_host_nvidia_vfio_readiness(); + + #[cfg(not(target_os = "linux"))] + assert!(results.is_empty(), "non-Linux should return empty Vec"); + + #[cfg(target_os = "linux")] + { + // CI machines typically have no NVIDIA GPU bound to vfio-pci. + // Accept an empty list or any per-device readiness state. + for (addr, state) in &results { + assert!(!addr.is_empty(), "PCI address should not be empty"); + assert!( + matches!( + state, + HostNvidiaVfioReadiness::BoundToNvidia + | HostNvidiaVfioReadiness::VfioBoundReady + | HostNvidiaVfioReadiness::VfioBoundDirtyGroup + ), + "unexpected per-device readiness state for {addr}: {state:?}" + ); + } + } + } + + #[test] + fn display_impl_is_meaningful() { + let states = [ + HostNvidiaVfioReadiness::UnsupportedPlatform, + HostNvidiaVfioReadiness::NoNvidiaDevice, + HostNvidiaVfioReadiness::BoundToNvidia, + HostNvidiaVfioReadiness::VfioBoundReady, + HostNvidiaVfioReadiness::VfioBoundDirtyGroup, + HostNvidiaVfioReadiness::MixedVfioAndOther, + ]; + for state in &states { + let msg = format!("{state}"); + assert!(!msg.is_empty(), "Display for {state:?} should not be empty"); + } + } + + fn mock_pci_device(root: &Path, pci_addr: &str, vendor: &str, driver: Option<&str>) { + use std::fs; + let dev_dir = root.join("sys/bus/pci/devices").join(pci_addr); + fs::create_dir_all(&dev_dir).unwrap(); + fs::write(dev_dir.join("vendor"), vendor).unwrap(); + fs::write(dev_dir.join("class"), "0x030000").unwrap(); + if let Some(drv) = driver { + let driver_dir = root.join("sys/bus/pci/drivers").join(drv); + fs::create_dir_all(&driver_dir).unwrap(); + #[cfg(unix)] + std::os::unix::fs::symlink(&driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(dev_dir.join("driver_override"), "").unwrap(); + } + + fn mock_drm_card(root: &Path, card: &str, pci_addr: &str, outputs: &[(&str, &str)]) { + use std::fs; + let card_dir = root.join("sys/class/drm").join(card); + fs::create_dir_all(&card_dir).unwrap(); + #[cfg(unix)] + std::os::unix::fs::symlink( + root.join("sys/bus/pci/devices").join(pci_addr), + card_dir.join("device"), + ) + .unwrap(); + for (output, status) in outputs { + let out_dir = card_dir.join(format!("{card}-{output}")); + fs::create_dir_all(&out_dir).unwrap(); + fs::write(out_dir.join("status"), status).unwrap(); + } + } + + fn mock_iommu_group(root: &Path, group_id: u32, members: &[&str]) { + use std::fs; + let group_dir = root.join(format!("sys/kernel/iommu_groups/{group_id}/devices")); + fs::create_dir_all(&group_dir).unwrap(); + for member in members { + let dev_dir = root.join("sys/bus/pci/devices").join(member); + fs::create_dir_all(&dev_dir).unwrap(); + #[cfg(unix)] + { + let iommu_group_target = root.join(format!("sys/kernel/iommu_groups/{group_id}")); + let _ = + std::os::unix::fs::symlink(&iommu_group_target, dev_dir.join("iommu_group")); + let _ = std::os::unix::fs::symlink(&dev_dir, group_dir.join(member)); + } + } + } + + #[test] + #[cfg(target_os = "linux")] + fn display_attached_detects_active_framebuffer() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + assert!(check_display_attached(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_attached_false_on_headless() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "disconnected")], + ); + assert!(!check_display_attached(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_attached_false_no_drm_card() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + assert!(!check_display_attached(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_check_fails_without_groups_dir() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + assert!(!check_iommu_enabled(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_check_passes_with_group() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_iommu_group(root.path(), 15, &["0000:41:00.0"]); + assert!(check_iommu_enabled(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn vfio_modules_loaded_true() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + assert!(check_vfio_modules_loaded(&sysfs)); + } + + #[test] + #[cfg(target_os = "linux")] + fn vfio_modules_missing() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + assert!(!check_vfio_modules_loaded(&sysfs)); + } + + #[test] + #[cfg(target_os = "linux")] + fn permissions_writable() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&bind_dir).unwrap(); + fs::write(bind_dir.join("bind"), "").unwrap(); + assert!(check_sysfs_permissions(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn permissions_readonly_driver_override() { + use std::os::unix::fs::PermissionsExt; + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let driver_override = root + .path() + .join("sys/bus/pci/devices/0000:41:00.0/driver_override"); + fs::set_permissions(&driver_override, fs::Permissions::from_mode(0o444)).unwrap(); + assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn permissions_readonly_bind() { + use std::os::unix::fs::PermissionsExt; + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&bind_dir).unwrap(); + let bind_path = bind_dir.join("bind"); + fs::write(&bind_path, "").unwrap(); + fs::set_permissions(&bind_path, fs::Permissions::from_mode(0o444)).unwrap(); + assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0")); + } + + fn mock_bindable_gpu(root: &Path, pci_addr: &str) { + mock_pci_device(root, pci_addr, "0x10de", Some("nvidia")); + let drv_unbind = root.join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + let vfio_dir = root.join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + mock_iommu_group(root, 15, &[pci_addr]); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_gpu_writes_correct_sysfs_paths() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + + bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + + let unbind_content = + fs::read_to_string(root.path().join("sys/bus/pci/drivers/nvidia/unbind")).unwrap(); + assert_eq!(unbind_content, "0000:41:00.0"); + + let override_content = fs::read_to_string( + root.path() + .join("sys/bus/pci/devices/0000:41:00.0/driver_override"), + ) + .unwrap(); + assert_eq!(override_content, "vfio-pci"); + + let bind_content = + fs::read_to_string(root.path().join("sys/bus/pci/drivers/vfio-pci/bind")).unwrap(); + assert_eq!(bind_content, "0000:41:00.0"); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_returns_original_driver() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + + let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!(result, "nvidia"); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_noop_when_already_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("vfio-pci")); + let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + + let nvidia_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::create_dir_all(nvidia_unbind.parent().unwrap()).unwrap(); + fs::write(&nvidia_unbind, "").unwrap(); + + let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!(result, "vfio-pci"); + + let unbind_content = fs::read_to_string(&nvidia_unbind).unwrap(); + assert_eq!( + unbind_content, "", + "nvidia unbind should NOT have been written" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn rebind_clears_driver_override() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, ""); + } + + #[test] + #[cfg(target_os = "linux")] + fn rebind_writes_to_original_driver_bind() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap(); + + let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap(); + assert_eq!(bind_content, "0000:41:00.0"); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_peers_listed_correctly() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_pci_device(root.path(), "0000:41:00.1", "0x10de", None); + mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]); + + let peers = iommu_group_peers(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!(peers, vec!["0000:41:00.1"]); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_peers_bound_together() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia")); + let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]); + + let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!( + restore, + vec![("0000:41:00.1".to_string(), "nvidia".to_string())] + ); + + let override_content = fs::read_to_string( + root.path() + .join("sys/bus/pci/devices/0000:41:00.1/driver_override"), + ) + .unwrap(); + assert_eq!(override_content, "vfio-pci"); + } + + #[test] + #[cfg(target_os = "linux")] + fn peer_restore_rebinds_to_original() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia")); + let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]); + + let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap(); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.1"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + rebind_iommu_group_peers(&sysfs, &restore).unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, ""); + } + + fn mock_multi_gpu_host(root: &Path) { + // GPU 0: on nvidia, has display attached + mock_bindable_gpu(root, "0000:41:00.0"); + mock_drm_card(root, "card0", "0000:41:00.0", &[("DP-1", "connected")]); + + // GPU 1: on nvidia, idle (no display, no processes) + mock_bindable_gpu(root, "0000:42:00.0"); + + // GPU 2: already on vfio-pci, clean IOMMU group + mock_pci_device(root, "0000:43:00.0", "0x10de", Some("vfio-pci")); + mock_iommu_group(root, 17, &["0000:43:00.0"]); + + fs::create_dir_all(root.join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.join("sys/module/vfio_iommu_type1")).unwrap(); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_prefers_already_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_multi_gpu_host(root.path()); + + let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap(); + assert_eq!(state.pci_addr, "0000:43:00.0"); + assert!(!state.did_bind); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_selects_idle_gpu_when_no_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + mock_iommu_group(root.path(), 15, &["0000:41:00.0"]); + + mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia")); + mock_iommu_group(root.path(), 16, &["0000:42:00.0"]); + + let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap(); + assert_eq!(state.pci_addr, "0000:42:00.0"); + assert!(state.did_bind); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_fails_when_all_blocked() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + mock_iommu_group(root.path(), 15, &["0000:41:00.0"]); + + mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card1", + "0000:42:00.0", + &[("HDMI-1", "connected")], + ); + mock_iommu_group(root.path(), 16, &["0000:42:00.0"]); + + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("display"), + "error should mention display: {msg}" + ); + assert!( + msg.contains("0000:41:00.0"), + "error should list first GPU: {msg}" + ); + assert!( + msg.contains("0000:42:00.0"), + "error should list second GPU: {msg}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_fails_on_empty_host() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + fs::create_dir_all(root.path().join("sys/bus/pci/devices")).unwrap(); + + let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err(); + assert!( + err.to_string().contains("no NVIDIA PCI device found"), + "unexpected error: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_binds_target() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap(); + assert_eq!(state.pci_addr, "0000:41:00.0"); + assert!(state.did_bind); + assert_eq!(state.original_driver, "nvidia"); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_validates_format() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + let err = prepare_gpu_with_sysfs(&sysfs, Some("invalid")).unwrap_err(); + assert!( + err.to_string().contains("invalid PCI address"), + "unexpected error: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_fails_display_check() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + + let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err(); + assert!( + err.to_string().contains("display"), + "error should mention display: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_fails_iommu_check() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + + let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err(); + assert!( + err.to_string().contains("IOMMU"), + "error should mention IOMMU: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn restore_round_trips() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap(); + assert!(state.did_bind); + assert_eq!(state.original_driver, "nvidia"); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + state.restore_with_sysfs(&sysfs).unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, ""); + + let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap(); + assert_eq!(bind_content, "0000:41:00.0"); + } + + #[test] + #[cfg(target_os = "linux")] + fn restore_noop_when_did_not_bind() { + let state = GpuBindState { + pci_addr: "0000:43:00.0".to_string(), + original_driver: "vfio-pci".to_string(), + peer_binds: vec![], + did_bind: false, + }; + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + state.restore_with_sysfs(&sysfs).unwrap(); + } + + #[test] + fn guard_has_pci_addr() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + }; + let guard = GpuBindGuard::new(state); + assert_eq!(guard.pci_addr(), Some("0000:41:00.0")); + } + + #[test] + fn guard_disarm_returns_state() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + }; + let mut guard = GpuBindGuard::new(state); + let taken = guard.disarm(); + assert!(taken.is_some()); + assert_eq!(guard.pci_addr(), None); + } + + #[test] + fn guard_disarm_prevents_double_restore() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + }; + let mut guard = GpuBindGuard::new(state); + let _ = guard.disarm(); + let second = guard.disarm(); + assert!(second.is_none()); + } + + #[test] + fn guard_drop_noop_when_did_not_bind() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: false, + }; + let guard = GpuBindGuard::new(state); + drop(guard); + } + + #[test] + fn guard_drop_on_panic_is_safe() { + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: false, + }; + let _guard = GpuBindGuard::new(state); + panic!("test panic"); + })); + assert!(result.is_err()); + } +} diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index 2b78a7669..9b70b32cf 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -14,9 +14,11 @@ #![allow(unsafe_code)] +pub mod backend; mod embedded; mod exec; mod ffi; +pub mod gpu_passthrough; mod health; use std::ffi::CString; @@ -25,9 +27,10 @@ use std::ptr; use std::time::Instant; pub use exec::{ - VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state, - ensure_vm_not_running, exec_capture, exec_running_vm, recover_corrupt_kine_db, - reset_runtime_state, vm_exec_socket_path, vm_state_path, write_vm_runtime_state, + VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, VsockConnectMode, acquire_rootfs_lock, + clear_vm_runtime_state, ensure_vm_not_running, exec_capture, exec_running_vm, + recover_corrupt_kine_db, reset_runtime_state, vm_exec_socket_path, vm_state_path, + write_vm_runtime_state, }; // ── Error type ───────────────────────────────────────────────────────── @@ -45,6 +48,19 @@ pub enum VmError { )] RootfsNotFound { path: String }, + /// The GPU rootfs directory does not exist. + #[error( + "GPU rootfs not found: {path}\n\ + The --gpu flag requires a rootfs built with GPU support (NVIDIA drivers,\n\ + nvidia-container-toolkit, and GPU manifests).\n\ + Build one with:\n\ + \x20 ./crates/openshell-vm/scripts/build-rootfs.sh --gpu \n\ + Then either:\n\ + \x20 - Copy it to: {path}\n\ + \x20 - Or use: openshell-vm --gpu --rootfs " + )] + GpuRootfsNotFound { path: String }, + /// A path contained invalid UTF-8. #[error("path is not valid UTF-8: {0}")] InvalidPath(String), @@ -98,6 +114,18 @@ fn check(ret: i32, func: &'static str) -> Result<(), VmError> { // ── Configuration ────────────────────────────────────────────────────── +/// Hypervisor backend selection. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum VmBackendChoice { + /// Auto-select: cloud-hypervisor when a VFIO device is configured, libkrun otherwise. + #[default] + Auto, + /// Force the libkrun backend. + Libkrun, + /// Force the cloud-hypervisor backend (even without GPU/VFIO). + CloudHypervisor, +} + /// Networking backend for the microVM. #[derive(Debug, Clone)] pub enum NetBackend { @@ -202,6 +230,16 @@ pub struct VmConfig { /// Optional host-backed raw block image for mutable guest state. pub state_disk: Option, + + /// Whether GPU passthrough is enabled for this VM. + pub gpu_enabled: bool, + + /// VFIO PCI device address for GPU passthrough (e.g. `0000:41:00.0`). + /// When set, the cloud-hypervisor backend is used instead of libkrun. + pub vfio_device: Option, + + /// Hypervisor backend override. Defaults to [`VmBackendChoice::Auto`]. + pub backend: VmBackendChoice, } impl VmConfig { @@ -245,6 +283,9 @@ impl VmConfig { reset: false, gateway_name: format!("{GATEWAY_NAME_PREFIX}-default"), state_disk: Some(state_disk), + gpu_enabled: false, + vfio_device: None, + backend: VmBackendChoice::Auto, } } } @@ -277,6 +318,38 @@ pub fn named_rootfs_dir(instance_name: &str) -> Result { .join("rootfs")) } +/// Resolve the GPU rootfs path for a named instance. +/// +/// Layout: `$XDG_DATA_HOME/openshell/openshell-vm/{version}/instances/{name}/rootfs-gpu` +/// +/// The GPU rootfs is built separately with `build-rootfs.sh --gpu` and is +/// never embedded (too large with NVIDIA drivers). If it doesn't exist, +/// callers should return [`VmError::GpuRootfsNotFound`]. +pub fn named_gpu_rootfs_dir(instance_name: &str) -> Result { + let name = sanitize_instance_name(instance_name)?; + let base = openshell_bootstrap::paths::openshell_vm_base_dir() + .map_err(|e| VmError::RuntimeState(format!("resolve openshell-vm base dir: {e}")))?; + Ok(base + .join(env!("CARGO_PKG_VERSION")) + .join("instances") + .join(name) + .join("rootfs-gpu")) +} + +/// Ensure a GPU rootfs exists for the named instance. +/// +/// Unlike [`ensure_named_rootfs`], there is no embedded GPU rootfs to +/// extract — the user must pre-build it with `build-rootfs.sh --gpu`. +pub fn ensure_gpu_rootfs(instance_name: &str) -> Result { + let gpu_rootfs = named_gpu_rootfs_dir(instance_name)?; + if gpu_rootfs.is_dir() { + return Ok(gpu_rootfs); + } + Err(VmError::GpuRootfsNotFound { + path: gpu_rootfs.display().to_string(), + }) +} + /// Ensure a named instance rootfs exists, extracting from the embedded /// rootfs tarball on first use. /// @@ -365,7 +438,9 @@ fn sanitize_instance_name(name: &str) -> Result { /// Build a null-terminated C string array from a slice of strings. /// /// Returns both the `CString` owners (to keep them alive) and the pointer array. -fn c_string_array(strings: &[&str]) -> Result<(Vec, Vec<*const libc::c_char>), VmError> { +pub(crate) fn c_string_array( + strings: &[&str], +) -> Result<(Vec, Vec<*const libc::c_char>), VmError> { let owned: Vec = strings .iter() .map(|s| CString::new(*s)) @@ -570,7 +645,7 @@ fn extract_json_string(json: &str, key: &str) -> Option { map.get(key)?.as_str().map(ToOwned::to_owned) } -fn clamp_log_level(level: u32) -> u32 { +pub(crate) fn clamp_log_level(level: u32) -> u32 { match level { 0 => ffi::KRUN_LOG_LEVEL_OFF, 1 => ffi::KRUN_LOG_LEVEL_ERROR, @@ -581,258 +656,29 @@ fn clamp_log_level(level: u32) -> u32 { } } -struct VmContext { - krun: &'static ffi::LibKrun, - ctx_id: u32, -} - -impl VmContext { - fn create(log_level: u32) -> Result { - let krun = ffi::libkrun()?; - unsafe { - check( - (krun.krun_init_log)( - ffi::KRUN_LOG_TARGET_DEFAULT, - clamp_log_level(log_level), - ffi::KRUN_LOG_STYLE_AUTO, - ffi::KRUN_LOG_OPTION_NO_ENV, - ), - "krun_init_log", - )?; - } - - let ctx_id = unsafe { (krun.krun_create_ctx)() }; - if ctx_id < 0 { - return Err(VmError::Krun { - func: "krun_create_ctx", - code: ctx_id, - }); - } - - Ok(Self { - krun, - ctx_id: ctx_id as u32, - }) - } - - fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib), - "krun_set_vm_config", - ) - } - } - - fn set_root(&self, rootfs: &Path) -> Result<(), VmError> { - let rootfs_c = path_to_cstring(rootfs)?; - unsafe { - check( - (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()), - "krun_set_root", - ) - } - } - - fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> { - let Some(add_disk3) = self.krun.krun_add_disk3 else { - return Err(VmError::HostSetup( - "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support" - .to_string(), - )); - }; - - let block_id_c = CString::new(state_disk.block_id.as_str())?; - let disk_path_c = path_to_cstring(&state_disk.path)?; - unsafe { - check( - add_disk3( - self.ctx_id, - block_id_c.as_ptr(), - disk_path_c.as_ptr(), - ffi::KRUN_DISK_FORMAT_RAW, - false, - false, - state_disk_sync_mode(), - ), - "krun_add_disk3", - ) - } - } - - fn set_workdir(&self, workdir: &str) -> Result<(), VmError> { - let workdir_c = CString::new(workdir)?; - unsafe { - check( - (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()), - "krun_set_workdir", - ) - } - } - - fn disable_implicit_vsock(&self) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_disable_implicit_vsock)(self.ctx_id), - "krun_disable_implicit_vsock", - ) - } - } - - fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_add_vsock)(self.ctx_id, tsi_features), - "krun_add_vsock", - ) - } - } - - #[cfg(target_os = "macos")] - fn add_net_unixgram( - &self, - socket_path: &Path, - mac: &[u8; 6], - features: u32, - flags: u32, - ) -> Result<(), VmError> { - let sock_c = path_to_cstring(socket_path)?; - unsafe { - check( - (self.krun.krun_add_net_unixgram)( - self.ctx_id, - sock_c.as_ptr(), - -1, - mac.as_ptr(), - features, - flags, - ), - "krun_add_net_unixgram", - ) - } - } - - #[allow(dead_code)] // FFI binding for future use (e.g. Linux networking) - fn add_net_unixstream( - &self, - socket_path: &Path, - mac: &[u8; 6], - features: u32, - ) -> Result<(), VmError> { - let sock_c = path_to_cstring(socket_path)?; - unsafe { - check( - (self.krun.krun_add_net_unixstream)( - self.ctx_id, - sock_c.as_ptr(), - -1, - mac.as_ptr(), - features, - 0, - ), - "krun_add_net_unixstream", - ) - } - } - - fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { - let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); - let (_port_owners, port_ptrs) = c_string_array(&port_strs)?; - unsafe { - check( - (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()), - "krun_set_port_map", - ) - } - } - - fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> { - let socket_c = path_to_cstring(&port.socket_path)?; - unsafe { - check( - (self.krun.krun_add_vsock_port2)( - self.ctx_id, - port.port, - socket_c.as_ptr(), - port.listen, - ), - "krun_add_vsock_port2", - ) - } - } - - fn set_console_output(&self, path: &Path) -> Result<(), VmError> { - let console_c = path_to_cstring(path)?; - unsafe { - check( - (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()), - "krun_set_console_output", - ) - } - } - - fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> { - let exec_c = CString::new(exec_path)?; - let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect(); - let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?; - let env_strs: Vec<&str> = env.iter().map(String::as_str).collect(); - let (_env_owners, env_ptrs) = c_string_array(&env_strs)?; - - unsafe { - check( - (self.krun.krun_set_exec)( - self.ctx_id, - exec_c.as_ptr(), - argv_ptrs.as_ptr(), - env_ptrs.as_ptr(), - ), - "krun_set_exec", - ) - } - } - - fn start_enter(&self) -> i32 { - unsafe { (self.krun.krun_start_enter)(self.ctx_id) } - } -} - -impl Drop for VmContext { - fn drop(&mut self) { - unsafe { - let ret = (self.krun.krun_free_ctx)(self.ctx_id); - if ret < 0 { - eprintln!( - "warning: krun_free_ctx({}) failed with code {ret}", - self.ctx_id - ); - } - } - } -} - /// RAII guard that kills and waits on a gvproxy child process when dropped. /// /// This prevents orphaned gvproxy processes when early `?` returns in the /// launch function cause the child to be dropped before cleanup code runs. /// Call [`GvproxyGuard::disarm`] to take ownership of the child when it /// should outlive the guard (i.e., after a successful fork). -struct GvproxyGuard { +pub(crate) struct GvproxyGuard { child: Option, } impl GvproxyGuard { - fn new(child: std::process::Child) -> Self { + pub(crate) fn new(child: std::process::Child) -> Self { Self { child: Some(child) } } /// Take the child out of the guard, preventing it from being killed on drop. /// Use this after the launch is successful and the parent will manage cleanup. - fn disarm(&mut self) -> Option { + pub(crate) fn disarm(&mut self) -> Option { self.child.take() } /// Get the child's PID without disarming. - fn id(&self) -> Option { + pub(crate) fn id(&self) -> Option { self.child.as_ref().map(std::process::Child::id) } } @@ -852,7 +698,7 @@ impl Drop for GvproxyGuard { /// /// Sends a raw HTTP/1.1 POST request over the unix socket to avoid /// depending on `curl` being installed on the host. -fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { +pub(crate) fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { use std::io::{Read, Write}; use std::os::unix::net::UnixStream; @@ -908,7 +754,7 @@ fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { /// runtime state. If the state file was deleted (e.g. the user ran /// `rm -rf` on the data directory), we fall back to killing any gvproxy /// process holding the target ports. -fn kill_stale_gvproxy(rootfs: &Path) { +pub(crate) fn kill_stale_gvproxy(rootfs: &Path) { kill_stale_gvproxy_by_state(rootfs); } @@ -929,7 +775,7 @@ fn kill_stale_gvproxy_by_state(rootfs: &Path) { /// /// Used as a fallback when the VM state file is missing (e.g. after the /// user deleted the data directory while a VM was running). -fn kill_stale_gvproxy_by_port(port: u16) { +pub(crate) fn kill_stale_gvproxy_by_port(port: u16) { // Use lsof to find PIDs listening on the target port. let output = std::process::Command::new("lsof") .args(["-ti", &format!(":{port}")]) @@ -1009,7 +855,7 @@ fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool { false } -fn vm_rootfs_key(rootfs: &Path) -> String { +pub(crate) fn vm_rootfs_key(rootfs: &Path) -> String { let name = rootfs .file_name() .and_then(|part| part.to_str()) @@ -1078,7 +924,7 @@ fn ensure_state_disk_image(state_disk: &StateDiskConfig) -> Result<(), VmError> Ok(()) } -fn state_disk_sync_mode() -> u32 { +pub(crate) fn state_disk_sync_mode() -> u32 { #[cfg(target_os = "macos")] { ffi::KRUN_SYNC_RELAXED @@ -1154,7 +1000,7 @@ fn secure_socket_base(subdir: &str) -> Result { Ok(dir) } -fn gvproxy_socket_dir(rootfs: &Path) -> Result { +pub(crate) fn gvproxy_socket_dir(rootfs: &Path) -> Result { let dir = secure_socket_base("ovm-gv")?; // macOS unix socket path limit is tight (~104 bytes). Keep paths very short. @@ -1162,7 +1008,30 @@ fn gvproxy_socket_dir(rootfs: &Path) -> Result { Ok(dir.join(id)) } -fn gateway_host_port(config: &VmConfig) -> u16 { +/// Validate that a VFIO PCI address matches the BDF format `DDDD:BB:DD.F`. +/// +/// Rejects strings containing `/`, `..`, or non-hex characters to prevent +/// path traversal when the address is interpolated into sysfs paths. +fn validate_vfio_address(addr: &str) -> Result<(), VmError> { + let bytes = addr.as_bytes(); + if bytes.len() == 12 + && bytes[4] == b':' + && bytes[7] == b':' + && bytes[10] == b'.' + && bytes[..4].iter().all(u8::is_ascii_hexdigit) + && bytes[5..7].iter().all(u8::is_ascii_hexdigit) + && bytes[8..10].iter().all(u8::is_ascii_hexdigit) + && bytes[11].is_ascii_digit() + && bytes[11] <= b'7' + { + return Ok(()); + } + Err(VmError::HostSetup(format!( + "invalid VFIO PCI address '{addr}': expected BDF format DDDD:BB:DD.F (e.g. 0000:41:00.0)" + ))) +} + +pub(crate) fn gateway_host_port(config: &VmConfig) -> u16 { config .port_map .first() @@ -1171,7 +1040,7 @@ fn gateway_host_port(config: &VmConfig) -> u16 { .unwrap_or(DEFAULT_GATEWAY_PORT) } -fn pick_gvproxy_ssh_port() -> Result { +pub(crate) fn pick_gvproxy_ssh_port() -> Result { let listener = std::net::TcpListener::bind(("127.0.0.1", 0)) .map_err(|e| VmError::HostSetup(format!("allocate gvproxy ssh port on localhost: {e}")))?; let port = listener @@ -1182,7 +1051,7 @@ fn pick_gvproxy_ssh_port() -> Result { Ok(port) } -fn path_to_cstring(path: &Path) -> Result { +pub(crate) fn path_to_cstring(path: &Path) -> Result { let s = path .to_str() .ok_or_else(|| VmError::InvalidPath(path.display().to_string()))?; @@ -1277,11 +1146,22 @@ pub fn launch(config: &VmConfig) -> Result { state_disk.path.display() ))); } - if let Some(state_disk) = &config.state_disk { + let fresh_state_disk = if let Some(state_disk) = &config.state_disk { + let existed_before = state_disk.path.is_file(); ensure_state_disk_image(state_disk)?; + !existed_before + } else { + false + }; + + // When the state disk is freshly created (deleted by user, --reset, or + // first boot), the VM will generate new PKI. Clear any cached host-side + // mTLS certs so `bootstrap_gateway` runs the cold-boot PKI fetch path + // instead of using stale certs that won't match the new VM CA. + if fresh_state_disk || config.reset { + clear_warm_boot_certs(&config.gateway_name); } - let launch_start = Instant::now(); eprintln!("rootfs: {}", config.rootfs.display()); if let Some(state_disk) = &config.state_disk { eprintln!( @@ -1292,8 +1172,34 @@ pub fn launch(config: &VmConfig) -> Result { } eprintln!("vm: {} vCPU(s), {} MiB RAM", config.vcpus, config.mem_mib); - // The runtime is embedded in the binary and extracted on first use. - // Can be overridden via OPENSHELL_VM_RUNTIME_DIR for development. + raise_nofile_limit(); + + // ── Dispatch to the appropriate backend ───────────────────────── + + let use_chv = match config.backend { + VmBackendChoice::CloudHypervisor => true, + VmBackendChoice::Libkrun => false, + VmBackendChoice::Auto => config.gpu_enabled || config.vfio_device.is_some(), + }; + + if use_chv { + #[cfg(not(target_os = "linux"))] + return Err(VmError::HostSetup( + "cloud-hypervisor backend requires Linux with KVM".into(), + )); + + #[cfg(target_os = "linux")] + { + if let Some(ref addr) = config.vfio_device { + validate_vfio_address(addr)?; + } + let chv_backend = backend::cloud_hypervisor::CloudHypervisorBackend::new()?; + return backend::VmBackend::launch(&chv_backend, config); + } + } + + // libkrun path: resolve the embedded runtime bundle and load libkrun. + // Cloud-hypervisor resolves its own binaries in CloudHypervisorBackend::new(). let runtime_gvproxy = resolve_runtime_bundle()?; let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { VmError::HostSetup(format!( @@ -1302,413 +1208,12 @@ pub fn launch(config: &VmConfig) -> Result { )) })?; configure_runtime_loader_env(runtime_dir)?; - raise_nofile_limit(); - // ── Log runtime provenance ───────────────────────────────────── - // After configuring the loader, trigger library loading so that - // provenance is captured before we proceed with VM configuration. let _ = ffi::libkrun()?; log_runtime_provenance(runtime_dir); - // ── Configure the microVM ────────────────────────────────────── - - let vm = VmContext::create(config.log_level)?; - vm.set_vm_config(config.vcpus, config.mem_mib)?; - vm.set_root(&config.rootfs)?; - if let Some(state_disk) = &config.state_disk { - vm.add_state_disk(state_disk)?; - } - vm.set_workdir(&config.workdir)?; - - // Networking setup — use a drop guard so gvproxy is killed if we - // return early via `?` before reaching the parent's cleanup code. - let mut gvproxy_guard: Option = None; - let mut gvproxy_api_sock: Option = None; - - match &config.net { - NetBackend::Tsi => { - // Default TSI — no special setup needed. - } - NetBackend::None => { - vm.disable_implicit_vsock()?; - vm.add_vsock(0)?; - eprintln!("Networking: disabled (no TSI, no virtio-net)"); - } - NetBackend::Gvproxy { binary } => { - if !binary.exists() { - return Err(VmError::BinaryNotFound { - path: binary.display().to_string(), - hint: "Install Podman Desktop or place gvproxy in PATH".to_string(), - }); - } - - // Create temp socket paths - let run_dir = config - .rootfs - .parent() - .unwrap_or(&config.rootfs) - .to_path_buf(); - let rootfs_key = vm_rootfs_key(&config.rootfs); - let sock_base = gvproxy_socket_dir(&config.rootfs)?; - let net_sock = sock_base.with_extension("v"); - let api_sock = sock_base.with_extension("a"); - - // Kill any stale gvproxy process from a previous run. - // First try via the saved PID in the state file, then fall - // back to killing any gvproxy holding our target ports (covers - // the case where the state file was deleted). - kill_stale_gvproxy(&config.rootfs); - for pm in &config.port_map { - if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::().ok()) { - kill_stale_gvproxy_by_port(host_port); - } - } - - // Clean stale sockets (including the -krun.sock file that - // libkrun creates as its datagram endpoint on macOS). - let _ = std::fs::remove_file(&net_sock); - let _ = std::fs::remove_file(&api_sock); - let krun_sock = sock_base.with_extension("v-krun.sock"); - let _ = std::fs::remove_file(&krun_sock); - - // Start gvproxy - eprintln!("Starting gvproxy: {}", binary.display()); - let ssh_port = pick_gvproxy_ssh_port()?; - let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log")); - let gvproxy_log_file = std::fs::File::create(&gvproxy_log) - .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; - - // On Linux, gvproxy uses QEMU mode (SOCK_STREAM) since the vfkit - // unixgram scheme is macOS/vfkit-specific. On macOS, use vfkit mode. - #[cfg(target_os = "linux")] - let (gvproxy_net_flag, gvproxy_net_url) = - ("-listen-qemu", format!("unix://{}", net_sock.display())); - #[cfg(target_os = "macos")] - let (gvproxy_net_flag, gvproxy_net_url) = ( - "-listen-vfkit", - format!("unixgram://{}", net_sock.display()), - ); - - let child = std::process::Command::new(binary) - .arg(gvproxy_net_flag) - .arg(&gvproxy_net_url) - .arg("-listen") - .arg(format!("unix://{}", api_sock.display())) - .arg("-ssh-port") - .arg(ssh_port.to_string()) - .stdout(std::process::Stdio::null()) - .stderr(gvproxy_log_file) - .spawn() - .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; - - eprintln!( - "gvproxy started (pid {}, ssh port {}) [{:.1}s]", - child.id(), - ssh_port, - launch_start.elapsed().as_secs_f64() - ); - - // Wait for the socket to appear (exponential backoff: 5ms → 100ms). - { - let deadline = Instant::now() + std::time::Duration::from_secs(5); - let mut interval = std::time::Duration::from_millis(5); - while !net_sock.exists() { - if Instant::now() >= deadline { - return Err(VmError::Fork( - "gvproxy socket did not appear within 5s".to_string(), - )); - } - std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(100)); - } - } - - // Disable implicit TSI and add virtio-net via gvproxy - vm.disable_implicit_vsock()?; - vm.add_vsock(0)?; - // This MAC matches gvproxy's default static DHCP lease for - // 192.168.127.2. Using a different MAC can cause the gVisor - // network stack to misroute or drop packets. - let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; - - // COMPAT_NET_FEATURES from libkrun.h - const NET_FEATURE_CSUM: u32 = 1 << 0; - const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; - const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; - const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; - const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; - const NET_FEATURE_HOST_UFO: u32 = 1 << 14; - const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM - | NET_FEATURE_GUEST_CSUM - | NET_FEATURE_GUEST_TSO4 - | NET_FEATURE_GUEST_UFO - | NET_FEATURE_HOST_TSO4 - | NET_FEATURE_HOST_UFO; - - // On Linux use unixstream (SOCK_STREAM) to connect to gvproxy's - // QEMU listener. On macOS use unixgram (SOCK_DGRAM) with the vfkit - // magic byte for the vfkit listener. - #[cfg(target_os = "linux")] - vm.add_net_unixstream(&net_sock, &mac, COMPAT_NET_FEATURES)?; - #[cfg(target_os = "macos")] - { - const NET_FLAG_VFKIT: u32 = 1 << 0; - vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; - } - - eprintln!( - "Networking: gvproxy (virtio-net) [{:.1}s]", - launch_start.elapsed().as_secs_f64() - ); - gvproxy_guard = Some(GvproxyGuard::new(child)); - gvproxy_api_sock = Some(api_sock); - } - } - - // Port mapping (TSI only) - if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) { - vm.set_port_map(&config.port_map)?; - } - - for vsock_port in &config.vsock_ports { - if let Some(parent) = vsock_port.socket_path.parent() { - std::fs::create_dir_all(parent).map_err(|e| { - VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display())) - })?; - } - // libkrun returns EEXIST if the socket file is already present from a - // previous run. Remove any stale socket before registering the port. - let _ = std::fs::remove_file(&vsock_port.socket_path); - vm.add_vsock_port(vsock_port)?; - } - - // Console output - let console_log = config.console_output.clone().unwrap_or_else(|| { - config - .rootfs - .parent() - .unwrap_or(&config.rootfs) - .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs))) - }); - vm.set_console_output(&console_log)?; - - // envp: use provided env or minimal defaults - let mut env: Vec = if config.env.is_empty() { - vec![ - "HOME=/root", - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm", - ] - .into_iter() - .map(ToOwned::to_owned) - .collect() - } else { - config.env.clone() - }; - if let Some(state_disk) = &config.state_disk - && !env - .iter() - .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE=")) - { - env.push(format!( - "OPENSHELL_VM_STATE_DISK_DEVICE={}", - state_disk.guest_device - )); - } - vm.set_exec(&config.exec_path, &config.args, &env)?; - - // ── Fork and enter the VM ────────────────────────────────────── - // - // krun_start_enter() never returns — it calls exit() when the guest - // process exits. We fork so the parent can monitor and report. - - let boot_start = Instant::now(); - eprintln!("Booting microVM..."); - - let pid = unsafe { libc::fork() }; - match pid { - -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())), - 0 => { - // Child process: enter the VM (never returns on success) - let ret = vm.start_enter(); - eprintln!("krun_start_enter failed: {ret}"); - std::process::exit(1); - } - _ => { - // Parent: wait for child - if config.exec_path == "/srv/openshell-vm-init.sh" { - let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id); - if let Err(err) = - write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid) - { - unsafe { - libc::kill(pid, libc::SIGTERM); - } - // Guard drop will kill gvproxy automatically - drop(gvproxy_guard); - clear_vm_runtime_state(&config.rootfs); - return Err(err); - } - } - eprintln!( - "VM started (child pid {pid}) [{:.1}s]", - boot_start.elapsed().as_secs_f64() - ); - for pm in &config.port_map { - let host_port = pm.split(':').next().unwrap_or(pm); - eprintln!(" port {pm} -> http://localhost:{host_port}"); - } - eprintln!("Console output: {}", console_log.display()); - - // Set up gvproxy port forwarding via its HTTP API. - // The port_map entries use the same "host:guest" format - // as TSI, but here we translate them into gvproxy expose - // calls targeting the guest IP (192.168.127.2). - // - // Instead of a fixed 500ms sleep, poll the API socket with - // exponential backoff (5ms → 200ms, ~1s total budget). - if let Some(ref api_sock) = gvproxy_api_sock { - let fwd_start = Instant::now(); - // Wait for the API socket to appear (it lags slightly - // behind the vfkit data socket). - { - let deadline = Instant::now() + std::time::Duration::from_secs(2); - let mut interval = std::time::Duration::from_millis(5); - while !api_sock.exists() { - if Instant::now() >= deadline { - eprintln!( - "warning: gvproxy API socket not ready after 2s, attempting anyway" - ); - break; - } - std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(200)); - } - } - - let guest_ip = "192.168.127.2"; - - for pm in &config.port_map { - let parts: Vec<&str> = pm.split(':').collect(); - let (host_port, guest_port) = match parts.len() { - 2 => (parts[0], parts[1]), - 1 => (parts[0], parts[0]), - _ => { - eprintln!(" skipping invalid port mapping: {pm}"); - continue; - } - }; - - let expose_body = format!( - r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"# - ); - - // Retry with exponential backoff — gvproxy's internal - // netstack may not be ready immediately after socket creation. - let mut expose_ok = false; - let mut retry_interval = std::time::Duration::from_millis(100); - let expose_deadline = Instant::now() + std::time::Duration::from_secs(10); - loop { - match gvproxy_expose(api_sock, &expose_body) { - Ok(()) => { - eprintln!(" port {host_port} -> {guest_ip}:{guest_port}"); - expose_ok = true; - break; - } - Err(e) => { - if Instant::now() >= expose_deadline { - eprintln!(" port {host_port}: {e} (retries exhausted)"); - break; - } - std::thread::sleep(retry_interval); - retry_interval = - (retry_interval * 2).min(std::time::Duration::from_secs(1)); - } - } - } - if !expose_ok { - return Err(VmError::HostSetup(format!( - "failed to forward port {host_port} via gvproxy" - ))); - } - } - eprintln!( - "Port forwarding ready [{:.1}s]", - fwd_start.elapsed().as_secs_f64() - ); - } - - // Bootstrap the OpenShell control plane and wait for the - // service to be reachable. Only for the gateway preset, and - // only when port forwarding is configured (i.e. the gateway - // is reachable from the host). During rootfs pre-init builds, - // no --port is specified so there is nothing to health-check - // — the build script has its own kubectl-based readiness - // checks inside the VM. - if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { - // Bootstrap stores host-side metadata and mTLS creds. - // With pre-baked rootfs (Path 1) this reads PKI directly - // from virtio-fs — no kubectl or port forwarding needed. - // Cold boot (Path 2) writes secret manifests into the - // k3s auto-deploy directory via virtio-fs. - let gateway_port = gateway_host_port(config); - bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; - - // Wait for the gRPC health check to pass. This ensures - // the service is fully operational, not just accepting - // TCP connections. The health check confirms the full - // path (gvproxy → kube-proxy nftables → pod:8080) and - // that the gRPC service is responding to requests. - health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; - } - - eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); - eprintln!("Press Ctrl+C to stop."); - - // Forward signals to child - unsafe { - libc::signal( - libc::SIGINT, - forward_signal as *const () as libc::sighandler_t, - ); - libc::signal( - libc::SIGTERM, - forward_signal as *const () as libc::sighandler_t, - ); - CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); - } - - let mut status: libc::c_int = 0; - unsafe { - libc::waitpid(pid, &raw mut status, 0); - } - - // Clean up gvproxy — disarm the guard and do explicit cleanup - // so we can print the "stopped" message. - if config.exec_path == "/srv/openshell-vm-init.sh" { - clear_vm_runtime_state(&config.rootfs); - } - if let Some(mut guard) = gvproxy_guard - && let Some(mut child) = guard.disarm() - { - let _ = child.kill(); - let _ = child.wait(); - eprintln!("gvproxy stopped"); - } - - if libc::WIFEXITED(status) { - let code = libc::WEXITSTATUS(status); - eprintln!("VM exited with code {code}"); - return Ok(code); - } else if libc::WIFSIGNALED(status) { - let sig = libc::WTERMSIG(status); - eprintln!("VM killed by signal {sig}"); - return Ok(128 + sig); - } - - Ok(status) - } - } + let libkrun_backend = backend::libkrun::LibkrunBackend; + backend::VmBackend::launch(&libkrun_backend, config) } // ── Post-boot bootstrap ──────────────────────────────────────────────── @@ -1727,7 +1232,11 @@ const DEFAULT_GATEWAY_PORT: u16 = 30051; /// 2. **First boot / post-reset**: poll the exec agent to `cat` each PEM file /// from `/opt/openshell/pki/` until the files exist (PKI generation has /// finished), then store them in `~/.config/openshell/gateways//mtls/`. -fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Result<(), VmError> { +pub(crate) fn bootstrap_gateway( + rootfs: &Path, + gateway_name: &str, + gateway_port: u16, +) -> Result<(), VmError> { let bootstrap_start = Instant::now(); let metadata = openshell_bootstrap::GatewayMetadata { @@ -1921,6 +1430,31 @@ fn is_warm_boot(gateway_name: &str) -> bool { true } +/// Remove cached mTLS certs from the host so the next `bootstrap_gateway` +/// call treats this as a cold boot and fetches fresh PKI from the VM. +/// +/// Called when the state disk is freshly created or `--reset` is used, +/// since the VM will generate new PKI that won't match stale host certs. +fn clear_warm_boot_certs(gateway_name: &str) { + let Ok(home) = std::env::var("HOME") else { + return; + }; + let config_base = + std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); + let mtls_dir = PathBuf::from(&config_base) + .join("openshell/gateways") + .join(gateway_name) + .join("mtls"); + + if mtls_dir.is_dir() { + if let Err(e) = std::fs::remove_dir_all(&mtls_dir) { + eprintln!("Warning: failed to clear stale mTLS certs: {e}"); + } else { + eprintln!("Cleared stale host mTLS certs"); + } + } +} + /// Compare the CA cert on the rootfs (authoritative source) against the /// host-side copy. If they differ, re-copy all client certs from the rootfs. /// @@ -1956,9 +1490,9 @@ fn sync_host_certs_if_stale( Ok(()) } -static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); +pub(crate) static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); -extern "C" fn forward_signal(_sig: libc::c_int) { +pub(crate) extern "C" fn forward_signal(_sig: libc::c_int) { let pid = CHILD_PID.load(std::sync::atomic::Ordering::Relaxed); if pid > 0 { unsafe { diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs index bb9d854b1..1b3aa6423 100644 --- a/crates/openshell-vm/src/main.rs +++ b/crates/openshell-vm/src/main.rs @@ -92,6 +92,16 @@ struct Cli { /// unclean shutdown. #[arg(long)] reset: bool, + + /// Enable GPU passthrough. Optionally specify a PCI address + /// (e.g. `0000:41:00.0`). Uses cloud-hypervisor backend with VFIO. + #[arg(long, num_args = 0..=1, default_missing_value = "auto")] + gpu: Option, + + /// Hypervisor backend: "auto" (default), "libkrun", or "cloud-hypervisor". + /// Auto selects cloud-hypervisor when --gpu is set, libkrun otherwise. + #[arg(long, default_value = "auto")] + backend: String, } #[derive(Subcommand)] @@ -196,12 +206,16 @@ fn run(cli: Cli) -> Result> { return Err("openshell-vm exec requires a command when stdin is not a TTY".into()); } } + let exec_rootfs = if let Some(explicit) = cli.rootfs { + explicit + } else if cli.gpu.is_some() { + openshell_vm::named_gpu_rootfs_dir(&cli.name)? + } else { + openshell_vm::named_rootfs_dir(&cli.name)? + }; return Ok(openshell_vm::exec_running_vm( openshell_vm::VmExecOptions { - rootfs: Some( - cli.rootfs - .unwrap_or(openshell_vm::named_rootfs_dir(&cli.name)?), - ), + rootfs: Some(exec_rootfs), command, workdir, env, @@ -223,12 +237,59 @@ fn run(cli: Cli) -> Result> { } }; - let rootfs = cli - .rootfs - .map_or_else(|| openshell_vm::ensure_named_rootfs(&cli.name), Ok)?; + let rootfs = if let Some(explicit) = cli.rootfs { + Ok(explicit) + } else if cli.gpu.is_some() { + openshell_vm::ensure_gpu_rootfs(&cli.name) + } else { + openshell_vm::ensure_named_rootfs(&cli.name) + }?; let gateway_name = openshell_vm::gateway_name(&cli.name)?; + let (gpu_enabled, vfio_device, _gpu_guard) = match cli.gpu { + Some(ref addr) if addr != "auto" => { + let state = openshell_vm::gpu_passthrough::prepare_gpu_for_passthrough(Some(addr))?; + let bdf = state.pci_addr.clone(); + ( + true, + Some(bdf), + Some(openshell_vm::gpu_passthrough::GpuBindGuard::new(state)), + ) + } + Some(_) => { + let state = openshell_vm::gpu_passthrough::prepare_gpu_for_passthrough(None)?; + let bdf = state.pci_addr.clone(); + ( + true, + Some(bdf), + Some(openshell_vm::gpu_passthrough::GpuBindGuard::new(state)), + ) + } + None => (false, None, None), + }; + + let backend_choice = match cli.backend.as_str() { + "cloud-hypervisor" | "chv" => openshell_vm::VmBackendChoice::CloudHypervisor, + "libkrun" => { + if gpu_enabled { + return Err( + "--backend libkrun is incompatible with --gpu (libkrun does not support \ + VFIO passthrough). Use --backend auto or --backend cloud-hypervisor." + .into(), + ); + } + openshell_vm::VmBackendChoice::Libkrun + } + "auto" => openshell_vm::VmBackendChoice::Auto, + other => { + return Err(format!( + "unknown --backend: {other} (expected: auto, libkrun, cloud-hypervisor)" + ) + .into()); + } + }; + let mut config = if let Some(exec_path) = cli.exec { openshell_vm::VmConfig { rootfs, @@ -246,6 +307,9 @@ fn run(cli: Cli) -> Result> { reset: cli.reset, gateway_name, state_disk: None, + gpu_enabled, + vfio_device, + backend: backend_choice, } } else { let mut c = openshell_vm::VmConfig::gateway(rootfs); @@ -261,6 +325,9 @@ fn run(cli: Cli) -> Result> { c.net = net_backend; c.reset = cli.reset; c.gateway_name = gateway_name; + c.gpu_enabled = gpu_enabled; + c.vfio_device = vfio_device; + c.backend = backend_choice; if state_disk_disabled() { c.state_disk = None; } diff --git a/crates/openshell-vm/tests/gpu_passthrough_implementation.rs b/crates/openshell-vm/tests/gpu_passthrough_implementation.rs new file mode 100644 index 000000000..4985ba39b --- /dev/null +++ b/crates/openshell-vm/tests/gpu_passthrough_implementation.rs @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration tests for GPU passthrough on real hardware. +//! +//! Gated by `OPENSHELL_VM_GPU_E2E=1`. On machines without a real GPU, +//! all tests early-return and pass. + +use openshell_vm::gpu_passthrough::{ + GpuBindGuard, HostNvidiaVfioReadiness, prepare_gpu_for_passthrough, + probe_host_nvidia_vfio_readiness, +}; + +fn gpu_e2e_enabled() -> bool { + std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() == Ok("1") +} + +#[test] +fn nvidia_gpu_passthrough_is_available() { + if !gpu_e2e_enabled() { + eprintln!("OPENSHELL_VM_GPU_E2E not set — skipping GPU passthrough gate test"); + return; + } + assert!( + openshell_vm::gpu_passthrough::nvidia_gpu_available_for_vm_passthrough(), + "GPU passthrough gate returned false on a GPU CI runner — \ + check VFIO binding and cloud-hypervisor runtime bundle" + ); +} + +#[test] +fn bind_and_rebind_real_gpu() { + if !gpu_e2e_enabled() { + return; + } + + let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU"); + + let results = probe_host_nvidia_vfio_readiness(); + let (_, readiness) = results + .iter() + .find(|(a, _)| a == &state.pci_addr) + .expect("bound GPU should appear in probe"); + assert_eq!(*readiness, HostNvidiaVfioReadiness::VfioBoundReady); + + state.restore().expect("restore should succeed"); + + let results = probe_host_nvidia_vfio_readiness(); + let (_, readiness) = results + .iter() + .find(|(a, _)| a == &state.pci_addr) + .expect("restored GPU should appear in probe"); + assert_eq!(*readiness, HostNvidiaVfioReadiness::BoundToNvidia); +} + +#[test] +fn safety_checks_pass_on_ci_gpu() { + if !gpu_e2e_enabled() { + return; + } + + // `prepare_gpu_for_passthrough` runs all safety checks internally + // (display-attached, IOMMU enabled, VFIO modules loaded, sysfs + // permissions). Success here validates that the CI GPU is headless, + // IOMMU is on, and VFIO modules are loaded. + let state = prepare_gpu_for_passthrough(None) + .expect("all safety checks should pass on a headless CI GPU"); + assert!(!state.pci_addr.is_empty()); + + state.restore().expect("restore should succeed"); +} + +#[test] +fn guard_restores_on_drop_real_gpu() { + if !gpu_e2e_enabled() { + return; + } + + let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU"); + let pci_addr = state.pci_addr.clone(); + + let guard = GpuBindGuard::new(state); + drop(guard); + + let output = std::process::Command::new("nvidia-smi") + .arg("--query-gpu=pci.bus_id") + .arg("--format=csv,noheader") + .output() + .expect("nvidia-smi should be available after guard drop"); + assert!( + output.status.success(), + "nvidia-smi failed after guard drop" + ); + + let stdout = String::from_utf8_lossy(&output.stdout); + let normalized_addr = pci_addr.to_uppercase(); + assert!( + stdout.to_uppercase().contains(&normalized_addr), + "nvidia-smi should list the restored GPU {pci_addr}, got: {stdout}" + ); +} + +#[test] +fn auto_select_finds_ci_gpu() { + if !gpu_e2e_enabled() { + return; + } + + let state = prepare_gpu_for_passthrough(None).expect("auto-select should find a GPU on CI"); + assert!(!state.pci_addr.is_empty()); + assert!(state.did_bind); + + state.restore().expect("restore should succeed"); +} diff --git a/crates/openshell-vm/tests/vm_boot_smoke.rs b/crates/openshell-vm/tests/vm_boot_smoke.rs new file mode 100644 index 000000000..ffdb16595 --- /dev/null +++ b/crates/openshell-vm/tests/vm_boot_smoke.rs @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Non-GPU cloud-hypervisor boot smoke test. +//! +//! Boots a cloud-hypervisor VM **without** VFIO/GPU passthrough and verifies +//! the kernel boots and init runs. This catches backend regressions on regular +//! CI runners that lack GPU hardware. +//! +//! Gated on `OPENSHELL_VM_BACKEND=cloud-hypervisor` — skipped when the env +//! var is absent or set to a different backend. +//! +//! Requires the VM runtime bundle (cloud-hypervisor, vmlinux, virtiofsd, +//! rootfs) to be installed. Set `OPENSHELL_VM_RUNTIME_DIR` or run +//! `mise run vm:bundle-runtime` first. +//! +//! Run explicitly: +//! +//! ```sh +//! OPENSHELL_VM_BACKEND=cloud-hypervisor cargo test -p openshell-vm --test vm_boot_smoke +//! ``` + +#![allow(unsafe_code)] + +use std::process::{Command, Stdio}; +use std::time::Duration; + +const GATEWAY: &str = env!("CARGO_BIN_EXE_openshell-vm"); + +fn runtime_bundle_dir() -> std::path::PathBuf { + std::path::Path::new(GATEWAY) + .parent() + .expect("openshell-vm binary has no parent") + .join("openshell-vm.runtime") +} + +fn skip_unless_chv() -> bool { + if std::env::var("OPENSHELL_VM_BACKEND").as_deref() != Ok("cloud-hypervisor") { + eprintln!("OPENSHELL_VM_BACKEND != cloud-hypervisor — skipping"); + return true; + } + false +} + +fn require_bundle() { + let bundle = runtime_bundle_dir(); + if !bundle.is_dir() { + panic!( + "VM runtime bundle not found at {}. Run `mise run vm:bundle-runtime` first.", + bundle.display() + ); + } +} + +#[test] +fn cloud_hypervisor_exec_exits_cleanly() { + if skip_unless_chv() { + return; + } + require_bundle(); + + // Boot with --exec /bin/true --net none. The cloud-hypervisor backend + // wraps the exec command in a script that calls `poweroff -f` after + // completion, causing a clean ACPI shutdown. + let mut child = Command::new(GATEWAY) + .args([ + "--backend", + "cloud-hypervisor", + "--net", + "none", + "--exec", + "/bin/true", + ]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("failed to start openshell-vm"); + + // The VM should boot, run /bin/true, and exit within ~5s. + // Give 30s for slow CI. + let timeout = Duration::from_secs(30); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => { + assert!( + status.success(), + "cloud-hypervisor --exec /bin/true exited with {status}" + ); + return; + } + Ok(None) => { + if start.elapsed() > timeout { + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGKILL) }; + let _ = child.wait(); + panic!("cloud-hypervisor VM did not exit within {timeout:?}"); + } + std::thread::sleep(Duration::from_millis(500)); + } + Err(e) => panic!("error waiting for openshell-vm: {e}"), + } + } +} + +#[test] +fn cloud_hypervisor_boots_without_gpu() { + if skip_unless_chv() { + return; + } + require_bundle(); + + // Full gateway boot requires TAP networking (root/CAP_NET_ADMIN). + // Skip unless running as root. + if !nix_is_root() { + eprintln!("skipping full gateway boot — requires root for TAP networking"); + return; + } + + let mut child = Command::new(GATEWAY) + .args(["--backend", "cloud-hypervisor"]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("failed to start openshell-vm"); + + let addr: std::net::SocketAddr = ([127, 0, 0, 1], 30051).into(); + let timeout = Duration::from_secs(180); + let start = std::time::Instant::now(); + let mut reachable = false; + + while start.elapsed() < timeout { + if std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { + reachable = true; + break; + } + std::thread::sleep(Duration::from_secs(2)); + } + + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) }; + let _ = child.wait(); + + assert!( + reachable, + "cloud-hypervisor VM service on port 30051 not reachable within {timeout:?}" + ); +} + +fn nix_is_root() -> bool { + unsafe { libc::geteuid() == 0 } +} diff --git a/tasks/scripts/vm/build-cloud-hypervisor.sh b/tasks/scripts/vm/build-cloud-hypervisor.sh new file mode 100755 index 000000000..af0c913b1 --- /dev/null +++ b/tasks/scripts/vm/build-cloud-hypervisor.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Download pre-built cloud-hypervisor and virtiofsd binaries for GPU passthrough. +# +# These are only needed on Linux for VFIO GPU passthrough via the +# cloud-hypervisor backend. The binaries are downloaded from their +# respective GitHub release pages. +# +# Usage: +# ./build-cloud-hypervisor.sh [--output-dir ] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/_lib.sh" +ROOT="$(vm_lib_root)" + +source "${ROOT}/crates/openshell-vm/pins.env" 2>/dev/null || true + +CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}" +VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" +OUTPUT_DIR="${ROOT}/target/libkrun-build" + +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) OUTPUT_DIR="$2"; shift 2 ;; + *) echo "Unknown argument: $1" >&2; exit 1 ;; + esac +done + +if [ "$(uname -s)" != "Linux" ]; then + echo "Error: cloud-hypervisor GPU passthrough is Linux-only" >&2 + exit 1 +fi + +mkdir -p "$OUTPUT_DIR" + +HOST_ARCH="$(uname -m)" +case "$HOST_ARCH" in + aarch64) CHV_ARCH="aarch64"; VIRTIOFSD_ARCH="aarch64" ;; + x86_64) CHV_ARCH="x86_64"; VIRTIOFSD_ARCH="x86_64" ;; + *) echo "Error: Unsupported architecture: ${HOST_ARCH}" >&2; exit 1 ;; +esac + +echo "==> Downloading cloud-hypervisor ${CLOUD_HYPERVISOR_VERSION} for ${HOST_ARCH}..." +CHV_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CLOUD_HYPERVISOR_VERSION}/cloud-hypervisor-static" +if [ "$CHV_ARCH" = "aarch64" ]; then + CHV_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CLOUD_HYPERVISOR_VERSION}/cloud-hypervisor-static-aarch64" +fi + +curl -fsSL -o "${OUTPUT_DIR}/cloud-hypervisor" "$CHV_URL" +chmod +x "${OUTPUT_DIR}/cloud-hypervisor" +echo " Downloaded: cloud-hypervisor" + +echo "==> Building virtiofsd ${VIRTIOFSD_VERSION} from source..." +VIRTIOFSD_SRC="$(mktemp -d)" +VIRTIOFSD_TARBALL_URL="https://gitlab.com/virtio-fs/virtiofsd/-/archive/${VIRTIOFSD_VERSION}/virtiofsd-${VIRTIOFSD_VERSION}.tar.gz" +curl -fsSL "$VIRTIOFSD_TARBALL_URL" | tar -xzf - -C "$VIRTIOFSD_SRC" --strip-components=1 +rm -f "${VIRTIOFSD_SRC}/Cargo.lock" + +CARGO_CMD="cargo" +if command -v mise &>/dev/null; then + CARGO_CMD="mise exec -- cargo" +fi +$CARGO_CMD build --release --manifest-path "${VIRTIOFSD_SRC}/Cargo.toml" +cp "${VIRTIOFSD_SRC}/target/release/virtiofsd" "${OUTPUT_DIR}/virtiofsd" +chmod +x "${OUTPUT_DIR}/virtiofsd" +rm -rf "$VIRTIOFSD_SRC" +echo " Built: virtiofsd" + +echo "" +echo "==> GPU passthrough binaries ready in ${OUTPUT_DIR}" +ls -lah "${OUTPUT_DIR}/cloud-hypervisor" "${OUTPUT_DIR}/virtiofsd" 2>/dev/null || true diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh index 9e2217f50..621332366 100755 --- a/tasks/scripts/vm/build-libkrun.sh +++ b/tasks/scripts/vm/build-libkrun.sh @@ -239,6 +239,18 @@ make -j"$(nproc)" cp libkrunfw.so* "$OUTPUT_DIR/" echo " Built: $(ls "$OUTPUT_DIR"/libkrunfw.so* | xargs -n1 basename | tr '\n' ' ')" +# Copy vmlinux kernel image for cloud-hypervisor GPU passthrough. +# This is the uncompressed kernel built by libkrunfw's kernel build. +if [ -f "${KERNEL_SOURCES}/vmlinux" ]; then + cp "${KERNEL_SOURCES}/vmlinux" "$OUTPUT_DIR/vmlinux" + echo " Copied vmlinux for cloud-hypervisor GPU passthrough" +elif [ -f "vmlinux" ]; then + cp "vmlinux" "$OUTPUT_DIR/vmlinux" + echo " Copied vmlinux for cloud-hypervisor GPU passthrough" +else + echo " Warning: vmlinux not found in kernel build tree (GPU passthrough will not be available)" >&2 +fi + cd "$BUILD_DIR" # ── Build libkrun (VMM) ───────────────────────────────────────────────── diff --git a/tasks/scripts/vm/download-kernel-runtime.sh b/tasks/scripts/vm/download-kernel-runtime.sh index 8f0427af9..5e60d3c75 100755 --- a/tasks/scripts/vm/download-kernel-runtime.sh +++ b/tasks/scripts/vm/download-kernel-runtime.sh @@ -81,11 +81,11 @@ DOWNLOAD_DIR="${ROOT}/target/vm-runtime-download" mkdir -p "$DOWNLOAD_DIR" "$OUTPUT_DIR" echo "==> Downloading ${TARBALL_NAME} from ${RELEASE_TAG}..." +rm -f "${DOWNLOAD_DIR}/${TARBALL_NAME}" gh release download "${RELEASE_TAG}" \ --repo "${REPO}" \ --pattern "${TARBALL_NAME}" \ - --dir "${DOWNLOAD_DIR}" \ - --clobber + --dir "${DOWNLOAD_DIR}" if [ ! -f "${DOWNLOAD_DIR}/${TARBALL_NAME}" ]; then echo "Error: Download failed — ${TARBALL_NAME} not found." >&2 diff --git a/tasks/scripts/vm/package-vm-runtime.sh b/tasks/scripts/vm/package-vm-runtime.sh index f97eec870..8b09c91ba 100755 --- a/tasks/scripts/vm/package-vm-runtime.sh +++ b/tasks/scripts/vm/package-vm-runtime.sh @@ -84,6 +84,13 @@ case "$PLATFORM" in versioned="$(ls "${PACKAGE_DIR}"/libkrunfw.so.5.* 2>/dev/null | head -n1 || true)" [ -n "$versioned" ] && cp "$versioned" "${PACKAGE_DIR}/libkrunfw.so.5" fi + # GPU passthrough binaries (optional — only included if present) + for gpu_bin in cloud-hypervisor vmlinux virtiofsd; do + if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then + cp "${BUILD_DIR}/${gpu_bin}" "${PACKAGE_DIR}/" + echo " Included GPU passthrough binary: ${gpu_bin}" + fi + done ;; darwin-aarch64) cp "${BUILD_DIR}/libkrun.dylib" "${PACKAGE_DIR}/" diff --git a/tasks/scripts/vm/sync-vm-rootfs.sh b/tasks/scripts/vm/sync-vm-rootfs.sh index 727a9dd18..2c22e360b 100755 --- a/tasks/scripts/vm/sync-vm-rootfs.sh +++ b/tasks/scripts/vm/sync-vm-rootfs.sh @@ -141,6 +141,22 @@ fi patch_vm_helmchart "${MANIFEST_DST}/openshell-helmchart.yaml" patch_vm_helmchart "${ROOTFS_DIR}/var/lib/rancher/k3s/server/manifests/openshell-helmchart.yaml" +# ── GPU manifests ────────────────────────────────────────────────────── +# Only sync if the rootfs was built with --gpu (sentinel file present). +GPU_MANIFEST_SRC="${ROOT}/crates/openshell-vm/scripts/gpu-manifests" +GPU_MANIFEST_DST="${ROOTFS_DIR}/opt/openshell/gpu-manifests" +if [ -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ] && [ -d "${GPU_MANIFEST_SRC}" ]; then + mkdir -p "${GPU_MANIFEST_DST}" + for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do + [ -f "$manifest" ] || continue + base=$(basename "$manifest") + if ! cmp -s "$manifest" "${GPU_MANIFEST_DST}/${base}" 2>/dev/null; then + cp "$manifest" "${GPU_MANIFEST_DST}/${base}" + echo " updated: /opt/openshell/gpu-manifests/${base}" + fi + done +fi + # ── Gateway image tarball ────────────────────────────────────────────── # The VM rootfs airgap-imports openshell/gateway:dev from k3s/agent/images/. # Keep that tarball in sync with the local Docker image so `mise run e2e:vm` diff --git a/vm-gpu-passthrough-implementation.md b/vm-gpu-passthrough-implementation.md deleted file mode 100644 index 77ea95971..000000000 --- a/vm-gpu-passthrough-implementation.md +++ /dev/null @@ -1,246 +0,0 @@ -# VM GPU passthrough: implementation plan - -> Design: [vm-gpu-passthrough.md](vm-gpu-passthrough.md) - -## Phase 0 -- Specification and failing test (current) - -- [x] Design doc. -- [x] Phase 0.5 VMM decision (cloud-hypervisor selected). -- [ ] **`gpu_passthrough` module** integrated into `crates/openshell-vm/src/`: - - `probe_host_nvidia_vfio_readiness()` -- Linux sysfs scan; non-Linux returns `UnsupportedPlatform`. - - `nvidia_gpu_available_for_vm_passthrough()` -- hard-coded `false` until end-to-end passthrough works. - - **Note:** `gpu_passthrough.rs` and `gpu_passthrough_implementation.rs` exist as untracked files at the repo root but are not wired into the crate module tree (`lib.rs` does not `mod gpu_passthrough;`). Move them into `crates/openshell-vm/src/`, add `pub mod gpu_passthrough;`, and ensure `cargo test -p openshell-vm` compiles them. -- [ ] **Failing integration test** `tests/gpu_passthrough_implementation.rs` -- documents the target and fails until implementation is finished. - -**Running the red test:** `cargo test -p openshell-vm --test gpu_passthrough_implementation` - -**Note:** `mise run test` uses `cargo test --workspace --exclude openshell-vm`, so default CI stays green. - ---- - -## Phase 1 -- VMM backend abstraction and cloud-hypervisor integration - -### 1a. Backend trait and libkrun extraction - -Refactor only -- no behavior changes. Existing tests must still pass. - -- [ ] Create `src/backend.rs` with the `VmBackend` trait: - -```rust -pub trait VmBackend { - fn launch(&self, config: &VmLaunchConfig) -> Result; -} - -pub struct VmLaunchConfig { - pub base: VmConfig, - pub vfio_device: Option, -} -``` - -- [ ] Create `src/backend/libkrun.rs` -- move into `LibkrunBackend`: - - `VmContext` struct and all methods (current `lib.rs` lines 584-811) - - gvproxy setup block inside `NetBackend::Gvproxy` (lines 1337-1466) - - fork + waitpid + signal forwarding (lines 1525-1710) - - bootstrap block (lines 1648-1663) -- [ ] Extract shared gvproxy startup into a helper used by both backends. -- [ ] Update `launch()` to dispatch: - -```rust -pub fn launch(config: &VmLaunchConfig) -> Result { - // ... existing pre-launch checks ... - - if config.vfio_device.is_some() { - #[cfg(not(target_os = "linux"))] - return Err(VmError::HostSetup( - "GPU passthrough requires Linux with KVM and IOMMU".into(), - )); - - #[cfg(target_os = "linux")] - { - let backend = CloudHypervisorBackend::new()?; - return backend.launch(config); - } - } - - LibkrunBackend.launch(config) -} -``` - -- [ ] `ffi.rs` stays as-is -- only used by `LibkrunBackend`. - -### 1b. cloud-hypervisor backend - -- [ ] Create `src/backend/cloud_hypervisor.rs` implementing `VmBackend`. -- [ ] REST API client -- HTTP/1.1 over Unix socket, ~5 endpoints: - -``` -PUT /api/v1/vm.create -- configure VM -PUT /api/v1/vm.boot -- start VM -PUT /api/v1/vm.shutdown -- graceful stop -GET /api/v1/vm.info -- status check -PUT /api/v1/vm.delete -- cleanup -``` - -Use `hyper` over Unix socket (already in dependency tree) or raw HTTP. Avoid adding `cloud-hypervisor-client` crate for ~5 calls. - -- [ ] VM create payload mapping from `VmConfig`: - -```json -{ - "cpus": { "boot_vcpus": 4 }, - "memory": { "size": 8589934592 }, - "payload": { - "kernel": "/path/to/vmlinux", - "cmdline": "console=hvc0 root=virtiofs:rootfs rw init=/srv/openshell-vm-init.sh" - }, - "fs": [ - { "tag": "rootfs", "socket": "/path/to/virtiofsd.sock", "num_queues": 1, "queue_size": 1024 } - ], - "disks": [ - { "path": "/path/to/state.raw", "readonly": false } - ], - "net": [ - { "socket": "/path/to/gvproxy-qemu.sock", "mac": "5a:94:ef:e4:0c:ee" } - ], - "vsock": { - "cid": 3, - "socket": "/path/to/vsock.sock" - }, - "devices": [ - { "path": "/sys/bus/pci/devices/0000:41:00.0/" } - ], - "serial": { "mode": "File", "file": "/path/to/console.log" }, - "console": { "mode": "Off" } -} -``` - -- [ ] Process lifecycle: - 1. Start `cloud-hypervisor --api-socket /tmp/ovm-chv-{id}.sock` as subprocess - 2. Wait for API socket to appear (exponential backoff, same pattern as gvproxy) - 3. `PUT vm.create` with config payload - 4. `PUT vm.boot` - 5. Parent waits on subprocess - 6. Signal forwarding: SIGINT/SIGTERM -> `PUT vm.shutdown` + subprocess SIGTERM - 7. Cleanup: remove API socket - -### 1c. Kernel extraction and build pipeline - -- [ ] Modify `build-libkrun.sh`: after building libkrunfw, copy `vmlinux` from the kernel build tree to `target/libkrun-build/vmlinux` before cleanup. -- [ ] Add to `openshell.kconfig` (harmless for non-GPU boots): - -``` -CONFIG_PCI=y -CONFIG_PCI_MSI=y -CONFIG_DRM=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -``` - -- [ ] Add to `pins.env`: - -```bash -CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}" -VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" -``` - -- [ ] Create `build-cloud-hypervisor.sh` (or download step): download pre-built static binary from cloud-hypervisor GitHub releases for the target architecture. -- [ ] Update `package-vm-runtime.sh`: include `cloud-hypervisor`, `vmlinux`, and `virtiofsd` in the runtime tarball for Linux builds. -- [ ] `validate_runtime_dir()` in `lib.rs` must **not** require GPU binaries. Only `CloudHypervisorBackend::new()` validates their presence. - -### 1d. vsock exec agent compatibility - -libkrun uses per-port vsock bridging (`krun_add_vsock_port2`): each guest vsock port maps to a host Unix socket. cloud-hypervisor uses standard vhost-vsock with a single socket and CID-based addressing. - -- [ ] Update `exec.rs` to support both connection modes: - - **libkrun**: connect to `vm_exec_socket_path()` (existing) - - **cloud-hypervisor**: connect via `AF_VSOCK` (CID 3, port 10777) or bridge with `socat` -- [ ] Test exec agent communication (cat, env) over both backends. - -### 1e. Plumb `--gpu` flag - -- [ ] Add fields to `VmConfig`: - -```rust -pub vfio_device: Option, -pub gpu_enabled: bool, -``` - -- [ ] When `gpu_enabled` is set, add `GPU_ENABLED=true` to guest environment. -- [ ] Wire `--gpu` / `--gpu ` from the CLI to `VmConfig`. - ---- - -## Phase 1.5 -- Guest rootfs: NVIDIA driver and toolkit - -- [ ] **NVIDIA driver in rootfs.** Options: - - **Separate GPU rootfs artifact**: build `rootfs-gpu.tar.zst` alongside `rootfs.tar.zst`. Launcher selects GPU variant when `--gpu` is passed. - - **Bake into rootfs**: use `nvcr.io/nvidia/base/ubuntu` base image from `pins.env`. Heavier (~2-3 GB) but self-contained. - - **Runtime injection via virtio-fs**: stage driver packages on host, mount into guest. Lighter but more complex. -- [ ] **Driver version compatibility**: document minimum driver version and GPU compute capability. -- [ ] **NVIDIA container toolkit**: install `nvidia-container-toolkit` so `nvidia-container-runtime` is available to containerd/k3s. -- [ ] **Smoke test**: `nvidia-smi` runs inside the guest after rootfs build. - ---- - -## Phase 2 -- Guest appliance parity - -- [ ] **Init script changes** (`openshell-vm-init.sh`): when `GPU_ENABLED=true`: - - Load NVIDIA kernel modules (`nvidia`, `nvidia_uvm`, `nvidia_modeset`) - - Run `nvidia-smi` -- fail fast if device not visible - - Copy `gpu-manifests/*.yaml` into k3s auto-deploy directory (mirrors `cluster-entrypoint.sh` ~line 384) - - Verify `nvidia-container-runtime` is registered with containerd -- [ ] **End-to-end validation**: sandbox pod requesting `nvidia.com/gpu: 1` gets scheduled and can run `nvidia-smi` inside the pod. - ---- - -## Phase 3 -- CLI / UX - -- [ ] Mirror `openshell gateway start --gpu` semantics for VM backend. -- [ ] Support `--gpu ` for multi-GPU hosts. -- [ ] Document host preparation (IOMMU, `vfio-pci`, unbinding `nvidia`). -- [ ] Document single-GPU caveats (host display loss, headless operation). - ---- - -## Phase 4 -- CI - -- [ ] GPU E2E job: optional runner with `OPENSHELL_VM_GPU_E2E=1` and a VFIO-bound GPU. Tighten `nvidia_gpu_available_for_vm_passthrough()` to require `VfioBoundReady` + guest smoke. -- [ ] Non-GPU cloud-hypervisor CI test: boot and exec agent check without VFIO. Catches backend regressions without GPU hardware. - ---- - -## Test evolution - -Today `nvidia_gpu_available_for_vm_passthrough()` returns `false`. When complete, it should compose: - -1. `probe_host_nvidia_vfio_readiness()` returns `VfioBoundReady` (clean IOMMU group) -2. cloud-hypervisor binary present in runtime bundle -3. `/dev/vfio/vfio` and `/dev/vfio/{group}` accessible -4. Guest rootfs includes NVIDIA driver and toolkit - -Options for the final gate: -- `true` only when CI env var is set and hardware verified -- Replace boolean with full integration check -- Remove `#[ignore]` and run only on GPU runners - -Pick one in the final PR so `mise run test` policy stays intentional. - ---- - -## File change index - -| File | Change | -|---|---| -| `crates/openshell-vm/src/lib.rs` | Extract `launch()` internals into backend dispatch; add `vfio_device` / `gpu_enabled` to `VmConfig` | -| `crates/openshell-vm/src/backend.rs` (new) | `VmBackend` trait, `VmLaunchConfig` | -| `crates/openshell-vm/src/backend/libkrun.rs` (new) | `LibkrunBackend` -- moved from `lib.rs` (mechanical refactor) | -| `crates/openshell-vm/src/backend/cloud_hypervisor.rs` (new) | `CloudHypervisorBackend` -- REST API client, process lifecycle, VFIO assignment | -| `crates/openshell-vm/src/ffi.rs` | No changes (used only by `LibkrunBackend`) | -| `crates/openshell-vm/src/exec.rs` | Support both libkrun Unix socket and vhost-vsock connection modes | -| `crates/openshell-vm/src/gpu_passthrough.rs` (move from repo root) | `probe_host_nvidia_vfio_readiness()` with IOMMU group check | -| `crates/openshell-vm/runtime/kernel/openshell.kconfig` | Add `CONFIG_PCI`, `CONFIG_PCI_MSI`, `CONFIG_DRM`, `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | -| `crates/openshell-vm/pins.env` | Add `CLOUD_HYPERVISOR_VERSION`, `VIRTIOFSD_VERSION` | -| `crates/openshell-vm/scripts/openshell-vm-init.sh` | GPU-gated block: module loading, `nvidia-smi` check, manifest copy | -| `tasks/scripts/vm/build-libkrun.sh` | Preserve `vmlinux` in `target/libkrun-build/` | -| `tasks/scripts/vm/build-cloud-hypervisor.sh` (new) | Download or build cloud-hypervisor static binary | -| `tasks/scripts/vm/package-vm-runtime.sh` | Include `cloud-hypervisor`, `vmlinux`, `virtiofsd` for Linux builds | diff --git a/vm-gpu-passthrough.md b/vm-gpu-passthrough.md deleted file mode 100644 index 02262ddd9..000000000 --- a/vm-gpu-passthrough.md +++ /dev/null @@ -1,225 +0,0 @@ -# VM GPU passthrough: design - -> Status: **Design complete.** Implementation tracked in [vm-gpu-passthrough-implementation.md](vm-gpu-passthrough-implementation.md). - -## Goal - -Match the **Docker cluster GPU path** (`openshell gateway start --gpu`): the k3s node inside the microVM sees a **real NVIDIA GPU** so sandbox pods can request `nvidia.com/gpu`, the NVIDIA device plugin, and `nvidia` RuntimeClass behave identically to the Docker path. - -This is **PCI passthrough** (VFIO) of the physical GPU into the guest -- not virtio-gpu / Venus / virgl. - -## Decision record - -### Venus / virgl rejected - -libkrun's virtio-gpu Venus path forwards **Vulkan** API calls, not NVIDIA's proprietary CUDA stack. The guest never loads the NVIDIA kernel driver and has no `/dev/nvidia*` device nodes. This rules out `nvidia-smi`, CUDA workloads, the k8s device plugin, and the NVIDIA container runtime -- all of which the Docker `--gpu` path depends on. - -| Requirement | Venus | VFIO passthrough | -|---|---|---| -| `nvidia-smi` in guest | No (Vulkan only) | Yes (bare-metal driver) | -| CUDA workloads | No | Yes | -| `nvidia.com/gpu` k8s resource | No | Yes | -| NVIDIA container runtime | No | Yes | -| Performance | ~75-80% (forwarding overhead) | ~100% (bare-metal) | -| macOS support | Yes (MoltenVK) | No (Linux IOMMU only) | - -### libkrun VFIO rejected - -libkrun upstream closed the device passthrough request ([containers/libkrun#32](https://github.com/containers/libkrun/issues/32), March 2023). VFIO would require PCI bus emulation and ACPI tables -- outside libkrun's MMIO-only virtio design. No known forks add this. - -### VMM selection: dual backend - -| VMM | VFIO | Size | vsock | Rust | macOS | Decision | -|-----|------|------|-------|------|-------|----------| -| **libkrun** v1.17.4 | No | ~5 MB | Yes | Yes | Yes (HVF) | Keep for non-GPU | -| **cloud-hypervisor** | Yes | ~10 MB | Yes | Yes | No | **GPU backend** | -| QEMU | Yes | ~50+ MB | Yes | No (C) | Limited | Rejected: size, C | -| crosvm | Yes | ~15 MB | Yes | Yes | No | Rejected: heavier | -| libkrun fork | Needs patches | ~5 MB | Yes | Yes | Possible | Rejected: maintenance | - -**cloud-hypervisor** is the GPU-only VMM backend. libkrun remains the default for all non-GPU workloads and is the only backend on macOS. - ---- - -## Architecture - -``` - openshell gateway start - | - ┌───────┴───────┐ - │ --gpu flag? │ - └───────┬───────┘ - no / \ yes (Linux only) - / \ - ┌──────┴──────┐ ┌────┴─────────────┐ - │ LibkrunBack │ │ CloudHvBackend │ - │ end │ │ │ - │ │ │ REST API over │ - │ ffi.rs │ │ Unix socket │ - │ (dlopen) │ │ │ - └──────┬──────┘ └────┬─────────────┘ - │ │ - ┌──────┴──────┐ ┌────┴─────────────┐ - │ libkrun VM │ │ cloud-hypervisor │ - │ │ │ VM │ - │ virtio-fs │ │ virtio-fs │ - │ virtio-net │ │ virtio-net │ - │ vsock │ │ vsock │ - │ virtio-blk │ │ virtio-blk │ - │ │ │ VFIO PCI (GPU) │ - └─────────────┘ └──────────────────┘ -``` - -### Shared across both backends - -- **Guest rootfs**: Same directory tree under `~/.local/share/openshell/openshell-vm/{version}/instances//rootfs/`. -- **Init script**: `/srv/openshell-vm-init.sh` runs as PID 1. GPU behavior is gated on `GPU_ENABLED=true`. -- **Exec agent**: `openshell-vm-exec-agent.py` on vsock port 10777. -- **gvproxy**: DNS, DHCP, and port forwarding. Both backends connect to gvproxy's QEMU-mode Unix socket. -- **Host bootstrap**: `bootstrap_gateway()` fetches PKI over the exec agent and stores mTLS creds. - -### Per-backend differences - -| Concern | libkrun | cloud-hypervisor | -|---|---|---| -| **Process model** | Library via `dlopen`; `fork()` + `krun_start_enter()` | Subprocess; REST API over Unix socket | -| **Boot model** | `krun_set_root(dir)` + `krun_set_exec(init)` -- kernel in libkrunfw | `--kernel vmlinux` + virtio-fs via virtiofsd -- explicit kernel binary | -| **Networking** | `krun_add_net_unixstream` (Linux) / `krun_add_net_unixgram` (macOS) | `--net socket=/path/to/gvproxy.sock` | -| **vsock** | `krun_add_vsock_port2(port, socket)` per port | `--vsock cid=3,socket=/path/to/vsock.sock` (vhost-vsock) | -| **Block storage** | `krun_add_disk3(id, path, format, ...)` | `--disk path=/path/to/state.raw` | -| **GPU** | N/A | `--device path=/sys/bus/pci/devices/ADDR/` (VFIO) | -| **Console** | `krun_set_console_output(path)` | `--serial file=/path` | -| **Lifecycle** | `krun_free_ctx` in `Drop`; `waitpid` on child | REST: `vm.create` -> `vm.boot` -> `vm.shutdown`; wait on subprocess | -| **macOS** | Yes (HVF) | No (KVM only) | - ---- - -## Host requirements (GPU path) - -### Host kernel - -- `CONFIG_VFIO`, `CONFIG_VFIO_PCI`, `CONFIG_VFIO_IOMMU_TYPE1` -- IOMMU enabled: BIOS (VT-d / AMD-Vi) + kernel params (`intel_iommu=on iommu=pt` or AMD equivalent) - -### Host preparation - -1. Unbind GPU from `nvidia` driver: `echo > /sys/bus/pci/drivers/nvidia/unbind` -2. Bind to `vfio-pci`: `echo > /sys/bus/pci/drivers/vfio-pci/new_id` -3. Verify: `readlink /sys/bus/pci/devices//driver` points to `vfio-pci` -4. Ensure `/dev/vfio/vfio` and `/dev/vfio/{group}` are accessible - -### Host preflight state machine - -The stack classifies each NVIDIA PCI device into one of these states: - -| State | Meaning | Action | -|---|---|---| -| `NoNvidiaDevice` | No NVIDIA PCI device found | Error: no GPU to pass through | -| `BoundToNvidia` | Device on `nvidia` driver | Not available until unbound and rebound to `vfio-pci` | -| `VfioBoundDirtyGroup` | On `vfio-pci` but IOMMU group has non-VFIO peers | Report which peers need unbinding | -| `VfioBoundReady` | On `vfio-pci`, IOMMU group clean | Ready for passthrough | - -`probe_host_nvidia_vfio_readiness()` scans sysfs for vendor ID `0x10de`, checks the driver symlink, and inspects `/sys/bus/pci/devices//iommu_group/devices/` for group cleanliness. Returns per-device readiness for multi-GPU hosts. - ---- - -## Guest requirements (GPU path) - -### Guest kernel (`openshell.kconfig` additions) - -| Config | Purpose | -|---|---| -| `CONFIG_PCI`, `CONFIG_PCI_MSI` | PCIe device visibility and interrupts | -| `CONFIG_DRM` | GPU device node creation (`/dev/dri/*`) | -| `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | NVIDIA proprietary driver is a loadable module | -| `CONFIG_FB` / `CONFIG_FRAMEBUFFER_CONSOLE` | Optional: if GPU is the only display device | - -Do **not** enable `CONFIG_VFIO` in the guest (no nested passthrough). - -### Guest rootfs (GPU variant) - -The GPU rootfs extends the base rootfs with: - -- **NVIDIA kernel driver** matching the target GPU hardware generation -- **NVIDIA container toolkit** (`nvidia-container-toolkit`) so `nvidia-container-runtime` is available to containerd/k3s -- **`nvidia-smi`** for health checks - -Distribution: separate `rootfs-gpu.tar.zst` artifact alongside the base `rootfs.tar.zst`. The launcher selects the GPU variant when `--gpu` is passed. - -### Guest init (`openshell-vm-init.sh`) - -When `GPU_ENABLED=true` is set in the environment: - -1. Load NVIDIA kernel modules (`nvidia`, `nvidia_uvm`, `nvidia_modeset`) -2. Run `nvidia-smi` -- fail fast with a clear error if the device is not visible -3. Copy `gpu-manifests/*.yaml` (NVIDIA device plugin HelmChart CR) into the k3s auto-deploy directory -4. Verify `nvidia-container-runtime` is registered with containerd - -When `GPU_ENABLED` is unset or false: no GPU paths execute (current behavior). - ---- - -## CLI interface - -### `--gpu` - -``` -openshell gateway start --gpu # Auto-select first VFIO-ready GPU -openshell gateway start --gpu 0000:41:00.0 # Select specific PCI address -``` - -Errors: -- No NVIDIA PCI device found -- GPU not bound to `vfio-pci` (with instructions to bind) -- IOMMU group not clean (lists non-VFIO peers) -- GPU passthrough not supported on macOS -- cloud-hypervisor binary not found in runtime bundle - -### Runtime bundle - -``` -~/.local/share/openshell/vm-runtime/{version}/ -├── libkrun.so # existing -├── libkrunfw.so.5 # existing -├── gvproxy # existing -├── provenance.json # existing -├── cloud-hypervisor # new (GPU path, ~10 MB, Linux only) -├── vmlinux # new (GPU path, ~15 MB, from libkrunfw build) -└── virtiofsd # new (GPU path, ~5 MB) -``` - -`cloud-hypervisor`, `vmlinux`, and `virtiofsd` are only required for `--gpu` launches. Non-GPU launches do not validate their presence. - ---- - -## Security model - -GPU passthrough grants the guest **full device access** -- the same trust model as passing a GPU into the Docker cluster container today. The guest can issue arbitrary PCIe transactions to the device. IOMMU protects host memory from DMA attacks by the device, but the guest has unrestricted control of the GPU itself. - ---- - -## Constraints and limitations - -| Constraint | Impact | Mitigation | -|---|---|---| -| **Dual-backend maintenance** | Two VMM code paths for boot, networking, vsock, console | `VmBackend` trait limits blast radius; CI tests for both | -| **Linux-only GPU path** | macOS cannot use VFIO passthrough | macOS uses libkrun exclusively; GPU is out of scope for macOS | -| **NVIDIA FLR quirks** | Consumer GeForce may not reset on VM shutdown | Target data-center GPUs (A100, H100, L40) first; document supported list | -| **Single-GPU display loss** | Binding only GPU to `vfio-pci` removes host display | Document headless operation; recommend secondary GPU | -| **NVIDIA driver coupling** | Guest driver must match GPU generation | Pin driver version in rootfs; test against GPU matrix | -| **IOMMU group granularity** | Some boards group GPU with other devices | Recommend server hardware; document ACS override (unsupported) | -| **BAR size / MMIO** | Large-BAR GPUs need 64-bit MMIO support | Document BIOS settings (Above 4G Decoding, Resizable BAR) | -| **cloud-hypervisor NVIDIA issues** | Some driver failures reported upstream | Target data-center GPUs; pin cloud-hypervisor version | -| **GPU rootfs size** | NVIDIA driver + toolkit adds ~2-3 GB | Separate `rootfs-gpu.tar.zst` artifact | -| **Runtime bundle size** | cloud-hypervisor + vmlinux + virtiofsd add ~30 MB | Only in Linux GPU builds; separate tarball if needed | - ---- - -## Related documents - -- [Custom libkrun VM runtime](architecture/custom-vm-runtime.md) -- microVM layout, build pipeline, networking -- [Cluster bootstrap (Docker)](architecture/gateway-single-node.md) -- existing `--gpu` / `GPU_ENABLED` behavior -- [Implementation plan](vm-gpu-passthrough-implementation.md) -- phased work to build this -- [cloud-hypervisor VFIO docs](https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/docs/vfio.md) -- upstream VFIO reference -- [cloud-hypervisor REST API](https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/docs/api.md) -- programmatic VM management -- [rust-vmm/vfio](https://github.com/rust-vmm/vfio) -- VFIO Rust bindings used by cloud-hypervisor From 8591dc402ebb98427b10dd067ea7aff5803263cb Mon Sep 17 00:00:00 2001 From: Vincent Caux-Brisebois Date: Sun, 19 Apr 2026 19:57:40 +0000 Subject: [PATCH 3/5] refactor(vm): extract openshell-vfio crate and harden GPU passthrough lifecycle Move gpu_passthrough module into standalone openshell-vfio crate, reducing openshell-cli's dependency footprint. Key improvements: - MSI-X pre-flight check prevents cloud-hypervisor crash on MSI-only GPUs - Robust VFIO cleanup: nvidia module reload, PCI rescan fallback, independent IOMMU peer restore, sysfs write input validation - Auto-select GPU rootfs with sentinel validation - Extend GpuBindGuard lifetime to CLI process scope - Restore ip_forward to original value on teardown Signed-off-by: Vincent Caux-Brisebois --- Cargo.lock | 11 +- crates/openshell-cli/Cargo.toml | 2 +- crates/openshell-cli/src/run.rs | 11 +- crates/openshell-vfio/Cargo.toml | 20 + .../src/lib.rs} | 385 +++++++++++++++--- .../tests/gpu_passthrough_implementation.rs | 4 +- crates/openshell-vm/Cargo.toml | 1 + crates/openshell-vm/pins.env | 2 +- .../src/backend/cloud_hypervisor.rs | 4 +- crates/openshell-vm/src/lib.rs | 116 +++++- crates/openshell-vm/src/main.rs | 8 +- tasks/scripts/vm/build-rootfs-tarball.sh | 67 +-- tasks/scripts/vm/bundle-vm-runtime.sh | 9 + tasks/scripts/vm/compress-vm-runtime.sh | 22 +- 14 files changed, 540 insertions(+), 122 deletions(-) create mode 100644 crates/openshell-vfio/Cargo.toml rename crates/{openshell-vm/src/gpu_passthrough.rs => openshell-vfio/src/lib.rs} (83%) rename crates/{openshell-vm => openshell-vfio}/tests/gpu_passthrough_implementation.rs (96%) diff --git a/Cargo.lock b/Cargo.lock index d347ff86c..cc1193267 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3028,7 +3028,7 @@ dependencies = [ "openshell-prover", "openshell-providers", "openshell-tui", - "openshell-vm", + "openshell-vfio", "owo-colors", "prost-types", "rcgen", @@ -3271,6 +3271,14 @@ dependencies = [ "url", ] +[[package]] +name = "openshell-vfio" +version = "0.0.0" +dependencies = [ + "nix", + "tempfile", +] + [[package]] name = "openshell-vm" version = "0.0.0" @@ -3284,6 +3292,7 @@ dependencies = [ "nix", "openshell-bootstrap", "openshell-core", + "openshell-vfio", "rustls", "rustls-pemfile", "serde", diff --git a/crates/openshell-cli/Cargo.toml b/crates/openshell-cli/Cargo.toml index dd8f83bb8..20ba1e5f7 100644 --- a/crates/openshell-cli/Cargo.toml +++ b/crates/openshell-cli/Cargo.toml @@ -21,7 +21,7 @@ openshell-policy = { path = "../openshell-policy" } openshell-providers = { path = "../openshell-providers" } openshell-prover = { path = "../openshell-prover" } openshell-tui = { path = "../openshell-tui" } -openshell-vm = { path = "../openshell-vm" } +openshell-vfio = { path = "../openshell-vfio" } serde = { workspace = true } serde_json = { workspace = true } prost-types = { workspace = true } diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 247f41d11..be2527295 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1434,7 +1434,7 @@ pub async fn gateway_admin_deploy( registry_username: Option<&str>, registry_token: Option<&str>, gpu: Vec, -) -> Result> { +) -> Result> { let (gpu, gpu_guard) = prepare_gateway_deploy_gpu(gpu, remote.as_deref())?; let location = if remote.is_some() { "remote" } else { "local" }; @@ -5250,10 +5250,7 @@ fn looks_like_pci_bdf(s: &str) -> bool { fn prepare_gateway_deploy_gpu( gpu: Vec, remote: Option<&str>, -) -> Result<( - Vec, - Option, -)> { +) -> Result<(Vec, Option)> { if gpu.is_empty() { return Ok((gpu, None)); } @@ -5295,8 +5292,8 @@ fn prepare_gateway_deploy_gpu( } /// Bind a GPU for VFIO passthrough and return an RAII guard that restores it on drop. -fn check_gpu_readiness(gpu: &[String]) -> Result { - use openshell_vm::gpu_passthrough::{GpuBindGuard, prepare_gpu_for_passthrough}; +fn check_gpu_readiness(gpu: &[String]) -> Result { + use openshell_vfio::{GpuBindGuard, prepare_gpu_for_passthrough}; let requested_addr = gpu .first() diff --git a/crates/openshell-vfio/Cargo.toml b/crates/openshell-vfio/Cargo.toml new file mode 100644 index 000000000..d4c4f32de --- /dev/null +++ b/crates/openshell-vfio/Cargo.toml @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-vfio" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +description = "Host-side NVIDIA GPU VFIO bind/unbind for VM passthrough" + +[dependencies] +nix = { workspace = true } + +[dev-dependencies] +tempfile = "3" + +[lints] +workspace = true diff --git a/crates/openshell-vm/src/gpu_passthrough.rs b/crates/openshell-vfio/src/lib.rs similarity index 83% rename from crates/openshell-vm/src/gpu_passthrough.rs rename to crates/openshell-vfio/src/lib.rs index b835bca89..f6b59d892 100644 --- a/crates/openshell-vm/src/gpu_passthrough.rs +++ b/crates/openshell-vfio/src/lib.rs @@ -1,7 +1,9 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Host-side NVIDIA GPU VFIO readiness probing for VM passthrough. +//! Host-side NVIDIA GPU VFIO bind/unbind for VM passthrough. + +#![allow(unsafe_code)] //! //! This module scans Linux sysfs (`/sys/bus/pci/devices`) for NVIDIA GPUs //! (vendor ID `0x10de`), checks their driver binding, and verifies IOMMU @@ -74,6 +76,25 @@ const NVIDIA_VENDOR_ID: &str = "0x10de"; #[cfg(target_os = "linux")] const SYSFS_WRITE_TIMEOUT: Duration = Duration::from_secs(10); +/// Reject sysfs data containing characters outside the safe set for shell +/// interpolation. All legitimate sysfs writes in this crate use PCI BDF +/// addresses, driver names, or single digits — this blocks anything else. +#[cfg(target_os = "linux")] +fn validate_sysfs_data(data: &str) -> Result<(), std::io::Error> { + if data.is_empty() + || data + .bytes() + .all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.' || b == b':') + { + Ok(()) + } else { + Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("sysfs data contains unexpected characters: {data:?}"), + )) + } +} + #[cfg(target_os = "linux")] fn sysfs_write_with_timeout( path: &std::path::Path, @@ -83,6 +104,8 @@ fn sysfs_write_with_timeout( use std::process::{Command, Stdio}; use std::thread; + validate_sysfs_data(data)?; + let mut child = Command::new("sh") .arg("-c") .arg(format!( @@ -163,6 +186,50 @@ fn sysfs_write_with_timeout( } } +/// Check whether a PCI device supports MSI-X by walking the PCI capability +/// list in the sysfs `config` file. MSI-X is capability ID `0x11`. +/// +/// cloud-hypervisor's VFIO code assumes MSI-X and will panic if the device +/// only has MSI. This pre-flight check prevents a cryptic crash. +#[cfg(target_os = "linux")] +pub fn check_msix_support(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + let config_path = sysfs.sys_bus_pci_devices().join(pci_addr).join("config"); + let config = match std::fs::read(&config_path) { + Ok(data) => data, + Err(_) => return false, + }; + + // PCI config space: capability pointer at offset 0x34. + if config.len() < 0x35 { + return false; + } + + // Status register (offset 0x06, bit 4) indicates capability list present. + if config.len() > 0x07 && (config[0x06] & 0x10) == 0 { + return false; + } + + // PCI spec: capability pointers are DWORD-aligned (low 2 bits reserved). + let mut cap_ptr = (config[0x34] & 0xFC) as usize; + // Walk the capability linked list (max 48 iterations to avoid infinite loops). + for _ in 0..48 { + if cap_ptr == 0 || cap_ptr + 1 >= config.len() { + break; + } + let cap_id = config[cap_ptr]; + if cap_id == 0x11 { + return true; + } + cap_ptr = (config[cap_ptr + 1] & 0xFC) as usize; + } + false +} + +#[cfg(not(target_os = "linux"))] +pub fn check_msix_support(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + /// Validates that `addr` matches the PCI BDF format `DDDD:BB:DD.F`. fn validate_pci_addr(addr: &str) -> Result<(), std::io::Error> { let bytes = addr.as_bytes(); @@ -173,7 +240,8 @@ fn validate_pci_addr(addr: &str) -> Result<(), std::io::Error> { && bytes[..4].iter().all(|b| b.is_ascii_hexdigit()) && bytes[5..7].iter().all(|b| b.is_ascii_hexdigit()) && bytes[8..10].iter().all(|b| b.is_ascii_hexdigit()) - && bytes[11].is_ascii_digit(); + && bytes[11] >= b'0' + && bytes[11] <= b'7'; if valid { Ok(()) } else { @@ -278,8 +346,8 @@ fn probe_linux_sysfs() -> Vec<(String, HostNvidiaVfioReadiness)> { /// /// When activated, checks two conditions: /// 1. At least one NVIDIA device reports [`VfioBoundReady`]. -/// 2. The cloud-hypervisor binary exists in the runtime bundle. -pub fn nvidia_gpu_available_for_vm_passthrough() -> bool { +/// 2. The cloud-hypervisor binary exists in `runtime_dir` (if provided). +pub fn nvidia_gpu_available_for_vm_passthrough(runtime_dir: Option) -> bool { if std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() != Ok("1") { return false; } @@ -292,16 +360,14 @@ pub fn nvidia_gpu_available_for_vm_passthrough() -> bool { return false; } - let chv_exists = crate::configured_runtime_dir() + runtime_dir .map(|dir| dir.join("cloud-hypervisor").is_file()) - .unwrap_or(false); - - chv_exists + .unwrap_or(false) } /// Sysfs root path, defaulting to "/" in production and a temp dir in tests. #[derive(Debug, Clone)] -pub(crate) struct SysfsRoot(PathBuf); +pub struct SysfsRoot(PathBuf); impl Default for SysfsRoot { fn default() -> Self { @@ -352,7 +418,7 @@ impl SysfsRoot { } #[cfg(target_os = "linux")] -pub(crate) fn check_display_attached(sysfs: &SysfsRoot, pci_addr: &str) -> bool { +pub fn check_display_attached(sysfs: &SysfsRoot, pci_addr: &str) -> bool { use std::fs; let drm_dir = sysfs.sys_class_drm(); @@ -403,7 +469,7 @@ pub(crate) fn check_display_attached(sysfs: &SysfsRoot, pci_addr: &str) -> bool } #[cfg(not(target_os = "linux"))] -pub(crate) fn check_display_attached(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { +pub fn check_display_attached(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { false } @@ -411,7 +477,7 @@ pub(crate) fn check_display_attached(_sysfs: &SysfsRoot, _pci_addr: &str) -> boo /// Checks whether any process on the host has an open handle to an NVIDIA GPU /// device (`/dev/nvidia*`). This is a host-wide check across ALL NVIDIA GPUs, /// not scoped to a single PCI address. Returns a list of (pid, comm) pairs. -pub(crate) fn check_active_gpu_processes() -> std::io::Result> { +pub fn check_active_gpu_processes() -> std::io::Result> { use std::fs; let mut result = Vec::new(); @@ -459,12 +525,12 @@ pub(crate) fn check_active_gpu_processes() -> std::io::Result } #[cfg(not(target_os = "linux"))] -pub(crate) fn check_active_gpu_processes() -> std::io::Result> { +pub fn check_active_gpu_processes() -> std::io::Result> { Ok(vec![]) } #[cfg(target_os = "linux")] -pub(crate) fn check_iommu_enabled(sysfs: &SysfsRoot, pci_addr: &str) -> bool { +pub fn check_iommu_enabled(sysfs: &SysfsRoot, pci_addr: &str) -> bool { let iommu_groups = sysfs.sys_kernel_iommu_groups(); if !iommu_groups.is_dir() { return false; @@ -477,22 +543,22 @@ pub(crate) fn check_iommu_enabled(sysfs: &SysfsRoot, pci_addr: &str) -> bool { } #[cfg(not(target_os = "linux"))] -pub(crate) fn check_iommu_enabled(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { +pub fn check_iommu_enabled(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { false } #[cfg(target_os = "linux")] -pub(crate) fn check_vfio_modules_loaded(sysfs: &SysfsRoot) -> bool { +pub fn check_vfio_modules_loaded(sysfs: &SysfsRoot) -> bool { sysfs.sys_module("vfio_pci").is_dir() && sysfs.sys_module("vfio_iommu_type1").is_dir() } #[cfg(not(target_os = "linux"))] -pub(crate) fn check_vfio_modules_loaded(_sysfs: &SysfsRoot) -> bool { +pub fn check_vfio_modules_loaded(_sysfs: &SysfsRoot) -> bool { false } #[cfg(target_os = "linux")] -pub(crate) fn check_sysfs_permissions(sysfs: &SysfsRoot, pci_addr: &str) -> bool { +pub fn check_sysfs_permissions(sysfs: &SysfsRoot, pci_addr: &str) -> bool { use nix::unistd::{AccessFlags, access}; let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); @@ -507,12 +573,12 @@ pub(crate) fn check_sysfs_permissions(sysfs: &SysfsRoot, pci_addr: &str) -> bool } #[cfg(not(target_os = "linux"))] -pub(crate) fn check_sysfs_permissions(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { +pub fn check_sysfs_permissions(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { false } #[cfg(target_os = "linux")] -pub(crate) fn current_driver(sysfs: &SysfsRoot, pci_addr: &str) -> Option { +pub fn current_driver(sysfs: &SysfsRoot, pci_addr: &str) -> Option { let driver_link = sysfs.sys_bus_pci_devices().join(pci_addr).join("driver"); std::fs::read_link(&driver_link) .ok() @@ -520,7 +586,7 @@ pub(crate) fn current_driver(sysfs: &SysfsRoot, pci_addr: &str) -> Option Option { +pub fn current_driver(_sysfs: &SysfsRoot, _pci_addr: &str) -> Option { None } @@ -623,11 +689,52 @@ fn nvidia_pre_unbind_prep(pci_addr: &str) { } } +/// Reload nvidia kernel modules so the driver's sysfs bind file exists. +/// +/// Called during restore to ensure `modprobe nvidia` brings back the driver +/// that `nvidia_pre_unbind_prep` may have unloaded. Loads the base `nvidia` +/// module plus its dependent submodules in the correct order. #[cfg(target_os = "linux")] -pub(crate) fn bind_gpu_to_vfio( - sysfs: &SysfsRoot, - pci_addr: &str, -) -> Result { +fn nvidia_reload_modules() { + use std::process::{Command, Stdio}; + + // Load in dependency order: base module first, then dependents. + // If the base "nvidia" module fails, skip submodules (they depend on it). + for (i, module) in ["nvidia", "nvidia_modeset", "nvidia_uvm", "nvidia_drm"] + .iter() + .enumerate() + { + let mut cmd = Command::new("modprobe"); + cmd.arg(module) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::piped()); + match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) { + Some(s) if s.success() => { + eprintln!("GPU: loaded {module} for restore"); + } + None => { + eprintln!( + "GPU: modprobe {module} timed out after {:.0}s during restore", + NVIDIA_PREP_TIMEOUT.as_secs_f64() + ); + break; + } + Some(s) => { + eprintln!( + "GPU: modprobe {module} exited {} during restore (non-fatal)", + s.code().unwrap_or(-1) + ); + if i == 0 { + break; + } + } + } + } +} + +#[cfg(target_os = "linux")] +pub fn bind_gpu_to_vfio(sysfs: &SysfsRoot, pci_addr: &str) -> Result { validate_pci_addr(pci_addr)?; let drv = current_driver(sysfs, pci_addr); @@ -736,19 +843,36 @@ pub(crate) fn bind_gpu_to_vfio( )); } - Ok(drv.unwrap_or_default()) + // When the device had no driver (e.g. nvidia modules were already unloaded + // from a previous crash), infer "nvidia" from the vendor ID so the restore + // path knows which driver to rebind to. + let original = match drv { + Some(d) if !d.is_empty() => d, + _ => { + let vendor = std::fs::read_to_string(dev_dir.join("vendor")) + .map(|v| v.trim().to_lowercase()) + .unwrap_or_default(); + if vendor == NVIDIA_VENDOR_ID { + eprintln!( + "GPU {pci_addr}: no driver was bound, defaulting restore target to nvidia" + ); + "nvidia".to_string() + } else { + String::new() + } + } + }; + + Ok(original) } #[cfg(not(target_os = "linux"))] -pub(crate) fn bind_gpu_to_vfio( - _sysfs: &SysfsRoot, - _pci_addr: &str, -) -> Result { +pub fn bind_gpu_to_vfio(_sysfs: &SysfsRoot, _pci_addr: &str) -> Result { Ok(String::new()) } #[cfg(target_os = "linux")] -pub(crate) fn rebind_gpu_to_original( +pub fn rebind_gpu_to_original( sysfs: &SysfsRoot, pci_addr: &str, original_driver: &str, @@ -791,21 +915,39 @@ pub(crate) fn rebind_gpu_to_original( })?; if !original_driver.is_empty() && original_driver != "none" { + // The nvidia driver bind path requires the kernel module to be loaded. + // nvidia_pre_unbind_prep may have unloaded it (cascade from submodules), + // or it may have been absent since before we started. Reload it so the + // driver's bind file exists in sysfs. + if original_driver == "nvidia" && sysfs.is_real_sysfs() { + nvidia_reload_modules(); + } + let bind = sysfs.sys_bus_pci_drivers(original_driver).join("bind"); - sysfs.write_sysfs(&bind, pci_addr).map_err(|e| { - let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { - " — run as root" - } else { - "" - }; - std::io::Error::new( - e.kind(), - format!( - "Failed to rebind to {original_driver} at {path}{hint}", - path = bind.display() - ), - ) - })?; + if let Err(e) = sysfs.write_sysfs(&bind, pci_addr) { + eprintln!( + "GPU {pci_addr}: explicit bind to {original_driver} failed ({e}), \ + falling back to PCI rescan" + ); + let rescan = sysfs.0.join("sys/bus/pci/rescan"); + let _ = sysfs.write_sysfs(&rescan, "1"); + // Give the kernel time to re-probe and bind drivers. + std::thread::sleep(Duration::from_secs(1)); + + if current_driver(sysfs, pci_addr).is_none() { + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to restore {pci_addr} to {original_driver}: \ + explicit bind and PCI rescan both failed. \ + Manual fix: sudo modprobe nvidia && echo {pci_addr} | \ + sudo tee /sys/bus/pci/drivers/nvidia/bind" + ), + )); + } + let new_drv = current_driver(sysfs, pci_addr).unwrap_or_default(); + eprintln!("GPU {pci_addr}: PCI rescan bound device to {new_drv}"); + } } else { let rescan = sysfs.0.join("sys/bus/pci/rescan"); let _ = sysfs.write_sysfs(&rescan, "1"); @@ -815,7 +957,7 @@ pub(crate) fn rebind_gpu_to_original( } #[cfg(not(target_os = "linux"))] -pub(crate) fn rebind_gpu_to_original( +pub fn rebind_gpu_to_original( _sysfs: &SysfsRoot, _pci_addr: &str, _original_driver: &str, @@ -824,10 +966,7 @@ pub(crate) fn rebind_gpu_to_original( } #[cfg(target_os = "linux")] -pub(crate) fn iommu_group_peers( - sysfs: &SysfsRoot, - pci_addr: &str, -) -> Result, std::io::Error> { +pub fn iommu_group_peers(sysfs: &SysfsRoot, pci_addr: &str) -> Result, std::io::Error> { validate_pci_addr(pci_addr)?; let iommu_devices = sysfs .sys_bus_pci_devices() @@ -851,7 +990,7 @@ pub(crate) fn iommu_group_peers( } #[cfg(not(target_os = "linux"))] -pub(crate) fn iommu_group_peers( +pub fn iommu_group_peers( _sysfs: &SysfsRoot, _pci_addr: &str, ) -> Result, std::io::Error> { @@ -859,7 +998,7 @@ pub(crate) fn iommu_group_peers( } #[cfg(target_os = "linux")] -pub(crate) fn bind_iommu_group_peers( +pub fn bind_iommu_group_peers( sysfs: &SysfsRoot, pci_addr: &str, ) -> Result, std::io::Error> { @@ -890,7 +1029,7 @@ pub(crate) fn bind_iommu_group_peers( } #[cfg(not(target_os = "linux"))] -pub(crate) fn bind_iommu_group_peers( +pub fn bind_iommu_group_peers( _sysfs: &SysfsRoot, _pci_addr: &str, ) -> Result, std::io::Error> { @@ -898,7 +1037,7 @@ pub(crate) fn bind_iommu_group_peers( } #[cfg(target_os = "linux")] -pub(crate) fn rebind_iommu_group_peers( +pub fn rebind_iommu_group_peers( sysfs: &SysfsRoot, peers: &[(String, String)], ) -> Result<(), std::io::Error> { @@ -917,7 +1056,7 @@ pub(crate) fn rebind_iommu_group_peers( } #[cfg(not(target_os = "linux"))] -pub(crate) fn rebind_iommu_group_peers( +pub fn rebind_iommu_group_peers( _sysfs: &SysfsRoot, _peers: &[(String, String)], ) -> Result<(), std::io::Error> { @@ -959,17 +1098,28 @@ impl GpuBindState { self.restore_with_sysfs(&SysfsRoot::default()) } - pub(crate) fn restore_with_sysfs(&self, sysfs: &SysfsRoot) -> Result<(), std::io::Error> { + pub fn restore_with_sysfs(&self, sysfs: &SysfsRoot) -> Result<(), std::io::Error> { if !self.did_bind { return Ok(()); } + + // Always attempt peer restore even if GPU restore fails, so the + // audio companion (and any other IOMMU group peers) aren't left + // stranded on vfio-pci. eprintln!( "GPU: rebinding {} to {}", self.pci_addr, self.original_driver ); - rebind_gpu_to_original(sysfs, &self.pci_addr, &self.original_driver)?; - rebind_iommu_group_peers(sysfs, &self.peer_binds)?; - Ok(()) + let gpu_result = rebind_gpu_to_original(sysfs, &self.pci_addr, &self.original_driver); + let peer_result = rebind_iommu_group_peers(sysfs, &self.peer_binds); + + if let Err(ref gpu_err) = gpu_result { + if let Err(ref peer_err) = peer_result { + eprintln!("GPU: peer restore also failed: {peer_err}"); + } + return Err(std::io::Error::new(gpu_err.kind(), gpu_err.to_string())); + } + peer_result } } @@ -1024,7 +1174,7 @@ pub fn prepare_gpu_for_passthrough( prepare_gpu_with_sysfs(&SysfsRoot::default(), requested_bdf) } -pub(crate) fn prepare_gpu_with_sysfs( +pub fn prepare_gpu_with_sysfs( sysfs: &SysfsRoot, requested_bdf: Option<&str>, ) -> Result { @@ -1064,6 +1214,17 @@ fn prepare_specific_gpu(sysfs: &SysfsRoot, bdf: &str) -> Result Result { .map_err(|e| std::io::Error::new(e.kind(), format!("cannot verify GPUs are idle — {e}")))?; for addr in &nvidia_addrs { + if !check_msix_support(sysfs, addr) { + blocked.push(( + addr.clone(), + "no MSI-X support (required by cloud-hypervisor)".to_string(), + )); + continue; + } + if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") { blocked.push((addr.clone(), "IOMMU group not clean".to_string())); continue; @@ -1258,7 +1427,7 @@ mod tests { // SAFETY: test runs single-threaded; no other thread reads this var. unsafe { std::env::remove_var("OPENSHELL_VM_GPU_E2E") }; assert!( - !nvidia_gpu_available_for_vm_passthrough(), + !nvidia_gpu_available_for_vm_passthrough(None), "gate must return false when OPENSHELL_VM_GPU_E2E is unset" ); } @@ -1305,12 +1474,30 @@ mod tests { } } + /// Build a minimal PCI config space (64 bytes) with a capability list + /// containing a single MSI-X entry (cap ID 0x11) so `check_msix_support` + /// sees the device as passthrough-capable. + fn mock_pci_config_with_msix() -> Vec { + let mut cfg = vec![0u8; 64]; + // Status register (offset 0x06): set bit 4 = capabilities list present. + cfg[0x06] = 0x10; + // Capabilities pointer (offset 0x34): first cap at 0x40. + cfg[0x34] = 0x40; + // Extend to include the capability at offset 0x40. + cfg.resize(0x42, 0); + // Cap at 0x40: ID = 0x11 (MSI-X), next = 0x00 (end of list). + cfg[0x40] = 0x11; + cfg[0x41] = 0x00; + cfg + } + fn mock_pci_device(root: &Path, pci_addr: &str, vendor: &str, driver: Option<&str>) { use std::fs; let dev_dir = root.join("sys/bus/pci/devices").join(pci_addr); fs::create_dir_all(&dev_dir).unwrap(); fs::write(dev_dir.join("vendor"), vendor).unwrap(); fs::write(dev_dir.join("class"), "0x030000").unwrap(); + fs::write(dev_dir.join("config"), mock_pci_config_with_msix()).unwrap(); if let Some(drv) = driver { let driver_dir = root.join("sys/bus/pci/drivers").join(drv); fs::create_dir_all(&driver_dir).unwrap(); @@ -1890,6 +2077,82 @@ mod tests { state.restore_with_sysfs(&sysfs).unwrap(); } + #[test] + #[cfg(target_os = "linux")] + fn bind_unbound_nvidia_defaults_to_nvidia_driver() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + // Device with no driver bound (simulating post-crash state). + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + + let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!( + result, "nvidia", + "unbound NVIDIA device should default to nvidia as restore driver" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn msix_detected_in_config() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + assert!(check_msix_support(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn msix_absent_msi_only() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + let dev_dir = root.path().join("sys/bus/pci/devices").join("0000:41:00.0"); + fs::create_dir_all(&dev_dir).unwrap(); + // Config with MSI (cap 0x05) only, no MSI-X (0x11). + let mut cfg = vec![0u8; 0x42]; + cfg[0x06] = 0x10; // capabilities list present + cfg[0x34] = 0x40; // cap pointer + cfg[0x40] = 0x05; // MSI capability + cfg[0x41] = 0x00; // end of list + fs::write(dev_dir.join("config"), &cfg).unwrap(); + assert!(!check_msix_support(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn msix_empty_cap_list() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + let dev_dir = root.path().join("sys/bus/pci/devices").join("0000:41:00.0"); + fs::create_dir_all(&dev_dir).unwrap(); + let mut cfg = vec![0u8; 0x40]; + cfg[0x06] = 0x10; // capabilities list present + cfg[0x34] = 0x00; // null cap pointer + fs::write(dev_dir.join("config"), &cfg).unwrap(); + assert!(!check_msix_support(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn msix_circular_cap_list() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + let dev_dir = root.path().join("sys/bus/pci/devices").join("0000:41:00.0"); + fs::create_dir_all(&dev_dir).unwrap(); + // Circular: cap at 0x40 points back to 0x40. + let mut cfg = vec![0u8; 0x42]; + cfg[0x06] = 0x10; + cfg[0x34] = 0x40; + cfg[0x40] = 0x05; // MSI (not MSI-X) + cfg[0x41] = 0x40; // points back to self + fs::write(dev_dir.join("config"), &cfg).unwrap(); + // Should terminate via the 48-iteration guard, not hang. + assert!(!check_msix_support(&sysfs, "0000:41:00.0")); + } + #[test] fn guard_has_pci_addr() { let state = GpuBindState { diff --git a/crates/openshell-vm/tests/gpu_passthrough_implementation.rs b/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs similarity index 96% rename from crates/openshell-vm/tests/gpu_passthrough_implementation.rs rename to crates/openshell-vfio/tests/gpu_passthrough_implementation.rs index 4985ba39b..a9bbd7bdc 100644 --- a/crates/openshell-vm/tests/gpu_passthrough_implementation.rs +++ b/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs @@ -6,7 +6,7 @@ //! Gated by `OPENSHELL_VM_GPU_E2E=1`. On machines without a real GPU, //! all tests early-return and pass. -use openshell_vm::gpu_passthrough::{ +use openshell_vfio::{ GpuBindGuard, HostNvidiaVfioReadiness, prepare_gpu_for_passthrough, probe_host_nvidia_vfio_readiness, }; @@ -22,7 +22,7 @@ fn nvidia_gpu_passthrough_is_available() { return; } assert!( - openshell_vm::gpu_passthrough::nvidia_gpu_available_for_vm_passthrough(), + openshell_vfio::nvidia_gpu_available_for_vm_passthrough(None), "GPU passthrough gate returned false on a GPU CI runner — \ check VFIO binding and cloud-hypervisor runtime bundle" ); diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml index 388e42351..aa3d85a4a 100644 --- a/crates/openshell-vm/Cargo.toml +++ b/crates/openshell-vm/Cargo.toml @@ -28,6 +28,7 @@ miette = { workspace = true } nix = { workspace = true } openshell-bootstrap = { path = "../openshell-bootstrap" } openshell-core = { path = "../openshell-core" } +openshell-vfio = { path = "../openshell-vfio" } serde = { workspace = true } serde_json = "1" tar = "0.4" diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index d44f044c8..4da05e089 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -71,4 +71,4 @@ VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" # Consumer GPUs (GeForce) may work but are not officially supported # for VFIO passthrough. NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}" -NVIDIA_CONTAINER_TOOLKIT_VERSION="${NVIDIA_CONTAINER_TOOLKIT_VERSION:-1.17.5}" +NVIDIA_CONTAINER_TOOLKIT_VERSION="${NVIDIA_CONTAINER_TOOLKIT_VERSION:-1.19.0}" diff --git a/crates/openshell-vm/src/backend/cloud_hypervisor.rs b/crates/openshell-vm/src/backend/cloud_hypervisor.rs index 869b1747d..e6c89a93c 100644 --- a/crates/openshell-vm/src/backend/cloud_hypervisor.rs +++ b/crates/openshell-vm/src/backend/cloud_hypervisor.rs @@ -956,7 +956,9 @@ fn teardown_chv_host_networking(original_ip_forward: &str) { if original_ip_forward != "1" { let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", original_ip_forward); } - eprintln!("host networking: cleaned up iptables rules, restored ip_forward={original_ip_forward}"); + eprintln!( + "host networking: cleaned up iptables rules, restored ip_forward={original_ip_forward}" + ); } /// Start a background TCP proxy that forwards `127.0.0.1:{host_port}` diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index 9b70b32cf..27b0ed843 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -18,7 +18,6 @@ pub mod backend; mod embedded; mod exec; mod ffi; -pub mod gpu_passthrough; mod health; use std::ffi::CString; @@ -54,9 +53,12 @@ pub enum VmError { The --gpu flag requires a rootfs built with GPU support (NVIDIA drivers,\n\ nvidia-container-toolkit, and GPU manifests).\n\ Build one with:\n\ - \x20 ./crates/openshell-vm/scripts/build-rootfs.sh --gpu \n\ - Then either:\n\ - \x20 - Copy it to: {path}\n\ + \x20 mise run vm:rootfs -- --base --gpu\n\ + \x20 mise run vm:build\n\ + Or manually:\n\ + \x20 - Place rootfs-gpu.tar.zst in the openshell-vm.runtime/ sidecar directory\n\ + \x20 - Or set OPENSHELL_VM_GPU_ROOTFS_TARBALL=/path/to/rootfs-gpu.tar.zst\n\ + \x20 - Or copy the extracted rootfs to: {path}\n\ \x20 - Or use: openshell-vm --gpu --rootfs " )] GpuRootfsNotFound { path: String }, @@ -338,16 +340,108 @@ pub fn named_gpu_rootfs_dir(instance_name: &str) -> Result { /// Ensure a GPU rootfs exists for the named instance. /// -/// Unlike [`ensure_named_rootfs`], there is no embedded GPU rootfs to -/// extract — the user must pre-build it with `build-rootfs.sh --gpu`. +/// When the GPU rootfs directory doesn't exist, looks for a +/// `rootfs-gpu.tar.zst` tarball in these locations (in order): +/// +/// 1. Sidecar runtime dir: `/openshell-vm.runtime/rootfs-gpu.tar.zst` +/// 2. Environment variable: `OPENSHELL_VM_GPU_ROOTFS_TARBALL` +/// +/// If found, extracts to the instance `rootfs-gpu` path. This mirrors the +/// pattern used by [`ensure_named_rootfs`] for the standard rootfs. +/// +/// Validates that the rootfs contains the `.rootfs-gpu` sentinel written +/// by `build-rootfs.sh --gpu`, catching the case where a regular rootfs +/// was accidentally placed at the `rootfs-gpu` path. pub fn ensure_gpu_rootfs(instance_name: &str) -> Result { let gpu_rootfs = named_gpu_rootfs_dir(instance_name)?; - if gpu_rootfs.is_dir() { - return Ok(gpu_rootfs); + if !gpu_rootfs.is_dir() { + if let Some(tarball) = find_gpu_rootfs_tarball() { + extract_gpu_rootfs_tarball(&tarball, &gpu_rootfs)?; + } else { + return Err(VmError::GpuRootfsNotFound { + path: gpu_rootfs.display().to_string(), + }); + } } - Err(VmError::GpuRootfsNotFound { - path: gpu_rootfs.display().to_string(), - }) + + let sentinel = gpu_rootfs.join("opt/openshell/.rootfs-gpu"); + if !sentinel.is_file() { + return Err(VmError::GpuRootfsNotFound { + path: format!( + "{} (directory exists but missing .rootfs-gpu sentinel — \ + was it built with --gpu?)", + gpu_rootfs.display() + ), + }); + } + + eprintln!("GPU rootfs: {}", gpu_rootfs.display()); + Ok(gpu_rootfs) +} + +const GPU_ROOTFS_TARBALL_ENV: &str = "OPENSHELL_VM_GPU_ROOTFS_TARBALL"; +const GPU_ROOTFS_TARBALL_NAME: &str = "rootfs-gpu.tar.zst"; + +/// Search for a GPU rootfs tarball in known locations. +fn find_gpu_rootfs_tarball() -> Option { + // 1. Sidecar runtime dir next to the binary + if let Ok(exe) = std::env::current_exe() { + if let Some(exe_dir) = exe.parent() { + let sidecar = exe_dir + .join("openshell-vm.runtime") + .join(GPU_ROOTFS_TARBALL_NAME); + if sidecar.is_file() { + return Some(sidecar); + } + } + } + + // 2. Environment variable override + if let Some(path) = std::env::var_os(GPU_ROOTFS_TARBALL_ENV) { + let path = PathBuf::from(path); + if path.is_file() { + return Some(path); + } + } + + None +} + +/// Extract a `rootfs-gpu.tar.zst` tarball into the given destination directory. +fn extract_gpu_rootfs_tarball(tarball: &Path, dest: &Path) -> Result<(), VmError> { + eprintln!( + "Extracting GPU rootfs...\n source: {}\n dest: {}", + tarball.display(), + dest.display() + ); + + let file = std::fs::File::open(tarball).map_err(|e| { + VmError::HostSetup(format!( + "open GPU rootfs tarball {}: {e}", + tarball.display() + )) + })?; + + let decoder = zstd::Decoder::new(std::io::BufReader::new(file)).map_err(|e| { + VmError::HostSetup(format!( + "create zstd decoder for {}: {e}", + tarball.display() + )) + })?; + + std::fs::create_dir_all(dest).map_err(|e| { + VmError::HostSetup(format!("create GPU rootfs dir {}: {e}", dest.display())) + })?; + + let mut archive = tar::Archive::new(decoder); + archive.unpack(dest).map_err(|e| { + // Clean up partial extraction + let _ = std::fs::remove_dir_all(dest); + VmError::HostSetup(format!("extract GPU rootfs tarball: {e}")) + })?; + + eprintln!(" GPU rootfs extracted to {}", dest.display()); + Ok(()) } /// Ensure a named instance rootfs exists, extracting from the embedded diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs index 1b3aa6423..4d201cbe1 100644 --- a/crates/openshell-vm/src/main.rs +++ b/crates/openshell-vm/src/main.rs @@ -249,21 +249,21 @@ fn run(cli: Cli) -> Result> { let (gpu_enabled, vfio_device, _gpu_guard) = match cli.gpu { Some(ref addr) if addr != "auto" => { - let state = openshell_vm::gpu_passthrough::prepare_gpu_for_passthrough(Some(addr))?; + let state = openshell_vfio::prepare_gpu_for_passthrough(Some(addr))?; let bdf = state.pci_addr.clone(); ( true, Some(bdf), - Some(openshell_vm::gpu_passthrough::GpuBindGuard::new(state)), + Some(openshell_vfio::GpuBindGuard::new(state)), ) } Some(_) => { - let state = openshell_vm::gpu_passthrough::prepare_gpu_for_passthrough(None)?; + let state = openshell_vfio::prepare_gpu_for_passthrough(None)?; let bdf = state.pci_addr.clone(); ( true, Some(bdf), - Some(openshell_vm::gpu_passthrough::GpuBindGuard::new(state)), + Some(openshell_vfio::GpuBindGuard::new(state)), ) } None => (false, None, None), diff --git a/tasks/scripts/vm/build-rootfs-tarball.sh b/tasks/scripts/vm/build-rootfs-tarball.sh index 76e4f6297..d41b2ff25 100755 --- a/tasks/scripts/vm/build-rootfs-tarball.sh +++ b/tasks/scripts/vm/build-rootfs-tarball.sh @@ -9,36 +9,43 @@ # 2. Compresses it to a zstd tarball for embedding # # Usage: -# ./build-rootfs-tarball.sh [--base] +# ./build-rootfs-tarball.sh [--base] [--gpu] # # Options: # --base Build a base rootfs (~200-300MB) without pre-loaded images. # First boot will be slower but binary size is much smaller. # Default: full rootfs with pre-loaded images (~2GB+). +# --gpu Include NVIDIA drivers and nvidia-container-toolkit for GPU +# passthrough. Only supported on x86_64. # -# The resulting tarball is placed at target/vm-runtime-compressed/rootfs.tar.zst -# for inclusion in the embedded binary build. +# The resulting tarball is placed at: +# target/vm-runtime-compressed/rootfs.tar.zst (standard) +# target/vm-runtime-compressed/rootfs-gpu.tar.zst (--gpu) set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" ROOTFS_BUILD_DIR="${ROOT}/target/rootfs-build" OUTPUT_DIR="${ROOT}/target/vm-runtime-compressed" -OUTPUT="${OUTPUT_DIR}/rootfs.tar.zst" # Parse arguments BASE_ONLY=false +GPU=false for arg in "$@"; do case "$arg" in --base) BASE_ONLY=true ;; + --gpu) + GPU=true + ;; --help|-h) - echo "Usage: $0 [--base]" + echo "Usage: $0 [--base] [--gpu]" echo "" echo "Options:" echo " --base Build base rootfs (~200-300MB) without pre-loaded images" echo " First boot will be slower but binary size is much smaller" + echo " --gpu Include NVIDIA drivers for GPU passthrough (x86_64 only)" exit 0 ;; *) @@ -63,28 +70,33 @@ if ! docker info &>/dev/null; then exit 1 fi +ROOTFS_ARGS=() +MODE_DESC="full (pre-loaded images, pre-initialized, ~2GB+)" if [ "$BASE_ONLY" = true ]; then - echo "==> Building BASE rootfs for embedding" - echo " Build dir: ${ROOTFS_BUILD_DIR}" - echo " Output: ${OUTPUT}" - echo " Mode: base (no pre-loaded images, ~200-300MB)" - echo "" - - # Build base rootfs - echo "==> Step 1/2: Building base rootfs..." - "${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" --base "${ROOTFS_BUILD_DIR}" + ROOTFS_ARGS+=(--base) + MODE_DESC="base (no pre-loaded images, ~200-300MB)" +fi +if [ "$GPU" = true ]; then + ROOTFS_ARGS+=(--gpu) + MODE_DESC="${MODE_DESC}, GPU (NVIDIA drivers included)" +fi + +# GPU rootfs gets a distinct tarball name so both can coexist in the output dir +if [ "$GPU" = true ]; then + OUTPUT="${OUTPUT_DIR}/rootfs-gpu.tar.zst" else - echo "==> Building FULL rootfs for embedding" - echo " Build dir: ${ROOTFS_BUILD_DIR}" - echo " Output: ${OUTPUT}" - echo " Mode: full (pre-loaded images, pre-initialized, ~2GB+)" - echo "" - - # Build full rootfs - echo "==> Step 1/2: Building full rootfs (this may take 10-15 minutes)..." - "${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" "${ROOTFS_BUILD_DIR}" + OUTPUT="${OUTPUT_DIR}/rootfs.tar.zst" fi +echo "==> Building rootfs for embedding" +echo " Build dir: ${ROOTFS_BUILD_DIR}" +echo " Output: ${OUTPUT}" +echo " Mode: ${MODE_DESC}" +echo "" + +echo "==> Step 1/2: Building rootfs..." +"${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" "${ROOTFS_ARGS[@]}" "${ROOTFS_BUILD_DIR}" + # Compress to tarball echo "" echo "==> Step 2/2: Compressing rootfs to tarball..." @@ -107,10 +119,13 @@ echo "" echo "==> Rootfs tarball created successfully!" echo " Output: ${OUTPUT}" echo " Compressed: $(du -sh "${OUTPUT}" | cut -f1)" +TYPE_DESC="full (first boot ~3-5s, images pre-loaded)" if [ "$BASE_ONLY" = true ]; then - echo " Type: base (first boot ~30-60s, images pulled on demand)" -else - echo " Type: full (first boot ~3-5s, images pre-loaded)" + TYPE_DESC="base (first boot ~30-60s, images pulled on demand)" +fi +if [ "$GPU" = true ]; then + TYPE_DESC="${TYPE_DESC}, GPU" fi +echo " Type: ${TYPE_DESC}" echo "" echo "Next step: mise run vm:build" diff --git a/tasks/scripts/vm/bundle-vm-runtime.sh b/tasks/scripts/vm/bundle-vm-runtime.sh index 6c21e511d..83d53dcac 100755 --- a/tasks/scripts/vm/bundle-vm-runtime.sh +++ b/tasks/scripts/vm/bundle-vm-runtime.sh @@ -46,6 +46,9 @@ TARGETS=( "${ROOT}/target/release" ) +COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed" +GPU_ROOTFS_TARBALL="${COMPRESSED_DIR}/rootfs-gpu.tar.zst" + for target_dir in "${TARGETS[@]}"; do # Only stage if the binary exists (avoid creating orphan runtime dirs) if [ ! -f "${target_dir}/openshell-vm" ] && [ ! -f "${target_dir}/openshell-vm.d" ]; then @@ -61,5 +64,11 @@ for target_dir in "${TARGETS[@]}"; do install -m 0755 "$file" "${runtime_dir}/${name}" done + # Stage the GPU rootfs tarball if it was built + if [ -f "${GPU_ROOTFS_TARBALL}" ]; then + install -m 0644 "${GPU_ROOTFS_TARBALL}" "${runtime_dir}/rootfs-gpu.tar.zst" + echo "staged GPU rootfs tarball in ${runtime_dir}" + fi + echo "staged runtime bundle in ${runtime_dir}" done diff --git a/tasks/scripts/vm/compress-vm-runtime.sh b/tasks/scripts/vm/compress-vm-runtime.sh index efada8a2e..82a7c4b8f 100755 --- a/tasks/scripts/vm/compress-vm-runtime.sh +++ b/tasks/scripts/vm/compress-vm-runtime.sh @@ -91,8 +91,8 @@ if [ -z "${VM_RUNTIME_TARBALL:-}" ] && _check_compressed_artifacts "$OUTPUT_DIR" for f in "${OUTPUT_DIR}"/*.zst; do [ -f "$f" ] || continue name="$(basename "${f%.zst}")" - # Skip rootfs tarball — bundle-vm-runtime.sh doesn't need it - [[ "$name" == rootfs.tar ]] && continue + # Skip rootfs tarballs — bundle-vm-runtime.sh doesn't need them + [[ "$name" == rootfs.tar || "$name" == rootfs-gpu.tar ]] && continue zstd -d "$f" -o "${WORK_DIR}/${name}" -f -q chmod 0755 "${WORK_DIR}/${name}" done @@ -126,8 +126,9 @@ if [ -n "${VM_RUNTIME_TARBALL:-}" ]; then echo "" compress_dir "$WORK_DIR" "$OUTPUT_DIR" - # Check for rootfs tarball (built separately) + # Check for rootfs tarballs (built separately) ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs.tar.zst" + GPU_ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs-gpu.tar.zst" if [ -f "$ROOTFS_TARBALL" ]; then echo " rootfs.tar.zst: $(du -h "$ROOTFS_TARBALL" | cut -f1) (pre-built)" else @@ -135,6 +136,9 @@ if [ -n "${VM_RUNTIME_TARBALL:-}" ]; then echo "Note: rootfs.tar.zst not found." echo " To build one, run: mise run vm:rootfs -- --base" fi + if [ -f "$GPU_ROOTFS_TARBALL" ]; then + echo " rootfs-gpu.tar.zst: $(du -h "$GPU_ROOTFS_TARBALL" | cut -f1) (pre-built)" + fi echo "" echo "==> Compressed artifacts in ${OUTPUT_DIR}:" @@ -272,16 +276,20 @@ ls -lah "$WORK_DIR" echo "" compress_dir "$WORK_DIR" "$OUTPUT_DIR" -# Check for rootfs tarball (built separately by build-rootfs-tarball.sh) +# Check for rootfs tarballs (built separately by build-rootfs-tarball.sh) ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs.tar.zst" +GPU_ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs-gpu.tar.zst" if [ -f "$ROOTFS_TARBALL" ]; then echo " rootfs.tar.zst: $(du -h "$ROOTFS_TARBALL" | cut -f1) (pre-built)" else echo "" echo "Note: rootfs.tar.zst not found." - echo " To build one, run: mise run vm:rootfs -- --base" - echo " Without it, the binary will still work but require the rootfs" - echo " to be built separately on first run." + echo " To build one, run: mise run vm:rootfs -- --base" + echo " Without it, the binary will still work but require the rootfs" + echo " to be built separately on first run." +fi +if [ -f "$GPU_ROOTFS_TARBALL" ]; then + echo " rootfs-gpu.tar.zst: $(du -h "$GPU_ROOTFS_TARBALL" | cut -f1) (pre-built)" fi echo "" From 6ef5a211639be0e7cc36d516f062ecee06b8c23f Mon Sep 17 00:00:00 2001 From: Vincent Caux-Brisebois Date: Thu, 23 Apr 2026 15:06:45 +0000 Subject: [PATCH 4/5] adding qemu VM backend Signed-off-by: Vincent Caux-Brisebois --- architecture/custom-vm-runtime.md | 117 +- architecture/vm-gpu-passthrough.md | 119 +- crates/openshell-vfio/src/lib.rs | 788 +++++++++++-- crates/openshell-vm/build.rs | 94 +- crates/openshell-vm/pins.env | 8 + .../runtime/kernel/openshell.kconfig | 21 + crates/openshell-vm/scripts/build-rootfs.sh | 180 ++- .../openshell-vm/scripts/openshell-vm-init.sh | 106 +- .../src/backend/cloud_hypervisor.rs | 454 ++------ crates/openshell-vm/src/backend/libkrun.rs | 9 +- crates/openshell-vm/src/backend/mod.rs | 423 ++++++- crates/openshell-vm/src/backend/qemu.rs | 1021 +++++++++++++++++ crates/openshell-vm/src/embedded.rs | 11 +- crates/openshell-vm/src/exec.rs | 126 +- crates/openshell-vm/src/lib.rs | 326 +++++- crates/openshell-vm/src/main.rs | 181 ++- crates/openshell-vm/tests/vm_boot_smoke.rs | 112 +- ...-cloud-hypervisor.sh => build-gpu-deps.sh} | 23 +- tasks/scripts/vm/build-libkrun.sh | 25 +- tasks/scripts/vm/build-nvidia-modules.sh | 182 +++ tasks/scripts/vm/compress-vm-runtime.sh | 20 + tasks/scripts/vm/qemu-check.sh | 80 ++ tasks/scripts/vm/vm-setup.sh | 5 + tasks/vm.toml | 25 +- 24 files changed, 3731 insertions(+), 725 deletions(-) create mode 100644 crates/openshell-vm/src/backend/qemu.rs rename tasks/scripts/vm/{build-cloud-hypervisor.sh => build-gpu-deps.sh} (73%) create mode 100755 tasks/scripts/vm/build-nvidia-modules.sh create mode 100755 tasks/scripts/vm/qemu-check.sh diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index 6dac41064..6105187e6 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -4,17 +4,21 @@ ## Overview -The OpenShell gateway VM supports two hypervisor backends: +The OpenShell gateway VM supports three hypervisor backends: - **libkrun** (default) — lightweight VMM using Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel is embedded inside `libkrunfw`. Uses virtio-MMIO device transport and gvproxy for user-space networking. - **cloud-hypervisor** — Linux-only KVM-based VMM used for GPU passthrough (VFIO). Uses virtio-PCI device transport, TAP networking, and requires a separate `vmlinux` kernel and - `virtiofsd` for rootfs access. + `virtiofsd` for rootfs access. Requires GPU MSI-X support. +- **QEMU** — Linux-only fallback VMM for GPU passthrough when the GPU lacks MSI-X support. + Uses the same TAP networking, `vmlinux`, and `virtiofsd` as cloud-hypervisor. QEMU binary + is not embedded — it must be installed on the host. -Backend selection is automatic: `--gpu` selects cloud-hypervisor, otherwise libkrun is used. -The `--backend` flag provides explicit control (`auto`, `libkrun`, `cloud-hypervisor`). +Backend selection is automatic: `--gpu` selects cloud-hypervisor (MSI-X GPU) or QEMU +(non-MSI-X GPU), otherwise libkrun is used. The `--backend` flag provides explicit control +(`auto`, `libkrun`, `cloud-hypervisor`, `qemu`). When `--gpu` is passed, `openshell-vm` automatically binds an eligible GPU to `vfio-pci` and restores it to the original driver on shutdown. See @@ -38,6 +42,7 @@ graph TD PROV[Runtime provenance logging] GVP[gvproxy networking proxy] CHV_BIN["cloud-hypervisor · virtiofsd · vmlinux\n(GPU runtime bundle)"] + QEMU_BIN["qemu-system-x86_64\n(host-installed, GPU fallback)"] BIN --> EMB BIN -->|extracts to| CACHE @@ -60,6 +65,7 @@ graph TD BIN -- "libkrun: fork + krun_start_enter" --> INIT BIN -- "CHV: cloud-hypervisor API + virtiofsd" --> INIT + BIN -- "QEMU: qemu-system-x86_64 + virtiofsd" --> INIT GVP -- "virtio-net (libkrun only)" --> Guest ``` @@ -117,28 +123,31 @@ mise run vm:build # Rebuild binary with full rootfs ## Backend Comparison -| | libkrun (default) | cloud-hypervisor | -|---|---|---| -| Platforms | macOS (Hypervisor.framework), Linux (KVM) | Linux (KVM) only | -| Device transport | virtio-MMIO | virtio-PCI | -| Networking | gvproxy (user-space, no root needed) | TAP (requires root/CAP_NET_ADMIN) | -| Rootfs delivery | In-process (krun API) | virtiofsd (virtio-fs daemon) | -| Kernel delivery | Embedded in libkrunfw | Separate `vmlinux` file | -| Console | virtio-console (`hvc0`) | 8250 UART (`ttyS0`) | -| Shutdown | Automatic on PID 1 exit | ACPI poweroff (`poweroff -f`) | -| GPU passthrough | Not supported | VFIO PCI passthrough | -| `--exec` mode | Direct init replacement | Wrapper script with ACPI shutdown | -| CLI flag | `--backend libkrun` | `--backend cloud-hypervisor` or `--gpu` | +| | libkrun (default) | cloud-hypervisor | QEMU | +|---|---|---|---| +| Platforms | macOS (Hypervisor.framework), Linux (KVM) | Linux (KVM) only | Linux (KVM) only | +| Device transport | virtio-MMIO | virtio-PCI | virtio-PCI | +| Networking | gvproxy (user-space, no root needed) | TAP (requires root/CAP_NET_ADMIN) | TAP (requires root/CAP_NET_ADMIN) | +| Rootfs delivery | In-process (krun API) | virtiofsd (virtio-fs daemon) | virtiofsd (virtio-fs daemon) | +| Kernel delivery | Embedded in libkrunfw | Separate `vmlinux` file | Separate `vmlinux` file | +| Console | virtio-console (`hvc0`) | 8250 UART (`ttyS0`) | 8250 UART (`ttyS0`) | +| Shutdown | Automatic on PID 1 exit | ACPI poweroff (`poweroff -f`) | ACPI poweroff (`poweroff -f`) | +| GPU passthrough | Not supported | VFIO PCI (requires MSI-X) | VFIO PCI (MSI-X not required) | +| Vsock | libkrun built-in | Unix socket + text protocol | `AF_VSOCK` (kernel `vhost_vsock`) | +| VM control | krun C API | REST API over Unix socket | Command-line args | +| Binary source | Embedded in runtime | Runtime bundle | Host-installed | +| `--exec` mode | Direct init replacement | Wrapper script with ACPI shutdown | Wrapper script with ACPI shutdown | +| CLI flag | `--backend libkrun` | `--backend cloud-hypervisor` or `--gpu` | `--backend qemu` | ### Exec mode differences With libkrun, when `--exec ` is used, the command replaces the init process and the VM exits when PID 1 exits. -With cloud-hypervisor, the VM does not automatically exit when PID 1 terminates. A -wrapper init script is dynamically written to the guest rootfs that mounts necessary -filesystems, executes the user command, captures the exit code, and calls -`poweroff -f` to trigger an ACPI shutdown that cloud-hypervisor detects. +With cloud-hypervisor and QEMU, the VM does not automatically exit when PID 1 +terminates. A wrapper init script is dynamically written to the guest rootfs that +mounts necessary filesystems, executes the user command, captures the exit code, +and calls `poweroff -f` to trigger an ACPI shutdown that the hypervisor detects. ## Network Profile @@ -161,6 +170,8 @@ fast with an actionable error if they are missing. - **cloud-hypervisor**: Uses TAP networking (requires root or CAP_NET_ADMIN). When `--net none` is passed, networking is disabled entirely (useful for `--exec` mode tests). gvproxy is not used with cloud-hypervisor. +- **QEMU**: Uses TAP networking (same subnet and setup as cloud-hypervisor). Port + forwarding uses the same userspace TCP proxy. gvproxy is not used with QEMU. ## Guest Init Script @@ -202,35 +213,46 @@ graph LR BUILD_M["Build libkrunfw.dylib + libkrun.dylib"] end - subgraph CHV["Linux CI (build-cloud-hypervisor.sh)"] - BUILD_CHV["Build cloud-hypervisor + virtiofsd"] + subgraph GPU["Linux CI (build-gpu-deps.sh)"] + BUILD_GPU["Build cloud-hypervisor + virtiofsd\n(shared by CHV and QEMU)"] + end + + subgraph NV["Linux CI (build-nvidia-modules.sh)"] + BUILD_NV["Compile NVIDIA .ko against VM kernel"] + end + + subgraph QEMU["Host-installed"] + QEMU_BIN["qemu-system-x86_64\n(not built — must be on host PATH)"] end subgraph Output["target/libkrun-build/"] LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"] LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"] - CHV_OUT["cloud-hypervisor + virtiofsd\n(Linux)"] - VMLINUX["vmlinux\n(extracted from libkrunfw)"] + CHV_OUT["cloud-hypervisor (CHV only)\n+ virtiofsd (CHV + QEMU)"] + VMLINUX["vmlinux\n(shared by CHV + QEMU)"] + NV_KO["nvidia-modules/*.ko\n(GPU builds only)"] end KCONF --> BUILD_L BUILD_L --> LIB_SO BUILD_L --> VMLINUX + BUILD_L -->|kernel source tree| BUILD_NV + BUILD_NV --> NV_KO KCONF --> BUILD_M BUILD_M --> LIB_DY - BUILD_CHV --> CHV_OUT + BUILD_GPU --> CHV_OUT ``` -The `vmlinux` kernel is extracted from the libkrunfw build and reused by cloud-hypervisor. -Both backends boot the same kernel — the kconfig fragment includes drivers for both -virtio-MMIO (libkrun) and virtio-PCI (CHV) transports. +The `vmlinux` kernel is extracted from the libkrunfw build and reused by cloud-hypervisor +and QEMU. All three backends boot the same kernel — the kconfig fragment includes drivers +for both virtio-MMIO (libkrun) and virtio-PCI (CHV/QEMU) transports. ## Kernel Config Fragment The `openshell.kconfig` fragment enables these kernel features on top of the stock -libkrunfw kernel. A single kernel binary is shared by both libkrun and cloud-hypervisor — -backend-specific drivers coexist safely (the kernel probes whichever transport the -hypervisor provides). +libkrunfw kernel. A single kernel binary is shared by all three backends (libkrun, +cloud-hypervisor, QEMU) — backend-specific drivers coexist safely (the kernel probes +whichever transport the hypervisor provides). | Feature | Key Configs | Purpose | |---------|-------------|---------| @@ -291,6 +313,10 @@ commands work the same way they would inside the VM shell. - **cloud-hypervisor**: Uses a vsock exec bridge — a host-side process that connects an AF_VSOCK socket to a Unix domain socket, providing the same interface to the exec agent. +- **QEMU**: Uses `vhost-vsock-pci` with kernel `AF_VSOCK` sockets. The exec + bridge opens a kernel `AF_VSOCK` socket to the guest CID and bridges it to + the same Unix domain socket path used by the other backends. Requires the + `vhost_vsock` kernel module on the host. ## Build Commands @@ -316,9 +342,28 @@ mise run vm:build # Then build embedded binary # Build cloud-hypervisor runtime bundle (Linux only) mise run vm:bundle-runtime # Builds CHV + virtiofsd + extracts vmlinux +# Validate QEMU host prerequisites +mise run vm:qemu-check + +# Install QEMU if not present (Ubuntu/Debian) +sudo apt install qemu-system-x86 + +# Load vhost-vsock kernel module (required for QEMU vsock) +sudo modprobe vhost_vsock +echo "vhost_vsock" | sudo tee /etc/modules-load.d/vhost_vsock.conf + +# Build with GPU support (Linux x86_64 only) +FROM_SOURCE=1 mise run vm:setup # Build kernel from source (module compilation needs it) +mise run vm:nvidia-modules # Compile NVIDIA .ko files against VM kernel +mise run vm:rootfs -- --base --gpu # Build GPU rootfs with injected kernel modules +mise run vm:build # Rebuild binary with GPU rootfs + # Run with cloud-hypervisor backend openshell-vm --backend cloud-hypervisor # Requires runtime bundle -openshell-vm --gpu # Auto-selects CHV with GPU passthrough +openshell-vm --gpu # Auto-selects CHV (MSI-X) or QEMU (no MSI-X) + +# Run with QEMU backend +openshell-vm --backend qemu # Requires qemu-system-x86_64 on host # Wipe everything and start over mise run vm:clean @@ -337,8 +382,8 @@ pinned versions change. | Platform | Runner | Build Method | |----------|--------|-------------| -| Linux ARM64 | `build-arm64` (self-hosted) | `build-libkrun.sh` + `build-cloud-hypervisor.sh` | -| Linux x86_64 | `build-amd64` (self-hosted) | `build-libkrun.sh` + `build-cloud-hypervisor.sh` | +| Linux ARM64 | `build-arm64` (self-hosted) | `build-libkrun.sh` + `build-gpu-deps.sh` | +| Linux x86_64 | `build-amd64` (self-hosted) | `build-libkrun.sh` + `build-gpu-deps.sh` | | macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` (no CHV) | Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, @@ -376,6 +421,10 @@ macOS binaries produced via osxcross are not codesigned. Users must self-sign: codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - ./openshell-vm ``` +> **Note:** QEMU smoke tests (`vm_boot_smoke.rs`) are gated on `OPENSHELL_VM_BACKEND=qemu`. +> These tests require `qemu-system-x86_64` on the runner and are currently manual-only. +> Run `mise run vm:qemu-check` to validate prerequisites before running QEMU tests. + ## Rollout Strategy 1. Custom runtime is embedded by default when building with `mise run vm:build`. diff --git a/architecture/vm-gpu-passthrough.md b/architecture/vm-gpu-passthrough.md index c15fd668b..eda2ef9e3 100644 --- a/architecture/vm-gpu-passthrough.md +++ b/architecture/vm-gpu-passthrough.md @@ -6,7 +6,7 @@ OpenShell's VM backend can pass a physical NVIDIA GPU into a microVM using VFIO (Virtual Function I/O). This gives the guest direct access to GPU hardware, enabling CUDA workloads and `nvidia-smi` inside sandboxes without virtualization overhead. -GPU passthrough uses cloud-hypervisor (instead of the default libkrun backend) to attach a VFIO device to the VM. The guest sees a real PCI GPU device and loads standard NVIDIA drivers. +GPU passthrough uses cloud-hypervisor or QEMU (instead of the default libkrun backend) to attach a VFIO device to the VM. The guest sees a real PCI GPU device and loads standard NVIDIA drivers. cloud-hypervisor is preferred; QEMU is used as a fallback when the GPU lacks MSI-X support. ## Architecture @@ -17,7 +17,7 @@ Host │ Guest (microVM) ↕ bound to vfio-pci │ ↕ /dev/vfio/ │ /dev/nvidia* ↕ │ ↕ - cloud-hypervisor (VFIO) ────│→ PCI device visible + CHV or QEMU (VFIO) ────│→ PCI device visible ↕ │ ↕ TAP networking │ k3s + device plugin virtiofsd (rootfs) │ ↕ @@ -29,11 +29,27 @@ Host │ Guest (microVM) | Flag | Backend | GPU attached? | |------|---------|---------------| | (none) | libkrun | No | -| `--gpu` | cloud-hypervisor | Yes (auto-detect and bind) | -| `--gpu 0000:41:00.0` | cloud-hypervisor | Yes (specific PCI device) | +| `--gpu` (MSI-X GPU) | cloud-hypervisor | Yes | +| `--gpu` (non-MSI-X GPU) | QEMU | Yes (fallback) | +| `--gpu 0000:41:00.0` | auto (CHV or QEMU based on MSI-X) | Yes | | `--backend cloud-hypervisor` | cloud-hypervisor | No (force CHV without GPU) | +| `--backend qemu` | QEMU | Optional | -Auto mode (`--backend auto`, the default) selects cloud-hypervisor when `--gpu` is used or a VFIO PCI address is configured. Otherwise libkrun is used. +Auto mode (`--backend auto`, the default) selects cloud-hypervisor when `--gpu` is used with an MSI-X-capable GPU, QEMU when `--gpu` is used with a GPU lacking MSI-X, and libkrun otherwise. + +### QEMU fallback + +QEMU is used when GPU passthrough is requested but the GPU does not support MSI-X (PCI capability `0x11`). cloud-hypervisor's VFIO implementation requires MSI-X; QEMU handles MSI-only devices via its own interrupt remapping layer. + +| Aspect | cloud-hypervisor | QEMU | +|--------|-----------------|------| +| VFIO MSI-X | Required | Not required | +| VM control | REST API over Unix socket | Command-line args + QMP | +| Vsock transport | Unix socket + `CONNECT` text protocol | `AF_VSOCK` (kernel `vhost_vsock`) | +| TAP networking | Built-in TAP creation | `-netdev tap` flag | +| Shutdown | REST `vm.shutdown` | `SIGTERM` or QMP `system_powerdown` | + +The guest kernel, rootfs, init script, and exec agent are identical across both backends. The host requirements differ: QEMU needs `qemu-system-x86_64` installed on the host (not embedded in the runtime bundle) and the `vhost_vsock` kernel module for vsock exec support. ### Automatic GPU binding @@ -49,12 +65,12 @@ When a specific PCI address is given (`--gpu 0000:41:00.0`), the launcher target ### Safety checks -All safety checks are hard failures — if any check fails, the launcher prints an error and exits without binding. There is no `--force` override. +All safety checks are hard failures — if any check fails, the launcher prints an error and exits without binding. The one exception is display-manager-related blocking: when the GPU is held by Xorg or a Wayland compositor, the launcher prompts the user interactively to stop the display manager (see Single-GPU caveats). | Check | What it detects | Failure behavior | |-------|----------------|------------------| -| **Display attached** | GPU drives an active DRM framebuffer or is the primary rendering device | Error: "GPU 0000:xx:xx.x has active display outputs — cannot passthrough without losing host display" | -| **Active processes** | Processes holding `/dev/nvidia*` file descriptors (CUDA jobs, monitoring) | Error: "GPU 0000:xx:xx.x is in use by PID(s) — stop these processes first" | +| **Display attached** | GPU drives an active DRM framebuffer or is the primary rendering device | Interactive prompt to stop display-manager; error if declined or non-interactive | +| **Active processes** | Processes holding `/dev/nvidia*` file descriptors (CUDA jobs, monitoring) | Error if non-display processes; interactive prompt if only display servers | | **IOMMU enabled** | `/sys/kernel/iommu_groups/` exists and the GPU has a group assignment | Error: "IOMMU is not enabled — add intel_iommu=on or amd_iommu=on to kernel cmdline" | | **VFIO modules loaded** | `vfio-pci` and `vfio_iommu_type1` kernel modules are loaded | Error: "vfio-pci kernel module not loaded — run: sudo modprobe vfio-pci" | | **Permissions** | Write access to sysfs bind/unbind and `/dev/vfio/` | Error: "insufficient permissions — run as root or with CAP_NET_ADMIN" | @@ -140,9 +156,23 @@ When `--gpu` is passed, the launcher performs the following steps that previousl When the host has only one NVIDIA GPU: -- **Display-attached GPUs are blocked.** The safety checks detect if the GPU drives an active display (DRM framebuffer). If so, the launcher refuses to bind it — this prevents accidentally killing the host desktop. On headless data center servers (the typical deployment), this check passes and the GPU is bound automatically. -- **Recovery is automatic.** When the VM exits (clean shutdown, Ctrl+C, or process crash), the launcher rebinds the GPU to the `nvidia` driver and clears `driver_override`. No manual intervention is needed. -- **Process check.** If CUDA processes are using the GPU (visible via `/dev/nvidia*` file descriptors), the launcher refuses to unbind. Stop those processes first. +- **Display manager prompt.** When the GPU drives an active display or is held by a display server (Xorg, Wayland compositor), the launcher detects this and prompts the user interactively: + + ```text + WARNING: GPU 0000:2d:00.0 is in use by the display manager. + Display server processes: Xorg (PID 1234) + Active display outputs are connected to this GPU. + + Stopping the display manager will terminate your graphical session. + You will lose access to any open GUI applications. + + The display manager will be restarted automatically when the VM exits. + Stop display-manager and proceed with GPU passthrough? [y/N] + ``` + + If the user confirms, the launcher runs `systemctl stop display-manager`, waits for Xorg to release the GPU, then proceeds with VFIO binding. A `DisplayManagerGuard` ensures that `systemctl start display-manager` is called when the VM exits (clean shutdown, Ctrl+C, error, or panic). In non-interactive mode (stdin is not a TTY), the prompt is skipped and the launcher exits with an error instructing the user to stop the display manager manually. +- **Recovery is automatic.** When the VM exits (clean shutdown, Ctrl+C, or process crash), the launcher rebinds the GPU to the `nvidia` driver, clears `driver_override`, and restarts the display manager if it was stopped. No manual intervention is needed. +- **Process check.** If non-display CUDA processes are also using the GPU (visible via `/dev/nvidia*` file descriptors), the prompt warns about those processes too. The launcher lists all PIDs and process names so the user can make an informed decision. ## Supported GPUs @@ -158,6 +188,50 @@ GPU passthrough is validated with NVIDIA data center GPUs. Consumer GPUs may wor | L40S | Ada Lovelace | 8.9 | Supported | | L4 | Ada Lovelace | 8.9 | Supported | +## GPU build pipeline + +GPU passthrough requires NVIDIA kernel modules compiled against the VM kernel. The full build pipeline is: + +```shell +# 1. Build kernel from source (needed for module compilation) +FROM_SOURCE=1 mise run vm:setup + +# 2. Compile NVIDIA .ko files against the VM kernel +mise run vm:nvidia-modules + +# 3. Build GPU rootfs and inject kernel modules +mise run vm:rootfs -- --base --gpu + +# 4. Compile binary and package runtime +mise run vm:build +``` + +### NVIDIA kernel module build (`vm:nvidia-modules`) + +The `build-nvidia-modules.sh` script clones [NVIDIA/open-gpu-kernel-modules](https://github.com/NVIDIA/open-gpu-kernel-modules) at the tag pinned by `NVIDIA_DRIVER_TAG` in `pins.env` and compiles the open kernel modules against the VM kernel source tree produced by `build-libkrun.sh`. + +The driver tag must match the exact version of `nvidia-headless-570-open` installed in the guest rootfs. A mismatch causes "API mismatch" errors from `nvidia-smi`. The current pin is `570.211.01`. + +The build produces these modules: + +| Module | Purpose | +|--------|---------| +| `nvidia.ko` | Core GPU driver | +| `nvidia-uvm.ko` | Unified Virtual Memory (CUDA managed memory) | +| `nvidia-modeset.ko` | Display mode setting | +| `nvidia-drm.ko` | DRM/KMS integration | +| `nvidia-peermem.ko` | GPUDirect RDMA (optional) | + +### Module injection (`vm:rootfs --gpu`) + +When `build-rootfs.sh` runs with `--gpu`, it: + +1. Reads `kernel-version.txt` (exported by `build-libkrun.sh`) to determine the kernel release string. +2. Copies `.ko` files from `target/libkrun-build/nvidia-modules/` into the rootfs at `/lib/modules//kernel/drivers/video/nvidia/`. +3. Runs `depmod` to generate module dependency metadata so `modprobe` works at boot. + +The VM init script loads `nvidia`, `nvidia_uvm`, and `nvidia_modeset` during boot when `GPU_ENABLED=true` is set on the kernel command line. + ## CLI usage ### Auto-select GPU @@ -185,8 +259,10 @@ sudo openshell-vm --gpu 0000:41:00.0 The `--backend` flag controls hypervisor selection independently of `--gpu`: ```shell -sudo openshell-vm --gpu # auto: selects cloud-hypervisor +sudo openshell-vm --gpu # auto: CHV if MSI-X, QEMU otherwise sudo openshell-vm --backend cloud-hypervisor # explicit CHV, no GPU +sudo openshell-vm --backend qemu # explicit QEMU, no GPU +sudo openshell-vm --gpu --backend qemu # force QEMU with GPU sudo openshell-vm --backend libkrun # explicit libkrun (no GPU support) ``` @@ -337,6 +413,10 @@ The launcher caches mTLS certificates on the host after the first successful boo ## Troubleshooting +### "cloud-hypervisor requires MSI-X for VFIO passthrough" + +The GPU lacks MSI-X support and `--backend cloud-hypervisor` was explicitly requested. Either use `--backend qemu` or omit the `--backend` flag to let auto-selection pick QEMU as the fallback. + ### "no NVIDIA PCI device found" The host has no NVIDIA GPU installed, or the PCI device is not visible: @@ -346,19 +426,17 @@ lspci -nn | grep -i nvidia # If empty, the GPU is not detected at the PCI level ``` -### "has active display outputs" +### "has active display outputs" / "in use by display manager" -The GPU drives a DRM framebuffer or is the boot VGA device. This is a hard safety check — the launcher will not unbind a display GPU. Options: +The GPU drives a DRM framebuffer or is held by a display server (Xorg, Wayland compositor). If running interactively, the launcher prompts to stop the display manager. If running non-interactively or the user declines, options: - Use a different GPU for the monitor (iGPU, secondary card) -- Stop the display manager first: `sudo systemctl stop gdm` +- Stop the display manager manually: `sudo systemctl stop display-manager` - On headless servers, this should not occur — verify with `ls /sys/class/drm/card*/device` ### "in use by PIDs: ..." -Active processes hold `/dev/nvidia*` file descriptors. The check is host-wide -(across all NVIDIA GPUs, not per-device). The launcher lists the PIDs and -process names. Stop those processes before retrying. +Active non-display processes hold `/dev/nvidia*` file descriptors. The check is host-wide (across all NVIDIA GPUs, not per-device). The launcher lists the PIDs and process names. Stop those processes before retrying. If the only processes are display servers (Xorg, gnome-shell, etc.), the launcher will offer to stop the display manager instead. ### "IOMMU not enabled or device has no IOMMU group" @@ -410,4 +488,7 @@ If you hit this issue repeatedly, check for nvidia driver updates or file a bug - [Custom VM Runtime](custom-vm-runtime.md) — building and customizing the libkrun VM runtime - [System Architecture](system-architecture.md) — overall OpenShell architecture -- Implementation: [`crates/openshell-vm/src/gpu_passthrough.rs`](../crates/openshell-vm/src/gpu_passthrough.rs) +- Implementation: + - [`crates/openshell-vfio/src/lib.rs`](../crates/openshell-vfio/src/lib.rs) — GPU binding and VFIO setup + - [`crates/openshell-vm/src/backend/cloud_hypervisor.rs`](../crates/openshell-vm/src/backend/cloud_hypervisor.rs) — cloud-hypervisor backend + - [`crates/openshell-vm/src/backend/qemu.rs`](../crates/openshell-vm/src/backend/qemu.rs) — QEMU backend diff --git a/crates/openshell-vfio/src/lib.rs b/crates/openshell-vfio/src/lib.rs index f6b59d892..4404624e6 100644 --- a/crates/openshell-vfio/src/lib.rs +++ b/crates/openshell-vfio/src/lib.rs @@ -3,7 +3,6 @@ //! Host-side NVIDIA GPU VFIO bind/unbind for VM passthrough. -#![allow(unsafe_code)] //! //! This module scans Linux sysfs (`/sys/bus/pci/devices`) for NVIDIA GPUs //! (vendor ID `0x10de`), checks their driver binding, and verifies IOMMU @@ -104,6 +103,15 @@ fn sysfs_write_with_timeout( use std::process::{Command, Stdio}; use std::thread; + if data.is_empty() { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!( + "sysfs_write_with_timeout called with empty data for {}", + path.display() + ), + )); + } validate_sysfs_data(data)?; let mut child = Command::new("sh") @@ -408,6 +416,36 @@ impl SysfsRoot { #[cfg(target_os = "linux")] fn write_sysfs(&self, path: &std::path::Path, data: &str) -> Result<(), std::io::Error> { if self.is_real_sysfs() { + if data.is_empty() { + // Clearing a sysfs attribute requires a direct write() syscall. + // Shell-based approaches (`printf '%s' '' > file`) produce zero + // bytes of output, and sysfs doesn't support truncation — so the + // kernel store function is never invoked and the attribute keeps + // its old value. A direct write("\n") always works: the kernel + // strips trailing newlines in store functions like + // driver_override_store(), resulting in an empty string that + // clears the attribute. Uses O_WRONLY only (no O_CREAT/O_TRUNC) + // for sysfs compatibility. This path does NOT use the timeout + // wrapper because clearing attributes never hangs — unlike driver + // unbind which can deadlock in nvidia's remove(). + use std::io::Write; + let mut f = std::fs::OpenOptions::new() + .write(true) + .open(path) + .map_err(|e| { + std::io::Error::new( + e.kind(), + format!("failed to open {} for clearing: {e}", path.display()), + ) + })?; + f.write_all(b"\n").map_err(|e| { + std::io::Error::new( + e.kind(), + format!("failed to write newline to {}: {e}", path.display()), + ) + })?; + return Ok(()); + } sysfs_write_with_timeout(path, data, SYSFS_WRITE_TIMEOUT) } else { std::fs::write(path, data).map_err(|e| { @@ -689,6 +727,58 @@ fn nvidia_pre_unbind_prep(pci_addr: &str) { } } +/// Reset a PCI device to clear stale IOMMU state after VFIO passthrough. +/// +/// Tries the device's own `reset` file (FLR) first. If that doesn't exist, +/// locates the parent PCI bridge and triggers a secondary bus reset (SBR). +/// Either reset clears stale IOMMU page table entries that would otherwise +/// cause `RmInitAdapter` failures when the nvidia driver initialises. +#[cfg(target_os = "linux")] +fn pci_reset_device(sysfs: &SysfsRoot, pci_addr: &str) { + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + + // Try device-level FLR first. + let device_reset = dev_dir.join("reset"); + if device_reset.exists() { + eprintln!("GPU {pci_addr}: performing PCI function-level reset"); + match sysfs.write_sysfs(&device_reset, "1") { + Ok(()) => { + std::thread::sleep(Duration::from_secs(1)); + eprintln!("GPU {pci_addr}: FLR complete"); + return; + } + Err(e) => { + eprintln!("GPU {pci_addr}: FLR failed ({e}), trying bridge SBR"); + } + } + } + + // Fall back to secondary bus reset on the parent bridge. The sysfs + // device path is a symlink whose real path encodes the PCI topology: + // /sys/devices/pci0000:00/0000:00:03.1/0000:2d:00.0 + // The parent directory (0000:00:03.1) is the bridge. + if let Ok(real) = std::fs::canonicalize(&dev_dir) { + if let Some(bridge_dir) = real.parent() { + let bridge_reset = bridge_dir.join("reset"); + if bridge_reset.exists() { + let bridge_name = bridge_dir + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown"); + eprintln!( + "GPU {pci_addr}: performing secondary bus reset on bridge {bridge_name}" + ); + if let Err(e) = std::fs::write(&bridge_reset, "1") { + eprintln!("GPU {pci_addr}: bridge SBR failed: {e}"); + } else { + std::thread::sleep(Duration::from_secs(1)); + eprintln!("GPU {pci_addr}: SBR complete"); + } + } + } + } +} + /// Reload nvidia kernel modules so the driver's sysfs bind file exists. /// /// Called during restore to ensure `modprobe nvidia` brings back the driver @@ -880,80 +970,165 @@ pub fn rebind_gpu_to_original( validate_pci_addr(pci_addr)?; let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); - if current_driver(sysfs, pci_addr).is_some() { + // Restore is best-effort: attempt every step even if earlier ones fail, + // so a partial failure (e.g. unbind succeeds but driver_override clear + // fails) doesn't leave the device in a worse state than before. Track + // the first error to return at the end. + let mut first_err: Option = None; + + // Step 1: Unbind from the current driver. Without this, modprobe for + // the original driver fails with "No such device" because the kernel + // still considers the PCI slot claimed. + let cur_drv = current_driver(sysfs, pci_addr); + if cur_drv.as_deref() == Some("vfio-pci") { + let vfio_unbind = sysfs.sys_bus_pci_drivers("vfio-pci").join("unbind"); + if let Err(e) = sysfs.write_sysfs(&vfio_unbind, pci_addr) { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + eprintln!( + "GPU {pci_addr}: failed to unbind from vfio-pci at {}{hint} — continuing restore", + vfio_unbind.display() + ); + if first_err.is_none() { + first_err = Some(std::io::Error::new( + e.kind(), + format!( + "Failed to unbind {pci_addr} from vfio-pci at {path}{hint}", + path = vfio_unbind.display() + ), + )); + } + } + } else if cur_drv.is_some() { let unbind = dev_dir.join("driver/unbind"); - sysfs.write_sysfs(&unbind, pci_addr).map_err(|e| { + if let Err(e) = sysfs.write_sysfs(&unbind, pci_addr) { let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { " — run as root" } else { "" }; - std::io::Error::new( - e.kind(), - format!( - "Failed to unbind device at {path}{hint}", - path = unbind.display() - ), - ) - })?; + eprintln!( + "GPU {pci_addr}: failed to unbind from {} at {}{hint} — continuing restore", + cur_drv.as_deref().unwrap_or("unknown"), + unbind.display() + ); + if first_err.is_none() { + first_err = Some(std::io::Error::new( + e.kind(), + format!( + "Failed to unbind device at {path}{hint}", + path = unbind.display() + ), + )); + } + } } + // Step 2: Clear driver_override so modprobe can claim the device. This + // is required even when the device is already unbound — a killed VM + // process can leave driver_override set to "vfio-pci" with no driver + // actually bound. let driver_override = dev_dir.join("driver_override"); - sysfs.write_sysfs(&driver_override, "").map_err(|e| { + if let Err(e) = sysfs.write_sysfs(&driver_override, "") { let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { " — run as root" } else { "" }; - std::io::Error::new( - e.kind(), - format!( - "Failed to clear driver_override at {path}{hint}", - path = driver_override.display() - ), - ) - })?; + eprintln!( + "GPU {pci_addr}: failed to clear driver_override at {}{hint} — continuing restore", + driver_override.display() + ); + if first_err.is_none() { + first_err = Some(std::io::Error::new( + e.kind(), + format!( + "Failed to clear driver_override at {path}{hint}", + path = driver_override.display() + ), + )); + } + } + + // Step 3: PCI device reset to clear stale IOMMU state. + // After VFIO passthrough (especially on AMD-Vi systems), the GPU may + // retain stale IOMMU page table entries. Without a reset, modprobe + // nvidia fails with RmInitAdapter errors and IO_PAGE_FAULTs. + if sysfs.is_real_sysfs() { + pci_reset_device(sysfs, pci_addr); + } + // Step 4: Reload modules and bind to the original driver. if !original_driver.is_empty() && original_driver != "none" { - // The nvidia driver bind path requires the kernel module to be loaded. - // nvidia_pre_unbind_prep may have unloaded it (cascade from submodules), - // or it may have been absent since before we started. Reload it so the - // driver's bind file exists in sysfs. if original_driver == "nvidia" && sysfs.is_real_sysfs() { nvidia_reload_modules(); } - let bind = sysfs.sys_bus_pci_drivers(original_driver).join("bind"); - if let Err(e) = sysfs.write_sysfs(&bind, pci_addr) { - eprintln!( - "GPU {pci_addr}: explicit bind to {original_driver} failed ({e}), \ - falling back to PCI rescan" - ); - let rescan = sysfs.0.join("sys/bus/pci/rescan"); - let _ = sysfs.write_sysfs(&rescan, "1"); - // Give the kernel time to re-probe and bind drivers. - std::thread::sleep(Duration::from_secs(1)); + // modprobe may have auto-bound the device (now that driver_override is + // cleared). Skip the explicit bind if already on the right driver. + let cur = current_driver(sysfs, pci_addr); + if cur.as_deref() == Some(original_driver) { + eprintln!("GPU {pci_addr}: already bound to {original_driver}"); + } else { + let bind = sysfs.sys_bus_pci_drivers(original_driver).join("bind"); + if let Err(e) = sysfs.write_sysfs(&bind, pci_addr) { + eprintln!( + "GPU {pci_addr}: explicit bind to {original_driver} failed ({e}), \ + falling back to PCI rescan" + ); + let rescan = sysfs.0.join("sys/bus/pci/rescan"); + if let Err(rescan_err) = sysfs.write_sysfs(&rescan, "1") { + eprintln!("GPU {pci_addr}: PCI rescan write failed: {rescan_err}"); + } + std::thread::sleep(Duration::from_secs(1)); - if current_driver(sysfs, pci_addr).is_none() { - return Err(std::io::Error::new( - e.kind(), - format!( - "Failed to restore {pci_addr} to {original_driver}: \ - explicit bind and PCI rescan both failed. \ - Manual fix: sudo modprobe nvidia && echo {pci_addr} | \ - sudo tee /sys/bus/pci/drivers/nvidia/bind" - ), - )); + match current_driver(sysfs, pci_addr) { + None => { + let bind_err = std::io::Error::new( + e.kind(), + format!( + "Failed to restore {pci_addr} to {original_driver}: \ + explicit bind and PCI rescan both failed. \ + Manual fix:\n \ + echo {pci_addr} | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind\n \ + echo | sudo tee /sys/bus/pci/devices/{pci_addr}/driver_override\n \ + sudo modprobe {original_driver}" + ), + ); + if first_err.is_none() { + first_err = Some(bind_err); + } + } + Some(new_drv) => { + eprintln!("GPU {pci_addr}: PCI rescan bound device to {new_drv}"); + } + } } - let new_drv = current_driver(sysfs, pci_addr).unwrap_or_default(); - eprintln!("GPU {pci_addr}: PCI rescan bound device to {new_drv}"); } } else { let rescan = sysfs.0.join("sys/bus/pci/rescan"); - let _ = sysfs.write_sysfs(&rescan, "1"); + if let Err(rescan_err) = sysfs.write_sysfs(&rescan, "1") { + eprintln!("GPU {pci_addr}: PCI rescan write failed: {rescan_err}"); + } } - Ok(()) + if first_err.is_none() { + if current_driver(sysfs, pci_addr).is_none() { + eprintln!( + "GPU {pci_addr}: warning: driver link missing in sysfs after restore \ + (nvidia-smi may still work via character devices). \ + To re-create the sysfs link: echo {pci_addr} | sudo tee /sys/bus/pci/drivers/{original_driver}/bind" + ); + } + } + + match first_err { + Some(e) => Err(e), + None => Ok(()), + } } #[cfg(not(target_os = "linux"))] @@ -1044,6 +1219,7 @@ pub fn rebind_iommu_group_peers( let mut first_err = None; for (peer_addr, original_driver) in peers { if let Err(e) = rebind_gpu_to_original(sysfs, peer_addr, original_driver) { + eprintln!("IOMMU peer {peer_addr}: failed to restore to {original_driver}: {e}"); if first_err.is_none() { first_err = Some(e); } @@ -1090,9 +1266,61 @@ pub struct GpuBindState { pub peer_binds: Vec<(String, String)>, /// Whether this instance performed the bind (false if GPU was already on vfio-pci). pub did_bind: bool, + /// Whether the GPU supports MSI-X (needed by cloud-hypervisor; QEMU works without it). + pub has_msix: bool, } impl GpuBindState { + /// Shell commands to manually restore the GPU and its peers to their + /// original drivers. Useful for printing recovery instructions when + /// the process might be force-killed (SIGKILL). + pub fn recovery_commands(&self) -> String { + let mut cmds = Vec::new(); + + cmds.push(format!( + "echo {} | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind", + self.pci_addr + )); + + for (peer_addr, _) in &self.peer_binds { + cmds.push(format!( + "echo {} | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind", + peer_addr + )); + } + + cmds.push(format!( + "echo | sudo tee /sys/bus/pci/devices/{}/driver_override", + self.pci_addr + )); + + for (peer_addr, _) in &self.peer_binds { + cmds.push(format!( + "echo | sudo tee /sys/bus/pci/devices/{}/driver_override", + peer_addr + )); + } + + if self.original_driver == "nvidia" || self.original_driver.is_empty() { + cmds.push("sudo modprobe nvidia".to_string()); + } + + let mut peer_drivers: Vec<&str> = Vec::new(); + for (_, original_drv) in &self.peer_binds { + if !original_drv.is_empty() + && original_drv != "nvidia" + && !peer_drivers.contains(&original_drv.as_str()) + { + peer_drivers.push(original_drv.as_str()); + } + } + for drv in peer_drivers { + cmds.push(format!("sudo modprobe {drv}")); + } + + cmds.join("\n") + } + /// Restore the GPU and its IOMMU peers to their original drivers. pub fn restore(&self) -> Result<(), std::io::Error> { self.restore_with_sysfs(&SysfsRoot::default()) @@ -1141,6 +1369,11 @@ impl GpuBindGuard { self.state.take() } + /// Access the inner bind state, if present. + pub fn state(&self) -> Option<&GpuBindState> { + self.state.as_ref() + } + /// Get the PCI address of the bound GPU, if any. pub fn pci_addr(&self) -> Option<&str> { self.state.as_ref().map(|s| s.pci_addr.as_str()) @@ -1161,6 +1394,142 @@ impl Drop for GpuBindGuard { } } +/// Known display server process names (matched against `/proc/PID/comm`). +const DISPLAY_SERVER_NAMES: &[&str] = &[ + "Xorg", + "X", + "Xwayland", + "gnome-shell", + "kwin_wayland", + "kwin_x11", + "sway", + "weston", + "mutter", +]; + +/// Returns `true` if `comm` matches a known display server process name. +pub fn is_display_server_process(comm: &str) -> bool { + DISPLAY_SERVER_NAMES.contains(&comm) +} + +/// Information about display manager processes blocking GPU passthrough. +/// +/// Returned by [`detect_display_blocker`] when a GPU that would otherwise +/// be eligible for passthrough is held by Xorg or a Wayland compositor. +#[derive(Debug, Clone)] +pub struct DisplayBlockerInfo { + /// PCI address of the GPU blocked by the display manager. + pub pci_addr: String, + /// Display-server processes holding `/dev/nvidia*` device files open. + pub display_processes: Vec<(u32, String)>, + /// Whether the GPU has active display outputs (DRM connectors). + pub has_active_outputs: bool, + /// Non-display processes also holding `/dev/nvidia*` device files. + /// If non-empty, stopping the display manager alone won't free the GPU. + pub other_processes: Vec<(u32, String)>, +} + +/// Detect whether a display manager is blocking GPU passthrough. +/// +/// Returns `Some(info)` when at least one GPU that would otherwise pass +/// safety checks is blocked by display-server processes (Xorg, Wayland +/// compositor) or has active display outputs. The caller can use this to +/// prompt the user to stop the display manager before retrying. +/// +/// Returns `None` when no display-related blocker is detected (GPUs may +/// still be blocked by other issues like missing IOMMU or permissions). +pub fn detect_display_blocker(requested_bdf: Option<&str>) -> Option { + detect_display_blocker_with_sysfs(&SysfsRoot::default(), requested_bdf) +} + +#[cfg(target_os = "linux")] +pub fn detect_display_blocker_with_sysfs( + sysfs: &SysfsRoot, + requested_bdf: Option<&str>, +) -> Option { + let addrs: Vec = match requested_bdf { + Some(bdf) => { + if validate_pci_addr(bdf).is_err() { + return None; + } + vec![bdf.to_string()] + } + None => find_nvidia_gpu_addrs(sysfs), + }; + + if addrs.is_empty() { + return None; + } + + let active_procs = check_active_gpu_processes().unwrap_or_default(); + + let display_procs: Vec<(u32, String)> = active_procs + .iter() + .filter(|(_, comm)| is_display_server_process(comm)) + .cloned() + .collect(); + + let other_procs: Vec<(u32, String)> = active_procs + .iter() + .filter(|(_, comm)| !is_display_server_process(comm)) + .cloned() + .collect(); + + for addr in &addrs { + if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") { + continue; + } + + let has_outputs = check_display_attached(sysfs, addr); + + if has_outputs || !display_procs.is_empty() { + return Some(DisplayBlockerInfo { + pci_addr: addr.clone(), + display_processes: display_procs, + other_processes: other_procs, + has_active_outputs: has_outputs, + }); + } + } + + None +} + +#[cfg(not(target_os = "linux"))] +pub fn detect_display_blocker_with_sysfs( + _sysfs: &SysfsRoot, + _requested_bdf: Option<&str>, +) -> Option { + None +} + +/// Find all NVIDIA GPU PCI addresses (class 0x03xxxx) in sysfs. +#[cfg(target_os = "linux")] +fn find_nvidia_gpu_addrs(sysfs: &SysfsRoot) -> Vec { + let pci_dir = sysfs.sys_bus_pci_devices(); + let Ok(entries) = std::fs::read_dir(&pci_dir) else { + return vec![]; + }; + + let mut addrs = Vec::new(); + for entry in entries.filter_map(Result::ok) { + let dev_path = entry.path(); + let vendor = match std::fs::read_to_string(dev_path.join("vendor")) { + Ok(v) => v.trim().to_lowercase(), + Err(_) => continue, + }; + let class = match std::fs::read_to_string(dev_path.join("class")) { + Ok(c) => c.trim().to_lowercase(), + Err(_) => continue, + }; + if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x03") { + addrs.push(entry.file_name().to_string_lossy().to_string()); + } + } + addrs.sort(); + addrs +} + /// Prepare a GPU for VFIO passthrough: run safety checks, select, and bind. /// /// When `requested_bdf` is Some, targets that specific device. @@ -1214,24 +1583,23 @@ fn prepare_specific_gpu(sysfs: &SysfsRoot, bdf: &str) -> Result Result = procs + .iter() + .filter(|(_, comm)| is_display_server_process(comm)) + .map(|(_, comm)| comm.as_str()) + .collect(); + let mut msg = format!("GPU {bdf}: in use by PIDs: {}", desc.join(", ")); + if !display_procs.is_empty() { + msg.push_str(&format!( + "\n\n {} {} a display server \ + — stop the display manager to release the GPU:\n \ + sudo systemctl stop display-manager\ + \n\n The display manager will need to be restarted after the VM exits:\n \ + sudo systemctl start display-manager", + display_procs.join(", "), + if display_procs.len() == 1 { "is" } else { "are" }, + )); + } return Err(std::io::Error::new( std::io::ErrorKind::Other, - format!("GPU {bdf}: in use by PIDs: {}", desc.join(", ")), + msg, )); } @@ -1294,6 +1679,7 @@ fn prepare_specific_gpu(sysfs: &SysfsRoot, bdf: &str) -> Result Result { nvidia_addrs.sort(); + // Phase 1: prefer GPUs already on vfio-pci with clean IOMMU group. + // MSI-X GPUs get priority (cloud-hypervisor has lower overhead than QEMU). + let mut vfio_msix: Option = None; + let mut vfio_no_msix: Option = None; for addr in &nvidia_addrs { if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") && is_iommu_group_clean(sysfs, addr) { - return Ok(GpuBindState { - pci_addr: addr.clone(), - original_driver: "vfio-pci".to_string(), - peer_binds: vec![], - did_bind: false, - }); + if check_msix_support(sysfs, addr) { + if vfio_msix.is_none() { + vfio_msix = Some(addr.clone()); + } + } else if vfio_no_msix.is_none() { + vfio_no_msix = Some(addr.clone()); + } } } + if let Some(addr) = vfio_msix { + eprintln!("GPU {addr}: already on vfio-pci (inherited from previous session), will restore to nvidia on exit"); + return Ok(GpuBindState { + pci_addr: addr, + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + has_msix: true, + }); + } + if let Some(ref addr) = vfio_no_msix { + eprintln!( + "GPU {addr}: no MSI-X support — QEMU backend will be used \ + (cloud-hypervisor requires MSI-X)" + ); + eprintln!("GPU {addr}: already on vfio-pci (inherited from previous session), will restore to nvidia on exit"); + return Ok(GpuBindState { + pci_addr: addr.clone(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + has_msix: false, + }); + } + // Phase 2: try to bind idle GPUs. Collect eligible candidates, then + // pick the best one (MSI-X preferred over non-MSI-X). let mut blocked: Vec<(String, String)> = Vec::new(); + let mut has_display_blocker = false; let active_procs = check_active_gpu_processes() .map_err(|e| std::io::Error::new(e.kind(), format!("cannot verify GPUs are idle — {e}")))?; - for addr in &nvidia_addrs { - if !check_msix_support(sysfs, addr) { - blocked.push(( - addr.clone(), - "no MSI-X support (required by cloud-hypervisor)".to_string(), - )); - continue; - } + let mut idle_candidates: Vec<(String, bool)> = Vec::new(); + for addr in &nvidia_addrs { if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") { blocked.push((addr.clone(), "IOMMU group not clean".to_string())); continue; } if check_display_attached(sysfs, addr) { + has_display_blocker = true; blocked.push((addr.clone(), "has active display outputs".to_string())); continue; } if !active_procs.is_empty() { + let display_names: Vec<&str> = active_procs + .iter() + .filter(|(_, comm)| is_display_server_process(comm)) + .map(|(_, comm)| comm.as_str()) + .collect(); + if !display_names.is_empty() { + has_display_blocker = true; + } let desc: Vec = active_procs .iter() .map(|(pid, comm)| format!("{pid} ({comm})")) @@ -1388,6 +1809,20 @@ fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result { continue; } + let has_msix = check_msix_support(sysfs, addr); + idle_candidates.push((addr.clone(), has_msix)); + } + + // Sort: MSI-X candidates first (lower overhead with cloud-hypervisor). + idle_candidates.sort_by_key(|(_, has_msix)| !has_msix); + + for (addr, has_msix) in &idle_candidates { + if !has_msix { + eprintln!( + "GPU {addr}: no MSI-X support — QEMU backend will be used \ + (cloud-hypervisor requires MSI-X)" + ); + } eprintln!("GPU: binding {addr} for VFIO passthrough"); let original_driver = bind_gpu_to_vfio(sysfs, addr)?; let peer_binds = match bind_iommu_group_peers(sysfs, addr) { @@ -1403,6 +1838,7 @@ fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result { original_driver, peer_binds, did_bind: true, + has_msix: *has_msix, }); } @@ -1411,6 +1847,15 @@ fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result { for (addr, reason) in &blocked { msg.push_str(&format!(" {addr}: {reason}\n")); } + if has_display_blocker { + msg.push_str( + "\n A display server is using the GPU. \ + Stop the display manager to release it:\n \ + sudo systemctl stop display-manager\ + \n\n The display manager will be restarted automatically if you use the --gpu flag,\ + \n or manually with: sudo systemctl start display-manager\n", + ); + } msg.push_str("\n No GPU is available for passthrough."); Err(std::io::Error::new(std::io::ErrorKind::Other, msg)) @@ -1423,6 +1868,7 @@ mod tests { use std::path::Path; #[test] + #[allow(unsafe_code)] fn passthrough_gate_is_false_without_env_var() { // SAFETY: test runs single-threaded; no other thread reads this var. unsafe { std::env::remove_var("OPENSHELL_VM_GPU_E2E") }; @@ -1875,7 +2321,8 @@ mod tests { let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap(); assert_eq!(state.pci_addr, "0000:43:00.0"); - assert!(!state.did_bind); + assert!(state.did_bind, "inherited vfio-pci should set did_bind=true for restore"); + assert_eq!(state.original_driver, "nvidia"); } #[test] @@ -1950,6 +2397,44 @@ mod tests { msg.contains("0000:42:00.0"), "error should list second GPU: {msg}" ); + assert!( + msg.contains("sudo systemctl stop display-manager"), + "error should suggest stopping display-manager: {msg}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_blocked_by_display_includes_restart_hint() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:61:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:61:00.0", + &[("DP-1", "connected")], + ); + mock_iommu_group(root.path(), 20, &["0000:61:00.0"]); + + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("sudo systemctl stop display-manager"), + "error should include display-manager stop command: {msg}" + ); + assert!( + msg.contains("sudo systemctl start display-manager"), + "error should include display-manager restart command: {msg}" + ); + assert!( + msg.contains("0000:61:00.0"), + "error should list the blocked GPU: {msg}" + ); } #[test] @@ -2065,16 +2550,31 @@ mod tests { #[test] #[cfg(target_os = "linux")] - fn restore_noop_when_did_not_bind() { - let state = GpuBindState { - pci_addr: "0000:43:00.0".to_string(), - original_driver: "vfio-pci".to_string(), - peer_binds: vec![], - did_bind: false, - }; + fn restore_inherited_vfio_rebinds_to_nvidia() { let root = tempfile::tempdir().unwrap(); let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:43:00.0", "0x10de", Some("vfio-pci")); + mock_iommu_group(root.path(), 17, &["0000:43:00.0"]); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:43:00.0")).unwrap(); + assert!(state.did_bind, "inherited vfio-pci state should set did_bind=true"); + assert_eq!(state.original_driver, "nvidia", "inherited vfio-pci should target nvidia for restore"); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:43:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_driver_dir).unwrap(); + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + state.restore_with_sysfs(&sysfs).unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, "", "driver_override should be cleared after restore"); } #[test] @@ -2160,6 +2660,7 @@ mod tests { original_driver: "nvidia".to_string(), peer_binds: vec![], did_bind: true, + has_msix: true, }; let guard = GpuBindGuard::new(state); assert_eq!(guard.pci_addr(), Some("0000:41:00.0")); @@ -2172,6 +2673,7 @@ mod tests { original_driver: "nvidia".to_string(), peer_binds: vec![], did_bind: true, + has_msix: true, }; let mut guard = GpuBindGuard::new(state); let taken = guard.disarm(); @@ -2186,6 +2688,7 @@ mod tests { original_driver: "nvidia".to_string(), peer_binds: vec![], did_bind: true, + has_msix: true, }; let mut guard = GpuBindGuard::new(state); let _ = guard.disarm(); @@ -2193,6 +2696,27 @@ mod tests { assert!(second.is_none()); } + #[test] + fn recovery_commands_includes_gpu_and_peers() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![("0000:41:00.1".to_string(), "snd_hda_intel".to_string())], + did_bind: true, + has_msix: true, + }; + let cmds = state.recovery_commands(); + assert!(cmds.contains("vfio-pci/unbind"), "should unbind GPU from vfio-pci"); + assert!(cmds.contains("0000:41:00.0"), "should reference GPU address"); + assert!(cmds.contains("0000:41:00.1"), "should reference peer address"); + assert!(cmds.contains("driver_override"), "should clear driver_override"); + assert!(cmds.contains("modprobe nvidia"), "should reload nvidia modules"); + assert!( + cmds.contains("modprobe snd_hda_intel"), + "should reload peer original driver" + ); + } + #[test] fn guard_drop_noop_when_did_not_bind() { let state = GpuBindState { @@ -2200,6 +2724,7 @@ mod tests { original_driver: "nvidia".to_string(), peer_binds: vec![], did_bind: false, + has_msix: true, }; let guard = GpuBindGuard::new(state); drop(guard); @@ -2213,10 +2738,109 @@ mod tests { original_driver: "nvidia".to_string(), peer_binds: vec![], did_bind: false, + has_msix: true, }; let _guard = GpuBindGuard::new(state); panic!("test panic"); })); assert!(result.is_err()); } + + #[test] + fn display_server_process_detection() { + assert!(is_display_server_process("Xorg")); + assert!(is_display_server_process("X")); + assert!(is_display_server_process("Xwayland")); + assert!(is_display_server_process("gnome-shell")); + assert!(is_display_server_process("kwin_wayland")); + assert!(is_display_server_process("sway")); + assert!(is_display_server_process("mutter")); + + assert!(!is_display_server_process("firefox")); + assert!(!is_display_server_process("python3")); + assert!(!is_display_server_process("nvidia-smi")); + assert!(!is_display_server_process("cuda_app")); + assert!(!is_display_server_process("")); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_blocker_detected_with_active_outputs() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + + let info = detect_display_blocker_with_sysfs(&sysfs, Some("0000:41:00.0")); + assert!(info.is_some(), "should detect display blocker"); + let info = info.unwrap(); + assert_eq!(info.pci_addr, "0000:41:00.0"); + assert!(info.has_active_outputs); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_blocker_none_when_gpu_already_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("vfio-pci")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + + let info = detect_display_blocker_with_sysfs(&sysfs, Some("0000:41:00.0")); + assert!( + info.is_none(), + "should not detect blocker when GPU is already on vfio-pci" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_blocker_none_on_headless_idle_gpu() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "disconnected")], + ); + + let info = detect_display_blocker_with_sysfs(&sysfs, Some("0000:41:00.0")); + assert!( + info.is_none(), + "headless idle GPU should not trigger display blocker" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_blocker_auto_finds_blocked_gpu() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + + mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("vfio-pci")); + + let info = detect_display_blocker_with_sysfs(&sysfs, None); + assert!(info.is_some()); + assert_eq!(info.unwrap().pci_addr, "0000:41:00.0"); + } } diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs index f448ed0bc..9a5e04e4c 100644 --- a/crates/openshell-vm/build.rs +++ b/crates/openshell-vm/build.rs @@ -28,6 +28,7 @@ fn main() { "libkrunfw.5.dylib.zst", "gvproxy.zst", "rootfs.tar.zst", + "rootfs-gpu.tar.zst", ] { println!("cargo:rerun-if-changed={dir}/{name}"); } @@ -68,24 +69,30 @@ fn main() { return; } - // Copy compressed files to OUT_DIR - let files = [ + // Copy compressed files to OUT_DIR. + // Core artifacts are required; rootfs has two variants (base and GPU) and + // the presence of either one is sufficient. + let core_files = [ (format!("{libkrun_name}.zst"), format!("{libkrun_name}.zst")), ( format!("{libkrunfw_name}.zst"), format!("{libkrunfw_name}.zst"), ), ("gvproxy.zst".to_string(), "gvproxy.zst".to_string()), - ("rootfs.tar.zst".to_string(), "rootfs.tar.zst".to_string()), ]; let mut all_found = true; - for (src_name, dst_name) in &files { + let mut total_embedded_size: u64 = 0; + + let copy_artifact = |src_name: &str, + dst_name: &str, + compressed_dir: &Path, + out_dir: &Path, + total: &mut u64| + -> bool { let src_path = compressed_dir.join(src_name); let dst_path = out_dir.join(dst_name); - if src_path.exists() { - // Remove existing file first (may be read-only from previous build) if dst_path.exists() { let _ = fs::remove_file(&dst_path); } @@ -98,20 +105,92 @@ fn main() { ) }); let size = fs::metadata(&dst_path).map(|m| m.len()).unwrap_or(0); + *total += size; println!("cargo:warning=Embedded {src_name}: {size} bytes"); + true } else { + false + } + }; + + for (src_name, dst_name) in &core_files { + if !copy_artifact(src_name, dst_name, &compressed_dir, &out_dir, &mut total_embedded_size) { println!( "cargo:warning=Missing compressed artifact: {}", - src_path.display() + compressed_dir.join(src_name).display() ); all_found = false; } } + // Rootfs: accept either the base rootfs or the GPU rootfs (or both). + let has_base = copy_artifact( + "rootfs.tar.zst", + "rootfs.tar.zst", + &compressed_dir, + &out_dir, + &mut total_embedded_size, + ); + let has_gpu = copy_artifact( + "rootfs-gpu.tar.zst", + "rootfs-gpu.tar.zst", + &compressed_dir, + &out_dir, + &mut total_embedded_size, + ); + if !has_base && !has_gpu { + println!( + "cargo:warning=Missing rootfs artifact: neither rootfs.tar.zst nor rootfs-gpu.tar.zst found in {}", + compressed_dir.display() + ); + } else if !has_base { + println!( + "cargo:warning=Only rootfs-gpu.tar.zst found (base rootfs.tar.zst absent). \ + This is fine for GPU-only builds; run `mise run vm:setup` to get the base rootfs." + ); + } else if !has_gpu { + println!( + "cargo:warning=Only rootfs.tar.zst found (GPU rootfs-gpu.tar.zst absent). \ + This is fine for non-GPU builds; run `mise run vm:rootfs -- --gpu` to get the GPU rootfs." + ); + } + + // Write empty stubs for any missing rootfs variant so that + // `include_bytes!()` in embedded.rs always resolves. The embedded module + // treats zero-length slices as "not available". + for (found, name) in [(has_base, "rootfs.tar.zst"), (has_gpu, "rootfs-gpu.tar.zst")] { + if !found { + let stub = out_dir.join(name); + if !stub.exists() { + fs::write(&stub, b"") + .unwrap_or_else(|e| panic!("Failed to write stub {name}: {e}")); + } + } + } + if !all_found { println!("cargo:warning=Some artifacts missing. Run: mise run vm:setup"); generate_stub_resources(&out_dir); } + + // Warn when total embedded data approaches the x86_64 small code model limit. + // The default code model uses R_X86_64_PC32 (±2 GiB) relocations; embedding + // blobs that push .rodata past 2 GiB will cause linker failures unless + // RUSTFLAGS="-C code-model=large" is set. The vm:build task does this + // automatically, but direct cargo invocations may not. + const LARGE_BLOB_THRESHOLD: u64 = 1_800_000_000; // ~1.8 GiB + if target_arch == "x86_64" && total_embedded_size > LARGE_BLOB_THRESHOLD { + println!( + "cargo:warning=Total embedded data is {total_embedded_size} bytes ({:.1} GiB).", + total_embedded_size as f64 / (1024.0 * 1024.0 * 1024.0) + ); + println!( + "cargo:warning=This exceeds the x86_64 small code model limit (~2 GiB)." + ); + println!( + "cargo:warning=Ensure RUSTFLAGS includes '-C code-model=large' or use `mise run vm:build`." + ); + } } /// Generate stub (empty) resource files so the build can complete. @@ -129,6 +208,7 @@ fn generate_stub_resources(out_dir: &Path) { format!("{libkrunfw_name}.zst"), "gvproxy.zst".to_string(), "rootfs.tar.zst".to_string(), + "rootfs-gpu.tar.zst".to_string(), ]; for name in &stubs { diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index 4da05e089..e6a01dcce 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -72,3 +72,11 @@ VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" # for VFIO passthrough. NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}" NVIDIA_CONTAINER_TOOLKIT_VERSION="${NVIDIA_CONTAINER_TOOLKIT_VERSION:-1.19.0}" + +# NVIDIA open kernel module source tag (must match nvidia-headless-570-open version). +# Repo: https://github.com/NVIDIA/open-gpu-kernel-modules +# The tag must be the exact driver version so that the compiled kernel modules +# match the userspace libraries installed by nvidia-headless-570-open in the +# rootfs. A mismatch causes "API mismatch" errors from nvidia-smi. +# Find the APT version: apt-cache show nvidia-headless-570-open | grep Version +NVIDIA_DRIVER_TAG="${NVIDIA_DRIVER_TAG:-570.211.01}" diff --git a/crates/openshell-vm/runtime/kernel/openshell.kconfig b/crates/openshell-vm/runtime/kernel/openshell.kconfig index 5ce14a683..95435b149 100644 --- a/crates/openshell-vm/runtime/kernel/openshell.kconfig +++ b/crates/openshell-vm/runtime/kernel/openshell.kconfig @@ -138,6 +138,27 @@ CONFIG_DRM=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y +# MTRR — required dependency for CONFIG_X86_PAT below. +CONFIG_MTRR=y + +# MMU notifier — required by NVIDIA UVM module for GPU memory management. +CONFIG_MMU_NOTIFIER=y + +# PAT (Page Attribute Table) — required for correct GPU memory mapping. +# Without this, the NVIDIA driver compiles a fallback code path in nv-pat.c +# that calls __flush_tlb(), which was removed in kernel 6.12+. All modern +# x86_64 CPUs support PAT; every distro kernel enables it. +CONFIG_X86_PAT=y + +# ── Firmware loading (required for NVIDIA GSP firmware) ────────────────── +# The NVIDIA open kernel modules use request_firmware() to load GSP firmware +# from /lib/firmware/nvidia//. Without CONFIG_FW_LOADER, the kernel +# has no firmware loading infrastructure and GPU init fails with: +# NVRM: RmFetchGspRmImages: No firmware image found +# On kernel 6.12+, CONFIG_FW_LOADER includes the sysfs loading interface +# (previously CONFIG_FW_LOADER_SYSFS, now merged). +CONFIG_FW_LOADER=y + # ── cloud-hypervisor support ──────────────────────────────────────────── # CHV uses virtio-PCI transport (libkrun uses virtio-MMIO). Both drivers # coexist safely — the kernel probes whichever transport the hypervisor diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index 99a301f85..4baade995 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -135,6 +135,67 @@ verify_checksum() { fi } +verify_gpu_rootfs() { + local rootfs_dir="$1" + local kernel_version="$2" + local driver_tag="$3" + local driver_version="$4" + + echo "==> Verifying GPU components in rootfs..." + if [ ! -f "${rootfs_dir}/usr/bin/nvidia-smi" ]; then + echo "ERROR: nvidia-smi not found in rootfs." + exit 1 + fi + echo " nvidia-smi: found" + if ls "${rootfs_dir}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then + echo " nvidia-container-runtime: found" + else + echo "WARNING: nvidia-container-runtime not found — GPU pods may not work." + fi + if [ -z "${kernel_version}" ]; then + echo "ERROR: VM_KERNEL_VERSION not set — kernel module injection may have been skipped" >&2 + exit 1 + fi + if [ -d "${rootfs_dir}/lib/modules/${kernel_version}" ]; then + local mod_count + mod_count=$(find "${rootfs_dir}/lib/modules/${kernel_version}" -name "nvidia*.ko" | wc -l) + echo " nvidia kernel modules: ${mod_count} found (kernel ${kernel_version})" + if [ "$mod_count" -eq 0 ]; then + echo "ERROR: no nvidia kernel modules in /lib/modules/${kernel_version}/" + echo " Run: mise run vm:nvidia-modules" + exit 1 + fi + else + echo "ERROR: /lib/modules/${kernel_version}/ not found in rootfs" + echo " Run: mise run vm:nvidia-modules" + exit 1 + fi + local fw_dir="${rootfs_dir}/lib/firmware/nvidia/${driver_tag}" + if [ ! -d "${fw_dir}" ]; then + fw_dir="${rootfs_dir}/usr/lib/firmware/nvidia/${driver_tag}" + fi + if [ -d "${fw_dir}" ]; then + local fw_count + fw_count=$(ls "${fw_dir}"/gsp_*.bin 2>/dev/null | wc -l) + echo " GSP firmware: ${fw_count} files found" + for fw in "${fw_dir}"/gsp_*.bin; do + [ -f "$fw" ] || continue + echo " $(basename "$fw") ($(du -h "$fw" | cut -f1))" + done + if [ "$fw_count" -eq 0 ]; then + echo "ERROR: No GSP firmware files (gsp_*.bin) in ${fw_dir}" >&2 + echo " nvidia-smi will fail with: RmFetchGspRmImages: No firmware image found" >&2 + exit 1 + fi + else + echo "ERROR: GSP firmware directory not found" >&2 + echo " Checked: ${rootfs_dir}/lib/firmware/nvidia/${driver_tag}/" >&2 + echo " and: ${rootfs_dir}/usr/lib/firmware/nvidia/${driver_tag}/" >&2 + echo " Install: nvidia-firmware-${driver_version}-${driver_tag}" >&2 + exit 1 + fi +} + if [ "$BASE_ONLY" = true ]; then echo "==> Building base openshell-vm rootfs" echo " Guest arch: ${GUEST_ARCH}" @@ -248,11 +309,13 @@ if [ "$GPU_BUILD" = true ]; then docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" \ --build-arg "NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}" \ + --build-arg "NVIDIA_DRIVER_TAG=${NVIDIA_DRIVER_TAG}" \ --build-arg "NVIDIA_CONTAINER_TOOLKIT_VERSION=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \ -f - . <<'DOCKERFILE' ARG BASE_IMAGE FROM ${BASE_IMAGE} ARG NVIDIA_DRIVER_VERSION +ARG NVIDIA_DRIVER_TAG ARG NVIDIA_CONTAINER_TOOLKIT_VERSION RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -260,6 +323,7 @@ RUN apt-get update && \ e2fsprogs \ iptables \ iproute2 \ + kmod \ python3 \ busybox-static \ sqlite3 \ @@ -276,15 +340,28 @@ RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s # Add the NVIDIA package repository and install the open kernel module # flavour of the driver plus nvidia-container-toolkit. The open modules # are required for data-center GPUs (Turing+ / compute capability >= 7.0). +# Userspace packages are pinned to $NVIDIA_DRIVER_TAG so they match the +# kernel modules compiled by build-nvidia-modules.sh. RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ && curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ > /etc/apt/sources.list.d/nvidia-container-toolkit.list RUN apt-get update && \ + HEADLESS_VER=$(apt-cache madison nvidia-headless-${NVIDIA_DRIVER_VERSION}-open \ + | awk -v tag="${NVIDIA_DRIVER_TAG}" '$3 ~ "^"tag {print $3; exit}') && \ + UTILS_VER=$(apt-cache madison nvidia-utils-${NVIDIA_DRIVER_VERSION} \ + | awk -v tag="${NVIDIA_DRIVER_TAG}" '$3 ~ "^"tag {print $3; exit}') && \ + if [ -z "$HEADLESS_VER" ] || [ -z "$UTILS_VER" ]; then \ + echo "ERROR: No APT package found for driver tag ${NVIDIA_DRIVER_TAG}" >&2; \ + echo " headless: ${HEADLESS_VER:-not found}"; \ + echo " utils: ${UTILS_VER:-not found}"; \ + exit 1; \ + fi && \ + echo "Pinning NVIDIA packages: headless=${HEADLESS_VER} utils=${UTILS_VER}" && \ apt-get install -y --no-install-recommends \ - nvidia-headless-${NVIDIA_DRIVER_VERSION}-open \ - nvidia-utils-${NVIDIA_DRIVER_VERSION} \ + nvidia-headless-${NVIDIA_DRIVER_VERSION}-open=${HEADLESS_VER} \ + nvidia-utils-${NVIDIA_DRIVER_VERSION}=${UTILS_VER} \ nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION}-1 \ && rm -rf /var/lib/apt/lists/* # Configure the NVIDIA container runtime as the default for containerd. @@ -319,11 +396,19 @@ echo "==> Creating container..." docker create --platform "${DOCKER_PLATFORM}" --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true echo "==> Exporting filesystem..." -# Previous builds may leave overlayfs work/ dirs with permissions that -# prevent rm on macOS. Force-fix permissions before removing. +# Previous builds (especially VM pre-init) may leave root-owned files +# (k3s data, CNI, kubelet) that prevent non-root removal. Try normal +# cleanup first, fall back to sudo if needed. if [ -d "${ROOTFS_DIR}" ]; then + if [ -z "${ROOTFS_DIR}" ] || [ "${ROOTFS_DIR}" = "/" ]; then + echo "ERROR: ROOTFS_DIR is empty or root — refusing to rm -rf" >&2 + exit 1 + fi chmod -R u+rwx "${ROOTFS_DIR}" 2>/dev/null || true - rm -rf "${ROOTFS_DIR}" + if ! rm -rf "${ROOTFS_DIR}" 2>/dev/null; then + echo " Root-owned files detected in ${ROOTFS_DIR}, using sudo to clean..." + sudo rm -rf "${ROOTFS_DIR}" + fi fi mkdir -p "${ROOTFS_DIR}" docker export "${CONTAINER_NAME}" | tar -C "${ROOTFS_DIR}" -xf - @@ -455,6 +540,49 @@ if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_SRC}" ]; then fi fi +# ── Inject NVIDIA kernel modules (GPU rootfs only) ──────────────────── +# The kernel modules are compiled separately by build-nvidia-modules.sh +# against the VM kernel source tree. We inject them here so modprobe +# can load nvidia.ko at VM boot time. +if [ "$GPU_BUILD" = true ]; then + NVIDIA_MODULES_DIR="${PROJECT_ROOT}/target/libkrun-build/nvidia-modules" + + # Read the kernel version exported by build-libkrun.sh. + KERNEL_VERSION_FILE="${PROJECT_ROOT}/target/libkrun-build/kernel-version.txt" + if [ -f "$KERNEL_VERSION_FILE" ]; then + VM_KERNEL_VERSION="$(cat "$KERNEL_VERSION_FILE")" + else + echo "ERROR: kernel-version.txt not found at ${KERNEL_VERSION_FILE}" >&2 + echo " Run: FROM_SOURCE=1 mise run vm:setup" >&2 + exit 1 + fi + + MODULE_DEST="${ROOTFS_DIR}/lib/modules/${VM_KERNEL_VERSION}/kernel/drivers/video/nvidia" + + if [ -d "${NVIDIA_MODULES_DIR}" ] && ls "${NVIDIA_MODULES_DIR}"/*.ko >/dev/null 2>&1; then + echo "==> Injecting NVIDIA kernel modules (kernel ${VM_KERNEL_VERSION})..." + mkdir -p "${MODULE_DEST}" + cp "${NVIDIA_MODULES_DIR}"/*.ko "${MODULE_DEST}/" + for mod in "${MODULE_DEST}"/*.ko; do + echo " $(basename "$mod") ($(du -h "$mod" | cut -f1))" + done + + # Generate module dependency metadata so modprobe works. + KERNEL_DIR_NAME="$(grep '^KERNEL_VERSION' "${PROJECT_ROOT}/target/libkrun-build/libkrunfw/Makefile" | head -1 | awk '{print $3}')" + SYSTEM_MAP="${PROJECT_ROOT}/target/libkrun-build/libkrunfw/${KERNEL_DIR_NAME}/System.map" + if [ -f "$SYSTEM_MAP" ]; then + depmod -a -b "${ROOTFS_DIR}" -F "$SYSTEM_MAP" "${VM_KERNEL_VERSION}" + else + depmod -a -b "${ROOTFS_DIR}" "${VM_KERNEL_VERSION}" + fi + echo " depmod: module dependencies generated" + else + echo "ERROR: NVIDIA kernel modules not found at ${NVIDIA_MODULES_DIR}" >&2 + echo " Run: tasks/scripts/vm/build-nvidia-modules.sh" >&2 + exit 1 + fi +fi + # ── Base mode: mark rootfs type and skip pre-loading ─────────────────── if [ "$BASE_ONLY" = true ]; then @@ -477,22 +605,11 @@ if [ "$BASE_ONLY" = true ]; then fi if [ "$GPU_BUILD" = true ]; then - echo "==> Verifying GPU components in rootfs..." - if [ ! -f "${ROOTFS_DIR}/usr/bin/nvidia-smi" ]; then - echo "ERROR: nvidia-smi not found in rootfs." - exit 1 - fi if [ ! -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ]; then echo "ERROR: GPU sentinel file not found in rootfs." exit 1 fi - echo " nvidia-smi: found" - # nvidia-container-runtime is installed via nvidia-container-toolkit. - if ls "${ROOTFS_DIR}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then - echo " nvidia-container-runtime: found" - else - echo "WARNING: nvidia-container-runtime not found — GPU pods may not work." - fi + verify_gpu_rootfs "${ROOTFS_DIR}" "${VM_KERNEL_VERSION:-}" "${NVIDIA_DRIVER_TAG}" "${NVIDIA_DRIVER_VERSION}" fi echo "" @@ -713,6 +830,7 @@ else fi # Pre-initialize directly on virtio-fs. Runtime boots attach a separate # block-backed state disk and seed it from the rootfs on first launch. +rm -f "${ROOTFS_DIR}-console.log" 2>/dev/null || sudo rm -f "${ROOTFS_DIR}-console.log" 2>/dev/null || true OPENSHELL_VM_DISABLE_STATE_DISK=1 "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" --reset & VM_PID=$! @@ -723,6 +841,13 @@ cleanup_vm() { kill "${VM_PID}" 2>/dev/null || true wait "${VM_PID}" 2>/dev/null || true fi + # Kill orphaned gvproxy processes left by the VM (holds port 30051). + local gvproxy_pids + gvproxy_pids=$(pgrep -f "gvproxy.*listen-qemu" 2>/dev/null || true) + if [ -n "$gvproxy_pids" ]; then + echo " Killing orphaned gvproxy: $gvproxy_pids" + kill $gvproxy_pids 2>/dev/null || true + fi } trap cleanup_vm EXIT @@ -740,15 +865,16 @@ for i in $(seq 1 120); do sleep 1 done -# Wait for containerd to be ready. +# Wait for containerd to be ready. The first boot after a --reset may +# need extra time for k3s to extract its data dir and start containerd. echo " Waiting for containerd..." -for i in $(seq 1 60); do +for i in $(seq 1 180); do if vm_exec k3s ctr version >/dev/null 2>&1; then echo " Containerd ready (${i}s)" break fi - if [ "$i" -eq 60 ]; then - echo "ERROR: containerd did not become ready in 60s" + if [ "$i" -eq 180 ]; then + echo "ERROR: containerd did not become ready in 180s" exit 1 fi sleep 1 @@ -867,17 +993,7 @@ fi # ── GPU verification (full mode) ────────────────────────────────────── if [ "$GPU_BUILD" = true ]; then - echo "==> Verifying GPU components in rootfs..." - if [ ! -f "${ROOTFS_DIR}/usr/bin/nvidia-smi" ]; then - echo "ERROR: nvidia-smi not found in rootfs." - exit 1 - fi - echo " nvidia-smi: found" - if ls "${ROOTFS_DIR}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then - echo " nvidia-container-runtime: found" - else - echo "WARNING: nvidia-container-runtime not found — GPU pods may not work." - fi + verify_gpu_rootfs "${ROOTFS_DIR}" "${VM_KERNEL_VERSION:-}" "${NVIDIA_DRIVER_TAG}" "${NVIDIA_DRIVER_VERSION}" fi echo "" diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh index 222bcc641..f9e4c228f 100755 --- a/crates/openshell-vm/scripts/openshell-vm-init.sh +++ b/crates/openshell-vm/scripts/openshell-vm-init.sh @@ -281,10 +281,10 @@ rm -f /var/lib/rancher/k3s/server/cred/node-passwd 2>/dev/null || true # Clean stale containerd runtime state from previous boots. # -# The rootfs persists across VM restarts via virtio-fs. The overlayfs -# snapshotter now lives on the host-backed state disk when present, so -# snapshot data and meta.db persist across boots. We only clean runtime -# state (shim PIDs, sockets) that becomes stale when the VM restarts. +# The rootfs persists across VM restarts via virtio-fs. The snapshotter +# (overlayfs on state disk, native on virtiofs) persists across boots, +# so snapshot data and meta.db survive. We only clean runtime state +# (shim PIDs, sockets) that becomes stale when the VM restarts. if [ -d "$CONTAINERD_DIR" ]; then # Remove runtime task state (stale shim PIDs, sockets from dead processes). rm -rf "${CONTAINERD_DIR}/io.containerd.runtime.v2.task" 2>/dev/null || true @@ -296,24 +296,27 @@ if [ -d "$CONTAINERD_DIR" ]; then # Clean stale ingest temp files from the content store. rm -rf "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" 2>/dev/null || true mkdir -p "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" - # meta.db and overlayfs snapshots persist across boots on virtio-fs. - # No need to delete meta.db — snapshot metadata remains valid since - # the snapshotter directory is no longer backed by volatile tmpfs. + # meta.db and snapshots persist across boots. ts "cleaned containerd runtime state (meta.db + snapshots preserved)" fi rm -rf /run/k3s 2>/dev/null || true -# Ensure the overlayfs snapshotter directory exists. The snapshotter -# runs directly on virtio-fs, so layer data and snapshot metadata -# persist across VM restarts. This eliminates the need to re-import -# image tarballs and re-extract layers on every boot, significantly -# reducing sandbox creation time. -OVERLAYFS_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.overlayfs" -mkdir -p "$OVERLAYFS_DIR" +# Select snapshotter based on the backing filesystem. overlayfs requires +# filesystem features (redirect_dir xattrs) that virtiofs does not +# support. When containerd lives on the block-backed state disk (ext4), +# overlayfs works and provides efficient layer sharing. On virtiofs +# (no state disk), fall back to the native snapshotter which uses +# simple directory copies and works on any POSIX filesystem. if [ "$STATE_DISK_ACTIVE" = true ]; then - ts "overlayfs snapshotter on block-backed containerd state" + SNAPSHOTTER="overlayfs" + OVERLAYFS_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.overlayfs" + mkdir -p "$OVERLAYFS_DIR" + ts "snapshotter: overlayfs on block-backed containerd state" else - ts "overlayfs snapshotter on virtio-fs (persistent)" + SNAPSHOTTER="native" + NATIVE_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.native" + mkdir -p "$NATIVE_DIR" + ts "snapshotter: native on virtio-fs (overlayfs unsupported on virtiofs)" fi ts "stale artifacts cleaned" @@ -406,6 +409,12 @@ fi if [ "${GPU_ENABLED:-false}" = "true" ]; then ts "GPU mode enabled — loading NVIDIA drivers" + if ! command -v modprobe >/dev/null 2>&1; then + echo "FATAL: modprobe not found — the kmod package is missing from the GPU rootfs" >&2 + echo "Fix: add 'kmod' to the apt-get install list in build-rootfs.sh and rebuild" >&2 + exit 1 + fi + modprobe nvidia || { echo "FATAL: failed to load nvidia kernel module" >&2; exit 1; } modprobe nvidia_uvm || { echo "FATAL: failed to load nvidia_uvm kernel module" >&2; exit 1; } modprobe nvidia_modeset || { echo "FATAL: failed to load nvidia_modeset kernel module" >&2; exit 1; } @@ -822,7 +831,7 @@ K3S_ARGS=( --resolv-conf=/etc/resolv.conf --tls-san="localhost,127.0.0.1,10.0.2.15,192.168.127.2,$NODE_IP" --flannel-backend=none - --snapshotter=overlayfs + --snapshotter="$SNAPSHOTTER" --kube-proxy-arg=proxy-mode=nftables --kube-proxy-arg=nodeport-addresses=0.0.0.0/0 # virtio-fs passthrough reports the host disk usage, which is @@ -838,7 +847,7 @@ K3S_ARGS=( # container create after an image import may still be slow if # containerd needs to extract layers. 10m is a conservative safety # margin; typical operations complete much faster with persistent - # overlayfs snapshots. + # snapshots (overlayfs on state disk, native on virtiofs). --kubelet-arg=runtime-request-timeout=10m ) @@ -886,30 +895,51 @@ setsid sh -c ' ' & fi -# ── Clear stale kine bootstrap lock ───────────────────────────────────── -# k3s uses kine with a SQLite backend at state.db. When k3s starts, kine -# sets a bootstrap lock row; if k3s is killed before completing bootstrap -# (SIGKILL, host crash, power loss), the lock persists and the next k3s -# instance hangs forever on: -# "Bootstrap key already locked — waiting for data to be populated by -# another server" +# ── Kine database health check ─────────────────────────────────────────── +# k3s uses kine with a SQLite backend at state.db. Two failure modes: # -# We clear the lock row before starting k3s so that a warm boot with -# persistent state.db succeeds. If state.db doesn't exist (first boot or -# --reset), this is a harmless no-op. If state.db is corrupt, sqlite3 -# fails silently (|| true) and the host-side corruption check in exec.rs -# will have already removed the file. +# 1. Page-level corruption (SQLITE_CORRUPT) — from a killed VM mid-write. +# Detected via PRAGMA quick_check; the DB is removed so k3s starts fresh. +# The host-side recover_corrupt_kine_db() in exec.rs only checks the +# virtiofs path, so it misses corruption on the state disk (--gpu). +# This in-VM check is the authoritative corruption gate. +# +# 2. Stale bootstrap lock — kine sets a lock row on startup; if k3s is +# killed before completing bootstrap, the lock persists and the next +# instance hangs on "Bootstrap key already locked". Cleared via DELETE. KINE_DB="/var/lib/rancher/k3s/server/db/state.db" if [ -f "$KINE_DB" ]; then - ts "clearing stale kine bootstrap lock (if any)" - # If sqlite3 fails (corrupt DB, missing binary), log the failure. - # The host-side corruption check in exec.rs handles the corrupt case, - # but we should still know about it. - if ! sqlite3 "$KINE_DB" "DELETE FROM kine WHERE name LIKE '/bootstrap/%';" 2>/dev/null; then - ts "WARNING: failed to clear kine bootstrap lock — k3s may hang if DB is corrupt" + # When the state disk is in use, the kine DB lives on the block device, + # not on the virtiofs rootfs. The host-side recover_corrupt_kine_db() + # in exec.rs can only check the virtiofs path, so it misses corruption + # on the state disk. Run a quick_check here inside the VM where the + # bind-mount is active and the DB is at its final runtime path. + _kine_corrupt=false + if command -v sqlite3 >/dev/null 2>&1; then + _qc_result=$(sqlite3 "$KINE_DB" "PRAGMA quick_check;" 2>&1) || _kine_corrupt=true + if [ "$_kine_corrupt" = false ] && [ "$_qc_result" != "ok" ]; then + _kine_corrupt=true + fi + else + # No sqlite3 binary — can't verify, try to proceed. + ts "WARNING: sqlite3 not available, skipping kine DB integrity check" fi - if ! sqlite3 "$KINE_DB" "PRAGMA wal_checkpoint(TRUNCATE);" 2>/dev/null; then - ts "WARNING: failed to checkpoint kine WAL" + + if [ "$_kine_corrupt" = true ]; then + ts "WARNING: kine database is corrupt ($_qc_result), removing for clean boot" + rm -f "$KINE_DB" "${KINE_DB}-wal" "${KINE_DB}-shm" + ts "corrupt kine DB removed — k3s will recreate from manifests" + else + ts "clearing stale kine bootstrap lock (if any)" + if ! sqlite3 "$KINE_DB" "DELETE FROM kine WHERE name LIKE '/bootstrap/%';" 2>/dev/null; then + ts "WARNING: failed to clear kine bootstrap lock — removing DB for safety" + rm -f "$KINE_DB" "${KINE_DB}-wal" "${KINE_DB}-shm" + fi + if [ -f "$KINE_DB" ]; then + if ! sqlite3 "$KINE_DB" "PRAGMA wal_checkpoint(TRUNCATE);" 2>/dev/null; then + ts "WARNING: failed to checkpoint kine WAL" + fi + fi fi fi diff --git a/crates/openshell-vm/src/backend/cloud_hypervisor.rs b/crates/openshell-vm/src/backend/cloud_hypervisor.rs index e6c89a93c..13a73be03 100644 --- a/crates/openshell-vm/src/backend/cloud_hypervisor.rs +++ b/crates/openshell-vm/src/backend/cloud_hypervisor.rs @@ -10,10 +10,15 @@ use std::io::{Read, Write}; use std::os::unix::net::UnixStream; +use std::os::unix::process::CommandExt; use std::path::{Path, PathBuf}; use std::time::{Duration, Instant}; -use super::VmBackend; +use super::{ + GUEST_MAC, TAP_GUEST_IP, TAP_HOST_IP, TAP_NETMASK, VmBackend, bridge_bidirectional, + build_kernel_cmdline, setup_tap_host_networking, shell_escape, start_tcp_port_forwarder, + teardown_tap_host_networking, wait_for_socket, +}; use crate::exec::{ VM_EXEC_VSOCK_PORT, clear_vm_runtime_state, vm_exec_socket_path, write_vm_runtime_state, }; @@ -165,26 +170,6 @@ fn http_request_unix( Ok((status_code, body_str)) } -/// Wait for a Unix socket to appear on the filesystem. -fn wait_for_socket(socket_path: &Path, label: &str, timeout: Duration) -> Result<(), VmError> { - let deadline = Instant::now() + timeout; - let mut interval = Duration::from_millis(10); - - while !socket_path.exists() { - if Instant::now() >= deadline { - return Err(VmError::HostSetup(format!( - "{label} socket did not appear within {}s: {}", - timeout.as_secs(), - socket_path.display(), - ))); - } - std::thread::sleep(interval); - interval = (interval * 2).min(Duration::from_millis(200)); - } - - Ok(()) -} - /// Create the VM via the cloud-hypervisor REST API. fn api_vm_create(socket_path: &Path, payload: &str) -> Result<(), VmError> { let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.create", Some(payload)) @@ -272,40 +257,7 @@ fn build_vm_create_payload( ) -> Result { let mem_bytes = u64::from(config.mem_mib) * 1024 * 1024; - let mut cmdline_parts = vec![ - "console=ttyS0".to_string(), - "root=rootfs".to_string(), - "rootfstype=virtiofs".to_string(), - "rw".to_string(), - "panic=-1".to_string(), - format!("init={effective_exec_path}"), - ]; - - // Pass environment variables via kernel cmdline. Unrecognised kernel - // parameters are forwarded to init as env vars. Only simple KEY=VALUE - // pairs without spaces are safe (cmdline is space-delimited, ~4096 B). - if config.gpu_enabled && config.vfio_device.is_some() { - cmdline_parts.push("GPU_ENABLED=true".to_string()); - } - if let Some(state_disk) = &config.state_disk { - cmdline_parts.push(format!( - "OPENSHELL_VM_STATE_DISK_DEVICE={}", - state_disk.guest_device - )); - } - for var in &config.env { - if var.contains('=') && !var.contains(' ') && !var.contains('"') { - cmdline_parts.push(var.clone()); - } - } - - if use_tap_net { - cmdline_parts.push(format!("VM_NET_IP={CHV_TAP_GUEST_IP}")); - cmdline_parts.push(format!("VM_NET_GW={CHV_TAP_HOST_IP}")); - cmdline_parts.push(format!("VM_NET_DNS={}", host_dns_server())); - } - - let cmdline = cmdline_parts.join(" "); + let cmdline = build_kernel_cmdline(config, effective_exec_path, use_tap_net); let mut payload = serde_json::json!({ "cpus": { @@ -352,9 +304,9 @@ fn build_vm_create_payload( // elevated privileges, so TAP access is expected. if use_tap_net { payload["net"] = serde_json::json!([{ - "mac": "5a:94:ef:e4:0c:ee", - "ip": CHV_TAP_HOST_IP, - "mask": CHV_TAP_NETMASK, + "mac": GUEST_MAC, + "ip": TAP_HOST_IP, + "mask": TAP_NETMASK, }]); } @@ -411,13 +363,21 @@ fn launch_cloud_hypervisor( let virtiofsd_log_file = std::fs::File::create(&virtiofsd_log) .map_err(|e| VmError::Fork(format!("create virtiofsd log: {e}")))?; - let mut virtiofsd_child = std::process::Command::new(&backend.virtiofsd) + let mut virtiofsd_cmd = std::process::Command::new(&backend.virtiofsd); + virtiofsd_cmd .arg(format!("--socket-path={}", virtiofsd_sock_path.display())) .arg(format!("--shared-dir={}", config.rootfs.display())) .arg("--cache=always") .stdout(std::process::Stdio::null()) - .stderr(virtiofsd_log_file) - .spawn() + .stderr(virtiofsd_log_file); + #[allow(unsafe_code)] + unsafe { + virtiofsd_cmd.pre_exec(|| { + libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); + Ok(()) + }); + } + let mut virtiofsd_child = virtiofsd_cmd.spawn() .map_err(|e| VmError::Fork(format!("start virtiofsd: {e}")))?; eprintln!( @@ -438,7 +398,7 @@ fn launch_cloud_hypervisor( // Unlike libkrun (which exits when init terminates), cloud-hypervisor // keeps running after PID 1 exits (kernel panics). A wrapper init script // runs the command then calls `poweroff -f` for a clean ACPI shutdown. - let is_exec_mode = config.exec_path != "/srv/openshell-vm-init.sh"; + let is_exec_mode = config.is_exec_mode(); let wrapper_path = config.rootfs.join("tmp/chv-exec-wrapper.sh"); let effective_exec_path; if is_exec_mode { @@ -507,12 +467,20 @@ fn launch_cloud_hypervisor( let chv_log_file = std::fs::File::create(&chv_log) .map_err(|e| VmError::Fork(format!("create cloud-hypervisor log: {e}")))?; - let mut chv_child = std::process::Command::new(&backend.chv_binary) + let mut chv_cmd = std::process::Command::new(&backend.chv_binary); + chv_cmd .arg("--api-socket") .arg(&api_sock_path) .stdout(std::process::Stdio::null()) - .stderr(chv_log_file) - .spawn() + .stderr(chv_log_file); + #[allow(unsafe_code)] + unsafe { + chv_cmd.pre_exec(|| { + libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); + Ok(()) + }); + } + let mut chv_child = chv_cmd.spawn() .map_err(|e| VmError::Fork(format!("start cloud-hypervisor: {e}")))?; let chv_pid = chv_child.id() as i32; @@ -549,7 +517,7 @@ fn launch_cloud_hypervisor( // so the guest can reach the internet through the host. let mut original_ip_forward: Option = None; if use_tap_net { - match setup_chv_host_networking() { + match setup_tap_host_networking() { Ok(orig) => original_ip_forward = Some(orig), Err(e) => { eprintln!("WARNING: host networking setup failed: {e}"); @@ -558,8 +526,9 @@ fn launch_cloud_hypervisor( } } - // Write runtime state (vsock_bridge: true — CHV uses AF_VSOCK bridging) - if config.exec_path == "/srv/openshell-vm-init.sh" { + // Write runtime state (vsock_bridge: true — CHV uses Unix socket vsock + // bridging with a text protocol, not kernel AF_VSOCK) + if !config.is_exec_mode() { if let Err(err) = write_vm_runtime_state(&config.rootfs, chv_pid, &console_log, None, true) { let _ = api_vm_shutdown(&api_sock_path); @@ -568,13 +537,14 @@ fn launch_cloud_hypervisor( let _ = virtiofsd_child.kill(); let _ = virtiofsd_child.wait(); if let Some(ref orig) = original_ip_forward { - teardown_chv_host_networking(orig); + teardown_tap_host_networking(orig); } clear_vm_runtime_state(&config.rootfs); return Err(err); } } + let exec_socket = vm_exec_socket_path(&config.rootfs); // CHV TAP networking doesn't provide built-in port forwarding like // gvproxy. Start a TCP proxy for each port mapping so the host can // reach guest services (e.g., the gateway health check on :30051). @@ -583,7 +553,19 @@ fn launch_cloud_hypervisor( let parts: Vec<&str> = pm.split(':').collect(); if parts.len() == 2 { if let (Ok(hp), Ok(gp)) = (parts[0].parse::(), parts[1].parse::()) { - start_tcp_port_forwarder(hp, CHV_TAP_GUEST_IP, gp)?; + if let Err(e) = start_tcp_port_forwarder(hp, TAP_GUEST_IP, gp) { + let _ = chv_child.kill(); + let _ = chv_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + clear_vm_runtime_state(&config.rootfs); + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + return Err(e); + } } } } @@ -598,20 +580,33 @@ fn launch_cloud_hypervisor( // Start vsock exec bridge (exec Unix socket → CHV vsock Unix socket). // The bridge allows `openshell-vm exec` and bootstrap to communicate // with the guest exec agent over the standard exec socket path. - let exec_socket = vm_exec_socket_path(&config.rootfs); start_vsock_exec_bridge(&exec_socket, &vsock_sock_path, VM_EXEC_VSOCK_PORT)?; // Gateway bootstrap and health check (mirrors libkrun backend). - if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { + if !config.is_exec_mode() && !config.port_map.is_empty() { let gateway_port = crate::gateway_host_port(config); - crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; - crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; + if let Err(e) = crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port) + .and_then(|_| crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name)) + { + let _ = chv_child.kill(); + let _ = chv_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + clear_vm_runtime_state(&config.rootfs); + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + return Err(e); + } } eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); eprintln!("Press Ctrl+C to stop."); // Signal forwarding: SIGINT/SIGTERM -> graceful shutdown + crate::CHILD_PID.store(chv_pid, std::sync::atomic::Ordering::Relaxed); unsafe { libc::signal( libc::SIGINT, @@ -621,21 +616,21 @@ fn launch_cloud_hypervisor( libc::SIGTERM, crate::forward_signal as *const () as libc::sighandler_t, ); - crate::CHILD_PID.store(chv_pid, std::sync::atomic::Ordering::Relaxed); } // Wait for cloud-hypervisor to exit let status = chv_child .wait() .map_err(|e| VmError::HostSetup(format!("wait for cloud-hypervisor: {e}")))?; + crate::CHILD_PID.store(0, std::sync::atomic::Ordering::Relaxed); // Clean up host networking rules if let Some(ref orig) = original_ip_forward { - teardown_chv_host_networking(orig); + teardown_tap_host_networking(orig); } // Cleanup - if config.exec_path == "/srv/openshell-vm-init.sh" { + if !config.is_exec_mode() { clear_vm_runtime_state(&config.rootfs); } let _ = virtiofsd_child.kill(); @@ -654,31 +649,11 @@ fn launch_cloud_hypervisor( Ok(code) } -/// Escape a string for use in a shell script. Wraps in single quotes. -fn shell_escape(s: &str) -> String { - if s.is_empty() { - return "''".to_string(); - } - if !s.contains('\'') && !s.contains(' ') && !s.contains('"') && !s.contains('\\') { - return s.to_string(); - } - format!("'{}'", s.replace('\'', "'\\''")) -} - // ── Vsock exec bridge ─────────────────────────────────────────────────── /// Guest CID assigned in the cloud-hypervisor vsock config. const VSOCK_GUEST_CID: u32 = 3; -// ── CHV TAP networking constants ──────────────────────────────────────── -// cloud-hypervisor defaults to 192.168.249.1/24 on the host side of the -// TAP device. The guest uses .2 with the host as its gateway. - -const CHV_TAP_HOST_IP: &str = "192.168.249.1"; -const CHV_TAP_GUEST_IP: &str = "192.168.249.2"; -const CHV_TAP_SUBNET: &str = "192.168.249.0/24"; -const CHV_TAP_NETMASK: &str = "255.255.255.0"; - /// Start a background bridge: exec Unix socket → CHV vsock Unix socket. /// /// cloud-hypervisor exposes guest vsock via a host-side Unix socket with a @@ -808,215 +783,6 @@ fn chv_vsock_connect(chv_vsock_socket: &Path, port: u32) -> std::io::Result String { - content - .lines() - .filter(|line| line.starts_with("nameserver")) - .filter_map(|line| line.split_whitespace().nth(1)) - .find(|ip| !ip.starts_with("127.")) - .map(String::from) - .unwrap_or_else(|| "8.8.8.8".to_string()) -} - -/// Read the host's primary DNS server. -/// -/// Checks `/etc/resolv.conf` first. If every nameserver there is a loopback -/// address (e.g. systemd-resolved's `127.0.0.53`), falls back to the -/// upstream resolv.conf at `/run/systemd/resolve/resolv.conf` which -/// contains the real upstream nameservers. Final fallback is `8.8.8.8`. -fn host_dns_server() -> String { - for path in &["/etc/resolv.conf", "/run/systemd/resolve/resolv.conf"] { - if let Ok(content) = std::fs::read_to_string(path) { - let server = parse_dns_server(&content); - if server != "8.8.8.8" { - return server; - } - } - } - "8.8.8.8".to_string() -} - -/// Run a command, returning an error if it fails. -fn run_cmd(cmd: &str, args: &[&str]) -> Result<(), VmError> { - let output = std::process::Command::new(cmd) - .args(args) - .output() - .map_err(|e| VmError::HostSetup(format!("{cmd}: {e}")))?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - return Err(VmError::HostSetup(format!( - "{cmd} {}: {}", - args.join(" "), - stderr.trim() - ))); - } - - Ok(()) -} - -/// Set up host-side networking so the CHV guest can reach the internet. -/// -/// 1. Enable IP forwarding (saving the original value for teardown) -/// 2. MASQUERADE outbound traffic from the VM subnet -/// 3. Allow forwarding to/from the VM subnet -/// -/// Returns the original value of `ip_forward` so the caller can restore it. -fn setup_chv_host_networking() -> Result { - let original_ip_forward = std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward") - .map(|s| s.trim().to_string()) - .unwrap_or_else(|_| "0".to_string()); - - std::fs::write("/proc/sys/net/ipv4/ip_forward", "1") - .map_err(|e| VmError::HostSetup(format!("enable IP forwarding: {e}")))?; - - run_cmd( - "iptables", - &[ - "-t", - "nat", - "-A", - "POSTROUTING", - "-s", - CHV_TAP_SUBNET, - "!", - "-d", - CHV_TAP_SUBNET, - "-j", - "MASQUERADE", - ], - )?; - - run_cmd( - "iptables", - &["-A", "FORWARD", "-s", CHV_TAP_SUBNET, "-j", "ACCEPT"], - )?; - - run_cmd( - "iptables", - &["-A", "FORWARD", "-d", CHV_TAP_SUBNET, "-j", "ACCEPT"], - )?; - - eprintln!("host networking: IP forwarding + NAT masquerade for {CHV_TAP_SUBNET}"); - Ok(original_ip_forward) -} - -/// Remove the iptables rules added by [`setup_chv_host_networking`] and -/// restore the original `ip_forward` sysctl value. -fn teardown_chv_host_networking(original_ip_forward: &str) { - let _ = run_cmd( - "iptables", - &[ - "-t", - "nat", - "-D", - "POSTROUTING", - "-s", - CHV_TAP_SUBNET, - "!", - "-d", - CHV_TAP_SUBNET, - "-j", - "MASQUERADE", - ], - ); - let _ = run_cmd( - "iptables", - &["-D", "FORWARD", "-s", CHV_TAP_SUBNET, "-j", "ACCEPT"], - ); - let _ = run_cmd( - "iptables", - &["-D", "FORWARD", "-d", CHV_TAP_SUBNET, "-j", "ACCEPT"], - ); - if original_ip_forward != "1" { - let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", original_ip_forward); - } - eprintln!( - "host networking: cleaned up iptables rules, restored ip_forward={original_ip_forward}" - ); -} - -/// Start a background TCP proxy that forwards `127.0.0.1:{host_port}` -/// to `{guest_ip}:{guest_port}`. -/// -/// Each accepted connection spawns two threads for bidirectional copy. -/// The listener thread runs until the process exits. -fn start_tcp_port_forwarder( - host_port: u16, - guest_ip: &str, - guest_port: u16, -) -> Result<(), VmError> { - use std::net::{TcpListener, TcpStream}; - - let listener = TcpListener::bind(("127.0.0.1", host_port)) - .map_err(|e| VmError::HostSetup(format!("bind port forwarder on :{host_port}: {e}")))?; - - let guest_addr = format!("{guest_ip}:{guest_port}"); - eprintln!("port forwarder: 127.0.0.1:{host_port} -> {guest_addr}"); - - std::thread::spawn(move || { - for stream in listener.incoming() { - let client = match stream { - Ok(s) => s, - Err(_) => continue, - }; - - let addr = guest_addr.clone(); - std::thread::spawn(move || { - if let Ok(remote) = TcpStream::connect(&addr) { - forward_tcp_bidirectional(client, remote); - } - }); - } - }); - - Ok(()) -} - -/// Copy data bidirectionally between two TCP streams until either side closes. -fn forward_tcp_bidirectional(client: std::net::TcpStream, remote: std::net::TcpStream) { - let Ok(mut client_r) = client.try_clone() else { - return; - }; - let mut client_w = client; - let Ok(mut remote_r) = remote.try_clone() else { - return; - }; - let mut remote_w = remote; - - std::thread::spawn(move || { - let _ = std::io::copy(&mut client_r, &mut remote_w); - }); - std::thread::spawn(move || { - let _ = std::io::copy(&mut remote_r, &mut client_w); - }); -} - #[cfg(test)] mod tests { use super::*; @@ -1071,6 +837,7 @@ mod tests { gateway_name: "test".into(), state_disk: None, gpu_enabled: true, + gpu_has_msix: true, vfio_device: Some("0000:41:00.0".into()), backend: crate::VmBackendChoice::CloudHypervisor, }; @@ -1129,6 +896,7 @@ mod tests { gateway_name: "test".into(), state_disk: None, gpu_enabled: false, + gpu_has_msix: true, vfio_device: None, backend: crate::VmBackendChoice::Auto, }; @@ -1185,6 +953,7 @@ mod tests { gateway_name: "test".into(), state_disk: None, gpu_enabled: true, + gpu_has_msix: true, vfio_device: Some("0000:41:00.0".into()), backend: crate::VmBackendChoice::CloudHypervisor, }; @@ -1251,6 +1020,7 @@ mod tests { gateway_name: "test".into(), state_disk: None, gpu_enabled: false, + gpu_has_msix: true, vfio_device: None, backend: crate::VmBackendChoice::Auto, }; @@ -1315,6 +1085,7 @@ mod tests { gateway_name: "test".into(), state_disk: None, gpu_enabled: false, + gpu_has_msix: true, vfio_device: None, backend: crate::VmBackendChoice::CloudHypervisor, }; @@ -1340,7 +1111,7 @@ mod tests { let json: serde_json::Value = serde_json::from_str(&payload).unwrap(); let net = &json["net"][0]; - assert_eq!(net["mac"], "5a:94:ef:e4:0c:ee"); + assert_eq!(net["mac"], GUEST_MAC); assert_eq!(net["ip"], "192.168.249.1"); assert_eq!(net["mask"], "255.255.255.0"); } @@ -1368,6 +1139,7 @@ mod tests { gateway_name: "test".into(), state_disk: None, gpu_enabled: true, + gpu_has_msix: true, vfio_device: Some("0000:41:00.0".into()), backend: crate::VmBackendChoice::CloudHypervisor, }; @@ -1407,72 +1179,4 @@ mod tests { assert_eq!(json["net"][0]["ip"], "192.168.249.1"); } - // ── parse_dns_server tests ────────────────────────────────────────── - - #[test] - fn parse_dns_server_returns_first_non_loopback() { - let content = "nameserver 10.0.0.1\nnameserver 8.8.8.8\n"; - assert_eq!(parse_dns_server(content), "10.0.0.1"); - } - - #[test] - fn parse_dns_server_skips_systemd_resolved() { - let content = "nameserver 127.0.0.53\nnameserver 1.1.1.1\n"; - assert_eq!(parse_dns_server(content), "1.1.1.1"); - } - - #[test] - fn parse_dns_server_skips_all_loopback_variants() { - let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\nnameserver 172.16.0.1\n"; - assert_eq!(parse_dns_server(content), "172.16.0.1"); - } - - #[test] - fn parse_dns_server_falls_back_when_only_loopback() { - let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\n"; - assert_eq!(parse_dns_server(content), "8.8.8.8"); - } - - #[test] - fn parse_dns_server_handles_empty_content() { - assert_eq!(parse_dns_server(""), "8.8.8.8"); - } - - #[test] - fn parse_dns_server_ignores_comments_and_other_lines() { - let content = "# Generated by NetworkManager\nsearch example.com\nnameserver 10.1.2.3\n"; - assert_eq!(parse_dns_server(content), "10.1.2.3"); - } - - // ── shell_escape tests ────────────────────────────────────────────── - - #[test] - fn shell_escape_empty_string() { - assert_eq!(shell_escape(""), "''"); - } - - #[test] - fn shell_escape_simple_string() { - assert_eq!(shell_escape("hello"), "hello"); - } - - #[test] - fn shell_escape_string_with_single_quotes() { - assert_eq!(shell_escape("it's"), "'it'\\''s'"); - } - - #[test] - fn shell_escape_string_with_spaces() { - assert_eq!(shell_escape("hello world"), "'hello world'"); - } - - #[test] - fn shell_escape_string_with_double_quotes() { - assert_eq!(shell_escape(r#"say "hi""#), r#"'say "hi"'"#); - } - - #[test] - fn shell_escape_string_with_backslash() { - assert_eq!(shell_escape("path\\to"), "'path\\to'"); - } } diff --git a/crates/openshell-vm/src/backend/libkrun.rs b/crates/openshell-vm/src/backend/libkrun.rs index 1f077563a..0a27c4c71 100644 --- a/crates/openshell-vm/src/backend/libkrun.rs +++ b/crates/openshell-vm/src/backend/libkrun.rs @@ -389,7 +389,7 @@ fn launch_libkrun(config: &VmConfig) -> Result { std::process::exit(1); } _ => { - if config.exec_path == "/srv/openshell-vm-init.sh" { + if !config.is_exec_mode() { let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id); if let Err(err) = write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid, false) @@ -416,7 +416,7 @@ fn launch_libkrun(config: &VmConfig) -> Result { setup_gvproxy_port_forwarding(api_sock, &config.port_map)?; } - if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { + if !config.is_exec_mode() && !config.port_map.is_empty() { let gateway_port = gateway_host_port(config); bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; @@ -425,6 +425,7 @@ fn launch_libkrun(config: &VmConfig) -> Result { eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); eprintln!("Press Ctrl+C to stop."); + crate::CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); unsafe { libc::signal( libc::SIGINT, @@ -434,15 +435,15 @@ fn launch_libkrun(config: &VmConfig) -> Result { libc::SIGTERM, crate::forward_signal as *const () as libc::sighandler_t, ); - crate::CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); } let mut status: libc::c_int = 0; unsafe { libc::waitpid(pid, &raw mut status, 0); } + crate::CHILD_PID.store(0, std::sync::atomic::Ordering::Relaxed); - if config.exec_path == "/srv/openshell-vm-init.sh" { + if !config.is_exec_mode() { clear_vm_runtime_state(&config.rootfs); } if let Some(mut guard) = gvproxy_guard diff --git a/crates/openshell-vm/src/backend/mod.rs b/crates/openshell-vm/src/backend/mod.rs index 9c2167fc5..c08d4e4b0 100644 --- a/crates/openshell-vm/src/backend/mod.rs +++ b/crates/openshell-vm/src/backend/mod.rs @@ -5,20 +5,22 @@ //! //! Defines the [`VmBackend`] trait that all hypervisor backends implement, //! and shared infrastructure (gvproxy startup, networking helpers) used by -//! both the libkrun and cloud-hypervisor backends. +//! the libkrun, cloud-hypervisor, and QEMU backends. pub mod cloud_hypervisor; pub mod libkrun; +pub mod qemu; +use std::os::unix::net::UnixStream; use std::path::{Path, PathBuf}; -use std::time::Instant; +use std::time::{Duration, Instant}; use crate::{ GvproxyGuard, NetBackend, VmConfig, VmError, gvproxy_expose, gvproxy_socket_dir, kill_stale_gvproxy, kill_stale_gvproxy_by_port, pick_gvproxy_ssh_port, vm_rootfs_key, }; -/// Trait implemented by each hypervisor backend (libkrun, cloud-hypervisor). +/// Trait implemented by each hypervisor backend (libkrun, cloud-hypervisor, QEMU). pub trait VmBackend { /// Launch a VM with the given configuration. /// @@ -115,8 +117,8 @@ pub(crate) fn start_gvproxy( ); { - let deadline = Instant::now() + std::time::Duration::from_secs(5); - let mut interval = std::time::Duration::from_millis(5); + let deadline = Instant::now() + Duration::from_secs(5); + let mut interval = Duration::from_millis(5); while !net_sock.exists() { if Instant::now() >= deadline { return Err(VmError::Fork( @@ -124,7 +126,7 @@ pub(crate) fn start_gvproxy( )); } std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(100)); + interval = (interval * 2).min(Duration::from_millis(100)); } } @@ -144,15 +146,15 @@ pub(crate) fn setup_gvproxy_port_forwarding( ) -> Result<(), VmError> { let fwd_start = Instant::now(); { - let deadline = Instant::now() + std::time::Duration::from_secs(2); - let mut interval = std::time::Duration::from_millis(5); + let deadline = Instant::now() + Duration::from_secs(2); + let mut interval = Duration::from_millis(5); while !api_sock.exists() { if Instant::now() >= deadline { eprintln!("warning: gvproxy API socket not ready after 2s, attempting anyway"); break; } std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(200)); + interval = (interval * 2).min(Duration::from_millis(200)); } } @@ -174,8 +176,8 @@ pub(crate) fn setup_gvproxy_port_forwarding( ); let mut expose_ok = false; - let mut retry_interval = std::time::Duration::from_millis(100); - let expose_deadline = Instant::now() + std::time::Duration::from_secs(10); + let mut retry_interval = Duration::from_millis(100); + let expose_deadline = Instant::now() + Duration::from_secs(10); loop { match gvproxy_expose(api_sock, &expose_body) { Ok(()) => { @@ -189,7 +191,7 @@ pub(crate) fn setup_gvproxy_port_forwarding( break; } std::thread::sleep(retry_interval); - retry_interval = (retry_interval * 2).min(std::time::Duration::from_secs(1)); + retry_interval = (retry_interval * 2).min(Duration::from_secs(1)); } } } @@ -206,3 +208,400 @@ pub(crate) fn setup_gvproxy_port_forwarding( Ok(()) } + +// ── TAP networking constants ──────────────────────────────────────────── +// cloud-hypervisor defaults to 192.168.249.1/24 on the host side of the +// TAP device. The guest uses .2 with the host as its gateway. + +/// Fixed MAC for the guest TAP interface. Only one VM runs per host. +pub(crate) const GUEST_MAC: &str = "5a:94:ef:e4:0c:ee"; + +pub(crate) const TAP_HOST_IP: &str = "192.168.249.1"; +pub(crate) const TAP_GUEST_IP: &str = "192.168.249.2"; +pub(crate) const TAP_SUBNET: &str = "192.168.249.0/24"; +pub(crate) const TAP_NETMASK: &str = "255.255.255.0"; + +/// Wait for a Unix socket to appear on the filesystem. +pub(crate) fn wait_for_socket( + socket_path: &Path, + label: &str, + timeout: Duration, +) -> Result<(), VmError> { + let deadline = Instant::now() + timeout; + let mut interval = Duration::from_millis(10); + + while !socket_path.exists() { + if Instant::now() >= deadline { + return Err(VmError::HostSetup(format!( + "{label} socket did not appear within {}s: {}", + timeout.as_secs(), + socket_path.display(), + ))); + } + std::thread::sleep(interval); + interval = (interval * 2).min(Duration::from_millis(200)); + } + + Ok(()) +} + +/// Run a command, returning an error if it fails. +pub(crate) fn run_cmd(cmd: &str, args: &[&str]) -> Result<(), VmError> { + let output = std::process::Command::new(cmd) + .args(args) + .output() + .map_err(|e| VmError::HostSetup(format!("{cmd}: {e}")))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(VmError::HostSetup(format!( + "{cmd} {}: {}", + args.join(" "), + stderr.trim() + ))); + } + + Ok(()) +} + +/// Escape a string for use in a shell script. +/// +/// Uses an allowlist of safe characters; anything outside the list gets +/// single-quoted. Single quotes inside the value are escaped with the +/// standard `'\''` idiom. +pub(crate) fn shell_escape(s: &str) -> String { + if s.is_empty() { + return "''".to_string(); + } + if s.bytes().all(|b| matches!(b, + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' + | b'_' | b'-' | b'.' | b'/' | b':' | b'@' | b'=' + )) { + return s.to_string(); + } + format!("'{}'", s.replace('\'', "'\\''")) +} + +/// Parse a DNS server from resolv.conf content. +/// +/// Returns the first non-`127.x.x.x` nameserver, or `8.8.8.8` if none found. +pub(crate) fn parse_dns_server(content: &str) -> String { + content + .lines() + .filter(|line| line.starts_with("nameserver")) + .filter_map(|line| line.split_whitespace().nth(1)) + .find(|ip| !ip.starts_with("127.")) + .map(String::from) + .unwrap_or_else(|| "8.8.8.8".to_string()) +} + +/// Read the host's primary DNS server. +/// +/// Checks `/etc/resolv.conf` first. If every nameserver there is a loopback +/// address (e.g. systemd-resolved's `127.0.0.53`), falls back to the +/// upstream resolv.conf at `/run/systemd/resolve/resolv.conf` which +/// contains the real upstream nameservers. Final fallback is `8.8.8.8`. +pub(crate) fn host_dns_server() -> String { + for path in &["/etc/resolv.conf", "/run/systemd/resolve/resolv.conf"] { + if let Ok(content) = std::fs::read_to_string(path) { + let server = parse_dns_server(&content); + if server != "8.8.8.8" { + return server; + } + } + } + "8.8.8.8".to_string() +} + +// ── Kernel command line ───────────────────────────────────────────────── + +/// Build the kernel command line shared by all backends that use virtiofs +/// rootfs and the standard init path. +pub(crate) fn build_kernel_cmdline( + config: &VmConfig, + effective_exec_path: &str, + use_tap_net: bool, +) -> String { + let mut parts = vec![ + "console=ttyS0".to_string(), + "root=rootfs".to_string(), + "rootfstype=virtiofs".to_string(), + "rw".to_string(), + "panic=-1".to_string(), + format!("init={effective_exec_path}"), + ]; + + if config.gpu_enabled && config.vfio_device.is_some() { + parts.push("GPU_ENABLED=true".to_string()); + } + if let Some(state_disk) = &config.state_disk { + parts.push(format!( + "OPENSHELL_VM_STATE_DISK_DEVICE={}", + state_disk.guest_device + )); + } + for var in &config.env { + if var.contains('=') && !var.contains(' ') && !var.contains('"') { + parts.push(var.clone()); + } + } + + if use_tap_net { + parts.push(format!("VM_NET_IP={TAP_GUEST_IP}")); + parts.push(format!("VM_NET_GW={TAP_HOST_IP}")); + parts.push(format!("VM_NET_DNS={}", host_dns_server())); + } + + parts.join(" ") +} + +// ── TAP host networking ───────────────────────────────────────────────── + +/// Set up host-side networking so the guest can reach the internet via TAP. +/// +/// 1. Enable IP forwarding (saving the original value for teardown) +/// 2. MASQUERADE outbound traffic from the VM subnet +/// 3. Allow forwarding to/from the VM subnet +/// +/// Returns the original value of `ip_forward` so the caller can restore it. +pub(crate) fn setup_tap_host_networking() -> Result { + let original_ip_forward = std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward") + .map(|s| s.trim().to_string()) + .unwrap_or_else(|_| "0".to_string()); + + std::fs::write("/proc/sys/net/ipv4/ip_forward", "1") + .map_err(|e| VmError::HostSetup(format!("enable IP forwarding: {e}")))?; + + let _ = run_cmd( + "iptables", + &[ + "-t", "nat", "-D", "POSTROUTING", + "-s", TAP_SUBNET, "!", "-d", TAP_SUBNET, + "-j", "MASQUERADE", + ], + ); + run_cmd( + "iptables", + &[ + "-t", + "nat", + "-A", + "POSTROUTING", + "-s", + TAP_SUBNET, + "!", + "-d", + TAP_SUBNET, + "-j", + "MASQUERADE", + ], + )?; + + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-s", TAP_SUBNET, "-j", "ACCEPT"], + ); + run_cmd( + "iptables", + &["-A", "FORWARD", "-s", TAP_SUBNET, "-j", "ACCEPT"], + )?; + + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-d", TAP_SUBNET, "-j", "ACCEPT"], + ); + run_cmd( + "iptables", + &["-A", "FORWARD", "-d", TAP_SUBNET, "-j", "ACCEPT"], + )?; + + eprintln!("host networking: IP forwarding + NAT masquerade for {TAP_SUBNET}"); + Ok(original_ip_forward) +} + +/// Remove the iptables rules added by [`setup_tap_host_networking`] and +/// restore the original `ip_forward` sysctl value. +pub(crate) fn teardown_tap_host_networking(original_ip_forward: &str) { + let _ = run_cmd( + "iptables", + &[ + "-t", + "nat", + "-D", + "POSTROUTING", + "-s", + TAP_SUBNET, + "!", + "-d", + TAP_SUBNET, + "-j", + "MASQUERADE", + ], + ); + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-s", TAP_SUBNET, "-j", "ACCEPT"], + ); + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-d", TAP_SUBNET, "-j", "ACCEPT"], + ); + if original_ip_forward != "1" { + let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", original_ip_forward); + } + eprintln!( + "host networking: cleaned up iptables rules, restored ip_forward={original_ip_forward}" + ); +} + +// ── TCP port forwarding ───────────────────────────────────────────────── + +/// Start a background TCP proxy that forwards `127.0.0.1:{host_port}` +/// to `{guest_ip}:{guest_port}`. +/// +/// Each accepted connection spawns two threads for bidirectional copy. +/// The listener thread runs until the process exits. +pub(crate) fn start_tcp_port_forwarder( + host_port: u16, + guest_ip: &str, + guest_port: u16, +) -> Result<(), VmError> { + use std::net::{TcpListener, TcpStream}; + + let listener = TcpListener::bind(("127.0.0.1", host_port)) + .map_err(|e| VmError::HostSetup(format!("bind port forwarder on :{host_port}: {e}")))?; + + let guest_addr = format!("{guest_ip}:{guest_port}"); + eprintln!("port forwarder: 127.0.0.1:{host_port} -> {guest_addr}"); + + std::thread::spawn(move || { + for stream in listener.incoming() { + let client = match stream { + Ok(s) => s, + Err(_) => continue, + }; + + let addr = guest_addr.clone(); + std::thread::spawn(move || { + if let Ok(remote) = TcpStream::connect(&addr) { + forward_tcp_bidirectional(client, remote); + } + }); + } + }); + + Ok(()) +} + +/// Copy data bidirectionally between two TCP streams until either side closes. +fn forward_tcp_bidirectional( + client: std::net::TcpStream, + remote: std::net::TcpStream, +) { + let Ok(mut client_r) = client.try_clone() else { + return; + }; + let mut client_w = client; + let Ok(mut remote_r) = remote.try_clone() else { + return; + }; + let mut remote_w = remote; + + std::thread::spawn(move || { + let _ = std::io::copy(&mut client_r, &mut remote_w); + }); + std::thread::spawn(move || { + let _ = std::io::copy(&mut remote_r, &mut client_w); + }); +} + +// ── Bidirectional Unix stream bridge ──────────────────────────────────── + +/// Spawn two threads that copy data between two Unix streams. +pub(crate) fn bridge_bidirectional(client: UnixStream, guest: UnixStream) { + let Ok(mut client_r) = client.try_clone() else { + return; + }; + let mut client_w = client; + let Ok(mut guest_r) = guest.try_clone() else { + return; + }; + let mut guest_w = guest; + + std::thread::spawn(move || { + let _ = std::io::copy(&mut client_r, &mut guest_w); + }); + std::thread::spawn(move || { + let _ = std::io::copy(&mut guest_r, &mut client_w); + }); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_dns_server_returns_first_non_loopback() { + let content = "nameserver 10.0.0.1\nnameserver 8.8.8.8\n"; + assert_eq!(parse_dns_server(content), "10.0.0.1"); + } + + #[test] + fn parse_dns_server_skips_systemd_resolved() { + let content = "nameserver 127.0.0.53\nnameserver 1.1.1.1\n"; + assert_eq!(parse_dns_server(content), "1.1.1.1"); + } + + #[test] + fn parse_dns_server_skips_all_loopback_variants() { + let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\nnameserver 172.16.0.1\n"; + assert_eq!(parse_dns_server(content), "172.16.0.1"); + } + + #[test] + fn parse_dns_server_falls_back_when_only_loopback() { + let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\n"; + assert_eq!(parse_dns_server(content), "8.8.8.8"); + } + + #[test] + fn parse_dns_server_handles_empty_content() { + assert_eq!(parse_dns_server(""), "8.8.8.8"); + } + + #[test] + fn parse_dns_server_ignores_comments_and_other_lines() { + let content = "# Generated by NetworkManager\nsearch example.com\nnameserver 10.1.2.3\n"; + assert_eq!(parse_dns_server(content), "10.1.2.3"); + } + + #[test] + fn shell_escape_empty_string() { + assert_eq!(shell_escape(""), "''"); + } + + #[test] + fn shell_escape_simple_string() { + assert_eq!(shell_escape("hello"), "hello"); + } + + #[test] + fn shell_escape_string_with_single_quotes() { + assert_eq!(shell_escape("it's"), "'it'\\''s'"); + } + + #[test] + fn shell_escape_string_with_spaces() { + assert_eq!(shell_escape("hello world"), "'hello world'"); + } + + #[test] + fn shell_escape_string_with_double_quotes() { + assert_eq!(shell_escape(r#"say "hi""#), r#"'say "hi"'"#); + } + + #[test] + fn shell_escape_string_with_backslash() { + assert_eq!(shell_escape("path\\to"), "'path\\to'"); + } +} diff --git a/crates/openshell-vm/src/backend/qemu.rs b/crates/openshell-vm/src/backend/qemu.rs new file mode 100644 index 000000000..f3fe1f40a --- /dev/null +++ b/crates/openshell-vm/src/backend/qemu.rs @@ -0,0 +1,1021 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! QEMU backend for GPU passthrough VMs (devices without MSI-X support). +//! +//! Uses QEMU's command-line interface with KVM acceleration and VFIO device +//! passthrough. This backend is Linux-only and requires a separate kernel +//! image (`vmlinux`) and `virtiofsd` for the root filesystem. +//! +//! Unlike cloud-hypervisor, QEMU handles VFIO devices that lack MSI-X +//! capability by falling back to legacy interrupt emulation. + +use std::os::unix::net::UnixStream; +use std::os::unix::process::CommandExt; +use std::path::{Path, PathBuf}; +use std::time::{Duration, Instant}; + +use super::{ + GUEST_MAC, TAP_GUEST_IP, TAP_HOST_IP, VmBackend, bridge_bidirectional, build_kernel_cmdline, + run_cmd, setup_tap_host_networking, shell_escape, start_tcp_port_forwarder, + teardown_tap_host_networking, wait_for_socket, +}; +use crate::exec::{ + VM_EXEC_VSOCK_PORT, clear_vm_runtime_state, vm_exec_socket_path, write_vm_runtime_state, +}; +use crate::{NetBackend, VmConfig, VmError, vm_rootfs_key}; + +const VSOCK_GUEST_CID: u32 = 3; +const QEMU_BINARY_NAME: &str = "qemu-system-x86_64"; + +/// QEMU hypervisor backend for GPU passthrough (non-MSI-X devices). +pub struct QemuBackend { + qemu_binary: PathBuf, + vmlinux: PathBuf, + virtiofsd: PathBuf, +} + +impl QemuBackend { + /// Create a new QEMU backend, validating required binaries. + pub fn new() -> Result { + let runtime_dir = crate::configured_runtime_dir()?; + + let qemu_binary = { + let bundled = runtime_dir.join(QEMU_BINARY_NAME); + if bundled.is_file() { + bundled + } else { + find_in_path(QEMU_BINARY_NAME).ok_or_else(|| VmError::BinaryNotFound { + path: bundled.display().to_string(), + hint: "QEMU backend requires qemu-system-x86_64. Install QEMU or set OPENSHELL_VM_RUNTIME_DIR".to_string(), + })? + } + }; + + let vmlinux = runtime_dir.join("vmlinux"); + if !vmlinux.is_file() { + return Err(VmError::BinaryNotFound { + path: vmlinux.display().to_string(), + hint: "QEMU backend requires a vmlinux kernel. Run the GPU build pipeline" + .to_string(), + }); + } + + let virtiofsd = runtime_dir.join("virtiofsd"); + if !virtiofsd.is_file() { + return Err(VmError::BinaryNotFound { + path: virtiofsd.display().to_string(), + hint: "QEMU backend requires virtiofsd. Run the GPU build pipeline".to_string(), + }); + } + + // Verify vhost-vsock is available. QEMU's vhost-vsock-pci device + // needs /dev/vhost-vsock (provided by the vhost_vsock kernel module). + // A plain AF_VSOCK socket() can succeed with just the vsock module, + // but connect() will fail with ENODEV if vhost_vsock isn't loaded. + if !Path::new("/dev/vhost-vsock").exists() { + return Err(VmError::HostSetup( + "/dev/vhost-vsock not found.\n\ + QEMU backend requires the vhost_vsock kernel module.\n\ + Fix: sudo modprobe vhost_vsock" + .to_string(), + )); + } + { + let fd = unsafe { libc::socket(libc::AF_VSOCK, libc::SOCK_STREAM, 0) }; + if fd < 0 { + let err = std::io::Error::last_os_error(); + return Err(VmError::HostSetup(format!( + "AF_VSOCK socket creation failed: {err}\n\ + QEMU backend requires the vhost_vsock kernel module.\n\ + Fix: sudo modprobe vhost_vsock" + ))); + } + unsafe { libc::close(fd) }; + } + + Ok(Self { + qemu_binary, + vmlinux, + virtiofsd, + }) + } +} + +impl VmBackend for QemuBackend { + fn launch(&self, config: &VmConfig) -> Result { + launch_qemu(self, config) + } +} + +/// Search `$PATH` for a binary by name. +fn find_in_path(name: &str) -> Option { + let path_var = std::env::var_os("PATH")?; + for dir in std::env::split_paths(&path_var) { + let candidate = dir.join(name); + if candidate.is_file() { + return Some(candidate); + } + } + None +} + +const TAP_DEVICE_NAME: &str = "vmtap0"; + +/// Create and configure the TAP device before QEMU starts. +/// +/// Unlike cloud-hypervisor (which creates its own TAP via the `net` config), +/// QEMU with `script=no` expects the TAP device to already exist. +fn setup_tap_device() -> Result<(), VmError> { + // Clean up stale TAP device from a previous crashed run. + if Path::new(&format!("/sys/class/net/{TAP_DEVICE_NAME}")).exists() { + eprintln!("TAP device {TAP_DEVICE_NAME} already exists, removing stale device"); + let _ = run_cmd("ip", &["link", "delete", TAP_DEVICE_NAME]); + } + run_cmd("ip", &["tuntap", "add", "dev", TAP_DEVICE_NAME, "mode", "tap"])?; + run_cmd( + "ip", + &[ + "addr", "add", + &format!("{TAP_HOST_IP}/24"), + "dev", TAP_DEVICE_NAME, + ], + )?; + run_cmd("ip", &["link", "set", TAP_DEVICE_NAME, "up"])?; + eprintln!("TAP device {TAP_DEVICE_NAME} created with {TAP_HOST_IP}"); + Ok(()) +} + +/// Remove the TAP device created by [`setup_tap_device`]. +fn teardown_tap_device() { + let _ = run_cmd("ip", &["link", "delete", TAP_DEVICE_NAME]); + eprintln!("TAP device {TAP_DEVICE_NAME} removed"); +} + +// ── Build QEMU command-line arguments ─────────────────────────────────── + +fn build_qemu_args( + backend: &QemuBackend, + config: &VmConfig, + effective_exec_path: &str, + vfio_device: Option<&str>, + virtiofsd_sock: &Path, + state_disk_path: Option<&Path>, + use_tap_net: bool, + guest_cid: u32, + console_log: &Path, +) -> Vec { + let mut args = Vec::new(); + + // Machine, CPU, resources + args.extend([ + "-machine".into(), + "q35,accel=kvm".into(), + "-cpu".into(), + "host".into(), + "-smp".into(), + config.vcpus.to_string(), + "-m".into(), + format!("{}M", config.mem_mib), + ]); + + // Kernel + args.extend([ + "-kernel".into(), + backend.vmlinux.display().to_string(), + ]); + + // Kernel cmdline (shared builder with CHV) + let cmdline = build_kernel_cmdline(config, effective_exec_path, use_tap_net); + args.extend(["-append".into(), cmdline]); + + // virtiofs rootfs + args.extend([ + "-chardev".into(), + format!("socket,id=vfsock,path={}", virtiofsd_sock.display()), + "-device".into(), + "vhost-user-fs-pci,chardev=vfsock,tag=rootfs".into(), + "-object".into(), + format!( + "memory-backend-file,id=mem,size={}M,mem-path=/dev/shm,share=on", + config.mem_mib + ), + "-numa".into(), + "node,memdev=mem".into(), + ]); + + // State disk + if let Some(disk_path) = state_disk_path { + args.extend([ + "-drive".into(), + format!( + "file={},format=raw,if=virtio", + disk_path.display() + ), + ]); + } + + // PCIe root ports — Q35's pcie.0 root bus does not support + // hotplugging. VFIO and vhost-vsock-pci need dedicated root ports + // to initialize correctly under the Q35 PCIe topology. + // virtio-net-pci and vhost-user-fs-pci are QEMU-emulated devices + // that work directly on the root bus without dedicated root ports. + const PCIE_SLOT_VFIO: u8 = 1; + const PCIE_SLOT_VSOCK: u8 = 2; + + // VFIO device passthrough + if let Some(bdf) = vfio_device { + args.extend([ + "-device".into(), + format!("pcie-root-port,id=vfio-rp,chassis={PCIE_SLOT_VFIO},slot={PCIE_SLOT_VFIO}"), + "-device".into(), + format!("vfio-pci,host={bdf},bus=vfio-rp"), + ]); + } + + // vsock + args.extend([ + "-device".into(), + format!("pcie-root-port,id=vsock-rp,chassis={PCIE_SLOT_VSOCK},slot={PCIE_SLOT_VSOCK}"), + "-device".into(), + format!("vhost-vsock-pci,guest-cid={guest_cid},bus=vsock-rp"), + ]); + + // TAP networking + if use_tap_net { + args.extend([ + "-netdev".into(), + "tap,id=net0,ifname=vmtap0,script=no,downscript=no".into(), + "-device".into(), + format!("virtio-net-pci,netdev=net0,mac={GUEST_MAC}"), + ]); + } + + // Console / display — disable monitor explicitly to prevent + // stdin from being interpreted as monitor commands. + args.extend([ + "-serial".into(), + format!("file:{}", console_log.display()), + "-display".into(), + "none".into(), + "-monitor".into(), + "none".into(), + "-no-reboot".into(), + ]); + + args +} + +// ── Launch ────────────────────────────────────────────────────────────── + +#[allow(clippy::similar_names)] +fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result { + let launch_start = Instant::now(); + + let run_dir = config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .to_path_buf(); + let rootfs_key = vm_rootfs_key(&config.rootfs); + + let sock_dir = PathBuf::from(format!("/tmp/ovm-qemu-{}", std::process::id())); + if let Ok(entries) = std::fs::read_dir("/tmp") { + for entry in entries.filter_map(Result::ok) { + let name = entry.file_name().to_string_lossy().to_string(); + if name.starts_with("ovm-qemu-") && entry.path() != sock_dir { + let is_stale = name + .strip_prefix("ovm-qemu-") + .and_then(|pid_str| pid_str.parse::().ok()) + .map(|pid| unsafe { libc::kill(pid, 0) } != 0) + .unwrap_or(true); + if is_stale { + let _ = std::fs::remove_dir_all(entry.path()); + } + } + } + } + std::fs::create_dir_all(&sock_dir).map_err(|e| { + VmError::HostSetup(format!("create socket dir {}: {e}", sock_dir.display())) + })?; + + let virtiofsd_sock_path = sock_dir.join("virtiofsd.sock"); + let console_log = config + .console_output + .clone() + .unwrap_or_else(|| run_dir.join(format!("{rootfs_key}-console.log"))); + + let _ = std::fs::remove_file(&virtiofsd_sock_path); + + // Start virtiofsd + eprintln!("Starting virtiofsd: {}", backend.virtiofsd.display()); + let virtiofsd_log = run_dir.join(format!("{rootfs_key}-virtiofsd.log")); + let virtiofsd_log_file = std::fs::File::create(&virtiofsd_log) + .map_err(|e| VmError::Fork(format!("create virtiofsd log: {e}")))?; + + let mut virtiofsd_cmd = std::process::Command::new(&backend.virtiofsd); + virtiofsd_cmd + .arg(format!("--socket-path={}", virtiofsd_sock_path.display())) + .arg(format!("--shared-dir={}", config.rootfs.display())) + .arg("--cache=always") + .stdout(std::process::Stdio::null()) + .stderr(virtiofsd_log_file); + #[allow(unsafe_code)] + unsafe { + virtiofsd_cmd.pre_exec(|| { + libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); + Ok(()) + }); + } + let mut virtiofsd_child = virtiofsd_cmd.spawn() + .map_err(|e| VmError::Fork(format!("start virtiofsd: {e}")))?; + + eprintln!( + "virtiofsd started (pid {}) [{:.1}s]", + virtiofsd_child.id(), + launch_start.elapsed().as_secs_f64() + ); + + wait_for_socket(&virtiofsd_sock_path, "virtiofsd", Duration::from_secs(5))?; + + let use_tap_net = !matches!(config.net, NetBackend::None); + + // Build exec wrapper (same pattern as CHV) + let is_exec_mode = config.is_exec_mode(); + let wrapper_path = config.rootfs.join("tmp/qemu-exec-wrapper.sh"); + let effective_exec_path; + if is_exec_mode { + let args_str = config + .args + .iter() + .map(|a| shell_escape(a)) + .collect::>() + .join(" "); + + let env_str = config + .env + .iter() + .map(|v| format!("export {}", shell_escape(v))) + .collect::>() + .join("\n"); + + let wrapper = format!( + "#!/bin/sh\n\ + mount -t proc proc /proc 2>/dev/null\n\ + mount -t sysfs sysfs /sys 2>/dev/null\n\ + mount -t devtmpfs devtmpfs /dev 2>/dev/null\n\ + {env_str}\n\ + cd {workdir}\n\ + {exec} {args}\n\ + RC=$?\n\ + if command -v poweroff >/dev/null 2>&1; then\n\ + poweroff -f\n\ + elif [ -x /usr/bin/busybox ]; then\n\ + /usr/bin/busybox poweroff -f\n\ + else\n\ + echo o > /proc/sysrq-trigger\n\ + fi\n\ + exit $RC\n", + env_str = env_str, + workdir = shell_escape(&config.workdir), + exec = shell_escape(&config.exec_path), + args = args_str, + ); + + if let Some(parent) = wrapper_path.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| VmError::HostSetup(format!("create wrapper dir: {e}")))?; + } + std::fs::write(&wrapper_path, &wrapper) + .map_err(|e| VmError::HostSetup(format!("write exec wrapper: {e}")))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let _ = std::fs::set_permissions(&wrapper_path, std::fs::Permissions::from_mode(0o755)); + } + effective_exec_path = "/tmp/qemu-exec-wrapper.sh".to_string(); + } else { + effective_exec_path = config.exec_path.clone(); + } + + // Build QEMU command line + let state_disk_path = config.state_disk.as_ref().map(|sd| sd.path.as_path()); + let qemu_args = build_qemu_args( + backend, + config, + &effective_exec_path, + config.vfio_device.as_deref(), + &virtiofsd_sock_path, + state_disk_path, + use_tap_net, + VSOCK_GUEST_CID, + &console_log, + ); + + // Create TAP device before QEMU starts (QEMU with script=no expects it). + if use_tap_net { + setup_tap_device()?; + } + + // Spawn QEMU + eprintln!("Starting QEMU: {}", backend.qemu_binary.display()); + let qemu_log = run_dir.join(format!("{rootfs_key}-qemu.log")); + let qemu_log_file = std::fs::File::create(&qemu_log) + .map_err(|e| VmError::Fork(format!("create QEMU log: {e}")))?; + + let mut qemu_cmd = std::process::Command::new(&backend.qemu_binary); + qemu_cmd + .args(&qemu_args) + .stdout(std::process::Stdio::null()) + .stderr(qemu_log_file); + #[allow(unsafe_code)] + unsafe { + qemu_cmd.pre_exec(|| { + libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); + Ok(()) + }); + } + let mut qemu_child = qemu_cmd.spawn() + .map_err(|e| VmError::Fork(format!("start QEMU: {e}")))?; + + let qemu_pid = qemu_child.id() as i32; + eprintln!( + "QEMU started (pid {qemu_pid}) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + + // Set up host-side TAP networking + let mut original_ip_forward: Option = None; + if use_tap_net { + match setup_tap_host_networking() { + Ok(orig) => original_ip_forward = Some(orig), + Err(e) => { + eprintln!("WARNING: host networking setup failed: {e}"); + eprintln!(" The VM may not have internet access."); + } + } + } + + // Start AF_VSOCK exec bridge + let exec_socket = vm_exec_socket_path(&config.rootfs); + start_vsock_exec_bridge_af_vsock( + &exec_socket, + VSOCK_GUEST_CID, + VM_EXEC_VSOCK_PORT, + qemu_child.id(), + )?; + + // Write runtime state (vsock_bridge: true — uses AF_VSOCK bridging) + if !config.is_exec_mode() { + if let Err(err) = + write_vm_runtime_state(&config.rootfs, qemu_pid, &console_log, None, true) + { + let _ = qemu_child.kill(); + let _ = qemu_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + if use_tap_net { + teardown_tap_device(); + } + clear_vm_runtime_state(&config.rootfs); + return Err(err); + } + } + + // TCP port forwarding (same pattern as CHV) + if use_tap_net { + for pm in &config.port_map { + let parts: Vec<&str> = pm.split(':').collect(); + if parts.len() == 2 { + if let (Ok(hp), Ok(gp)) = (parts[0].parse::(), parts[1].parse::()) { + if let Err(e) = start_tcp_port_forwarder(hp, TAP_GUEST_IP, gp) { + let _ = qemu_child.kill(); + let _ = qemu_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + if use_tap_net { + teardown_tap_device(); + } + clear_vm_runtime_state(&config.rootfs); + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + return Err(e); + } + } + } + } + } + + for pm in &config.port_map { + let host_port = pm.split(':').next().unwrap_or(pm); + eprintln!(" port {pm} -> http://localhost:{host_port}"); + } + eprintln!("Console output: {}", console_log.display()); + + // Gateway bootstrap and health check + if !config.is_exec_mode() && !config.port_map.is_empty() { + let gateway_port = crate::gateway_host_port(config); + if let Err(e) = crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port) + .and_then(|_| crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name)) + { + let _ = qemu_child.kill(); + let _ = qemu_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + if use_tap_net { + teardown_tap_device(); + } + clear_vm_runtime_state(&config.rootfs); + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + return Err(e); + } + } + + eprintln!( + "Ready [{:.1}s total]", + launch_start.elapsed().as_secs_f64() + ); + eprintln!("Press Ctrl+C to stop."); + + // Signal forwarding + crate::CHILD_PID.store(qemu_pid, std::sync::atomic::Ordering::Relaxed); + unsafe { + libc::signal( + libc::SIGINT, + crate::forward_signal as *const () as libc::sighandler_t, + ); + libc::signal( + libc::SIGTERM, + crate::forward_signal as *const () as libc::sighandler_t, + ); + } + + // Wait for QEMU to exit + let status = qemu_child + .wait() + .map_err(|e| VmError::HostSetup(format!("wait for QEMU: {e}")))?; + crate::CHILD_PID.store(0, std::sync::atomic::Ordering::Relaxed); + + // Clean up host networking rules + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + if use_tap_net { + teardown_tap_device(); + } + + // Cleanup + if !config.is_exec_mode() { + clear_vm_runtime_state(&config.rootfs); + } + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + eprintln!("virtiofsd stopped"); + + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + if is_exec_mode { + let _ = std::fs::remove_file(&wrapper_path); + } + + let code = status.code().unwrap_or(1); + eprintln!("VM exited with code {code}"); + Ok(code) +} + +// ── AF_VSOCK exec bridge ──────────────────────────────────────────────── + +/// Start a background bridge: exec Unix socket → guest AF_VSOCK. +/// +/// QEMU uses kernel `vhost-vsock-pci` which exposes guest vsock via the +/// kernel's `AF_VSOCK` address family. This is different from +/// cloud-hypervisor's text protocol — here we connect directly to the +/// guest CID and port using raw `AF_VSOCK` sockets. +fn start_vsock_exec_bridge_af_vsock( + exec_socket: &Path, + guest_cid: u32, + guest_port: u32, + qemu_pid: u32, +) -> Result<(), VmError> { + use std::os::unix::net::UnixListener; + + if let Some(parent) = exec_socket.parent() { + std::fs::create_dir_all(parent).map_err(|e| { + VmError::HostSetup(format!("create exec bridge dir {}: {e}", parent.display())) + })?; + } + let _ = std::fs::remove_file(exec_socket); + + let listener = UnixListener::bind(exec_socket).map_err(|e| { + VmError::HostSetup(format!( + "bind vsock exec bridge {}: {e}", + exec_socket.display() + )) + })?; + + eprintln!( + "vsock exec bridge (AF_VSOCK): {} → CID {} port {}", + exec_socket.display(), + guest_cid, + guest_port, + ); + + std::thread::spawn(move || { + af_vsock_bridge_accept_loop(listener, guest_cid, guest_port, qemu_pid); + }); + + Ok(()) +} + +/// Connect to a guest vsock port via kernel AF_VSOCK. +/// +/// Returns the connected socket wrapped as a `UnixStream`. The `UnixStream` +/// type is used solely for its `Read`/`Write` trait impls which delegate to +/// raw `read()`/`write()` syscalls — address-family-specific methods like +/// `peer_addr()` must not be called on the returned stream. +fn connect_af_vsock(cid: u32, port: u32) -> std::io::Result { + use std::os::unix::io::FromRawFd; + + let fd = unsafe { libc::socket(libc::AF_VSOCK, libc::SOCK_STREAM, 0) }; + if fd < 0 { + return Err(std::io::Error::last_os_error()); + } + + let addr = libc::sockaddr_vm { + svm_family: libc::AF_VSOCK as u16, + svm_reserved1: 0, + svm_port: port, + svm_cid: cid, + svm_zero: [0; 4], + }; + + let ret = unsafe { + libc::connect( + fd, + std::ptr::from_ref(&addr).cast::(), + size_of::() as libc::socklen_t, + ) + }; + + if ret < 0 { + let err = std::io::Error::last_os_error(); + unsafe { libc::close(fd) }; + return Err(err); + } + + // SAFETY: fd is a valid, connected socket. We wrap it as UnixStream + // purely for Read/Write access used by bridge_bidirectional(). + Ok(unsafe { UnixStream::from_raw_fd(fd) }) +} + +/// Whether a vsock connect error is transient (expected during VM boot). +/// +/// The guest exec agent takes time to start, and the vhost-vsock transport +/// may not be fully initialized when QEMU first launches. These errors +/// resolve on their own once the guest is ready. +fn is_transient_vsock_error(e: &std::io::Error) -> bool { + if e.kind() == std::io::ErrorKind::ConnectionRefused { + return true; + } + match e.raw_os_error() { + Some(code) => { + code == libc::ENODEV // vsock transport not ready + || code == libc::EHOSTUNREACH // guest CID not reachable yet + || code == libc::ECONNRESET // connection reset during startup + || code == libc::ETIMEDOUT // connect timed out + } + None => false, + } +} + +/// Accept loop for the AF_VSOCK bridge background thread. +/// +/// Connection failures during boot are expected — the guest exec agent +/// isn't listening yet. We keep retrying since the bootstrap caller has +/// its own 120s timeout. If the QEMU process exits, we stop immediately +/// rather than retrying against a dead CID for 120s. +fn af_vsock_bridge_accept_loop( + listener: std::os::unix::net::UnixListener, + guest_cid: u32, + port: u32, + qemu_pid: u32, +) { + // Give QEMU time to initialize the vhost-vsock-pci device and register + // the CID with the kernel transport before accepting connections. + std::thread::sleep(Duration::from_secs(2)); + + let mut fatal_failures: u32 = 0; + let mut logged_transient = false; + + for stream in listener.incoming() { + if !is_process_alive(qemu_pid) { + eprintln!("vsock bridge: QEMU (pid {qemu_pid}) exited, stopping bridge"); + return; + } + + let client = match stream { + Ok(s) => s, + Err(e) => { + eprintln!("vsock bridge: accept: {e}"); + continue; + } + }; + + match connect_af_vsock(guest_cid, port) { + Ok(guest) => { + fatal_failures = 0; + bridge_bidirectional(client, guest); + } + Err(e) if is_transient_vsock_error(&e) => { + if !is_process_alive(qemu_pid) { + eprintln!( + "vsock bridge: QEMU (pid {qemu_pid}) exited — \ + check console log for VM boot errors" + ); + return; + } + if !logged_transient { + eprintln!( + "vsock bridge: guest not ready on CID {guest_cid} port {port} ({e}), \ + will keep retrying..." + ); + logged_transient = true; + } + std::thread::sleep(Duration::from_secs(1)); + } + Err(e) => { + fatal_failures += 1; + if fatal_failures <= 2 { + eprintln!("vsock bridge: AF_VSOCK connect failed: {e}"); + } + if fatal_failures >= 5 { + eprintln!("vsock bridge: too many AF_VSOCK failures, stopping bridge"); + return; + } + std::thread::sleep(Duration::from_secs(1)); + } + } + } +} + +fn is_process_alive(pid: u32) -> bool { + unsafe { libc::kill(pid as i32, 0) == 0 } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_backend() -> QemuBackend { + QemuBackend { + qemu_binary: "/usr/bin/qemu-system-x86_64".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + } + } + + fn base_config() -> VmConfig { + VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::None, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: false, + gpu_has_msix: false, + vfio_device: None, + backend: crate::VmBackendChoice::Qemu, + } + } + + #[test] + fn build_qemu_args_basic() { + let backend = test_backend(); + let config = base_config(); + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + false, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + assert!(args.contains(&"-machine".to_string())); + assert!(args.contains(&"q35,accel=kvm".to_string())); + assert!(args.contains(&"-cpu".to_string())); + assert!(args.contains(&"host".to_string())); + assert!(args.contains(&"-smp".to_string())); + assert!(args.contains(&"4".to_string())); + assert!(args.contains(&"-m".to_string())); + assert!(args.contains(&"8192M".to_string())); + assert!(args.contains(&"-monitor".to_string())); + assert!(args.contains(&"none".to_string())); + assert!(args.contains(&"-no-reboot".to_string())); + assert!(!args.iter().any(|a| a.contains("vfio-pci"))); + assert!(!args.iter().any(|a| a.contains("tap"))); + assert!( + args.iter() + .any(|a| a.contains("pcie-root-port,id=vsock-rp")), + "args should contain PCIe root port for vsock: {args:?}" + ); + assert!( + args.iter() + .any(|a| a.contains("vhost-vsock-pci,guest-cid=3,bus=vsock-rp")), + "args should contain vsock on root port: {args:?}" + ); + } + + #[test] + fn build_qemu_args_with_vfio() { + let backend = test_backend(); + let mut config = base_config(); + config.gpu_enabled = true; + config.vfio_device = Some("0000:41:00.0".into()); + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + false, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + assert!( + args.iter() + .any(|a| a.contains("vfio-pci,host=0000:41:00.0,bus=vfio-rp")), + "args should contain VFIO device on root port: {args:?}" + ); + assert!( + args.iter().any(|a| a.contains("pcie-root-port,id=vfio-rp")), + "args should contain PCIe root port for VFIO: {args:?}" + ); + } + + #[test] + fn build_qemu_args_with_tap_net() { + let backend = test_backend(); + let mut config = base_config(); + config.net = NetBackend::Gvproxy { + binary: "/usr/bin/gvproxy".into(), + }; + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + true, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + assert!( + args.iter().any(|a| a.contains("tap,id=net0")), + "args should contain TAP netdev: {args:?}" + ); + assert!( + args.iter() + .any(|a| a.contains("virtio-net-pci,netdev=net0")), + "args should contain virtio-net device: {args:?}" + ); + } + + #[test] + fn build_qemu_args_without_net() { + let backend = test_backend(); + let config = base_config(); + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + false, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + assert!( + !args.iter().any(|a| a.contains("tap")), + "args should not contain TAP: {args:?}" + ); + assert!( + !args.iter().any(|a| a.contains("virtio-net")), + "args should not contain virtio-net: {args:?}" + ); + } + + #[test] + fn build_qemu_args_gpu_enabled_cmdline() { + let backend = test_backend(); + let mut config = base_config(); + config.gpu_enabled = true; + config.vfio_device = Some("0000:41:00.0".into()); + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + false, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + let append_idx = args.iter().position(|a| a == "-append").unwrap(); + let cmdline = &args[append_idx + 1]; + assert!( + cmdline.contains("GPU_ENABLED=true"), + "cmdline should contain GPU_ENABLED=true: {cmdline}" + ); + } + + #[test] + fn transient_vsock_errors_classified_correctly() { + // Kind-based: ConnectionRefused + let refused = std::io::Error::from(std::io::ErrorKind::ConnectionRefused); + assert!( + is_transient_vsock_error(&refused), + "ConnectionRefused should be transient" + ); + + // OS-error-based transient codes + let enodev = std::io::Error::from_raw_os_error(libc::ENODEV); + assert!( + is_transient_vsock_error(&enodev), + "ENODEV should be transient" + ); + + let ehostunreach = std::io::Error::from_raw_os_error(libc::EHOSTUNREACH); + assert!( + is_transient_vsock_error(&ehostunreach), + "EHOSTUNREACH should be transient" + ); + + let econnreset = std::io::Error::from_raw_os_error(libc::ECONNRESET); + assert!( + is_transient_vsock_error(&econnreset), + "ECONNRESET should be transient" + ); + + let etimedout = std::io::Error::from_raw_os_error(libc::ETIMEDOUT); + assert!( + is_transient_vsock_error(&etimedout), + "ETIMEDOUT should be transient" + ); + + // Non-transient errors + let eperm = std::io::Error::from_raw_os_error(libc::EPERM); + assert!( + !is_transient_vsock_error(&eperm), + "EPERM should not be transient" + ); + + let eacces = std::io::Error::from_raw_os_error(libc::EACCES); + assert!( + !is_transient_vsock_error(&eacces), + "EACCES should not be transient" + ); + + let other = std::io::Error::new(std::io::ErrorKind::Other, "something else"); + assert!( + !is_transient_vsock_error(&other), + "ErrorKind::Other should not be transient" + ); + } +} diff --git a/crates/openshell-vm/src/embedded.rs b/crates/openshell-vm/src/embedded.rs index 731f34b10..6a4a2d3f6 100644 --- a/crates/openshell-vm/src/embedded.rs +++ b/crates/openshell-vm/src/embedded.rs @@ -26,6 +26,7 @@ mod resources { pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.5.dylib.zst")); pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); + pub const ROOTFS_GPU: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs-gpu.tar.zst")); pub const LIBKRUN_NAME: &str = "libkrun.dylib"; pub const LIBKRUNFW_NAME: &str = "libkrunfw.5.dylib"; } @@ -36,6 +37,7 @@ mod resources { pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); + pub const ROOTFS_GPU: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs-gpu.tar.zst")); pub const LIBKRUN_NAME: &str = "libkrun.so"; pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; } @@ -46,6 +48,7 @@ mod resources { pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); + pub const ROOTFS_GPU: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs-gpu.tar.zst")); pub const LIBKRUN_NAME: &str = "libkrun.so"; pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; } @@ -61,6 +64,7 @@ mod resources { pub const LIBKRUNFW: &[u8] = &[]; pub const GVPROXY: &[u8] = &[]; pub const ROOTFS: &[u8] = &[]; + pub const ROOTFS_GPU: &[u8] = &[]; pub const LIBKRUN_NAME: &str = "libkrun"; pub const LIBKRUNFW_NAME: &str = "libkrunfw"; } @@ -232,11 +236,16 @@ pub fn cleanup_old_rootfs() -> Result<(), VmError> { cleanup_old_versions_in_base(&base, ¤t_version_dir) } -/// Check if the rootfs is embedded (non-empty). +/// Check if the base rootfs is embedded (non-empty). pub fn has_embedded_rootfs() -> bool { !resources::ROOTFS.is_empty() } +/// Check if the GPU rootfs is embedded (non-empty). +pub fn has_embedded_gpu_rootfs() -> bool { + !resources::ROOTFS_GPU.is_empty() +} + // ── Internal helpers ─────────────────────────────────────────────────────── /// Build a cache key that combines the version string with a short content diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs index 1f8ad03fe..ea1fac718 100644 --- a/crates/openshell-vm/src/exec.rs +++ b/crates/openshell-vm/src/exec.rs @@ -51,16 +51,17 @@ pub const VM_EXEC_VSOCK_PORT: u32 = 10_777; /// How to connect to the VM exec agent. /// /// libkrun bridges each guest vsock port to a host Unix socket via -/// `krun_add_vsock_port2`. cloud-hypervisor uses standard vhost-vsock -/// with CID-based addressing — the host connects via `AF_VSOCK` or a -/// vsock-proxy/socat bridge. +/// `krun_add_vsock_port2`. cloud-hypervisor exposes guest vsock through +/// a host-side Unix socket with a text protocol (`CONNECT \n` / +/// `OK \n`), not kernel `AF_VSOCK` or standard `vhost-vsock`. #[derive(Debug, Clone)] pub enum VsockConnectMode { /// Connect via a host Unix socket (libkrun per-port bridging). UnixSocket(PathBuf), /// Connect via a vsock proxy bridge (cloud-hypervisor). - /// The path points to a socat-bridged Unix socket that forwards - /// to guest CID 3, port [`VM_EXEC_VSOCK_PORT`]. + /// The path points to a bridged Unix socket that performs the CHV + /// text-protocol handshake and forwards to guest CID 3, + /// port [`VM_EXEC_VSOCK_PORT`]. VsockBridge(PathBuf), } @@ -176,8 +177,10 @@ pub fn write_vm_runtime_state( pub fn clear_vm_runtime_state(rootfs: &Path) { let state_path = vm_state_path(rootfs); + let lock_path = vm_lock_path(rootfs); let socket_path = vm_exec_socket_path(rootfs); let _ = fs::remove_file(state_path); + let _ = fs::remove_file(lock_path); let _ = fs::remove_file(socket_path); } @@ -307,6 +310,13 @@ pub fn reset_runtime_state(rootfs: &Path, gateway_name: &str) -> Result<(), VmEr /// create a fresh database on startup and cluster state will be re-applied from /// the auto-deploy manifests in `server/manifests/`. /// +/// **Limitation — state disk:** When a state disk is configured (common with +/// `--gpu`), the kine DB lives inside the raw disk image, not on the virtiofs +/// rootfs. This host-side check only sees the virtiofs path and cannot detect +/// corruption on the state disk. The init script (`openshell-vm-init.sh`) runs +/// `PRAGMA quick_check` inside the VM where the state disk is mounted, catching +/// corruption that this function misses. +/// /// **Stale bootstrap locks** (a kine application-level issue where a killed k3s /// server leaves a lock row that causes the next instance to hang) are handled /// separately by the init script (`openshell-vm-init.sh`), which runs @@ -380,6 +390,10 @@ fn remove_kine_db_files(db_path: &Path) -> Result<(), VmError> { /// automatically. This provides a reliable guard against two VM processes /// sharing the same rootfs — even if the state file is deleted. /// +/// When the lock file already contains a PID from a previous holder that +/// is no longer alive, a warning is logged and any stale VM state files +/// are cleaned up proactively. +/// /// Returns `Ok(File)` on success. The caller must keep the `File` alive /// for as long as the VM is running. pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { @@ -405,14 +419,13 @@ pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { if rc != 0 { let err = std::io::Error::last_os_error(); if err.raw_os_error() == Some(libc::EWOULDBLOCK) { - // Another process holds the lock — read its PID for diagnostics. + // Another process holds the flock. Read the PID recorded in + // the file for diagnostics — but verify it's still alive, + // because the file may contain a stale PID from a crashed + // predecessor while a different process now holds the flock. let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default(); let holder_pid = holder_pid.trim(); - return Err(VmError::RuntimeState(format!( - "another process (pid {holder_pid}) is using rootfs {}. \ - Stop the running VM first", - rootfs.display() - ))); + return Err(stale_lock_error(rootfs, holder_pid, &lock_path)); } return Err(VmError::RuntimeState(format!( "lock rootfs {}: {err}", @@ -420,7 +433,11 @@ pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { ))); } - // Lock acquired — write our PID (truncate first, then write). + // Lock acquired — check for stale state from a crashed predecessor. + // Read the previous PID before we overwrite it. + cleanup_stale_state_on_lock_acquire(rootfs, &lock_path); + + // Write our PID (truncate first, then write). // This is informational only; the flock is the real guard. let _ = file.set_len(0); { @@ -431,6 +448,58 @@ pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { Ok(file) } +/// Build an appropriate error when flock returns EWOULDBLOCK. +/// +/// If the PID recorded in the lock file is dead, the flock holder is a +/// different (unknown) process — provide enhanced diagnostics so the user +/// isn't misled by a stale PID. +fn stale_lock_error(rootfs: &Path, recorded_pid: &str, _lock_path: &Path) -> VmError { + if let Ok(pid) = recorded_pid.parse::() { + if pid > 0 && !process_alive(pid) { + return VmError::RuntimeState(format!( + "rootfs {} is locked, but the recorded holder (pid {pid}) is dead. \ + A different openshell-vm process likely holds the lock. \ + Check for running openshell-vm processes (`ps aux | grep openshell-vm`) \ + and stop them before retrying.", + rootfs.display(), + )); + } + } + VmError::RuntimeState(format!( + "another process (pid {recorded_pid}) is using rootfs {}. \ + Stop the running VM first", + rootfs.display() + )) +} + +/// After successfully acquiring the flock, check whether the lock file +/// contained a PID from a dead process (crash recovery). If so, log a +/// warning and clean up stale VM state/socket files. +fn cleanup_stale_state_on_lock_acquire(rootfs: &Path, lock_path: &Path) { + let prev_contents = fs::read_to_string(lock_path).unwrap_or_default(); + let Ok(prev_pid) = prev_contents.trim().parse::() else { + return; + }; + if prev_pid <= 0 || process_alive(prev_pid) { + return; + } + + eprintln!( + "Warning: cleaning up stale lock from dead process (pid {prev_pid})" + ); + + let state_path = vm_state_path(rootfs); + if let Ok(bytes) = fs::read(&state_path) { + if let Ok(state) = serde_json::from_slice::(&bytes) { + if !process_alive(state.pid) { + eprintln!(" Removing stale VM state (pid {})", state.pid); + let _ = fs::remove_file(&state_path); + let _ = fs::remove_file(vm_exec_socket_path(rootfs)); + } + } + } +} + /// Check whether the rootfs lock file is currently held by another process. /// /// Returns `Ok(())` if the lock is free (or can be acquired), and an @@ -453,11 +522,7 @@ fn check_rootfs_lock_free(rootfs: &Path) -> Result<(), VmError> { if err.raw_os_error() == Some(libc::EWOULDBLOCK) { let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default(); let holder_pid = holder_pid.trim(); - return Err(VmError::RuntimeState(format!( - "another process (pid {holder_pid}) is using rootfs {}. \ - Stop the running VM first", - rootfs.display() - ))); + return Err(stale_lock_error(rootfs, holder_pid, &lock_path)); } } else { // We acquired the lock — release it immediately since we're only probing. @@ -468,27 +533,16 @@ fn check_rootfs_lock_free(rootfs: &Path) -> Result<(), VmError> { } pub fn ensure_vm_not_running(rootfs: &Path) -> Result<(), VmError> { - // Primary guard: check the flock. This works even if the state file - // has been deleted, because the kernel holds the lock until the - // owning process exits. + // The flock is the definitive guard: the kernel releases it + // automatically when the owning process exits (even via SIGKILL). + // If this succeeds, no VM process holds the rootfs. check_rootfs_lock_free(rootfs)?; - // Secondary guard: check the state file for any stale state. - match load_vm_runtime_state(Some(rootfs)) { - Ok(state) => Err(VmError::RuntimeState(format!( - "VM is already running (pid {}) with exec socket {}", - state.pid, - state.socket_path.display() - ))), - Err(VmError::RuntimeState(message)) - if message.starts_with("read VM runtime state") - || message.starts_with("VM is not running") => - { - clear_vm_runtime_state(rootfs); - Ok(()) - } - Err(err) => Err(err), - } + // Flock is free — no VM process holds the rootfs lock. Any remaining + // state file is stale (from a killed/crashed VM or PID reuse by an + // unrelated process). Clean it up unconditionally. + clear_vm_runtime_state(rootfs); + Ok(()) } pub fn exec_running_vm(options: VmExecOptions) -> Result { diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index 27b0ed843..c4d5ccb31 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -23,7 +23,7 @@ mod health; use std::ffi::CString; use std::path::{Path, PathBuf}; use std::ptr; -use std::time::Instant; +use std::time::{Duration, Instant}; pub use exec::{ VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, VsockConnectMode, acquire_rootfs_lock, @@ -126,6 +126,8 @@ pub enum VmBackendChoice { Libkrun, /// Force the cloud-hypervisor backend (even without GPU/VFIO). CloudHypervisor, + /// Force the QEMU backend (Linux-only, supports VFIO without MSI-X). + Qemu, } /// Networking backend for the microVM. @@ -236,6 +238,11 @@ pub struct VmConfig { /// Whether GPU passthrough is enabled for this VM. pub gpu_enabled: bool, + /// Whether the GPU supports MSI-X (needed for cloud-hypervisor VFIO). + /// When `false` and `Auto` backend is selected with GPU enabled, + /// QEMU is used instead of cloud-hypervisor. + pub gpu_has_msix: bool, + /// VFIO PCI device address for GPU passthrough (e.g. `0000:41:00.0`). /// When set, the cloud-hypervisor backend is used instead of libkrun. pub vfio_device: Option, @@ -245,6 +252,11 @@ pub struct VmConfig { } impl VmConfig { + /// Returns true when the VM runs in exec mode (one-shot command) rather than gateway mode. + pub(crate) fn is_exec_mode(&self) -> bool { + self.exec_path != "/srv/openshell-vm-init.sh" + } + /// Default gateway configuration: boots k3s server inside the VM. /// /// Runs `/srv/openshell-vm-init.sh` which mounts essential filesystems, @@ -286,6 +298,7 @@ impl VmConfig { gateway_name: format!("{GATEWAY_NAME_PREFIX}-default"), state_disk: Some(state_disk), gpu_enabled: false, + gpu_has_msix: true, vfio_device: None, backend: VmBackendChoice::Auto, } @@ -893,23 +906,54 @@ pub(crate) fn kill_stale_gvproxy_by_port(port: u16) { fn kill_gvproxy_pid(gvproxy_pid: u32) { let pid_i32 = gvproxy_pid as libc::pid_t; let is_alive = unsafe { libc::kill(pid_i32, 0) } == 0; - if is_alive { - // Verify the process is actually gvproxy before killing. - // Without this check, PID reuse could cause us to kill an - // unrelated process. - if !is_process_named(pid_i32, "gvproxy") { - eprintln!( - "Stale gvproxy pid {gvproxy_pid} is no longer gvproxy (PID reused), skipping kill" - ); + if !is_alive { + return; + } + + if !is_process_named(pid_i32, "gvproxy") { + eprintln!( + "Stale gvproxy pid {gvproxy_pid} is no longer gvproxy (PID reused), skipping kill" + ); + return; + } + + unsafe { + libc::kill(pid_i32, libc::SIGTERM); + } + eprintln!("Killing stale gvproxy process (pid {gvproxy_pid})..."); + + // Wait up to 2 seconds for graceful shutdown, then escalate to SIGKILL. + let deadline = Instant::now() + Duration::from_secs(2); + loop { + std::thread::sleep(Duration::from_millis(50)); + if unsafe { libc::kill(pid_i32, 0) } != 0 { + eprintln!("Stale gvproxy (pid {gvproxy_pid}) terminated"); + std::thread::sleep(Duration::from_millis(100)); return; } - unsafe { - libc::kill(pid_i32, libc::SIGTERM); + if Instant::now() >= deadline { + break; } - eprintln!("Killed stale gvproxy process (pid {gvproxy_pid})"); - // Brief pause for the port to be released. - std::thread::sleep(std::time::Duration::from_millis(200)); } + + eprintln!("gvproxy (pid {gvproxy_pid}) did not exit after SIGTERM, sending SIGKILL"); + unsafe { + libc::kill(pid_i32, libc::SIGKILL); + } + + // Wait for the process to be reaped (up to 2 more seconds). + let kill_deadline = Instant::now() + Duration::from_secs(2); + loop { + std::thread::sleep(Duration::from_millis(50)); + if unsafe { libc::kill(pid_i32, 0) } != 0 { + break; + } + if Instant::now() >= kill_deadline { + eprintln!("warning: gvproxy (pid {gvproxy_pid}) still alive after SIGKILL"); + break; + } + } + std::thread::sleep(Duration::from_millis(100)); } /// Check whether a process with the given PID has the expected name. @@ -1066,12 +1110,13 @@ fn secure_socket_base(subdir: &str) -> Result { dir.display() ))); } - // Verify ownership matches current user. + // Verify ownership matches current user. Root (uid 0) can safely + // use any directory, so skip this check under sudo / as root. #[cfg(unix)] { use std::os::unix::fs::MetadataExt as _; - let uid = unsafe { libc::getuid() }; - if meta.uid() != uid { + let uid = unsafe { libc::geteuid() }; + if uid != 0 && meta.uid() != uid { return Err(VmError::HostSetup(format!( "socket directory {} is owned by uid {} but we are uid {} — refusing to use it", dir.display(), @@ -1126,12 +1171,17 @@ fn validate_vfio_address(addr: &str) -> Result<(), VmError> { } pub(crate) fn gateway_host_port(config: &VmConfig) -> u16 { - config - .port_map - .first() - .and_then(|pm| pm.split(':').next()) - .and_then(|port| port.parse::().ok()) - .unwrap_or(DEFAULT_GATEWAY_PORT) + for pm in &config.port_map { + let parts: Vec<&str> = pm.split(':').collect(); + if parts.len() == 2 { + if let Ok(guest) = parts[1].parse::() { + if guest == GUEST_GATEWAY_NODEPORT { + return parts[0].parse::().unwrap_or(DEFAULT_GATEWAY_PORT); + } + } + } + } + DEFAULT_GATEWAY_PORT } pub(crate) fn pick_gvproxy_ssh_port() -> Result { @@ -1199,7 +1249,7 @@ pub fn launch(config: &VmConfig) -> Result { #[cfg(target_os = "linux")] check_kvm_access()?; - if config.exec_path == "/srv/openshell-vm-init.sh" { + if !config.is_exec_mode() { ensure_vm_not_running(&config.rootfs)?; } @@ -1208,7 +1258,7 @@ pub fn launch(config: &VmConfig) -> Result { // is killed (even SIGKILL), the OS releases the lock automatically. // This prevents a second launch or rootfs rebuild from corrupting a // running VM's filesystem via virtio-fs. - let _rootfs_lock = if config.exec_path == "/srv/openshell-vm-init.sh" { + let _rootfs_lock = if !config.is_exec_mode() { Some(acquire_rootfs_lock(&config.rootfs)?) } else { None @@ -1220,7 +1270,7 @@ pub fn launch(config: &VmConfig) -> Result { // every normal boot (not --reset, which wipes k3s/server/ entirely). // Must happen after the lock so we know no other VM process is using // the rootfs. - if !config.reset && config.exec_path == "/srv/openshell-vm-init.sh" { + if !config.reset && !config.is_exec_mode() { recover_corrupt_kine_db(&config.rootfs)?; } @@ -1270,44 +1320,88 @@ pub fn launch(config: &VmConfig) -> Result { // ── Dispatch to the appropriate backend ───────────────────────── - let use_chv = match config.backend { - VmBackendChoice::CloudHypervisor => true, - VmBackendChoice::Libkrun => false, - VmBackendChoice::Auto => config.gpu_enabled || config.vfio_device.is_some(), - }; + enum SelectedBackend { + Libkrun, + CloudHypervisor, + Qemu, + } - if use_chv { - #[cfg(not(target_os = "linux"))] - return Err(VmError::HostSetup( - "cloud-hypervisor backend requires Linux with KVM".into(), - )); + let selected = match config.backend { + VmBackendChoice::CloudHypervisor => { + if config.gpu_enabled && !config.gpu_has_msix { + return Err(VmError::HostSetup( + "cloud-hypervisor requires MSI-X for VFIO passthrough, but this GPU \ + lacks MSI-X support. Use --backend auto or --backend qemu." + .into(), + )); + } + SelectedBackend::CloudHypervisor + } + VmBackendChoice::Libkrun => SelectedBackend::Libkrun, + VmBackendChoice::Qemu => SelectedBackend::Qemu, + VmBackendChoice::Auto => { + if config.gpu_enabled { + if config.gpu_has_msix { + SelectedBackend::CloudHypervisor + } else { + SelectedBackend::Qemu + } + } else if config.vfio_device.is_some() { + SelectedBackend::CloudHypervisor + } else { + SelectedBackend::Libkrun + } + } + }; - #[cfg(target_os = "linux")] - { - if let Some(ref addr) = config.vfio_device { - validate_vfio_address(addr)?; + match selected { + SelectedBackend::CloudHypervisor => { + #[cfg(not(target_os = "linux"))] + return Err(VmError::HostSetup( + "cloud-hypervisor backend requires Linux with KVM".into(), + )); + + #[cfg(target_os = "linux")] + { + if let Some(ref addr) = config.vfio_device { + validate_vfio_address(addr)?; + } + let chv_backend = backend::cloud_hypervisor::CloudHypervisorBackend::new()?; + backend::VmBackend::launch(&chv_backend, config) } - let chv_backend = backend::cloud_hypervisor::CloudHypervisorBackend::new()?; - return backend::VmBackend::launch(&chv_backend, config); + } + SelectedBackend::Qemu => { + #[cfg(not(target_os = "linux"))] + return Err(VmError::HostSetup( + "QEMU backend requires Linux with KVM".into(), + )); + + #[cfg(target_os = "linux")] + { + if let Some(ref addr) = config.vfio_device { + validate_vfio_address(addr)?; + } + let qemu_backend = backend::qemu::QemuBackend::new()?; + backend::VmBackend::launch(&qemu_backend, config) + } + } + SelectedBackend::Libkrun => { + let runtime_gvproxy = resolve_runtime_bundle()?; + let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { + VmError::HostSetup(format!( + "runtime bundle file has no parent directory: {}", + runtime_gvproxy.display() + )) + })?; + configure_runtime_loader_env(runtime_dir)?; + + let _ = ffi::libkrun()?; + log_runtime_provenance(runtime_dir); + + let libkrun_backend = backend::libkrun::LibkrunBackend; + backend::VmBackend::launch(&libkrun_backend, config) } } - - // libkrun path: resolve the embedded runtime bundle and load libkrun. - // Cloud-hypervisor resolves its own binaries in CloudHypervisorBackend::new(). - let runtime_gvproxy = resolve_runtime_bundle()?; - let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { - VmError::HostSetup(format!( - "runtime bundle file has no parent directory: {}", - runtime_gvproxy.display() - )) - })?; - configure_runtime_loader_env(runtime_dir)?; - - let _ = ffi::libkrun()?; - log_runtime_provenance(runtime_dir); - - let libkrun_backend = backend::libkrun::LibkrunBackend; - backend::VmBackend::launch(&libkrun_backend, config) } // ── Post-boot bootstrap ──────────────────────────────────────────────── @@ -1315,6 +1409,9 @@ pub fn launch(config: &VmConfig) -> Result { /// Default gateway port: host port mapped to the `OpenShell` `NodePort` (30051). const DEFAULT_GATEWAY_PORT: u16 = 30051; +/// The NodePort the OpenShell gateway listens on inside the VM. +pub const GUEST_GATEWAY_NODEPORT: u16 = 30051; + /// Bootstrap the `OpenShell` control plane after k3s is ready. /// /// Two paths: @@ -1364,7 +1461,7 @@ pub(crate) fn bootstrap_gateway( // drift check and the host already has valid certs. If the agent // isn't reachable we skip silently rather than blocking boot for // 30s. - match fetch_pki_over_exec(&exec_socket, std::time::Duration::from_secs(5)) { + match fetch_pki_over_exec(&exec_socket, Duration::from_secs(5)) { Ok(bundle) => { if let Err(e) = sync_host_certs_if_stale(gateway_name, &bundle) { eprintln!("Warning: cert sync check failed: {e}"); @@ -1391,7 +1488,7 @@ pub(crate) fn bootstrap_gateway( // We poll the exec agent with `cat ` for each PEM file until they // exist, retrying to handle the window between VM boot and PKI generation. eprintln!("Waiting for VM to generate PKI..."); - let pki_bundle = fetch_pki_over_exec(&exec_socket, std::time::Duration::from_secs(120)) + let pki_bundle = fetch_pki_over_exec(&exec_socket, Duration::from_secs(120)) .map_err(|e| VmError::Bootstrap(format!("VM did not produce PKI within 120s: {e}")))?; eprintln!("PKI ready — storing client certs on host..."); @@ -1432,7 +1529,7 @@ const PKI_FILES: &[(&str, &str)] = &[ /// and PKI generation completing. fn fetch_pki_over_exec( exec_socket: &Path, - timeout: std::time::Duration, + timeout: Duration, ) -> Result { let deadline = Instant::now() + timeout; @@ -1440,7 +1537,7 @@ fn fetch_pki_over_exec( match try_read_pki_files(exec_socket) { Ok(bundle) => return Ok(bundle), Err(_) if Instant::now() < deadline => { - std::thread::sleep(std::time::Duration::from_millis(500)); + std::thread::sleep(Duration::from_millis(500)); } Err(e) => { return Err(VmError::Bootstrap(format!( @@ -1710,4 +1807,107 @@ mod tests { let _ = fs::remove_dir_all(&dir); } + + #[test] + fn auto_selects_qemu_when_gpu_no_msix() { + enum SelectedBackend { + Libkrun, + CloudHypervisor, + Qemu, + } + + let select = |backend: VmBackendChoice, gpu_enabled: bool, gpu_has_msix: bool| { + match backend { + VmBackendChoice::CloudHypervisor => SelectedBackend::CloudHypervisor, + VmBackendChoice::Libkrun => SelectedBackend::Libkrun, + VmBackendChoice::Qemu => SelectedBackend::Qemu, + VmBackendChoice::Auto => { + if gpu_enabled { + if gpu_has_msix { + SelectedBackend::CloudHypervisor + } else { + SelectedBackend::Qemu + } + } else { + SelectedBackend::Libkrun + } + } + } + }; + + assert!(matches!( + select(VmBackendChoice::Auto, true, false), + SelectedBackend::Qemu + )); + assert!(matches!( + select(VmBackendChoice::Auto, true, true), + SelectedBackend::CloudHypervisor + )); + assert!(matches!( + select(VmBackendChoice::Auto, false, true), + SelectedBackend::Libkrun + )); + assert!(matches!( + select(VmBackendChoice::Auto, false, false), + SelectedBackend::Libkrun + )); + assert!(matches!( + select(VmBackendChoice::Qemu, false, true), + SelectedBackend::Qemu + )); + } + + fn config_with_port_map(port_map: Vec) -> VmConfig { + VmConfig { + rootfs: PathBuf::from("/tmp/fake-rootfs"), + vcpus: 1, + mem_mib: 512, + exec_path: "/bin/true".to_string(), + args: vec![], + env: vec![], + workdir: "/".to_string(), + port_map, + vsock_ports: vec![], + log_level: 0, + console_output: None, + net: NetBackend::Tsi, + reset: false, + gateway_name: "test".to_string(), + state_disk: None, + gpu_enabled: false, + gpu_has_msix: false, + vfio_device: None, + backend: VmBackendChoice::Auto, + } + } + + #[test] + fn gateway_host_port_default_mapping() { + let cfg = config_with_port_map(vec!["30051:30051".to_string()]); + assert_eq!(gateway_host_port(&cfg), 30051); + } + + #[test] + fn gateway_host_port_no_gateway_mapping_returns_default() { + let cfg = config_with_port_map(vec![ + "6443:6443".to_string(), + "8080:8080".to_string(), + ]); + assert_eq!(gateway_host_port(&cfg), DEFAULT_GATEWAY_PORT); + } + + #[test] + fn gateway_host_port_finds_remapped_gateway() { + let cfg = config_with_port_map(vec![ + "6443:6443".to_string(), + "9999:30051".to_string(), + ]); + assert_eq!(gateway_host_port(&cfg), 9999); + } + + #[test] + fn gateway_host_port_empty_port_map() { + let cfg = config_with_port_map(vec![]); + assert_eq!(gateway_host_port(&cfg), DEFAULT_GATEWAY_PORT); + } } diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs index 4d201cbe1..cafc18763 100644 --- a/crates/openshell-vm/src/main.rs +++ b/crates/openshell-vm/src/main.rs @@ -17,8 +17,9 @@ //! codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - target/debug/openshell-vm //! ``` -use std::io::IsTerminal; +use std::io::{BufRead, IsTerminal}; use std::path::PathBuf; +use std::time::Duration; use clap::{Parser, Subcommand, ValueHint}; @@ -98,8 +99,9 @@ struct Cli { #[arg(long, num_args = 0..=1, default_missing_value = "auto")] gpu: Option, - /// Hypervisor backend: "auto" (default), "libkrun", or "cloud-hypervisor". - /// Auto selects cloud-hypervisor when --gpu is set, libkrun otherwise. + /// Hypervisor backend: "auto" (default), "libkrun", "cloud-hypervisor", or "qemu". + /// Auto selects cloud-hypervisor when --gpu is set (with MSI-X), qemu + /// when --gpu is set without MSI-X, and libkrun otherwise. #[arg(long, default_value = "auto")] backend: String, } @@ -168,6 +170,19 @@ fn main() { } } + #[cfg(target_os = "linux")] + { + #[allow(unsafe_code)] + let ret = unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) }; + if ret != 0 { + eprintln!( + "warning: prctl(PR_SET_PDEATHSIG) failed: {} — \ + signal propagation through sudo may not work", + std::io::Error::last_os_error() + ); + } + } + tracing_subscriber::fmt::init(); let cli = Cli::parse(); @@ -185,6 +200,102 @@ fn main() { } } +/// RAII guard that restarts the display manager when dropped. +/// +/// Created when the user confirms stopping the display manager for GPU +/// passthrough. On drop (normal exit, error, or panic), restarts the +/// service so the user's graphical session is restored. +struct DisplayManagerGuard; + +impl DisplayManagerGuard { + fn stop_display_manager() -> Result> { + eprintln!("Stopping display-manager..."); + let status = std::process::Command::new("systemctl") + .args(["stop", "display-manager"]) + .status()?; + if !status.success() { + return Err(format!( + "failed to stop display-manager (exit {})", + status.code().unwrap_or(-1) + ) + .into()); + } + eprintln!("display-manager stopped"); + // Give Xorg time to release GPU device handles. + std::thread::sleep(Duration::from_secs(2)); + Ok(Self) + } +} + +impl Drop for DisplayManagerGuard { + fn drop(&mut self) { + eprintln!("Restarting display-manager..."); + match std::process::Command::new("systemctl") + .args(["start", "display-manager"]) + .status() + { + Ok(s) if s.success() => eprintln!("display-manager restarted"), + Ok(s) => eprintln!( + "warning: display-manager restart failed (exit {})", + s.code().unwrap_or(-1) + ), + Err(e) => eprintln!("warning: could not restart display-manager: {e}"), + } + } +} + +/// Prompt the user to stop the display manager for GPU passthrough. +/// +/// Returns `true` if the user confirms. Always returns `false` when stdin +/// is not a terminal (non-interactive mode). +fn prompt_display_manager_stop(info: &openshell_vfio::DisplayBlockerInfo) -> bool { + if !std::io::stdin().is_terminal() { + return false; + } + + eprintln!(); + eprintln!( + "WARNING: GPU {} is in use by the display manager.", + info.pci_addr + ); + if !info.display_processes.is_empty() { + let procs: Vec = info + .display_processes + .iter() + .map(|(pid, comm)| format!("{comm} (PID {pid})")) + .collect(); + eprintln!(" Display server processes: {}", procs.join(", ")); + } + if info.has_active_outputs { + eprintln!(" Active display outputs are connected to this GPU."); + } + eprintln!(); + eprintln!("Stopping the display manager will terminate your graphical session."); + eprintln!("You will lose access to any open GUI applications."); + if !info.other_processes.is_empty() { + let procs: Vec = info + .other_processes + .iter() + .map(|(pid, comm)| format!("{comm} (PID {pid})")) + .collect(); + eprintln!(); + eprintln!( + "Other non-display processes are also using the GPU: {}", + procs.join(", ") + ); + eprintln!("These will also lose GPU access."); + } + eprintln!(); + eprintln!("The display manager will be restarted automatically when the VM exits."); + eprint!("Stop display-manager and proceed with GPU passthrough? [y/N] "); + + let mut input = String::new(); + if std::io::stdin().lock().read_line(&mut input).is_err() { + return false; + } + matches!(input.trim().to_lowercase().as_str(), "y" | "yes") +} + fn run(cli: Cli) -> Result> { if let Some(GatewayCommand::PrepareRootfs { force }) = &cli.command { let rootfs = openshell_vm::prepare_rootfs(cli.rootfs.clone(), &cli.name, *force)?; @@ -247,35 +358,79 @@ fn run(cli: Cli) -> Result> { let gateway_name = openshell_vm::gateway_name(&cli.name)?; - let (gpu_enabled, vfio_device, _gpu_guard) = match cli.gpu { + // Check if the display manager is blocking GPU passthrough and offer + // to stop it interactively. The guard restarts display-manager on exit. + let _display_manager_guard: Option = if cli.gpu.is_some() { + let requested_bdf = match cli.gpu.as_deref() { + Some(addr) if addr != "auto" => Some(addr), + _ => None, + }; + + if let Some(blocker) = openshell_vfio::detect_display_blocker(requested_bdf) { + if prompt_display_manager_stop(&blocker) { + Some(DisplayManagerGuard::stop_display_manager()?) + } else { + return Err(format!( + "GPU passthrough aborted: GPU {} is in use by the display manager.\n\ + To proceed, stop it manually before launching the VM:\n \ + sudo systemctl stop display-manager", + blocker.pci_addr + ) + .into()); + } + } else { + None + } + } else { + None + }; + + let (gpu_enabled, vfio_device, gpu_has_msix, _gpu_guard) = match cli.gpu { Some(ref addr) if addr != "auto" => { let state = openshell_vfio::prepare_gpu_for_passthrough(Some(addr))?; let bdf = state.pci_addr.clone(); + let has_msix = state.has_msix; ( true, Some(bdf), + has_msix, Some(openshell_vfio::GpuBindGuard::new(state)), ) } Some(_) => { let state = openshell_vfio::prepare_gpu_for_passthrough(None)?; let bdf = state.pci_addr.clone(); + let has_msix = state.has_msix; ( true, Some(bdf), + has_msix, Some(openshell_vfio::GpuBindGuard::new(state)), ) } - None => (false, None, None), + None => (false, None, true, None), }; + if let Some(ref guard) = _gpu_guard { + if let Some(state) = guard.state() { + if state.did_bind { + eprintln!( + "\nGPU recovery: if this process is force-killed (kill -9), \ + restore your GPU with:\n{}", + state.recovery_commands() + ); + } + } + } + let backend_choice = match cli.backend.as_str() { "cloud-hypervisor" | "chv" => openshell_vm::VmBackendChoice::CloudHypervisor, + "qemu" => openshell_vm::VmBackendChoice::Qemu, "libkrun" => { if gpu_enabled { return Err( "--backend libkrun is incompatible with --gpu (libkrun does not support \ - VFIO passthrough). Use --backend auto or --backend cloud-hypervisor." + VFIO passthrough). Use --backend auto, --backend cloud-hypervisor, or --backend qemu." .into(), ); } @@ -284,7 +439,7 @@ fn run(cli: Cli) -> Result> { "auto" => openshell_vm::VmBackendChoice::Auto, other => { return Err(format!( - "unknown --backend: {other} (expected: auto, libkrun, cloud-hypervisor)" + "unknown --backend: {other} (expected: auto, libkrun, cloud-hypervisor, qemu)" ) .into()); } @@ -308,6 +463,7 @@ fn run(cli: Cli) -> Result> { gateway_name, state_disk: None, gpu_enabled, + gpu_has_msix, vfio_device, backend: backend_choice, } @@ -315,6 +471,16 @@ fn run(cli: Cli) -> Result> { let mut c = openshell_vm::VmConfig::gateway(rootfs); if !cli.port.is_empty() { c.port_map = cli.port; + let has_gateway = c.port_map.iter().any(|pm| { + pm.split(':').nth(1).and_then(|p| p.parse::().ok()) + == Some(openshell_vm::GUEST_GATEWAY_NODEPORT) + }); + if !has_gateway { + eprintln!( + "warning: no port mapping targets guest port 30051 (gateway NodePort); \ + health check will use default port 30051" + ); + } } if let Some(v) = cli.vcpus { c.vcpus = v; @@ -326,6 +492,7 @@ fn run(cli: Cli) -> Result> { c.reset = cli.reset; c.gateway_name = gateway_name; c.gpu_enabled = gpu_enabled; + c.gpu_has_msix = gpu_has_msix; c.vfio_device = vfio_device; c.backend = backend_choice; if state_disk_disabled() { diff --git a/crates/openshell-vm/tests/vm_boot_smoke.rs b/crates/openshell-vm/tests/vm_boot_smoke.rs index ffdb16595..876c2a5b4 100644 --- a/crates/openshell-vm/tests/vm_boot_smoke.rs +++ b/crates/openshell-vm/tests/vm_boot_smoke.rs @@ -1,23 +1,24 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Non-GPU cloud-hypervisor boot smoke test. +//! Non-GPU boot smoke tests for cloud-hypervisor and QEMU backends. //! -//! Boots a cloud-hypervisor VM **without** VFIO/GPU passthrough and verifies -//! the kernel boots and init runs. This catches backend regressions on regular -//! CI runners that lack GPU hardware. +//! Boots a VM **without** VFIO/GPU passthrough and verifies the kernel boots +//! and init runs. This catches backend regressions on regular CI runners +//! that lack GPU hardware. //! -//! Gated on `OPENSHELL_VM_BACKEND=cloud-hypervisor` — skipped when the env -//! var is absent or set to a different backend. +//! Gated on `OPENSHELL_VM_BACKEND` — set to `cloud-hypervisor` or `qemu` to +//! run the corresponding tests. Skipped when the env var is absent. //! -//! Requires the VM runtime bundle (cloud-hypervisor, vmlinux, virtiofsd, -//! rootfs) to be installed. Set `OPENSHELL_VM_RUNTIME_DIR` or run +//! Requires the VM runtime bundle (vmlinux, virtiofsd, rootfs, and the +//! backend binary) to be installed. Set `OPENSHELL_VM_RUNTIME_DIR` or run //! `mise run vm:bundle-runtime` first. //! //! Run explicitly: //! //! ```sh //! OPENSHELL_VM_BACKEND=cloud-hypervisor cargo test -p openshell-vm --test vm_boot_smoke +//! OPENSHELL_VM_BACKEND=qemu cargo test -p openshell-vm --test vm_boot_smoke //! ``` #![allow(unsafe_code)] @@ -146,6 +147,101 @@ fn cloud_hypervisor_boots_without_gpu() { ); } +fn skip_unless_qemu() -> bool { + if std::env::var("OPENSHELL_VM_BACKEND").as_deref() != Ok("qemu") { + eprintln!("OPENSHELL_VM_BACKEND != qemu — skipping"); + return true; + } + false +} + +#[test] +fn qemu_exec_exits_cleanly() { + if skip_unless_qemu() { + return; + } + require_bundle(); + + let mut child = Command::new(GATEWAY) + .args([ + "--backend", + "qemu", + "--net", + "none", + "--exec", + "/bin/true", + ]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("failed to start openshell-vm"); + + let timeout = Duration::from_secs(30); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => { + assert!( + status.success(), + "qemu --exec /bin/true exited with {status}" + ); + return; + } + Ok(None) => { + if start.elapsed() > timeout { + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGKILL) }; + let _ = child.wait(); + panic!("QEMU VM did not exit within {timeout:?}"); + } + std::thread::sleep(Duration::from_millis(500)); + } + Err(e) => panic!("error waiting for openshell-vm: {e}"), + } + } +} + +#[test] +fn qemu_boots_without_gpu() { + if skip_unless_qemu() { + return; + } + require_bundle(); + + if !nix_is_root() { + eprintln!("skipping full gateway boot — requires root for TAP networking"); + return; + } + + let mut child = Command::new(GATEWAY) + .args(["--backend", "qemu"]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("failed to start openshell-vm"); + + let addr: std::net::SocketAddr = ([127, 0, 0, 1], 30051).into(); + let timeout = Duration::from_secs(180); + let start = std::time::Instant::now(); + let mut reachable = false; + + while start.elapsed() < timeout { + if std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { + reachable = true; + break; + } + std::thread::sleep(Duration::from_secs(2)); + } + + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) }; + let _ = child.wait(); + + assert!( + reachable, + "QEMU VM service on port 30051 not reachable within {timeout:?}" + ); +} + fn nix_is_root() -> bool { unsafe { libc::geteuid() == 0 } } diff --git a/tasks/scripts/vm/build-cloud-hypervisor.sh b/tasks/scripts/vm/build-gpu-deps.sh similarity index 73% rename from tasks/scripts/vm/build-cloud-hypervisor.sh rename to tasks/scripts/vm/build-gpu-deps.sh index af0c913b1..db109e558 100755 --- a/tasks/scripts/vm/build-cloud-hypervisor.sh +++ b/tasks/scripts/vm/build-gpu-deps.sh @@ -2,14 +2,24 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Download pre-built cloud-hypervisor and virtiofsd binaries for GPU passthrough. +# Build GPU passthrough dependencies shared by CHV and QEMU backends. # -# These are only needed on Linux for VFIO GPU passthrough via the -# cloud-hypervisor backend. The binaries are downloaded from their -# respective GitHub release pages. +# Downloads pre-built cloud-hypervisor and builds virtiofsd from source. +# These are only needed on Linux for VFIO GPU passthrough. +# +# Artifacts produced: +# cloud-hypervisor — CHV backend binary (not needed by QEMU) +# virtiofsd — shared by both CHV and QEMU backends +# +# The vmlinux kernel (shared by CHV and QEMU) is extracted separately +# by build-libkrun.sh during the kernel build step. +# +# QEMU's own binary (qemu-system-x86_64) must be installed on the host +# separately — it is not built or downloaded by this script. +# Run `mise run vm:qemu-check` to validate QEMU prerequisites. # # Usage: -# ./build-cloud-hypervisor.sh [--output-dir ] +# ./build-gpu-deps.sh [--output-dir ] set -euo pipefail @@ -64,6 +74,9 @@ CARGO_CMD="cargo" if command -v mise &>/dev/null; then CARGO_CMD="mise exec -- cargo" fi +# Prevent external CARGO_TARGET_DIR from redirecting build output away from +# the local temp directory (e.g. Cursor sandbox sets this globally). +unset CARGO_TARGET_DIR $CARGO_CMD build --release --manifest-path "${VIRTIOFSD_SRC}/Cargo.toml" cp "${VIRTIOFSD_SRC}/target/release/virtiofsd" "${OUTPUT_DIR}/virtiofsd" chmod +x "${OUTPUT_DIR}/virtiofsd" diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh index 621332366..cbd505f1c 100755 --- a/tasks/scripts/vm/build-libkrun.sh +++ b/tasks/scripts/vm/build-libkrun.sh @@ -210,9 +210,25 @@ if [ -f openshell.kconfig ]; then # Re-run olddefconfig to fill in any new symbols introduced by the fragment. make -C "${KERNEL_SOURCES}" ARCH="${KARCH}" olddefconfig + # Force-enable hidden Kconfig bools required by out-of-tree NVIDIA modules. + # CONFIG_MMU_NOTIFIER is a hidden bool (no prompt) that can only be + # activated via "select" from another in-tree option. olddefconfig and + # syncconfig both strip it if nothing selects it. NVIDIA UVM needs it for + # GPU memory management. We patch the DRM Kconfig (already enabled as + # CONFIG_DRM=y) to select MMU_NOTIFIER, then re-run olddefconfig so the + # dependency chain (INTERVAL_TREE) is resolved properly. + if ! grep -q "select MMU_NOTIFIER" "${KERNEL_SOURCES}/drivers/gpu/drm/Kconfig"; then + sed -i '/^menuconfig DRM$/,/^[[:space:]]*select VIDEO/ { + /^[[:space:]]*select VIDEO/a\ +\tselect MMU_NOTIFIER + }' "${KERNEL_SOURCES}/drivers/gpu/drm/Kconfig" + echo " Patched DRM Kconfig to select MMU_NOTIFIER" + fi + make -C "${KERNEL_SOURCES}" ARCH="${KARCH}" olddefconfig + # Verify that the key options were actually applied. all_ok=true - for opt in CONFIG_BRIDGE CONFIG_NETFILTER CONFIG_NF_NAT; do + for opt in CONFIG_BRIDGE CONFIG_NETFILTER CONFIG_NF_NAT CONFIG_X86_PAT CONFIG_MMU_NOTIFIER CONFIG_FW_LOADER; do val="$(grep "^${opt}=" "${KERNEL_SOURCES}/.config" 2>/dev/null || true)" if [ -n "$val" ]; then echo " ${opt}: ${val#*=}" @@ -251,6 +267,13 @@ else echo " Warning: vmlinux not found in kernel build tree (GPU passthrough will not be available)" >&2 fi +# Export kernel release string for downstream scripts (nvidia modules, rootfs). +# Uses kernelrelease (includes CONFIG_LOCALVERSION) so that module vermagic, +# rootfs module path, and the kernel's uname -r all agree. +KERNEL_RELEASE="$(make -s -C "${KERNEL_SOURCES}" kernelrelease)" +echo "${KERNEL_RELEASE}" > "${OUTPUT_DIR}/kernel-version.txt" +echo " Exported kernel version: ${KERNEL_RELEASE}" + cd "$BUILD_DIR" # ── Build libkrun (VMM) ───────────────────────────────────────────────── diff --git a/tasks/scripts/vm/build-nvidia-modules.sh b/tasks/scripts/vm/build-nvidia-modules.sh new file mode 100755 index 000000000..064c4bb0c --- /dev/null +++ b/tasks/scripts/vm/build-nvidia-modules.sh @@ -0,0 +1,182 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build NVIDIA open kernel modules against the VM kernel source tree. +# +# Clones the NVIDIA open-gpu-kernel-modules repo at a pinned driver tag +# and compiles the kernel modules against the kernel built by +# build-libkrun.sh. The resulting .ko files are placed in the output +# directory for injection into the GPU rootfs by build-rootfs.sh. +# +# Prerequisites: +# - Kernel source tree built by build-libkrun.sh +# (target/libkrun-build/libkrunfw/linux-/) +# - Build tools: make, gcc +# +# Usage: +# ./build-nvidia-modules.sh [--output-dir ] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/_lib.sh" +ROOT="$(vm_lib_root)" + +source "${ROOT}/crates/openshell-vm/pins.env" 2>/dev/null || true + +NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}" + +BUILD_DIR="${ROOT}/target/libkrun-build" +OUTPUT_DIR="${BUILD_DIR}/nvidia-modules" + +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) OUTPUT_DIR="$2"; shift 2 ;; + *) echo "Unknown argument: $1" >&2; exit 1 ;; + esac +done + +if [ "$(uname -s)" != "Linux" ]; then + echo "Error: NVIDIA GPU module build is Linux-only" >&2 + exit 1 +fi + +HOST_ARCH="$(uname -m)" +if [ "$HOST_ARCH" != "x86_64" ]; then + echo "Error: NVIDIA GPU passthrough is only supported on x86_64 (got: ${HOST_ARCH})" >&2 + exit 1 +fi + +# ── Locate the kernel source tree ──────────────────────────────────────── + +LIBKRUNFW_DIR="${BUILD_DIR}/libkrunfw" +if [ ! -f "${LIBKRUNFW_DIR}/Makefile" ]; then + echo "ERROR: libkrunfw not found at ${LIBKRUNFW_DIR}" >&2 + echo " The GPU module build requires the kernel source tree." >&2 + echo " Run: FROM_SOURCE=1 mise run vm:setup" >&2 + exit 1 +fi + +KERNEL_DIR_NAME="$(grep '^KERNEL_VERSION' "${LIBKRUNFW_DIR}/Makefile" | head -1 | awk '{print $3}')" +KERNEL_SOURCES="${LIBKRUNFW_DIR}/${KERNEL_DIR_NAME}" + +if [ ! -f "${KERNEL_SOURCES}/.config" ]; then + echo "ERROR: Kernel source tree not found at ${KERNEL_SOURCES}" >&2 + echo " Run: FROM_SOURCE=1 mise run vm:setup" >&2 + exit 1 +fi + +if [ ! -f "${KERNEL_SOURCES}/Module.symvers" ]; then + echo "ERROR: Kernel tree at ${KERNEL_SOURCES} is missing Module.symvers." >&2 + echo " The kernel must have been fully built." >&2 + echo " Run: FROM_SOURCE=1 mise run vm:setup" >&2 + exit 1 +fi + +# Use kernelrelease to get the full version string (includes CONFIG_LOCALVERSION). +KERNEL_VERSION="$(make -s -C "${KERNEL_SOURCES}" kernelrelease)" +echo "==> Building NVIDIA ${NVIDIA_DRIVER_VERSION} kernel modules for kernel ${KERNEL_VERSION}" +echo " Kernel source: ${KERNEL_SOURCES}" +echo " Output: ${OUTPUT_DIR}" +echo "" + +# ── Prepare kernel tree for out-of-tree module builds ──────────────────── + +echo "==> Preparing kernel tree for external module builds..." +make -C "${KERNEL_SOURCES}" modules_prepare -j"$(nproc)" + +# ── Clone or reuse NVIDIA open-gpu-kernel-modules ──────────────────────── + +NVIDIA_DRIVER_TAG="${NVIDIA_DRIVER_TAG:-}" +if [ -z "${NVIDIA_DRIVER_TAG}" ]; then + echo "ERROR: NVIDIA_DRIVER_TAG not set in pins.env or environment." >&2 + echo " This must be the exact driver version tag matching the" >&2 + echo " nvidia-headless-${NVIDIA_DRIVER_VERSION}-open APT package." >&2 + echo " Find it: apt-cache show nvidia-headless-${NVIDIA_DRIVER_VERSION}-open | grep Version" >&2 + echo " Example: NVIDIA_DRIVER_TAG=570.86.16" >&2 + exit 1 +fi + +NVIDIA_SRC="${BUILD_DIR}/open-gpu-kernel-modules" + +if [ -d "${NVIDIA_SRC}" ]; then + EXISTING_TAG="$(git -C "${NVIDIA_SRC}" describe --tags --exact-match HEAD 2>/dev/null || true)" + if [ "${EXISTING_TAG}" = "${NVIDIA_DRIVER_TAG}" ]; then + echo "==> Using cached NVIDIA source (tag ${NVIDIA_DRIVER_TAG})" + else + echo "==> NVIDIA source tag mismatch (have: ${EXISTING_TAG:-unknown}, want: ${NVIDIA_DRIVER_TAG}), re-cloning..." + rm -rf "${NVIDIA_SRC}" + fi +fi + +if [ ! -d "${NVIDIA_SRC}" ]; then + echo "==> Cloning NVIDIA open-gpu-kernel-modules (tag ${NVIDIA_DRIVER_TAG})..." + git clone --depth 1 --branch "${NVIDIA_DRIVER_TAG}" \ + https://github.com/NVIDIA/open-gpu-kernel-modules.git "${NVIDIA_SRC}" +fi + +# ── Build the kernel modules ───────────────────────────────────────────── + +echo "" +echo "==> Compiling NVIDIA kernel modules (this may take 2-5 minutes)..." +make -C "${NVIDIA_SRC}" -j"$(nproc)" modules \ + SYSSRC="${KERNEL_SOURCES}" \ + KERNEL_UNAME="${KERNEL_VERSION}" + +# ── Collect built modules ──────────────────────────────────────────────── + +mkdir -p "${OUTPUT_DIR}" + +# The NVIDIA kbuild produces modules at deterministic paths under kernel-open/. +declare -A MODULE_PATHS=( + [nvidia.ko]="kernel-open/nvidia.ko" + [nvidia-uvm.ko]="kernel-open/nvidia-uvm.ko" + [nvidia-modeset.ko]="kernel-open/nvidia-modeset.ko" + [nvidia-drm.ko]="kernel-open/nvidia-drm.ko" + [nvidia-peermem.ko]="kernel-open/nvidia-peermem.ko" +) + +EXPECTED_MODULES=(nvidia.ko nvidia-uvm.ko nvidia-modeset.ko nvidia-drm.ko nvidia-peermem.ko) + +for mod in "${EXPECTED_MODULES[@]}"; do + src_path="${NVIDIA_SRC}/${MODULE_PATHS[$mod]}" + if [ -f "$src_path" ]; then + cp "$src_path" "${OUTPUT_DIR}/" + echo " Built: $mod ($(du -h "$src_path" | cut -f1))" + fi +done + +# Normalize permissions. +chmod 644 "${OUTPUT_DIR}"/*.ko 2>/dev/null || true + +# nvidia-peermem.ko is optional (GPUDirect RDMA); the other four are required. +REQUIRED_MODULES=(nvidia.ko nvidia-uvm.ko nvidia-modeset.ko nvidia-drm.ko) +for mod in "${REQUIRED_MODULES[@]}"; do + if [ ! -f "${OUTPUT_DIR}/${mod}" ]; then + echo "ERROR: Required module ${mod} was not produced by the build." >&2 + echo " Check build output above for compilation errors." >&2 + exit 1 + fi +done + +echo "" +echo "==> NVIDIA modules ready at ${OUTPUT_DIR}" +ls -lah "${OUTPUT_DIR}/"*.ko + +# Verify module vermagic matches the kernel. +echo "" +echo "==> Verifying module compatibility..." +if command -v modinfo &>/dev/null; then + VERMAGIC="$(modinfo -F vermagic "${OUTPUT_DIR}/nvidia.ko" 2>/dev/null || true)" + if [ -n "$VERMAGIC" ]; then + echo " vermagic: ${VERMAGIC}" + if echo "$VERMAGIC" | grep -q "^${KERNEL_VERSION} "; then + echo " OK: modules match kernel ${KERNEL_VERSION}" + else + echo " ERROR: vermagic does not start with ${KERNEL_VERSION}" >&2 + echo " Modules will fail to load in the VM." >&2 + exit 1 + fi + fi +fi diff --git a/tasks/scripts/vm/compress-vm-runtime.sh b/tasks/scripts/vm/compress-vm-runtime.sh index 82a7c4b8f..3c0240ffb 100755 --- a/tasks/scripts/vm/compress-vm-runtime.sh +++ b/tasks/scripts/vm/compress-vm-runtime.sh @@ -96,6 +96,18 @@ if [ -z "${VM_RUNTIME_TARBALL:-}" ] && _check_compressed_artifacts "$OUTPUT_DIR" zstd -d "$f" -o "${WORK_DIR}/${name}" -f -q chmod 0755 "${WORK_DIR}/${name}" done + # GPU passthrough binaries live in libkrun-build but are not part of the + # core compressed set. Copy them into WORK_DIR so bundle-vm-runtime.sh + # stages them alongside the core libraries. + _BUILD_DIR="${ROOT}/target/libkrun-build" + for gpu_bin in vmlinux cloud-hypervisor virtiofsd; do + if [ -f "${_BUILD_DIR}/${gpu_bin}" ]; then + cp "${_BUILD_DIR}/${gpu_bin}" "$WORK_DIR/" + chmod 0755 "${WORK_DIR}/${gpu_bin}" + echo " Included GPU binary: ${gpu_bin}" + fi + done + echo " Decompressed files:" ls -lah "$WORK_DIR" @@ -260,6 +272,14 @@ case "$(uname -s)-$(uname -m)" in "https://github.com/containers/gvisor-tap-vsock/releases/download/${GVPROXY_VERSION}/gvproxy-linux-${GVPROXY_ARCH}" chmod +x "$WORK_DIR/gvproxy" fi + + # GPU passthrough binaries (optional — included when present in libkrun-build) + for gpu_bin in vmlinux cloud-hypervisor virtiofsd; do + if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then + cp "${BUILD_DIR}/${gpu_bin}" "$WORK_DIR/" + echo " Included GPU binary: ${gpu_bin}" + fi + done ;; *) diff --git a/tasks/scripts/vm/qemu-check.sh b/tasks/scripts/vm/qemu-check.sh new file mode 100755 index 000000000..8629ff276 --- /dev/null +++ b/tasks/scripts/vm/qemu-check.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Validate QEMU host prerequisites for GPU passthrough. +# +# Checks that qemu-system-x86_64, vhost-vsock support, and required +# runtime artifacts (vmlinux, virtiofsd) are available. +# +# Usage: +# ./qemu-check.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/_lib.sh" +ROOT="$(vm_lib_root)" + +RUNTIME_DIR="${ROOT}/target/libkrun-build" + +pass=0 +fail=0 + +ok() { echo " [OK] $1"; ((pass++)); } +miss() { echo " [MISS] $1"; ((fail++)); } + +echo "==> QEMU host prerequisite check" +echo "" + +# ── qemu-system-x86_64 ────────────────────────────────────────────────── + +echo "--- QEMU binary ---" +if command -v qemu-system-x86_64 &>/dev/null; then + version="$(qemu-system-x86_64 --version | head -n1)" + ok "qemu-system-x86_64 found: ${version}" +else + miss "qemu-system-x86_64 not found (install: sudo apt install qemu-system-x86)" +fi + +# ── vhost-vsock ────────────────────────────────────────────────────────── + +echo "--- vhost-vsock ---" +if [ -e /dev/vhost-vsock ]; then + ok "/dev/vhost-vsock exists" +elif lsmod 2>/dev/null | grep -q vhost_vsock; then + ok "vhost_vsock module loaded (but /dev/vhost-vsock missing — check permissions)" +else + miss "vhost_vsock not loaded (hint: sudo modprobe vhost_vsock)" +fi + +# ── Runtime artifacts ──────────────────────────────────────────────────── + +echo "--- Runtime artifacts (${RUNTIME_DIR}) ---" + +if [ -f "${RUNTIME_DIR}/vmlinux" ]; then + ok "vmlinux found" +else + miss "vmlinux not found (run: FROM_SOURCE=1 mise run vm:setup)" +fi + +if [ -f "${RUNTIME_DIR}/virtiofsd" ]; then + ok "virtiofsd found" +else + miss "virtiofsd not found (run: mise run vm:gpu-deps)" +fi + +# ── Summary ────────────────────────────────────────────────────────────── + +echo "" +echo "==> Summary: ${pass} passed, ${fail} missing" + +if [ "$fail" -gt 0 ]; then + echo "" + echo "Fix the missing prerequisites above before running QEMU GPU passthrough." + exit 1 +fi + +echo "" +echo "All QEMU prerequisites satisfied." +exit 0 diff --git a/tasks/scripts/vm/vm-setup.sh b/tasks/scripts/vm/vm-setup.sh index e7ae06d08..8afd3883d 100755 --- a/tasks/scripts/vm/vm-setup.sh +++ b/tasks/scripts/vm/vm-setup.sh @@ -81,6 +81,11 @@ if [ "$FROM_SOURCE" = "1" ]; then linux-*) # Linux: build both libkrunfw and libkrun in one go "${ROOT}/tasks/scripts/vm/build-libkrun.sh" + if [ "${GPU:-0}" = "1" ]; then + echo "" + echo "==> Building GPU passthrough dependencies..." + "${ROOT}/tasks/scripts/vm/build-gpu-deps.sh" + fi ;; esac echo "" diff --git a/tasks/vm.toml b/tasks/vm.toml index ca06b08c1..a61adec59 100644 --- a/tasks/vm.toml +++ b/tasks/vm.toml @@ -5,6 +5,10 @@ # # Workflow: # mise run vm:setup # one-time: download pre-built runtime (~30s) +# # (with FROM_SOURCE=1: builds kernel + libkrun + GPU deps) +# mise run vm:gpu-deps # (standalone) build GPU passthrough binaries separately +# mise run vm:nvidia-modules # (GPU only) build NVIDIA kernel modules +# mise run vm:rootfs -- --base --gpu # build GPU rootfs with NVIDIA drivers # mise run vm # build + run the VM # mise run vm:clean # wipe everything and start over # @@ -26,7 +30,14 @@ run = [ description = "Build the openshell-vm binary with embedded runtime" run = [ "tasks/scripts/vm/compress-vm-runtime.sh", - "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm", + """ + # The embedded rootfs.tar.zst can exceed 2 GiB, which overflows x86_64's + # default small code model (R_X86_64_PC32 ±2 GiB limit). Use the large + # code model so include_bytes!() blobs of any size link correctly. + RUSTFLAGS="${RUSTFLAGS:-} -C code-model=large" \ + OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed \ + cargo build -p openshell-vm + """, "tasks/scripts/vm/codesign-openshell-vm.sh", "tasks/scripts/vm/bundle-vm-runtime.sh", ] @@ -39,6 +50,18 @@ run = "tasks/scripts/vm/vm-setup.sh" description = "Build the VM rootfs tarball (use -- --base for lightweight)" run = "tasks/scripts/vm/build-rootfs-tarball.sh" +["vm:nvidia-modules"] +description = "Build NVIDIA kernel modules for GPU VM rootfs (requires FROM_SOURCE=1 vm:setup)" +run = "tasks/scripts/vm/build-nvidia-modules.sh" + +["vm:gpu-deps"] +description = "Build GPU passthrough dependencies (cloud-hypervisor, virtiofsd) shared by CHV and QEMU backends" +run = "tasks/scripts/vm/build-gpu-deps.sh" + +["vm:qemu-check"] +description = "Validate QEMU host prerequisites for GPU passthrough" +run = "tasks/scripts/vm/qemu-check.sh" + ["vm:clean"] description = "Remove all VM cached artifacts (runtime, rootfs, builds)" run = "tasks/scripts/vm/vm-clean.sh" From 8dbb413c8d5e75c3f0f30bd4098ec01507d93710 Mon Sep 17 00:00:00 2001 From: Vincent Caux-Brisebois Date: Thu, 23 Apr 2026 22:24:31 +0000 Subject: [PATCH 5/5] remove cloud-hypervisor Signed-off-by: Vincent Caux-Brisebois --- architecture/README.md | 2 +- architecture/custom-vm-runtime.md | 135 +- architecture/vm-gpu-passthrough.md | 58 +- crates/openshell-bootstrap/src/docker.rs | 25 +- crates/openshell-cli/src/run.rs | 6 +- crates/openshell-vfio/src/lib.rs | 195 ++- .../tests/gpu_passthrough_implementation.rs | 2 +- crates/openshell-vm/build.rs | 17 +- crates/openshell-vm/pins.env | 6 +- .../runtime/kernel/openshell.kconfig | 24 +- crates/openshell-vm/scripts/build-rootfs.sh | 4 +- .../openshell-vm/scripts/openshell-vm-init.sh | 40 +- .../src/backend/cloud_hypervisor.rs | 1182 ----------------- crates/openshell-vm/src/backend/libkrun.rs | 2 +- crates/openshell-vm/src/backend/mod.rs | 51 +- crates/openshell-vm/src/backend/qemu.rs | 137 +- crates/openshell-vm/src/exec.rs | 18 +- crates/openshell-vm/src/health.rs | 96 +- crates/openshell-vm/src/lib.rs | 122 +- crates/openshell-vm/src/main.rs | 23 +- crates/openshell-vm/tests/vm_boot_smoke.rs | 118 +- deploy/docker/Dockerfile.images | 3 + tasks/scripts/vm/build-gpu-deps.sh | 30 +- tasks/scripts/vm/build-libkrun.sh | 6 +- tasks/scripts/vm/compress-vm-runtime.sh | 4 +- tasks/scripts/vm/package-vm-runtime.sh | 2 +- tasks/vm.toml | 2 +- 27 files changed, 600 insertions(+), 1710 deletions(-) delete mode 100644 crates/openshell-vm/src/backend/cloud_hypervisor.rs diff --git a/architecture/README.md b/architecture/README.md index 45457d37c..008836fca 100644 --- a/architecture/README.md +++ b/architecture/README.md @@ -301,6 +301,6 @@ This opens an interactive SSH session into the sandbox, with all provider creden | [Inference Routing](inference-routing.md) | Transparent interception and sandbox-local routing of AI inference API calls to configured backends. | | [System Architecture](system-architecture.md) | Top-level system architecture diagram with all deployable components and communication flows. | | [Gateway Settings Channel](gateway-settings.md) | Runtime settings channel: two-tier key-value configuration, global policy override, settings registry, CLI/TUI commands. | -| [Custom VM Runtime](custom-vm-runtime.md) | Dual-backend VM runtime (libkrun / cloud-hypervisor), kernel configuration, and build pipeline. | +| [Custom VM Runtime](custom-vm-runtime.md) | Dual-backend VM runtime (libkrun / QEMU), kernel configuration, and build pipeline. | | [VM GPU Passthrough](vm-gpu-passthrough.md) | VFIO GPU passthrough for VMs: host preparation, safety checks, nvidia driver hardening, and troubleshooting. | | [TUI](tui.md) | Terminal user interface for sandbox interaction. | diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index 6105187e6..9963edea8 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -4,21 +4,17 @@ ## Overview -The OpenShell gateway VM supports three hypervisor backends: +The OpenShell gateway VM supports two hypervisor backends: - **libkrun** (default) — lightweight VMM using Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel is embedded inside `libkrunfw`. Uses virtio-MMIO device transport and gvproxy for user-space networking. -- **cloud-hypervisor** — Linux-only KVM-based VMM used for GPU passthrough (VFIO). Uses - virtio-PCI device transport, TAP networking, and requires a separate `vmlinux` kernel and - `virtiofsd` for rootfs access. Requires GPU MSI-X support. -- **QEMU** — Linux-only fallback VMM for GPU passthrough when the GPU lacks MSI-X support. - Uses the same TAP networking, `vmlinux`, and `virtiofsd` as cloud-hypervisor. QEMU binary - is not embedded — it must be installed on the host. +- **QEMU** — Linux-only VMM used for GPU passthrough (VFIO). Uses virtio-PCI device transport, + TAP networking, and requires a separate `vmlinux` kernel and `virtiofsd` for rootfs access. + QEMU binary is not embedded — it must be installed on the host. -Backend selection is automatic: `--gpu` selects cloud-hypervisor (MSI-X GPU) or QEMU -(non-MSI-X GPU), otherwise libkrun is used. The `--backend` flag provides explicit control -(`auto`, `libkrun`, `cloud-hypervisor`, `qemu`). +Backend selection is automatic: `--gpu` selects QEMU, otherwise libkrun is used. The `--backend` +flag provides explicit control (`auto`, `libkrun`, `qemu`). When `--gpu` is passed, `openshell-vm` automatically binds an eligible GPU to `vfio-pci` and restores it to the original driver on shutdown. See @@ -29,7 +25,7 @@ and rootfs. The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or conntrack support. This is insufficient for Kubernetes pod networking. The custom kconfig -adds bridge CNI, iptables/nftables, conntrack, and cloud-hypervisor compatibility. +adds bridge CNI, iptables/nftables, conntrack, and QEMU compatibility. ## Architecture @@ -41,8 +37,7 @@ graph TD CACHE["~/.local/share/openshell/vm-runtime/{version}/"] PROV[Runtime provenance logging] GVP[gvproxy networking proxy] - CHV_BIN["cloud-hypervisor · virtiofsd · vmlinux\n(GPU runtime bundle)"] - QEMU_BIN["qemu-system-x86_64\n(host-installed, GPU fallback)"] + QEMU_BIN["qemu-system-x86_64 · virtiofsd · vmlinux\n(GPU runtime bundle)"] BIN --> EMB BIN -->|extracts to| CACHE @@ -64,7 +59,6 @@ graph TD end BIN -- "libkrun: fork + krun_start_enter" --> INIT - BIN -- "CHV: cloud-hypervisor API + virtiofsd" --> INIT BIN -- "QEMU: qemu-system-x86_64 + virtiofsd" --> INIT GVP -- "virtio-net (libkrun only)" --> Guest ``` @@ -88,18 +82,17 @@ these to XDG cache directories with progress bars: └── ... ``` -When using cloud-hypervisor, an additional runtime bundle is required alongside the -binary: +When using QEMU for GPU passthrough, an additional runtime bundle is required alongside +the binary: ``` target/debug/openshell-vm.runtime/ (or alongside the installed binary) -├── cloud-hypervisor # CHV binary ├── virtiofsd # virtio-fs daemon └── vmlinux # extracted guest kernel ``` This bundle is built with `mise run vm:bundle-runtime` and is separate from the -embedded runtime because CHV and virtiofsd are Linux-only and not embedded in the +embedded runtime because virtiofsd is Linux-only and not embedded in the self-extracting binary. This eliminates the need for separate bundles or downloads for the default (libkrun) @@ -123,31 +116,31 @@ mise run vm:build # Rebuild binary with full rootfs ## Backend Comparison -| | libkrun (default) | cloud-hypervisor | QEMU | -|---|---|---|---| -| Platforms | macOS (Hypervisor.framework), Linux (KVM) | Linux (KVM) only | Linux (KVM) only | -| Device transport | virtio-MMIO | virtio-PCI | virtio-PCI | -| Networking | gvproxy (user-space, no root needed) | TAP (requires root/CAP_NET_ADMIN) | TAP (requires root/CAP_NET_ADMIN) | -| Rootfs delivery | In-process (krun API) | virtiofsd (virtio-fs daemon) | virtiofsd (virtio-fs daemon) | -| Kernel delivery | Embedded in libkrunfw | Separate `vmlinux` file | Separate `vmlinux` file | -| Console | virtio-console (`hvc0`) | 8250 UART (`ttyS0`) | 8250 UART (`ttyS0`) | -| Shutdown | Automatic on PID 1 exit | ACPI poweroff (`poweroff -f`) | ACPI poweroff (`poweroff -f`) | -| GPU passthrough | Not supported | VFIO PCI (requires MSI-X) | VFIO PCI (MSI-X not required) | -| Vsock | libkrun built-in | Unix socket + text protocol | `AF_VSOCK` (kernel `vhost_vsock`) | -| VM control | krun C API | REST API over Unix socket | Command-line args | -| Binary source | Embedded in runtime | Runtime bundle | Host-installed | -| `--exec` mode | Direct init replacement | Wrapper script with ACPI shutdown | Wrapper script with ACPI shutdown | -| CLI flag | `--backend libkrun` | `--backend cloud-hypervisor` or `--gpu` | `--backend qemu` | +| | libkrun (default) | QEMU | +|---|---|---| +| Platforms | macOS (Hypervisor.framework), Linux (KVM) | Linux (KVM) only | +| Device transport | virtio-MMIO | virtio-PCI | +| Networking | gvproxy (user-space, no root needed) | TAP (requires root/CAP_NET_ADMIN) | +| Rootfs delivery | In-process (krun API) | virtiofsd (virtio-fs daemon) | +| Kernel delivery | Embedded in libkrunfw | Separate `vmlinux` file | +| Console | virtio-console (`hvc0`) | 8250 UART (`ttyS0`) | +| Shutdown | Automatic on PID 1 exit | ACPI poweroff (`poweroff -f`) | +| GPU passthrough | Not supported | VFIO PCI | +| Vsock | libkrun built-in | `AF_VSOCK` (kernel `vhost_vsock`) | +| VM control | krun C API | Command-line args | +| Binary source | Embedded in runtime | Host-installed | +| `--exec` mode | Direct init replacement | Wrapper script with ACPI shutdown | +| CLI flag | `--backend libkrun` | `--backend qemu` or `--gpu` | ### Exec mode differences With libkrun, when `--exec ` is used, the command replaces the init process and the VM exits when PID 1 exits. -With cloud-hypervisor and QEMU, the VM does not automatically exit when PID 1 -terminates. A wrapper init script is dynamically written to the guest rootfs that -mounts necessary filesystems, executes the user command, captures the exit code, -and calls `poweroff -f` to trigger an ACPI shutdown that the hypervisor detects. +With QEMU, the VM does not automatically exit when PID 1 terminates. A wrapper init +script is dynamically written to the guest rootfs that mounts necessary filesystems, +executes the user command, captures the exit code, and calls `poweroff -f` to trigger +an ACPI shutdown that the hypervisor detects. ## Network Profile @@ -167,11 +160,9 @@ fast with an actionable error if they are missing. - **libkrun**: Uses gvproxy for user-space virtio-net networking. No root privileges needed. Port forwarding is handled via gvproxy configuration. -- **cloud-hypervisor**: Uses TAP networking (requires root or CAP_NET_ADMIN). When - `--net none` is passed, networking is disabled entirely (useful for `--exec` mode - tests). gvproxy is not used with cloud-hypervisor. -- **QEMU**: Uses TAP networking (same subnet and setup as cloud-hypervisor). Port - forwarding uses the same userspace TCP proxy. gvproxy is not used with QEMU. +- **QEMU**: Uses TAP networking (requires root or CAP_NET_ADMIN). When `--net none` + is passed, networking is disabled entirely (useful for `--exec` mode tests). gvproxy + is not used with QEMU. ## Guest Init Script @@ -181,7 +172,7 @@ The init script (`openshell-vm-init.sh`) runs as PID 1 in the guest. After mount 2. **Cgroup v2 controller enablement** — enables `cpu`, `cpuset`, `memory`, `pids`, and `io` controllers in the root cgroup hierarchy (`cgroup.subtree_control`). k3s/kubelet requires these controllers; the `cpu` controller depends on `CONFIG_CGROUP_SCHED` in the kernel. -3. **Networking** — detects `eth0` and attempts DHCP (via `udhcpc`). On failure, falls back to static IP configuration using `VM_NET_IP` and `VM_NET_GW` from the kernel cmdline (set by the CHV backend for TAP networking). DNS is configured from `VM_NET_DNS` if set, overriding any stale `/etc/resolv.conf` entries. +3. **Networking** — detects `eth0` and attempts DHCP (via `udhcpc`). On failure, falls back to static IP configuration using `VM_NET_IP` and `VM_NET_GW` from the kernel cmdline (set by the QEMU backend for TAP networking). DNS is configured from `VM_NET_DNS` if set, overriding any stale `/etc/resolv.conf` entries. 4. **Capability validation** — verifies required kernel features (bridge networking, netfilter, cgroups) and fails fast with actionable errors if missing. @@ -214,7 +205,7 @@ graph LR end subgraph GPU["Linux CI (build-gpu-deps.sh)"] - BUILD_GPU["Build cloud-hypervisor + virtiofsd\n(shared by CHV and QEMU)"] + BUILD_GPU["Build virtiofsd\n(for QEMU backend)"] end subgraph NV["Linux CI (build-nvidia-modules.sh)"] @@ -228,8 +219,8 @@ graph LR subgraph Output["target/libkrun-build/"] LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"] LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"] - CHV_OUT["cloud-hypervisor (CHV only)\n+ virtiofsd (CHV + QEMU)"] - VMLINUX["vmlinux\n(shared by CHV + QEMU)"] + VIRTIOFSD["virtiofsd\n(QEMU backend)"] + VMLINUX["vmlinux\n(shared by QEMU)"] NV_KO["nvidia-modules/*.ko\n(GPU builds only)"] end @@ -240,19 +231,19 @@ graph LR BUILD_NV --> NV_KO KCONF --> BUILD_M BUILD_M --> LIB_DY - BUILD_GPU --> CHV_OUT + BUILD_GPU --> VIRTIOFSD ``` -The `vmlinux` kernel is extracted from the libkrunfw build and reused by cloud-hypervisor -and QEMU. All three backends boot the same kernel — the kconfig fragment includes drivers -for both virtio-MMIO (libkrun) and virtio-PCI (CHV/QEMU) transports. +The `vmlinux` kernel is extracted from the libkrunfw build and reused by QEMU. +Both backends boot the same kernel — the kconfig fragment includes drivers for +both virtio-MMIO (libkrun) and virtio-PCI (QEMU) transports. ## Kernel Config Fragment The `openshell.kconfig` fragment enables these kernel features on top of the stock -libkrunfw kernel. A single kernel binary is shared by all three backends (libkrun, -cloud-hypervisor, QEMU) — backend-specific drivers coexist safely (the kernel probes -whichever transport the hypervisor provides). +libkrunfw kernel. A single kernel binary is shared by both backends (libkrun and +QEMU) — backend-specific drivers coexist safely (the kernel probes whichever +transport the hypervisor provides). | Feature | Key Configs | Purpose | |---------|-------------|---------| @@ -276,10 +267,10 @@ whichever transport the hypervisor provides). | Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support | | PCI / GPU | `CONFIG_PCI`, `CONFIG_PCI_MSI`, `CONFIG_DRM` | GPU passthrough via VFIO | | Kernel modules | `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | Loading NVIDIA drivers in guest | -| virtio-PCI transport | `CONFIG_VIRTIO_PCI` | cloud-hypervisor device bus (libkrun uses MMIO) | -| Serial console | `CONFIG_SERIAL_8250`, `CONFIG_SERIAL_8250_CONSOLE` | cloud-hypervisor console (`ttyS0`) | -| ACPI | `CONFIG_ACPI` | cloud-hypervisor power management / clean shutdown | -| x2APIC | `CONFIG_X86_X2APIC` | Multi-vCPU support (CHV uses x2APIC MADT entries) | +| virtio-PCI transport | `CONFIG_VIRTIO_PCI` | QEMU device bus (libkrun uses MMIO) | +| Serial console | `CONFIG_SERIAL_8250`, `CONFIG_SERIAL_8250_CONSOLE` | QEMU console (`ttyS0`) | +| ACPI | `CONFIG_ACPI` | QEMU power management / clean shutdown | +| x2APIC | `CONFIG_X86_X2APIC` | Multi-vCPU support (QEMU uses x2APIC MADT entries) | See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full fragment with inline comments explaining why each option is needed. @@ -310,12 +301,9 @@ commands work the same way they would inside the VM shell. - **libkrun**: Uses libkrun's built-in vsock port mapping, which transparently bridges the guest vsock port to a host Unix socket. -- **cloud-hypervisor**: Uses a vsock exec bridge — a host-side process that - connects an AF_VSOCK socket to a Unix domain socket, providing the same - interface to the exec agent. - **QEMU**: Uses `vhost-vsock-pci` with kernel `AF_VSOCK` sockets. The exec bridge opens a kernel `AF_VSOCK` socket to the guest CID and bridges it to - the same Unix domain socket path used by the other backends. Requires the + the same Unix domain socket path used by the other backend. Requires the `vhost_vsock` kernel module on the host. ## Build Commands @@ -339,8 +327,8 @@ mise run vm:build # Rebuild binary FROM_SOURCE=1 mise run vm:setup # Build runtime from source mise run vm:build # Then build embedded binary -# Build cloud-hypervisor runtime bundle (Linux only) -mise run vm:bundle-runtime # Builds CHV + virtiofsd + extracts vmlinux +# Build GPU runtime bundle (Linux only) +mise run vm:bundle-runtime # Builds virtiofsd + extracts vmlinux # Validate QEMU host prerequisites mise run vm:qemu-check @@ -358,12 +346,9 @@ mise run vm:nvidia-modules # Compile NVIDIA .ko files against VM mise run vm:rootfs -- --base --gpu # Build GPU rootfs with injected kernel modules mise run vm:build # Rebuild binary with GPU rootfs -# Run with cloud-hypervisor backend -openshell-vm --backend cloud-hypervisor # Requires runtime bundle -openshell-vm --gpu # Auto-selects CHV (MSI-X) or QEMU (no MSI-X) - # Run with QEMU backend openshell-vm --backend qemu # Requires qemu-system-x86_64 on host +openshell-vm --gpu # Auto-selects QEMU for GPU passthrough # Wipe everything and start over mise run vm:clean @@ -376,23 +361,23 @@ rolling `vm-dev` GitHub Release: ### Kernel Runtime (`release-vm-kernel.yml`) -Builds the custom libkrunfw (kernel firmware), libkrun (VMM), gvproxy, cloud-hypervisor, -and virtiofsd for all supported platforms. Runs on-demand or when the kernel config / -pinned versions change. +Builds the custom libkrunfw (kernel firmware), libkrun (VMM), gvproxy, and virtiofsd +for all supported platforms. Runs on-demand or when the kernel config / pinned versions +change. | Platform | Runner | Build Method | |----------|--------|-------------| | Linux ARM64 | `build-arm64` (self-hosted) | `build-libkrun.sh` + `build-gpu-deps.sh` | | Linux x86_64 | `build-amd64` (self-hosted) | `build-libkrun.sh` + `build-gpu-deps.sh` | -| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` (no CHV) | +| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` (no GPU support) | Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, -and provenance metadata. Linux artifacts additionally include cloud-hypervisor, -virtiofsd, and the extracted `vmlinux` kernel. +and provenance metadata. Linux artifacts additionally include virtiofsd and the +extracted `vmlinux` kernel. Each platform builds its own libkrunfw and libkrun natively. The kernel inside -libkrunfw is always Linux regardless of host platform. cloud-hypervisor and virtiofsd -are Linux-only (macOS does not support VFIO/KVM passthrough). +libkrunfw is always Linux regardless of host platform. Virtiofsd is +Linux-only (macOS does not support VFIO/KVM passthrough). ### VM Binary (`release-vm-dev.yml`) diff --git a/architecture/vm-gpu-passthrough.md b/architecture/vm-gpu-passthrough.md index eda2ef9e3..621e27c0c 100644 --- a/architecture/vm-gpu-passthrough.md +++ b/architecture/vm-gpu-passthrough.md @@ -6,7 +6,7 @@ OpenShell's VM backend can pass a physical NVIDIA GPU into a microVM using VFIO (Virtual Function I/O). This gives the guest direct access to GPU hardware, enabling CUDA workloads and `nvidia-smi` inside sandboxes without virtualization overhead. -GPU passthrough uses cloud-hypervisor or QEMU (instead of the default libkrun backend) to attach a VFIO device to the VM. The guest sees a real PCI GPU device and loads standard NVIDIA drivers. cloud-hypervisor is preferred; QEMU is used as a fallback when the GPU lacks MSI-X support. +GPU passthrough uses QEMU (instead of the default libkrun backend) to attach a VFIO device to the VM. The guest sees a real PCI GPU device and loads standard NVIDIA drivers. ## Architecture @@ -17,7 +17,7 @@ Host │ Guest (microVM) ↕ bound to vfio-pci │ ↕ /dev/vfio/ │ /dev/nvidia* ↕ │ ↕ - CHV or QEMU (VFIO) ────│→ PCI device visible + QEMU (VFIO) ────│→ PCI device visible ↕ │ ↕ TAP networking │ k3s + device plugin virtiofsd (rootfs) │ ↕ @@ -29,27 +29,11 @@ Host │ Guest (microVM) | Flag | Backend | GPU attached? | |------|---------|---------------| | (none) | libkrun | No | -| `--gpu` (MSI-X GPU) | cloud-hypervisor | Yes | -| `--gpu` (non-MSI-X GPU) | QEMU | Yes (fallback) | -| `--gpu 0000:41:00.0` | auto (CHV or QEMU based on MSI-X) | Yes | -| `--backend cloud-hypervisor` | cloud-hypervisor | No (force CHV without GPU) | +| `--gpu` | QEMU | Yes | +| `--gpu 0000:41:00.0` | QEMU | Yes | | `--backend qemu` | QEMU | Optional | -Auto mode (`--backend auto`, the default) selects cloud-hypervisor when `--gpu` is used with an MSI-X-capable GPU, QEMU when `--gpu` is used with a GPU lacking MSI-X, and libkrun otherwise. - -### QEMU fallback - -QEMU is used when GPU passthrough is requested but the GPU does not support MSI-X (PCI capability `0x11`). cloud-hypervisor's VFIO implementation requires MSI-X; QEMU handles MSI-only devices via its own interrupt remapping layer. - -| Aspect | cloud-hypervisor | QEMU | -|--------|-----------------|------| -| VFIO MSI-X | Required | Not required | -| VM control | REST API over Unix socket | Command-line args + QMP | -| Vsock transport | Unix socket + `CONNECT` text protocol | `AF_VSOCK` (kernel `vhost_vsock`) | -| TAP networking | Built-in TAP creation | `-netdev tap` flag | -| Shutdown | REST `vm.shutdown` | `SIGTERM` or QMP `system_powerdown` | - -The guest kernel, rootfs, init script, and exec agent are identical across both backends. The host requirements differ: QEMU needs `qemu-system-x86_64` installed on the host (not embedded in the runtime bundle) and the `vhost_vsock` kernel module for vsock exec support. +Auto mode (`--backend auto`, the default) selects QEMU when `--gpu` is used, and libkrun otherwise. ### Automatic GPU binding @@ -58,7 +42,7 @@ When `--gpu` is passed (with or without a specific PCI address), the launcher au 1. **Probe** — scans `/sys/bus/pci/devices` for NVIDIA devices (vendor `0x10de`). 2. **Safety checks** — for each candidate GPU, verifies it is safe to claim (see below). If any check fails, the launcher refuses to proceed and exits with an actionable error. 3. **Bind** — unbinds the selected GPU from the `nvidia` driver and binds it to `vfio-pci`. Also binds any IOMMU group peers to `vfio-pci` for group cleanliness. -4. **Launch** — starts cloud-hypervisor with the VFIO device attached and sets `GPU_ENABLED=true` in the guest kernel cmdline. +4. **Launch** — starts QEMU with the VFIO device attached and sets `GPU_ENABLED=true` in the guest kernel cmdline. 5. **Rebind on shutdown** — when the VM exits (clean shutdown, Ctrl+C, or crash), the launcher rebinds the GPU back to the `nvidia` driver and clears `driver_override`, restoring host GPU access. Cleanup is guaranteed by a `GpuBindGuard` RAII guard that calls restore on drop, covering normal exit, early return, and panic. Only `SIGKILL` (kill -9) bypasses the guard — see Troubleshooting below for manual recovery. When a specific PCI address is given (`--gpu 0000:41:00.0`), the launcher targets that exact device. When `--gpu` is used without an address (`auto` mode), the launcher selects the best available GPU using the multi-GPU selection strategy. @@ -259,15 +243,12 @@ sudo openshell-vm --gpu 0000:41:00.0 The `--backend` flag controls hypervisor selection independently of `--gpu`: ```shell -sudo openshell-vm --gpu # auto: CHV if MSI-X, QEMU otherwise -sudo openshell-vm --backend cloud-hypervisor # explicit CHV, no GPU +sudo openshell-vm --gpu # auto: selects QEMU for GPU sudo openshell-vm --backend qemu # explicit QEMU, no GPU sudo openshell-vm --gpu --backend qemu # force QEMU with GPU sudo openshell-vm --backend libkrun # explicit libkrun (no GPU support) ``` -The `chv` alias is accepted as shorthand for `cloud-hypervisor`. - ### Diagnostics When `--gpu` is passed, the launcher runs safety checks before unbinding. If @@ -304,9 +285,9 @@ GPU: restoring 0000:41:00.0 (cleanup) GPU: rebinding 0000:41:00.0 to nvidia ``` -## VM Networking (Cloud Hypervisor) +## VM Networking (QEMU) -Cloud Hypervisor uses TAP-based networking instead of the gvproxy user-mode networking used by the libkrun backend. This has several implications for connectivity and port forwarding. +QEMU uses TAP-based networking instead of the gvproxy user-mode networking used by the libkrun backend. This has several implications for connectivity and port forwarding. ### Network topology @@ -325,11 +306,11 @@ Host Guest (microVM) ### How it works -The CHV backend configures networking in three layers: +The QEMU backend configures networking in three layers: **1. TAP device and guest IP assignment** -Cloud Hypervisor creates a TAP device on the host side with IP `192.168.249.1/24`. The guest is assigned `192.168.249.2/24` via kernel command line parameters (`VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). The init script reads these from `/proc/cmdline` and uses them as the static fallback when DHCP is unavailable (CHV does not run a DHCP server). +QEMU creates a TAP device on the host side with IP `192.168.249.1/24`. The guest is assigned `192.168.249.2/24` via kernel command line parameters (`VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). The init script reads these from `/proc/cmdline` and uses them as the static fallback when DHCP is unavailable. **2. Host-side NAT and IP forwarding** @@ -342,7 +323,7 @@ This gives the guest internet access through the host. Rules are cleaned up on V **3. TCP port forwarding** -Unlike gvproxy (which provides built-in port forwarding), CHV TAP networking requires explicit port forwarding. The launcher starts a userspace TCP proxy for each port mapping (e.g., `30051:30051`). The proxy binds to `127.0.0.1:{host_port}` and forwards connections to `192.168.249.2:{guest_port}`. +Unlike gvproxy (which provides built-in port forwarding), TAP networking requires explicit port forwarding. The launcher starts a userspace TCP proxy for each port mapping (e.g., `30051:30051`). The proxy binds to `127.0.0.1:{host_port}` and forwards connections to `192.168.249.2:{guest_port}`. ### DNS resolution @@ -358,14 +339,14 @@ The resolved DNS server is passed to the guest via `VM_NET_DNS=` on the kernel c | Constant | Value | Purpose | |----------|-------|---------| -| `CHV_TAP_HOST_IP` | `192.168.249.1` | Host side of the TAP device | -| `CHV_TAP_GUEST_IP` | `192.168.249.2` | Guest static IP | -| `CHV_TAP_SUBNET` | `192.168.249.0/24` | Subnet for iptables rules | -| `CHV_TAP_NETMASK` | `255.255.255.0` | Subnet mask in VM payload | +| `TAP_HOST_IP` | `192.168.249.1` | Host side of the TAP device | +| `TAP_GUEST_IP` | `192.168.249.2` | Guest static IP | +| `TAP_SUBNET` | `192.168.249.0/24` | Subnet for iptables rules | +| `TAP_NETMASK` | `255.255.255.0` | Subnet mask in VM payload | ### Differences from libkrun/gvproxy networking -| Feature | libkrun + gvproxy | CHV + TAP | +| Feature | libkrun + gvproxy | QEMU + TAP | |---------|------------------|-----------| | Network mode | User-mode (SLIRP-like) | Kernel TAP device | | DHCP | Built-in (gvproxy) | None (static IP via cmdline) | @@ -413,10 +394,6 @@ The launcher caches mTLS certificates on the host after the first successful boo ## Troubleshooting -### "cloud-hypervisor requires MSI-X for VFIO passthrough" - -The GPU lacks MSI-X support and `--backend cloud-hypervisor` was explicitly requested. Either use `--backend qemu` or omit the `--backend` flag to let auto-selection pick QEMU as the fallback. - ### "no NVIDIA PCI device found" The host has no NVIDIA GPU installed, or the PCI device is not visible: @@ -490,5 +467,4 @@ If you hit this issue repeatedly, check for nvidia driver updates or file a bug - [System Architecture](system-architecture.md) — overall OpenShell architecture - Implementation: - [`crates/openshell-vfio/src/lib.rs`](../crates/openshell-vfio/src/lib.rs) — GPU binding and VFIO setup - - [`crates/openshell-vm/src/backend/cloud_hypervisor.rs`](../crates/openshell-vm/src/backend/cloud_hypervisor.rs) — cloud-hypervisor backend - [`crates/openshell-vm/src/backend/qemu.rs`](../crates/openshell-vm/src/backend/qemu.rs) — QEMU backend diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index be086e534..e62a6e13d 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -26,12 +26,13 @@ const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell"; /// Resolve the raw GPU device-ID list, replacing the `"auto"` sentinel with a /// concrete device ID based on whether CDI is enabled on the daemon. /// -/// | Input | Output | -/// |--------------|--------------------------------------------------------------| -/// | `[]` | `[]` — no GPU | -/// | `["legacy"]` | `["legacy"]` — pass through to the non-CDI fallback path | -/// | `["auto"]` | `["nvidia.com/gpu=all"]` if CDI enabled, else `["legacy"]` | -/// | `[cdi-ids…]` | unchanged | +/// | Input | Output | +/// |---------------------|--------------------------------------------------------------| +/// | `[]` | `[]` — no GPU | +/// | `["vm-passthrough"]`| `["vm-passthrough"]` — GPU via QEMU/VFIO, no Docker device | +/// | `["legacy"]` | `["legacy"]` — pass through to the non-CDI fallback path | +/// | `["auto"]` | `["nvidia.com/gpu=all"]` if CDI enabled, else `["legacy"]` | +/// | `[cdi-ids…]` | unchanged | pub(crate) fn resolve_gpu_device_ids(gpu: &[String], cdi_enabled: bool) -> Vec { match gpu { [] => vec![], @@ -622,6 +623,11 @@ pub async fn ensure_container( // Docker resolves them against the host CDI spec at /etc/cdi/ match device_ids { [] => {} + [id] if id == "vm-passthrough" => { + // GPU passthrough is handled by QEMU/VFIO inside the container, + // not by Docker. No DeviceRequest needed — GPU_ENABLED=true + // (set below) deploys the NVIDIA device plugin in k3s. + } [id] if id == "legacy" => { host_config.device_requests = Some(vec![DeviceRequest { driver: Some("nvidia".to_string()), @@ -1436,6 +1442,13 @@ mod tests { ); } + #[test] + fn resolve_gpu_vm_passthrough() { + let ids = vec!["vm-passthrough".to_string()]; + assert_eq!(resolve_gpu_device_ids(&ids, true), ids); + assert_eq!(resolve_gpu_device_ids(&ids, false), ids); + } + #[test] fn resolve_gpu_cdi_ids_passthrough() { let ids = vec!["nvidia.com/gpu=all".to_string()]; diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index be2527295..a104ace4d 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -5258,8 +5258,10 @@ fn prepare_gateway_deploy_gpu( if gateway_deploy_uses_vm_backend() { if remote.is_none() { let guard = check_gpu_readiness(&gpu)?; - let selected_bdf = guard.pci_addr().unwrap_or("auto").to_string(); - let updated_gpu = vec![selected_bdf]; + // Signal that GPU is enabled but passthrough is handled by QEMU/VFIO, + // not by Docker CDI. The bootstrap sets GPU_ENABLED=true for the + // k3s NVIDIA device plugin but skips Docker DeviceRequests. + let updated_gpu = vec!["vm-passthrough".to_string()]; return Ok((updated_gpu, Some(guard))); } else { eprintln!( diff --git a/crates/openshell-vfio/src/lib.rs b/crates/openshell-vfio/src/lib.rs index 4404624e6..675928db7 100644 --- a/crates/openshell-vfio/src/lib.rs +++ b/crates/openshell-vfio/src/lib.rs @@ -7,7 +7,7 @@ //! This module scans Linux sysfs (`/sys/bus/pci/devices`) for NVIDIA GPUs //! (vendor ID `0x10de`), checks their driver binding, and verifies IOMMU //! group cleanliness — the prerequisites for passing a physical GPU into -//! a cloud-hypervisor VM via VFIO. +//! a VM via VFIO. //! //! Returns per-device readiness for multi-GPU hosts. //! @@ -197,8 +197,8 @@ fn sysfs_write_with_timeout( /// Check whether a PCI device supports MSI-X by walking the PCI capability /// list in the sysfs `config` file. MSI-X is capability ID `0x11`. /// -/// cloud-hypervisor's VFIO code assumes MSI-X and will panic if the device -/// only has MSI. This pre-flight check prevents a cryptic crash. +/// MSI-X support is tracked for informational purposes. QEMU handles +/// devices with or without MSI-X via legacy interrupt emulation fallback. #[cfg(target_os = "linux")] pub fn check_msix_support(sysfs: &SysfsRoot, pci_addr: &str) -> bool { let config_path = sysfs.sys_bus_pci_devices().join(pci_addr).join("config"); @@ -354,7 +354,7 @@ fn probe_linux_sysfs() -> Vec<(String, HostNvidiaVfioReadiness)> { /// /// When activated, checks two conditions: /// 1. At least one NVIDIA device reports [`VfioBoundReady`]. -/// 2. The cloud-hypervisor binary exists in `runtime_dir` (if provided). +/// 2. The QEMU binary (`qemu-system-x86_64`) exists in `runtime_dir` or on PATH (if provided). pub fn nvidia_gpu_available_for_vm_passthrough(runtime_dir: Option) -> bool { if std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() != Ok("1") { return false; @@ -368,9 +368,17 @@ pub fn nvidia_gpu_available_for_vm_passthrough(runtime_dir: Option) -> return false; } - runtime_dir - .map(|dir| dir.join("cloud-hypervisor").is_file()) - .unwrap_or(false) + let has_qemu = runtime_dir + .map(|dir| dir.join("qemu-system-x86_64").is_file()) + .unwrap_or(false); + let has_qemu_on_path = std::process::Command::new("qemu-system-x86_64") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .is_ok(); + + has_qemu || has_qemu_on_path } /// Sysfs root path, defaulting to "/" in production and a temp dir in tests. @@ -765,9 +773,7 @@ fn pci_reset_device(sysfs: &SysfsRoot, pci_addr: &str) { .file_name() .and_then(|n| n.to_str()) .unwrap_or("unknown"); - eprintln!( - "GPU {pci_addr}: performing secondary bus reset on bridge {bridge_name}" - ); + eprintln!("GPU {pci_addr}: performing secondary bus reset on bridge {bridge_name}"); if let Err(e) = std::fs::write(&bridge_reset, "1") { eprintln!("GPU {pci_addr}: bridge SBR failed: {e}"); } else { @@ -934,15 +940,24 @@ pub fn bind_gpu_to_vfio(sysfs: &SysfsRoot, pci_addr: &str) -> Result d, _ => { let vendor = std::fs::read_to_string(dev_dir.join("vendor")) .map(|v| v.trim().to_lowercase()) .unwrap_or_default(); - if vendor == NVIDIA_VENDOR_ID { + let class = std::fs::read_to_string(dev_dir.join("class")) + .map(|c| c.trim().to_lowercase()) + .unwrap_or_default(); + if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x0403") { + // NVIDIA HDA audio companion (HDMI/DP audio) + eprintln!( + "GPU {pci_addr}: no driver was bound, defaulting restore target to snd_hda_intel (audio device)" + ); + "snd_hda_intel".to_string() + } else if vendor == NVIDIA_VENDOR_ID { eprintln!( "GPU {pci_addr}: no driver was bound, defaulting restore target to nvidia" ); @@ -1065,6 +1080,10 @@ pub fn rebind_gpu_to_original( if !original_driver.is_empty() && original_driver != "none" { if original_driver == "nvidia" && sysfs.is_real_sysfs() { nvidia_reload_modules(); + } else if sysfs.is_real_sysfs() { + let _ = std::process::Command::new("modprobe") + .arg(original_driver) + .output(); } // modprobe may have auto-bound the device (now that driver_override is @@ -1255,6 +1274,40 @@ fn is_iommu_group_clean(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { false } +/// Discover IOMMU group peers already on vfio-pci (inherited from a previous +/// session) and infer their original driver from the PCI class code so they +/// can be restored on exit. +#[cfg(target_os = "linux")] +fn inherited_peer_binds(sysfs: &SysfsRoot, gpu_addr: &str) -> Vec<(String, String)> { + iommu_group_peers(sysfs, gpu_addr) + .unwrap_or_default() + .into_iter() + .filter(|peer| peer != gpu_addr) + .filter_map(|peer| { + if current_driver(sysfs, &peer).as_deref() != Some("vfio-pci") { + return None; + } + let class = + std::fs::read_to_string(sysfs.sys_bus_pci_devices().join(&peer).join("class")) + .unwrap_or_default() + .trim() + .to_lowercase(); + // 0x0403xx = multimedia audio controller — typically snd_hda_intel + let orig = if class.starts_with("0x0403") { + "snd_hda_intel" + } else { + "nvidia" + }; + Some((peer, orig.to_string())) + }) + .collect() +} + +#[cfg(not(target_os = "linux"))] +fn inherited_peer_binds(_sysfs: &SysfsRoot, _gpu_addr: &str) -> Vec<(String, String)> { + vec![] +} + /// Captures the bind state for a GPU so it can be restored on shutdown. #[derive(Debug)] pub struct GpuBindState { @@ -1266,7 +1319,7 @@ pub struct GpuBindState { pub peer_binds: Vec<(String, String)>, /// Whether this instance performed the bind (false if GPU was already on vfio-pci). pub did_bind: bool, - /// Whether the GPU supports MSI-X (needed by cloud-hypervisor; QEMU works without it). + /// Whether the GPU supports MSI-X (informational; QEMU handles both cases). pub has_msix: bool, } @@ -1331,20 +1384,22 @@ impl GpuBindState { return Ok(()); } - // Always attempt peer restore even if GPU restore fails, so the - // audio companion (and any other IOMMU group peers) aren't left - // stranded on vfio-pci. + // Restore IOMMU peers (e.g. the HDA audio companion) BEFORE the GPU. + // nvidia_reload_modules() during GPU restore can claim peer devices + // through nvidia-modeset/nvidia-drm if they're still unbound, racing + // with the snd_hda_intel rebind. Restoring peers first avoids this. + let peer_result = rebind_iommu_group_peers(sysfs, &self.peer_binds); + if let Err(ref e) = peer_result { + eprintln!("GPU: peer restore failed: {e}"); + } + eprintln!( "GPU: rebinding {} to {}", self.pci_addr, self.original_driver ); let gpu_result = rebind_gpu_to_original(sysfs, &self.pci_addr, &self.original_driver); - let peer_result = rebind_iommu_group_peers(sysfs, &self.peer_binds); if let Err(ref gpu_err) = gpu_result { - if let Err(ref peer_err) = peer_result { - eprintln!("GPU: peer restore also failed: {peer_err}"); - } return Err(std::io::Error::new(gpu_err.kind(), gpu_err.to_string())); } peer_result @@ -1585,19 +1640,21 @@ fn prepare_specific_gpu(sysfs: &SysfsRoot, bdf: &str) -> Result Result Result { nvidia_addrs.sort(); // Phase 1: prefer GPUs already on vfio-pci with clean IOMMU group. - // MSI-X GPUs get priority (cloud-hypervisor has lower overhead than QEMU). + // MSI-X GPUs get slight priority (better interrupt performance). let mut vfio_msix: Option = None; let mut vfio_no_msix: Option = None; for addr in &nvidia_addrs { @@ -1732,25 +1790,32 @@ fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result { } } if let Some(addr) = vfio_msix { - eprintln!("GPU {addr}: already on vfio-pci (inherited from previous session), will restore to nvidia on exit"); + let peer_binds = inherited_peer_binds(sysfs, &addr); + eprintln!( + "GPU {addr}: already on vfio-pci (inherited from previous session), \ + will restore to nvidia on exit ({} peer(s) also tracked)", + peer_binds.len() + ); return Ok(GpuBindState { pci_addr: addr, original_driver: "nvidia".to_string(), - peer_binds: vec![], + peer_binds, did_bind: true, has_msix: true, }); } if let Some(ref addr) = vfio_no_msix { + let peer_binds = inherited_peer_binds(sysfs, addr); + eprintln!("GPU {addr}: no MSI-X support (QEMU will use legacy interrupt emulation)"); eprintln!( - "GPU {addr}: no MSI-X support — QEMU backend will be used \ - (cloud-hypervisor requires MSI-X)" + "GPU {addr}: already on vfio-pci (inherited from previous session), \ + will restore to nvidia on exit ({} peer(s) also tracked)", + peer_binds.len() ); - eprintln!("GPU {addr}: already on vfio-pci (inherited from previous session), will restore to nvidia on exit"); return Ok(GpuBindState { pci_addr: addr.clone(), original_driver: "nvidia".to_string(), - peer_binds: vec![], + peer_binds, did_bind: true, has_msix: false, }); @@ -1813,15 +1878,12 @@ fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result { idle_candidates.push((addr.clone(), has_msix)); } - // Sort: MSI-X candidates first (lower overhead with cloud-hypervisor). + // Sort: MSI-X candidates first (better interrupt performance). idle_candidates.sort_by_key(|(_, has_msix)| !has_msix); for (addr, has_msix) in &idle_candidates { if !has_msix { - eprintln!( - "GPU {addr}: no MSI-X support — QEMU backend will be used \ - (cloud-hypervisor requires MSI-X)" - ); + eprintln!("GPU {addr}: no MSI-X support (QEMU will use legacy interrupt emulation)"); } eprintln!("GPU: binding {addr} for VFIO passthrough"); let original_driver = bind_gpu_to_vfio(sysfs, addr)?; @@ -2321,7 +2383,10 @@ mod tests { let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap(); assert_eq!(state.pci_addr, "0000:43:00.0"); - assert!(state.did_bind, "inherited vfio-pci should set did_bind=true for restore"); + assert!( + state.did_bind, + "inherited vfio-pci should set did_bind=true for restore" + ); assert_eq!(state.original_driver, "nvidia"); } @@ -2560,8 +2625,14 @@ mod tests { fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:43:00.0")).unwrap(); - assert!(state.did_bind, "inherited vfio-pci state should set did_bind=true"); - assert_eq!(state.original_driver, "nvidia", "inherited vfio-pci should target nvidia for restore"); + assert!( + state.did_bind, + "inherited vfio-pci state should set did_bind=true" + ); + assert_eq!( + state.original_driver, "nvidia", + "inherited vfio-pci should target nvidia for restore" + ); let dev_dir = root.path().join("sys/bus/pci/devices/0000:43:00.0"); let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); @@ -2574,7 +2645,10 @@ mod tests { state.restore_with_sysfs(&sysfs).unwrap(); let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); - assert_eq!(override_content, "", "driver_override should be cleared after restore"); + assert_eq!( + override_content, "", + "driver_override should be cleared after restore" + ); } #[test] @@ -2706,11 +2780,26 @@ mod tests { has_msix: true, }; let cmds = state.recovery_commands(); - assert!(cmds.contains("vfio-pci/unbind"), "should unbind GPU from vfio-pci"); - assert!(cmds.contains("0000:41:00.0"), "should reference GPU address"); - assert!(cmds.contains("0000:41:00.1"), "should reference peer address"); - assert!(cmds.contains("driver_override"), "should clear driver_override"); - assert!(cmds.contains("modprobe nvidia"), "should reload nvidia modules"); + assert!( + cmds.contains("vfio-pci/unbind"), + "should unbind GPU from vfio-pci" + ); + assert!( + cmds.contains("0000:41:00.0"), + "should reference GPU address" + ); + assert!( + cmds.contains("0000:41:00.1"), + "should reference peer address" + ); + assert!( + cmds.contains("driver_override"), + "should clear driver_override" + ); + assert!( + cmds.contains("modprobe nvidia"), + "should reload nvidia modules" + ); assert!( cmds.contains("modprobe snd_hda_intel"), "should reload peer original driver" diff --git a/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs b/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs index a9bbd7bdc..08c658f7a 100644 --- a/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs +++ b/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs @@ -24,7 +24,7 @@ fn nvidia_gpu_passthrough_is_available() { assert!( openshell_vfio::nvidia_gpu_available_for_vm_passthrough(None), "GPU passthrough gate returned false on a GPU CI runner — \ - check VFIO binding and cloud-hypervisor runtime bundle" + check VFIO binding and VM runtime bundle" ); } diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs index 9a5e04e4c..7c709defd 100644 --- a/crates/openshell-vm/build.rs +++ b/crates/openshell-vm/build.rs @@ -114,7 +114,13 @@ fn main() { }; for (src_name, dst_name) in &core_files { - if !copy_artifact(src_name, dst_name, &compressed_dir, &out_dir, &mut total_embedded_size) { + if !copy_artifact( + src_name, + dst_name, + &compressed_dir, + &out_dir, + &mut total_embedded_size, + ) { println!( "cargo:warning=Missing compressed artifact: {}", compressed_dir.join(src_name).display() @@ -158,7 +164,10 @@ fn main() { // Write empty stubs for any missing rootfs variant so that // `include_bytes!()` in embedded.rs always resolves. The embedded module // treats zero-length slices as "not available". - for (found, name) in [(has_base, "rootfs.tar.zst"), (has_gpu, "rootfs-gpu.tar.zst")] { + for (found, name) in [ + (has_base, "rootfs.tar.zst"), + (has_gpu, "rootfs-gpu.tar.zst"), + ] { if !found { let stub = out_dir.join(name); if !stub.exists() { @@ -184,9 +193,7 @@ fn main() { "cargo:warning=Total embedded data is {total_embedded_size} bytes ({:.1} GiB).", total_embedded_size as f64 / (1024.0 * 1024.0 * 1024.0) ); - println!( - "cargo:warning=This exceeds the x86_64 small code model limit (~2 GiB)." - ); + println!("cargo:warning=This exceeds the x86_64 small code model limit (~2 GiB)."); println!( "cargo:warning=Ensure RUSTFLAGS includes '-C code-model=large' or use `mise run vm:build`." ); diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index e6a01dcce..cedc15d85 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -43,11 +43,7 @@ GVPROXY_VERSION="${GVPROXY_VERSION:-v0.8.8}" # Pinned: 2026-03-27 (main branch HEAD at time of pinning) LIBKRUNFW_REF="${LIBKRUNFW_REF:-463f717bbdd916e1352a025b6fb2456e882b0b39}" -# ── cloud-hypervisor (GPU passthrough VMM) ────────────────────────────── -# Repo: https://github.com/cloud-hypervisor/cloud-hypervisor -CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}" - -# ── virtiofsd (virtio-fs daemon for cloud-hypervisor rootfs) ──────────── +# ── virtiofsd (virtio-fs daemon for QEMU rootfs) ──────────────────────── # Repo: https://gitlab.com/virtio-fs/virtiofsd VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" diff --git a/crates/openshell-vm/runtime/kernel/openshell.kconfig b/crates/openshell-vm/runtime/kernel/openshell.kconfig index 95435b149..d1244ad32 100644 --- a/crates/openshell-vm/runtime/kernel/openshell.kconfig +++ b/crates/openshell-vm/runtime/kernel/openshell.kconfig @@ -159,21 +159,29 @@ CONFIG_X86_PAT=y # (previously CONFIG_FW_LOADER_SYSFS, now merged). CONFIG_FW_LOADER=y -# ── cloud-hypervisor support ──────────────────────────────────────────── -# CHV uses virtio-PCI transport (libkrun uses virtio-MMIO). Both drivers +# ── Compressed firmware support ────────────────────────────────────────── +# NVIDIA driver packages (570.x+) ship GSP firmware as compressed files +# (gsp_*.bin.xz). Without decompression support, request_firmware() fails +# to find the firmware even when the files exist in /lib/firmware/. +CONFIG_FW_LOADER_COMPRESS=y +CONFIG_FW_LOADER_COMPRESS_XZ=y +CONFIG_FW_LOADER_COMPRESS_ZSTD=y + +# ── QEMU backend support ───────────────────────────────────────────────── +# QEMU uses virtio-PCI transport (libkrun uses virtio-MMIO). Both drivers # coexist safely — the kernel probes whichever transport the hypervisor # provides. CONFIG_VIRTIO_PCI=y -# Serial console for cloud-hypervisor (8250/16550 UART). libkrun uses -# virtio-console which is already enabled in the base config. +# Serial console for QEMU (8250/16550 UART). libkrun uses virtio-console +# which is already enabled in the base config. CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y -# ACPI support for cloud-hypervisor power management. Required for -# `poweroff -f` to trigger a clean ACPI shutdown that CHV detects. +# ACPI support for QEMU power management. Required for `poweroff -f` +# to trigger a clean ACPI shutdown that QEMU detects. CONFIG_ACPI=y -# x2APIC support — Cloud Hypervisor uses x2APIC MADT entries for -# multi-vCPU VMs. Without this, only the bootstrap CPU is activated. +# x2APIC support — QEMU uses x2APIC MADT entries for multi-vCPU VMs. +# Without this, only the bootstrap CPU is activated. CONFIG_X86_X2APIC=y diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index 4baade995..efcf7ed10 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -919,8 +919,8 @@ done # per-boot layer extraction that previously added ~3-5s per container. echo " Pre-unpacking container images..." for img in \ - "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" \ - "ghcr.io/nvidia/openshell/gateway:latest"; do + "${COMMUNITY_SANDBOX_IMAGE}" \ + "${SERVER_IMAGE}"; do if vm_exec k3s ctr -n k8s.io images ls -q 2>/dev/null | grep -qF "$img"; then echo " unpacking: $img" vm_exec k3s ctr -n k8s.io run --rm "$img" "pre-unpack-$(date +%s)" true 2>/dev/null || true diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh index f9e4c228f..ab871e334 100755 --- a/crates/openshell-vm/scripts/openshell-vm-init.sh +++ b/crates/openshell-vm/scripts/openshell-vm-init.sh @@ -46,8 +46,8 @@ mkdir -p /sys/fs/cgroup mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null & wait -# ── Parse kernel cmdline for env vars (cloud-hypervisor path) ──────── -# cloud-hypervisor passes environment variables via kernel cmdline +# ── Parse kernel cmdline for env vars ───────────────────────────────── +# The QEMU backend passes environment variables via kernel cmdline # (KEY=VALUE tokens). These are not automatically exported to init. # Must run after /proc is mounted. if [ -f /proc/cmdline ]; then @@ -279,6 +279,14 @@ find /run -name '*.sock' -delete 2>/dev/null || true # start; clear it so k3s doesn't fail node re-registration validation. rm -f /var/lib/rancher/k3s/server/cred/node-passwd 2>/dev/null || true +# Clean stale k3s TLS certificates from previous boots. If k3s crashes +# mid-write it can leave partially-written (0-byte or non-PEM) cert files +# that cause "tls: failed to find any PEM data in certificate input" on +# restart. Wiping the TLS directory forces k3s to regenerate self-signed +# certs on startup. This is safe for both cold and warm boots — the certs +# are ephemeral per-cluster and recreated automatically by k3s. +rm -rf /var/lib/rancher/k3s/server/tls 2>/dev/null || true + # Clean stale containerd runtime state from previous boots. # # The rootfs persists across VM restarts via virtio-fs. The snapshotter @@ -415,11 +423,39 @@ if [ "${GPU_ENABLED:-false}" = "true" ]; then exit 1 fi + # ── Stage NVIDIA GSP firmware onto tmpfs for reliable loading ───── + # The kernel's request_firmware() calls kernel_read_file_from_path_initns() + # which must read the full firmware blob (64MB+ for GSP) through the VFS + # layer. On virtiofs (FUSE-based), each read is a round-trip through the + # virtio ring to virtiofsd. This can fail or stall on non-DAX virtiofs + # configurations (QEMU vhost-user-fs-pci without cache-size). + # + # Copying firmware to /run (tmpfs) eliminates the FUSE path entirely — + # kernel_read_file() reads directly from page cache backed by RAM. + NVIDIA_FW_SRC="/lib/firmware/nvidia" + NVIDIA_FW_TMPFS="/run/firmware/nvidia" + if [ -d "$NVIDIA_FW_SRC" ]; then + mkdir -p "/run/firmware" + cp -a "$NVIDIA_FW_SRC" "/run/firmware/" + ts "staged NVIDIA firmware to tmpfs ($(du -sh "$NVIDIA_FW_TMPFS" | cut -f1))" + + if [ -f /sys/module/firmware_class/parameters/path ]; then + echo -n "/run/firmware" > /sys/module/firmware_class/parameters/path + ts "firmware_class.path set to /run/firmware" + fi + else + echo "WARNING: NVIDIA firmware directory not found at $NVIDIA_FW_SRC" >&2 + echo " modprobe nvidia will likely fail with: RmFetchGspRmImages: No firmware image found" >&2 + fi + modprobe nvidia || { echo "FATAL: failed to load nvidia kernel module" >&2; exit 1; } modprobe nvidia_uvm || { echo "FATAL: failed to load nvidia_uvm kernel module" >&2; exit 1; } modprobe nvidia_modeset || { echo "FATAL: failed to load nvidia_modeset kernel module" >&2; exit 1; } ts "NVIDIA kernel modules loaded" + # Firmware is now in kernel memory; free the tmpfs copy. + rm -rf /run/firmware 2>/dev/null || true + if ! nvidia-smi > /dev/null 2>&1; then echo "FATAL: GPU_ENABLED=true but nvidia-smi failed — GPU not visible to guest" >&2 echo "Check: VFIO passthrough, IOMMU groups, guest kernel modules" >&2 diff --git a/crates/openshell-vm/src/backend/cloud_hypervisor.rs b/crates/openshell-vm/src/backend/cloud_hypervisor.rs deleted file mode 100644 index 13a73be03..000000000 --- a/crates/openshell-vm/src/backend/cloud_hypervisor.rs +++ /dev/null @@ -1,1182 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! cloud-hypervisor backend for GPU passthrough VMs. -//! -//! Uses the cloud-hypervisor REST API over a Unix socket to manage VMs -//! with VFIO device passthrough. This backend is Linux-only and requires -//! a separate kernel image (`vmlinux`) and `virtiofsd` for the root -//! filesystem. - -use std::io::{Read, Write}; -use std::os::unix::net::UnixStream; -use std::os::unix::process::CommandExt; -use std::path::{Path, PathBuf}; -use std::time::{Duration, Instant}; - -use super::{ - GUEST_MAC, TAP_GUEST_IP, TAP_HOST_IP, TAP_NETMASK, VmBackend, bridge_bidirectional, - build_kernel_cmdline, setup_tap_host_networking, shell_escape, start_tcp_port_forwarder, - teardown_tap_host_networking, wait_for_socket, -}; -use crate::exec::{ - VM_EXEC_VSOCK_PORT, clear_vm_runtime_state, vm_exec_socket_path, write_vm_runtime_state, -}; -use crate::{NetBackend, VmConfig, VmError, vm_rootfs_key}; - -/// cloud-hypervisor hypervisor backend for GPU passthrough. -pub struct CloudHypervisorBackend { - /// Path to the cloud-hypervisor binary. - chv_binary: PathBuf, - /// Path to the vmlinux kernel image. - vmlinux: PathBuf, - /// Path to the virtiofsd binary. - virtiofsd: PathBuf, -} - -impl CloudHypervisorBackend { - /// Create a new cloud-hypervisor backend, validating required binaries. - pub fn new() -> Result { - let runtime_dir = crate::configured_runtime_dir()?; - - let chv_binary = runtime_dir.join("cloud-hypervisor"); - if !chv_binary.is_file() { - return Err(VmError::BinaryNotFound { - path: chv_binary.display().to_string(), - hint: "GPU passthrough requires cloud-hypervisor. Run the GPU build pipeline or set OPENSHELL_VM_RUNTIME_DIR".to_string(), - }); - } - - let vmlinux = runtime_dir.join("vmlinux"); - if !vmlinux.is_file() { - return Err(VmError::BinaryNotFound { - path: vmlinux.display().to_string(), - hint: "GPU passthrough requires a vmlinux kernel. Run the GPU build pipeline" - .to_string(), - }); - } - - let virtiofsd = runtime_dir.join("virtiofsd"); - if !virtiofsd.is_file() { - return Err(VmError::BinaryNotFound { - path: virtiofsd.display().to_string(), - hint: "GPU passthrough requires virtiofsd. Run the GPU build pipeline".to_string(), - }); - } - - Ok(Self { - chv_binary, - vmlinux, - virtiofsd, - }) - } -} - -impl VmBackend for CloudHypervisorBackend { - fn launch(&self, config: &VmConfig) -> Result { - launch_cloud_hypervisor(self, config) - } -} - -// ── REST API client ───────────────────────────────────────────────────── - -/// Send a raw HTTP/1.1 request over a Unix socket and return the response body. -/// -/// Parses the response headers to determine Content-Length so we read exactly -/// the right number of bytes without relying on EOF or Connection: close. -fn http_request_unix( - socket_path: &Path, - method: &str, - path: &str, - body: Option<&str>, -) -> Result<(u16, String), String> { - use std::io::BufRead; - - let stream = UnixStream::connect(socket_path) - .map_err(|e| format!("connect to cloud-hypervisor API: {e}"))?; - - stream - .set_read_timeout(Some(Duration::from_secs(30))) - .map_err(|e| format!("set read timeout: {e}"))?; - - let request = if let Some(body) = body { - format!( - "{method} {path} HTTP/1.1\r\n\ - Host: localhost\r\n\ - Content-Type: application/json\r\n\ - Content-Length: {}\r\n\ - \r\n\ - {body}", - body.len(), - ) - } else { - format!( - "{method} {path} HTTP/1.1\r\n\ - Host: localhost\r\n\ - \r\n" - ) - }; - - { - let mut writer = &stream; - writer - .write_all(request.as_bytes()) - .map_err(|e| format!("write to cloud-hypervisor API: {e}"))?; - } - - let mut reader = std::io::BufReader::new(&stream); - - // Read status line - let mut status_line = String::new(); - reader - .read_line(&mut status_line) - .map_err(|e| format!("read status line: {e}"))?; - - let status_code = status_line - .split_whitespace() - .nth(1) - .and_then(|code| code.parse::().ok()) - .unwrap_or(0); - - // Read headers to find Content-Length - let mut content_length: usize = 0; - loop { - let mut header_line = String::new(); - reader - .read_line(&mut header_line) - .map_err(|e| format!("read header: {e}"))?; - if header_line.trim().is_empty() { - break; - } - if let Some(val) = header_line - .strip_prefix("Content-Length:") - .or_else(|| header_line.strip_prefix("content-length:")) - { - if let Ok(len) = val.trim().parse::() { - content_length = len; - } - } - } - - // Read body based on Content-Length - let mut body_bytes = vec![0u8; content_length]; - if content_length > 0 { - reader - .read_exact(&mut body_bytes) - .map_err(|e| format!("read body ({content_length} bytes): {e}"))?; - } - - let body_str = String::from_utf8_lossy(&body_bytes).to_string(); - Ok((status_code, body_str)) -} - -/// Create the VM via the cloud-hypervisor REST API. -fn api_vm_create(socket_path: &Path, payload: &str) -> Result<(), VmError> { - let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.create", Some(payload)) - .map_err(|e| VmError::HostSetup(format!("vm.create: {e}")))?; - - if status >= 200 && status < 300 { - Ok(()) - } else { - Err(VmError::HostSetup(format!( - "vm.create returned HTTP {status}: {body}" - ))) - } -} - -/// Boot the VM. -fn api_vm_boot(socket_path: &Path) -> Result<(), VmError> { - let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.boot", None) - .map_err(|e| VmError::HostSetup(format!("vm.boot: {e}")))?; - - if status >= 200 && status < 300 { - Ok(()) - } else { - Err(VmError::HostSetup(format!( - "vm.boot returned HTTP {status}: {body}" - ))) - } -} - -/// Request a graceful shutdown. -fn api_vm_shutdown(socket_path: &Path) -> Result<(), VmError> { - let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.shutdown", None) - .map_err(|e| VmError::HostSetup(format!("vm.shutdown: {e}")))?; - - if status >= 200 && status < 300 { - Ok(()) - } else { - Err(VmError::HostSetup(format!( - "vm.shutdown returned HTTP {status}: {body}" - ))) - } -} - -/// Query VM info/status. -#[allow(dead_code)] -fn api_vm_info(socket_path: &Path) -> Result { - let (status, body) = http_request_unix(socket_path, "GET", "/api/v1/vm.info", None) - .map_err(|e| VmError::HostSetup(format!("vm.info: {e}")))?; - - if status >= 200 && status < 300 { - Ok(body) - } else { - Err(VmError::HostSetup(format!( - "vm.info returned HTTP {status}: {body}" - ))) - } -} - -/// Delete the VM. -#[allow(dead_code)] -fn api_vm_delete(socket_path: &Path) -> Result<(), VmError> { - let (status, body) = http_request_unix(socket_path, "PUT", "/api/v1/vm.delete", None) - .map_err(|e| VmError::HostSetup(format!("vm.delete: {e}")))?; - - if status >= 200 && status < 300 { - Ok(()) - } else { - Err(VmError::HostSetup(format!( - "vm.delete returned HTTP {status}: {body}" - ))) - } -} - -// ── Build the VM create payload ───────────────────────────────────────── - -fn build_vm_create_payload( - backend: &CloudHypervisorBackend, - config: &VmConfig, - effective_exec_path: &str, - vfio_device: Option<&str>, - virtiofsd_sock: &Path, - state_disk_path: Option<&Path>, - use_tap_net: bool, - vsock_sock: &Path, - console_log: &Path, -) -> Result { - let mem_bytes = u64::from(config.mem_mib) * 1024 * 1024; - - let cmdline = build_kernel_cmdline(config, effective_exec_path, use_tap_net); - - let mut payload = serde_json::json!({ - "cpus": { - "boot_vcpus": config.vcpus, - "max_vcpus": config.vcpus, - }, - "memory": { - "size": mem_bytes, - "shared": true, - }, - "payload": { - "kernel": backend.vmlinux.display().to_string(), - "cmdline": cmdline, - }, - "fs": [{ - "tag": "rootfs", - "socket": virtiofsd_sock.display().to_string(), - "num_queues": 1, - "queue_size": 1024, - }], - "vsock": { - "cid": VSOCK_GUEST_CID, - "socket": vsock_sock.display().to_string(), - }, - "serial": { - "mode": "File", - "file": console_log.display().to_string(), - }, - "console": { - "mode": "Off", - }, - }); - - if let Some(disk_path) = state_disk_path { - payload["disks"] = serde_json::json!([{ - "path": disk_path.display().to_string(), - "readonly": false, - }]); - } - - // Cloud-hypervisor uses TAP devices for networking (requires root or - // CAP_NET_ADMIN). The gvproxy QEMU-style socket protocol is not - // compatible with CHV's NetConfig. GPU passthrough already requires - // elevated privileges, so TAP access is expected. - if use_tap_net { - payload["net"] = serde_json::json!([{ - "mac": GUEST_MAC, - "ip": TAP_HOST_IP, - "mask": TAP_NETMASK, - }]); - } - - if let Some(vfio_path) = vfio_device { - payload["devices"] = serde_json::json!([{ - "path": format!("/sys/bus/pci/devices/{vfio_path}/"), - }]); - } - - serde_json::to_string(&payload) - .map_err(|e| VmError::HostSetup(format!("serialize vm.create payload: {e}"))) -} - -// ── Launch ────────────────────────────────────────────────────────────── - -#[allow(clippy::similar_names)] -fn launch_cloud_hypervisor( - backend: &CloudHypervisorBackend, - config: &VmConfig, -) -> Result { - let launch_start = Instant::now(); - - let run_dir = config - .rootfs - .parent() - .unwrap_or(&config.rootfs) - .to_path_buf(); - let rootfs_key = vm_rootfs_key(&config.rootfs); - - // Unix domain sockets are limited to 108 characters (SUN_LEN). - // Instance rootfs paths can be deeply nested, so place sockets - // under /tmp to stay within the limit. - let sock_dir = PathBuf::from(format!("/tmp/ovm-chv-{}", std::process::id())); - std::fs::create_dir_all(&sock_dir).map_err(|e| { - VmError::HostSetup(format!("create socket dir {}: {e}", sock_dir.display())) - })?; - - let api_sock_path = sock_dir.join("api.sock"); - let vsock_sock_path = sock_dir.join("vsock.sock"); - let virtiofsd_sock_path = sock_dir.join("virtiofsd.sock"); - let console_log = config - .console_output - .clone() - .unwrap_or_else(|| run_dir.join(format!("{rootfs_key}-console.log"))); - - // Clean stale sockets - let _ = std::fs::remove_file(&api_sock_path); - let _ = std::fs::remove_file(&vsock_sock_path); - let _ = std::fs::remove_file(&virtiofsd_sock_path); - - // Start virtiofsd for the rootfs - eprintln!("Starting virtiofsd: {}", backend.virtiofsd.display()); - let virtiofsd_log = run_dir.join(format!("{rootfs_key}-virtiofsd.log")); - let virtiofsd_log_file = std::fs::File::create(&virtiofsd_log) - .map_err(|e| VmError::Fork(format!("create virtiofsd log: {e}")))?; - - let mut virtiofsd_cmd = std::process::Command::new(&backend.virtiofsd); - virtiofsd_cmd - .arg(format!("--socket-path={}", virtiofsd_sock_path.display())) - .arg(format!("--shared-dir={}", config.rootfs.display())) - .arg("--cache=always") - .stdout(std::process::Stdio::null()) - .stderr(virtiofsd_log_file); - #[allow(unsafe_code)] - unsafe { - virtiofsd_cmd.pre_exec(|| { - libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); - Ok(()) - }); - } - let mut virtiofsd_child = virtiofsd_cmd.spawn() - .map_err(|e| VmError::Fork(format!("start virtiofsd: {e}")))?; - - eprintln!( - "virtiofsd started (pid {}) [{:.1}s]", - virtiofsd_child.id(), - launch_start.elapsed().as_secs_f64() - ); - - // Wait for virtiofsd socket - wait_for_socket(&virtiofsd_sock_path, "virtiofsd", Duration::from_secs(5))?; - - // CHV uses TAP networking (requires root/CAP_NET_ADMIN). The gvproxy - // QEMU-style socket protocol is not compatible with cloud-hypervisor's - // NetConfig. GPU passthrough already requires elevated privileges. - let use_tap_net = !matches!(config.net, NetBackend::None); - - // For --exec mode: wrap the command so the VM powers off after it exits. - // Unlike libkrun (which exits when init terminates), cloud-hypervisor - // keeps running after PID 1 exits (kernel panics). A wrapper init script - // runs the command then calls `poweroff -f` for a clean ACPI shutdown. - let is_exec_mode = config.is_exec_mode(); - let wrapper_path = config.rootfs.join("tmp/chv-exec-wrapper.sh"); - let effective_exec_path; - if is_exec_mode { - let args_str = config - .args - .iter() - .map(|a| shell_escape(a)) - .collect::>() - .join(" "); - - let env_str = config - .env - .iter() - .map(|v| format!("export {}", shell_escape(v))) - .collect::>() - .join("\n"); - - let wrapper = format!( - "#!/bin/sh\n\ - mount -t proc proc /proc 2>/dev/null\n\ - mount -t sysfs sysfs /sys 2>/dev/null\n\ - mount -t devtmpfs devtmpfs /dev 2>/dev/null\n\ - {env_str}\n\ - cd {workdir}\n\ - {exec} {args}\n\ - RC=$?\n\ - # Trigger ACPI power-off so cloud-hypervisor exits cleanly.\n\ - # The rootfs may not have a `poweroff` binary, so try multiple methods.\n\ - if command -v poweroff >/dev/null 2>&1; then\n\ - poweroff -f\n\ - elif [ -x /usr/bin/busybox ]; then\n\ - /usr/bin/busybox poweroff -f\n\ - else\n\ - echo o > /proc/sysrq-trigger\n\ - fi\n\ - exit $RC\n", - env_str = env_str, - workdir = shell_escape(&config.workdir), - exec = shell_escape(&config.exec_path), - args = args_str, - ); - - if let Some(parent) = wrapper_path.parent() { - std::fs::create_dir_all(parent) - .map_err(|e| VmError::HostSetup(format!("create wrapper dir: {e}")))?; - } - std::fs::write(&wrapper_path, &wrapper) - .map_err(|e| VmError::HostSetup(format!("write exec wrapper: {e}")))?; - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - let _ = std::fs::set_permissions(&wrapper_path, std::fs::Permissions::from_mode(0o755)); - } - effective_exec_path = "/tmp/chv-exec-wrapper.sh".to_string(); - } else { - effective_exec_path = config.exec_path.clone(); - } - - // Start cloud-hypervisor process - eprintln!( - "Starting cloud-hypervisor: {}", - backend.chv_binary.display() - ); - - let chv_log = run_dir.join(format!("{rootfs_key}-cloud-hypervisor.log")); - let chv_log_file = std::fs::File::create(&chv_log) - .map_err(|e| VmError::Fork(format!("create cloud-hypervisor log: {e}")))?; - - let mut chv_cmd = std::process::Command::new(&backend.chv_binary); - chv_cmd - .arg("--api-socket") - .arg(&api_sock_path) - .stdout(std::process::Stdio::null()) - .stderr(chv_log_file); - #[allow(unsafe_code)] - unsafe { - chv_cmd.pre_exec(|| { - libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); - Ok(()) - }); - } - let mut chv_child = chv_cmd.spawn() - .map_err(|e| VmError::Fork(format!("start cloud-hypervisor: {e}")))?; - - let chv_pid = chv_child.id() as i32; - eprintln!( - "cloud-hypervisor started (pid {chv_pid}) [{:.1}s]", - launch_start.elapsed().as_secs_f64() - ); - - // Wait for API socket - wait_for_socket(&api_sock_path, "cloud-hypervisor", Duration::from_secs(10))?; - - // Build and send VM create payload - let state_disk_path = config.state_disk.as_ref().map(|sd| sd.path.as_path()); - let payload = build_vm_create_payload( - backend, - config, - &effective_exec_path, - config.vfio_device.as_deref(), - &virtiofsd_sock_path, - state_disk_path, - use_tap_net, - &vsock_sock_path, - &console_log, - )?; - - api_vm_create(&api_sock_path, &payload)?; - eprintln!("VM created [{:.1}s]", launch_start.elapsed().as_secs_f64()); - - api_vm_boot(&api_sock_path)?; - let boot_start = Instant::now(); - eprintln!("VM booting [{:.1}s]", launch_start.elapsed().as_secs_f64()); - - // Set up host-side networking for TAP (NAT, IP forwarding, masquerade) - // so the guest can reach the internet through the host. - let mut original_ip_forward: Option = None; - if use_tap_net { - match setup_tap_host_networking() { - Ok(orig) => original_ip_forward = Some(orig), - Err(e) => { - eprintln!("WARNING: host networking setup failed: {e}"); - eprintln!(" The VM may not have internet access."); - } - } - } - - // Write runtime state (vsock_bridge: true — CHV uses Unix socket vsock - // bridging with a text protocol, not kernel AF_VSOCK) - if !config.is_exec_mode() { - if let Err(err) = write_vm_runtime_state(&config.rootfs, chv_pid, &console_log, None, true) - { - let _ = api_vm_shutdown(&api_sock_path); - let _ = chv_child.kill(); - let _ = chv_child.wait(); - let _ = virtiofsd_child.kill(); - let _ = virtiofsd_child.wait(); - if let Some(ref orig) = original_ip_forward { - teardown_tap_host_networking(orig); - } - clear_vm_runtime_state(&config.rootfs); - return Err(err); - } - } - - let exec_socket = vm_exec_socket_path(&config.rootfs); - // CHV TAP networking doesn't provide built-in port forwarding like - // gvproxy. Start a TCP proxy for each port mapping so the host can - // reach guest services (e.g., the gateway health check on :30051). - if use_tap_net { - for pm in &config.port_map { - let parts: Vec<&str> = pm.split(':').collect(); - if parts.len() == 2 { - if let (Ok(hp), Ok(gp)) = (parts[0].parse::(), parts[1].parse::()) { - if let Err(e) = start_tcp_port_forwarder(hp, TAP_GUEST_IP, gp) { - let _ = chv_child.kill(); - let _ = chv_child.wait(); - let _ = virtiofsd_child.kill(); - let _ = virtiofsd_child.wait(); - if let Some(ref orig) = original_ip_forward { - teardown_tap_host_networking(orig); - } - clear_vm_runtime_state(&config.rootfs); - let _ = std::fs::remove_dir_all(&sock_dir); - let _ = std::fs::remove_file(&exec_socket); - return Err(e); - } - } - } - } - } - - for pm in &config.port_map { - let host_port = pm.split(':').next().unwrap_or(pm); - eprintln!(" port {pm} -> http://localhost:{host_port}"); - } - eprintln!("Console output: {}", console_log.display()); - - // Start vsock exec bridge (exec Unix socket → CHV vsock Unix socket). - // The bridge allows `openshell-vm exec` and bootstrap to communicate - // with the guest exec agent over the standard exec socket path. - start_vsock_exec_bridge(&exec_socket, &vsock_sock_path, VM_EXEC_VSOCK_PORT)?; - - // Gateway bootstrap and health check (mirrors libkrun backend). - if !config.is_exec_mode() && !config.port_map.is_empty() { - let gateway_port = crate::gateway_host_port(config); - if let Err(e) = crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port) - .and_then(|_| crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name)) - { - let _ = chv_child.kill(); - let _ = chv_child.wait(); - let _ = virtiofsd_child.kill(); - let _ = virtiofsd_child.wait(); - if let Some(ref orig) = original_ip_forward { - teardown_tap_host_networking(orig); - } - clear_vm_runtime_state(&config.rootfs); - let _ = std::fs::remove_dir_all(&sock_dir); - let _ = std::fs::remove_file(&exec_socket); - return Err(e); - } - } - - eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); - eprintln!("Press Ctrl+C to stop."); - - // Signal forwarding: SIGINT/SIGTERM -> graceful shutdown - crate::CHILD_PID.store(chv_pid, std::sync::atomic::Ordering::Relaxed); - unsafe { - libc::signal( - libc::SIGINT, - crate::forward_signal as *const () as libc::sighandler_t, - ); - libc::signal( - libc::SIGTERM, - crate::forward_signal as *const () as libc::sighandler_t, - ); - } - - // Wait for cloud-hypervisor to exit - let status = chv_child - .wait() - .map_err(|e| VmError::HostSetup(format!("wait for cloud-hypervisor: {e}")))?; - crate::CHILD_PID.store(0, std::sync::atomic::Ordering::Relaxed); - - // Clean up host networking rules - if let Some(ref orig) = original_ip_forward { - teardown_tap_host_networking(orig); - } - - // Cleanup - if !config.is_exec_mode() { - clear_vm_runtime_state(&config.rootfs); - } - let _ = virtiofsd_child.kill(); - let _ = virtiofsd_child.wait(); - eprintln!("virtiofsd stopped"); - - // Clean up sockets and wrapper - let _ = std::fs::remove_dir_all(&sock_dir); - let _ = std::fs::remove_file(&exec_socket); - if is_exec_mode { - let _ = std::fs::remove_file(&wrapper_path); - } - - let code = status.code().unwrap_or(1); - eprintln!("VM exited with code {code}"); - Ok(code) -} - -// ── Vsock exec bridge ─────────────────────────────────────────────────── - -/// Guest CID assigned in the cloud-hypervisor vsock config. -const VSOCK_GUEST_CID: u32 = 3; - -/// Start a background bridge: exec Unix socket → CHV vsock Unix socket. -/// -/// cloud-hypervisor exposes guest vsock via a host-side Unix socket with a -/// text protocol: connect to the socket, send `CONNECT \n`, read -/// back `OK \n`, then the stream is a raw bidirectional channel to -/// the guest vsock port. This is different from kernel `AF_VSOCK` (which -/// `vhost-vsock` uses) — CHV manages its own transport. -/// -/// This bridge creates a Unix socket at `exec_socket` and, for each -/// incoming connection, opens a connection to the CHV vsock socket, -/// performs the CONNECT handshake, and forwards data bidirectionally. -fn start_vsock_exec_bridge( - exec_socket: &Path, - chv_vsock_socket: &Path, - guest_port: u32, -) -> Result<(), VmError> { - use std::os::unix::net::UnixListener; - - if let Some(parent) = exec_socket.parent() { - std::fs::create_dir_all(parent).map_err(|e| { - VmError::HostSetup(format!("create exec bridge dir {}: {e}", parent.display())) - })?; - } - let _ = std::fs::remove_file(exec_socket); - - let listener = UnixListener::bind(exec_socket).map_err(|e| { - VmError::HostSetup(format!( - "bind vsock exec bridge {}: {e}", - exec_socket.display() - )) - })?; - - let chv_vsock = chv_vsock_socket.to_path_buf(); - eprintln!( - "vsock exec bridge: {} → {} port {}", - exec_socket.display(), - chv_vsock.display(), - guest_port, - ); - - std::thread::spawn(move || { - vsock_bridge_accept_loop(listener, &chv_vsock, guest_port); - }); - - Ok(()) -} - -/// Accept loop for the vsock bridge background thread. -/// -/// "CONNECT rejected" (empty response) is normal during boot — the guest -/// exec agent isn't listening yet. We keep retrying those indefinitely -/// since the bootstrap caller has its own 120s timeout. Only fatal errors -/// (socket gone = VM died) cause the bridge to give up. -fn vsock_bridge_accept_loop( - listener: std::os::unix::net::UnixListener, - chv_vsock_socket: &Path, - port: u32, -) { - let mut fatal_failures: u32 = 0; - let mut logged_transient = false; - - for stream in listener.incoming() { - let client = match stream { - Ok(s) => s, - Err(e) => { - eprintln!("vsock bridge: accept: {e}"); - continue; - } - }; - - match chv_vsock_connect(chv_vsock_socket, port) { - Ok(guest) => { - fatal_failures = 0; - bridge_bidirectional(client, guest); - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => { - fatal_failures += 1; - if fatal_failures <= 2 { - eprintln!("vsock bridge: CHV socket gone (VM exited?): {e}"); - } - if fatal_failures >= 3 { - eprintln!("vsock bridge: CHV socket not found, stopping bridge"); - return; - } - } - Err(e) => { - if !logged_transient { - eprintln!( - "vsock bridge: guest not ready on port {port} ({e}), \ - will keep retrying..." - ); - logged_transient = true; - } - } - } - } -} - -/// Connect to a guest vsock port via cloud-hypervisor's Unix socket protocol. -/// -/// CHV exposes guest vsock through a host Unix socket. The protocol is: -/// 1. Connect to the CHV vsock Unix socket -/// 2. Send: `CONNECT \n` -/// 3. Read: `OK \n` on success -/// 4. The stream is now a raw bidirectional channel to the guest port -fn chv_vsock_connect(chv_vsock_socket: &Path, port: u32) -> std::io::Result { - let mut stream = UnixStream::connect(chv_vsock_socket)?; - stream.set_read_timeout(Some(Duration::from_secs(5)))?; - stream.set_write_timeout(Some(Duration::from_secs(5)))?; - - let connect_msg = format!("CONNECT {port}\n"); - stream.write_all(connect_msg.as_bytes())?; - - let mut buf = [0u8; 64]; - let n = stream.read(&mut buf)?; - let response = std::str::from_utf8(&buf[..n]).unwrap_or(""); - - if !response.starts_with("OK") { - return Err(std::io::Error::new( - std::io::ErrorKind::ConnectionRefused, - format!("CHV vsock CONNECT rejected: {}", response.trim()), - )); - } - - stream.set_read_timeout(None)?; - stream.set_write_timeout(None)?; - Ok(stream) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn http_request_format_with_body() { - let payload = r#"{"cpus":{"boot_vcpus":4}}"#; - let request = format!( - "PUT /api/v1/vm.create HTTP/1.1\r\n\ - Host: localhost\r\n\ - Content-Type: application/json\r\n\ - Content-Length: {}\r\n\ - Connection: close\r\n\ - \r\n\ - {payload}", - payload.len(), - ); - assert!(request.contains("Content-Length: 25")); - assert!(request.contains("boot_vcpus")); - } - - #[test] - fn http_request_format_without_body() { - let request = format!( - "GET /api/v1/vm.info HTTP/1.1\r\n\ - Host: localhost\r\n\ - Connection: close\r\n\ - \r\n" - ); - assert!(request.contains("GET /api/v1/vm.info")); - assert!(!request.contains("Content-Length")); - } - - #[test] - fn build_payload_includes_vfio_device() { - use crate::{NetBackend, VmConfig}; - - let config = VmConfig { - rootfs: "/tmp/rootfs".into(), - vcpus: 4, - mem_mib: 8192, - exec_path: "/srv/openshell-vm-init.sh".into(), - args: vec![], - env: vec![], - workdir: "/".into(), - port_map: vec![], - vsock_ports: vec![], - log_level: 1, - console_output: None, - net: NetBackend::None, - reset: false, - gateway_name: "test".into(), - state_disk: None, - gpu_enabled: true, - gpu_has_msix: true, - vfio_device: Some("0000:41:00.0".into()), - backend: crate::VmBackendChoice::CloudHypervisor, - }; - - let backend = CloudHypervisorBackend { - chv_binary: "/usr/bin/cloud-hypervisor".into(), - vmlinux: "/boot/vmlinux".into(), - virtiofsd: "/usr/bin/virtiofsd".into(), - }; - - let payload = build_vm_create_payload( - &backend, - &config, - &config.exec_path, - config.vfio_device.as_deref(), - Path::new("/tmp/virtiofsd.sock"), - None, - false, - Path::new("/tmp/vsock.sock"), - Path::new("/tmp/console.log"), - ) - .unwrap(); - - assert!( - payload.contains("0000:41:00.0"), - "payload should contain VFIO device" - ); - assert!( - payload.contains("boot_vcpus"), - "payload should contain vcpus config" - ); - assert!( - payload.contains("GPU_ENABLED=true"), - "payload should contain GPU_ENABLED in cmdline" - ); - } - - #[test] - fn build_payload_without_vfio() { - use crate::{NetBackend, VmConfig}; - - let config = VmConfig { - rootfs: "/tmp/rootfs".into(), - vcpus: 2, - mem_mib: 4096, - exec_path: "/srv/openshell-vm-init.sh".into(), - args: vec![], - env: vec![], - workdir: "/".into(), - port_map: vec![], - vsock_ports: vec![], - log_level: 1, - console_output: None, - net: NetBackend::None, - reset: false, - gateway_name: "test".into(), - state_disk: None, - gpu_enabled: false, - gpu_has_msix: true, - vfio_device: None, - backend: crate::VmBackendChoice::Auto, - }; - - let backend = CloudHypervisorBackend { - chv_binary: "/usr/bin/cloud-hypervisor".into(), - vmlinux: "/boot/vmlinux".into(), - virtiofsd: "/usr/bin/virtiofsd".into(), - }; - - let payload = build_vm_create_payload( - &backend, - &config, - &config.exec_path, - None, - Path::new("/tmp/virtiofsd.sock"), - None, - false, - Path::new("/tmp/vsock.sock"), - Path::new("/tmp/console.log"), - ) - .unwrap(); - - assert!( - !payload.contains("devices"), - "payload without VFIO should not have devices key" - ); - assert!( - !payload.contains("GPU_ENABLED"), - "payload should not contain GPU_ENABLED" - ); - } - - #[test] - fn build_payload_with_tap_net_includes_ip_and_cmdline() { - use crate::{NetBackend, VmConfig}; - - let config = VmConfig { - rootfs: "/tmp/rootfs".into(), - vcpus: 4, - mem_mib: 8192, - exec_path: "/srv/openshell-vm-init.sh".into(), - args: vec![], - env: vec![], - workdir: "/".into(), - port_map: vec!["30051:30051".into()], - vsock_ports: vec![], - log_level: 1, - console_output: None, - net: NetBackend::Gvproxy { - binary: "/usr/bin/gvproxy".into(), - }, - reset: false, - gateway_name: "test".into(), - state_disk: None, - gpu_enabled: true, - gpu_has_msix: true, - vfio_device: Some("0000:41:00.0".into()), - backend: crate::VmBackendChoice::CloudHypervisor, - }; - - let backend = CloudHypervisorBackend { - chv_binary: "/usr/bin/cloud-hypervisor".into(), - vmlinux: "/boot/vmlinux".into(), - virtiofsd: "/usr/bin/virtiofsd".into(), - }; - - let payload = build_vm_create_payload( - &backend, - &config, - &config.exec_path, - config.vfio_device.as_deref(), - Path::new("/tmp/virtiofsd.sock"), - None, - true, // use_tap_net - Path::new("/tmp/vsock.sock"), - Path::new("/tmp/console.log"), - ) - .unwrap(); - - assert!( - payload.contains("192.168.249.1"), - "net should contain TAP host IP" - ); - assert!( - payload.contains("255.255.255.0"), - "net should contain TAP netmask" - ); - assert!( - payload.contains("VM_NET_IP=192.168.249.2"), - "cmdline should contain guest IP" - ); - assert!( - payload.contains("VM_NET_GW=192.168.249.1"), - "cmdline should contain gateway IP" - ); - assert!( - payload.contains("VM_NET_DNS="), - "cmdline should contain DNS server" - ); - } - - #[test] - fn build_payload_tap_net_false_omits_net_and_vm_net_vars() { - use crate::{NetBackend, VmConfig}; - - let config = VmConfig { - rootfs: "/tmp/rootfs".into(), - vcpus: 2, - mem_mib: 4096, - exec_path: "/srv/openshell-vm-init.sh".into(), - args: vec![], - env: vec![], - workdir: "/".into(), - port_map: vec![], - vsock_ports: vec![], - log_level: 1, - console_output: None, - net: NetBackend::None, - reset: false, - gateway_name: "test".into(), - state_disk: None, - gpu_enabled: false, - gpu_has_msix: true, - vfio_device: None, - backend: crate::VmBackendChoice::Auto, - }; - - let backend = CloudHypervisorBackend { - chv_binary: "/usr/bin/cloud-hypervisor".into(), - vmlinux: "/boot/vmlinux".into(), - virtiofsd: "/usr/bin/virtiofsd".into(), - }; - - let payload = build_vm_create_payload( - &backend, - &config, - &config.exec_path, - None, - Path::new("/tmp/virtiofsd.sock"), - None, - false, - Path::new("/tmp/vsock.sock"), - Path::new("/tmp/console.log"), - ) - .unwrap(); - - assert!( - !payload.contains("\"net\""), - "no-tap payload should not contain net section" - ); - assert!( - !payload.contains("VM_NET_IP"), - "no-tap payload should not contain VM_NET_IP" - ); - assert!( - !payload.contains("VM_NET_GW"), - "no-tap payload should not contain VM_NET_GW" - ); - assert!( - !payload.contains("VM_NET_DNS"), - "no-tap payload should not contain VM_NET_DNS" - ); - } - - #[test] - fn build_payload_tap_net_has_correct_mac_ip_mask() { - use crate::{NetBackend, VmConfig}; - - let config = VmConfig { - rootfs: "/tmp/rootfs".into(), - vcpus: 2, - mem_mib: 4096, - exec_path: "/srv/openshell-vm-init.sh".into(), - args: vec![], - env: vec![], - workdir: "/".into(), - port_map: vec![], - vsock_ports: vec![], - log_level: 1, - console_output: None, - net: NetBackend::Gvproxy { - binary: "/usr/bin/gvproxy".into(), - }, - reset: false, - gateway_name: "test".into(), - state_disk: None, - gpu_enabled: false, - gpu_has_msix: true, - vfio_device: None, - backend: crate::VmBackendChoice::CloudHypervisor, - }; - - let backend = CloudHypervisorBackend { - chv_binary: "/usr/bin/cloud-hypervisor".into(), - vmlinux: "/boot/vmlinux".into(), - virtiofsd: "/usr/bin/virtiofsd".into(), - }; - - let payload = build_vm_create_payload( - &backend, - &config, - &config.exec_path, - None, - Path::new("/tmp/virtiofsd.sock"), - None, - true, - Path::new("/tmp/vsock.sock"), - Path::new("/tmp/console.log"), - ) - .unwrap(); - - let json: serde_json::Value = serde_json::from_str(&payload).unwrap(); - let net = &json["net"][0]; - assert_eq!(net["mac"], GUEST_MAC); - assert_eq!(net["ip"], "192.168.249.1"); - assert_eq!(net["mask"], "255.255.255.0"); - } - - #[test] - fn build_payload_vfio_and_tap_net_coexist() { - use crate::{NetBackend, VmConfig}; - - let config = VmConfig { - rootfs: "/tmp/rootfs".into(), - vcpus: 4, - mem_mib: 8192, - exec_path: "/srv/openshell-vm-init.sh".into(), - args: vec![], - env: vec![], - workdir: "/".into(), - port_map: vec![], - vsock_ports: vec![], - log_level: 1, - console_output: None, - net: NetBackend::Gvproxy { - binary: "/usr/bin/gvproxy".into(), - }, - reset: false, - gateway_name: "test".into(), - state_disk: None, - gpu_enabled: true, - gpu_has_msix: true, - vfio_device: Some("0000:41:00.0".into()), - backend: crate::VmBackendChoice::CloudHypervisor, - }; - - let backend = CloudHypervisorBackend { - chv_binary: "/usr/bin/cloud-hypervisor".into(), - vmlinux: "/boot/vmlinux".into(), - virtiofsd: "/usr/bin/virtiofsd".into(), - }; - - let payload = build_vm_create_payload( - &backend, - &config, - &config.exec_path, - config.vfio_device.as_deref(), - Path::new("/tmp/virtiofsd.sock"), - None, - true, - Path::new("/tmp/vsock.sock"), - Path::new("/tmp/console.log"), - ) - .unwrap(); - - let json: serde_json::Value = serde_json::from_str(&payload).unwrap(); - assert!( - json["devices"].is_array(), - "devices section should exist for VFIO" - ); - assert!(json["net"].is_array(), "net section should exist for TAP"); - assert!( - json["devices"][0]["path"] - .as_str() - .unwrap() - .contains("0000:41:00.0"), - "VFIO device path should be present" - ); - assert_eq!(json["net"][0]["ip"], "192.168.249.1"); - } - -} diff --git a/crates/openshell-vm/src/backend/libkrun.rs b/crates/openshell-vm/src/backend/libkrun.rs index 0a27c4c71..3ab2d6631 100644 --- a/crates/openshell-vm/src/backend/libkrun.rs +++ b/crates/openshell-vm/src/backend/libkrun.rs @@ -419,7 +419,7 @@ fn launch_libkrun(config: &VmConfig) -> Result { if !config.is_exec_mode() && !config.port_map.is_empty() { let gateway_port = gateway_host_port(config); bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; - health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; + health::wait_for_gateway_ready(gateway_port, &config.gateway_name, config.gpu_enabled)?; } eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); diff --git a/crates/openshell-vm/src/backend/mod.rs b/crates/openshell-vm/src/backend/mod.rs index c08d4e4b0..0fe4abab1 100644 --- a/crates/openshell-vm/src/backend/mod.rs +++ b/crates/openshell-vm/src/backend/mod.rs @@ -5,9 +5,8 @@ //! //! Defines the [`VmBackend`] trait that all hypervisor backends implement, //! and shared infrastructure (gvproxy startup, networking helpers) used by -//! the libkrun, cloud-hypervisor, and QEMU backends. +//! the libkrun and QEMU backends. -pub mod cloud_hypervisor; pub mod libkrun; pub mod qemu; @@ -20,7 +19,7 @@ use crate::{ kill_stale_gvproxy, kill_stale_gvproxy_by_port, pick_gvproxy_ssh_port, vm_rootfs_key, }; -/// Trait implemented by each hypervisor backend (libkrun, cloud-hypervisor, QEMU). +/// Trait implemented by each hypervisor backend (libkrun, QEMU). pub trait VmBackend { /// Launch a VM with the given configuration. /// @@ -37,9 +36,9 @@ pub(crate) struct GvproxySetup { /// Start gvproxy for the given configuration. /// -/// Shared between libkrun and cloud-hypervisor backends. Handles stale -/// process cleanup, socket setup, and process spawning with exponential -/// backoff waiting for the network socket. +/// Shared between libkrun and QEMU backends. Handles stale process +/// cleanup, socket setup, and process spawning with exponential backoff +/// waiting for the network socket. pub(crate) fn start_gvproxy( config: &VmConfig, launch_start: Instant, @@ -210,8 +209,8 @@ pub(crate) fn setup_gvproxy_port_forwarding( } // ── TAP networking constants ──────────────────────────────────────────── -// cloud-hypervisor defaults to 192.168.249.1/24 on the host side of the -// TAP device. The guest uses .2 with the host as its gateway. +// The QEMU backend uses 192.168.249.1/24 on the host side of the TAP +// device. The guest uses .2 with the host as its gateway. /// Fixed MAC for the guest TAP interface. Only one VM runs per host. pub(crate) const GUEST_MAC: &str = "5a:94:ef:e4:0c:ee"; @@ -219,7 +218,6 @@ pub(crate) const GUEST_MAC: &str = "5a:94:ef:e4:0c:ee"; pub(crate) const TAP_HOST_IP: &str = "192.168.249.1"; pub(crate) const TAP_GUEST_IP: &str = "192.168.249.2"; pub(crate) const TAP_SUBNET: &str = "192.168.249.0/24"; -pub(crate) const TAP_NETMASK: &str = "255.255.255.0"; /// Wait for a Unix socket to appear on the filesystem. pub(crate) fn wait_for_socket( @@ -273,10 +271,12 @@ pub(crate) fn shell_escape(s: &str) -> String { if s.is_empty() { return "''".to_string(); } - if s.bytes().all(|b| matches!(b, - b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' - | b'_' | b'-' | b'.' | b'/' | b':' | b'@' | b'=' - )) { + if s.bytes().all(|b| { + matches!(b, + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' + | b'_' | b'-' | b'.' | b'/' | b':' | b'@' | b'=' + ) + }) { return s.to_string(); } format!("'{}'", s.replace('\'', "'\\''")) @@ -333,6 +333,12 @@ pub(crate) fn build_kernel_cmdline( if config.gpu_enabled && config.vfio_device.is_some() { parts.push("GPU_ENABLED=true".to_string()); + // Tell the kernel firmware loader to search /lib/firmware explicitly. + // The init script stages firmware to tmpfs and overrides this via + // sysfs, but the cmdline provides an early fallback so + // request_firmware() can find GSP blobs on the virtiofs rootfs even + // before init runs the staging logic. + parts.push("firmware_class.path=/lib/firmware".to_string()); } if let Some(state_disk) = &config.state_disk { parts.push(format!( @@ -375,9 +381,17 @@ pub(crate) fn setup_tap_host_networking() -> Result { let _ = run_cmd( "iptables", &[ - "-t", "nat", "-D", "POSTROUTING", - "-s", TAP_SUBNET, "!", "-d", TAP_SUBNET, - "-j", "MASQUERADE", + "-t", + "nat", + "-D", + "POSTROUTING", + "-s", + TAP_SUBNET, + "!", + "-d", + TAP_SUBNET, + "-j", + "MASQUERADE", ], ); run_cmd( @@ -494,10 +508,7 @@ pub(crate) fn start_tcp_port_forwarder( } /// Copy data bidirectionally between two TCP streams until either side closes. -fn forward_tcp_bidirectional( - client: std::net::TcpStream, - remote: std::net::TcpStream, -) { +fn forward_tcp_bidirectional(client: std::net::TcpStream, remote: std::net::TcpStream) { let Ok(mut client_r) = client.try_clone() else { return; }; diff --git a/crates/openshell-vm/src/backend/qemu.rs b/crates/openshell-vm/src/backend/qemu.rs index f3fe1f40a..10a9d7149 100644 --- a/crates/openshell-vm/src/backend/qemu.rs +++ b/crates/openshell-vm/src/backend/qemu.rs @@ -1,14 +1,14 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! QEMU backend for GPU passthrough VMs (devices without MSI-X support). +//! QEMU backend for GPU passthrough VMs. //! //! Uses QEMU's command-line interface with KVM acceleration and VFIO device //! passthrough. This backend is Linux-only and requires a separate kernel //! image (`vmlinux`) and `virtiofsd` for the root filesystem. //! -//! Unlike cloud-hypervisor, QEMU handles VFIO devices that lack MSI-X -//! capability by falling back to legacy interrupt emulation. +//! QEMU handles VFIO devices with or without MSI-X capability, falling +//! back to legacy interrupt emulation when MSI-X is unavailable. use std::os::unix::net::UnixStream; use std::os::unix::process::CommandExt; @@ -28,7 +28,7 @@ use crate::{NetBackend, VmConfig, VmError, vm_rootfs_key}; const VSOCK_GUEST_CID: u32 = 3; const QEMU_BINARY_NAME: &str = "qemu-system-x86_64"; -/// QEMU hypervisor backend for GPU passthrough (non-MSI-X devices). +/// QEMU hypervisor backend for GPU passthrough. pub struct QemuBackend { qemu_binary: PathBuf, vmlinux: PathBuf, @@ -124,7 +124,6 @@ const TAP_DEVICE_NAME: &str = "vmtap0"; /// Create and configure the TAP device before QEMU starts. /// -/// Unlike cloud-hypervisor (which creates its own TAP via the `net` config), /// QEMU with `script=no` expects the TAP device to already exist. fn setup_tap_device() -> Result<(), VmError> { // Clean up stale TAP device from a previous crashed run. @@ -132,13 +131,18 @@ fn setup_tap_device() -> Result<(), VmError> { eprintln!("TAP device {TAP_DEVICE_NAME} already exists, removing stale device"); let _ = run_cmd("ip", &["link", "delete", TAP_DEVICE_NAME]); } - run_cmd("ip", &["tuntap", "add", "dev", TAP_DEVICE_NAME, "mode", "tap"])?; + run_cmd( + "ip", + &["tuntap", "add", "dev", TAP_DEVICE_NAME, "mode", "tap"], + )?; run_cmd( "ip", &[ - "addr", "add", + "addr", + "add", &format!("{TAP_HOST_IP}/24"), - "dev", TAP_DEVICE_NAME, + "dev", + TAP_DEVICE_NAME, ], )?; run_cmd("ip", &["link", "set", TAP_DEVICE_NAME, "up"])?; @@ -180,12 +184,8 @@ fn build_qemu_args( ]); // Kernel - args.extend([ - "-kernel".into(), - backend.vmlinux.display().to_string(), - ]); + args.extend(["-kernel".into(), backend.vmlinux.display().to_string()]); - // Kernel cmdline (shared builder with CHV) let cmdline = build_kernel_cmdline(config, effective_exec_path, use_tap_net); args.extend(["-append".into(), cmdline]); @@ -208,10 +208,7 @@ fn build_qemu_args( if let Some(disk_path) = state_disk_path { args.extend([ "-drive".into(), - format!( - "file={},format=raw,if=virtio", - disk_path.display() - ), + format!("file={},format=raw,if=virtio", disk_path.display()), ]); } @@ -327,12 +324,15 @@ fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result Ok(()) }); } - let mut virtiofsd_child = virtiofsd_cmd.spawn() + let mut virtiofsd_child = virtiofsd_cmd + .spawn() .map_err(|e| VmError::Fork(format!("start virtiofsd: {e}")))?; + let virtiofsd_pid = virtiofsd_child.id() as i32; + crate::VIRTIOFSD_PID.store(virtiofsd_pid, std::sync::atomic::Ordering::Relaxed); + eprintln!( - "virtiofsd started (pid {}) [{:.1}s]", - virtiofsd_child.id(), + "virtiofsd started (pid {virtiofsd_pid}) [{:.1}s]", launch_start.elapsed().as_secs_f64() ); @@ -340,7 +340,7 @@ fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result let use_tap_net = !matches!(config.net, NetBackend::None); - // Build exec wrapper (same pattern as CHV) + // Build exec wrapper for --exec mode let is_exec_mode = config.is_exec_mode(); let wrapper_path = config.rootfs.join("tmp/qemu-exec-wrapper.sh"); let effective_exec_path; @@ -435,7 +435,8 @@ fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result Ok(()) }); } - let mut qemu_child = qemu_cmd.spawn() + let mut qemu_child = qemu_cmd + .spawn() .map_err(|e| VmError::Fork(format!("start QEMU: {e}")))?; let qemu_pid = qemu_child.id() as i32; @@ -444,6 +445,25 @@ fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result launch_start.elapsed().as_secs_f64() ); + // Install signal handlers immediately so SIGTERM during the long + // gateway bootstrap (30-120s) forwards to QEMU instead of killing + // the parent via the default handler (which skips Drop and leaves + // the GPU bound to vfio-pci). + // + // We use sigaction with SA_RESTART so that the wait() syscall in the + // main thread auto-restarts after the handler returns, rather than + // failing with EINTR. This prevents a second signal from killing the + // process before cleanup runs. + crate::CHILD_PID.store(qemu_pid, std::sync::atomic::Ordering::Relaxed); + unsafe { + let mut sa: libc::sigaction = std::mem::zeroed(); + sa.sa_sigaction = crate::forward_signal as *const () as libc::sighandler_t; + sa.sa_flags = libc::SA_RESTART; + libc::sigemptyset(&raw mut sa.sa_mask); + libc::sigaction(libc::SIGTERM, &sa, std::ptr::null_mut()); + libc::sigaction(libc::SIGINT, &sa, std::ptr::null_mut()); + } + // Set up host-side TAP networking let mut original_ip_forward: Option = None; if use_tap_net { @@ -467,8 +487,7 @@ fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result // Write runtime state (vsock_bridge: true — uses AF_VSOCK bridging) if !config.is_exec_mode() { - if let Err(err) = - write_vm_runtime_state(&config.rootfs, qemu_pid, &console_log, None, true) + if let Err(err) = write_vm_runtime_state(&config.rootfs, qemu_pid, &console_log, None, true) { let _ = qemu_child.kill(); let _ = qemu_child.wait(); @@ -485,7 +504,7 @@ fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result } } - // TCP port forwarding (same pattern as CHV) + // TCP port forwarding for TAP networking if use_tap_net { for pm in &config.port_map { let parts: Vec<&str> = pm.split(':').collect(); @@ -522,7 +541,7 @@ fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result if !config.is_exec_mode() && !config.port_map.is_empty() { let gateway_port = crate::gateway_host_port(config); if let Err(e) = crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port) - .and_then(|_| crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name)) + .and_then(|_| crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name, config.gpu_enabled)) { let _ = qemu_child.kill(); let _ = qemu_child.wait(); @@ -541,47 +560,52 @@ fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result } } - eprintln!( - "Ready [{:.1}s total]", - launch_start.elapsed().as_secs_f64() - ); + eprintln!("Ready [{:.1}s total]", launch_start.elapsed().as_secs_f64()); eprintln!("Press Ctrl+C to stop."); - // Signal forwarding - crate::CHILD_PID.store(qemu_pid, std::sync::atomic::Ordering::Relaxed); - unsafe { - libc::signal( - libc::SIGINT, - crate::forward_signal as *const () as libc::sighandler_t, - ); - libc::signal( - libc::SIGTERM, - crate::forward_signal as *const () as libc::sighandler_t, - ); - } - - // Wait for QEMU to exit + // Wait for QEMU to exit. SA_RESTART ensures the wait() syscall + // auto-restarts after our signal handler runs, so QEMU gets a + // chance to shut down gracefully before we proceed to cleanup. let status = qemu_child .wait() .map_err(|e| VmError::HostSetup(format!("wait for QEMU: {e}")))?; + + // Clear all signal-related atomics now that QEMU has exited. crate::CHILD_PID.store(0, std::sync::atomic::Ordering::Relaxed); + crate::VIRTIOFSD_PID.store(0, std::sync::atomic::Ordering::Relaxed); - // Clean up host networking rules - if let Some(ref orig) = original_ip_forward { - teardown_tap_host_networking(orig); + let was_shutdown = crate::SHUTDOWN_REQUESTED.load(std::sync::atomic::Ordering::Relaxed); + if was_shutdown { + eprintln!("Shutdown signal received, running explicit cleanup..."); } + + // ── Explicit cleanup (does NOT rely on Drop) ────────────────── + // + // This runs whether QEMU exited normally or was signalled. The + // signal handler forwarded SIGTERM to the process group, but we + // still need to clean up host-side state. + + // 1. Kill virtiofsd + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + eprintln!("virtiofsd stopped"); + + // 2. Tear down TAP device if use_tap_net { teardown_tap_device(); } - // Cleanup + // 3. Tear down host networking (iptables) + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + + // 4. Clean up runtime state files if !config.is_exec_mode() { clear_vm_runtime_state(&config.rootfs); } - let _ = virtiofsd_child.kill(); - let _ = virtiofsd_child.wait(); - eprintln!("virtiofsd stopped"); + // 5. Clean up socket directories and temporary files let _ = std::fs::remove_dir_all(&sock_dir); let _ = std::fs::remove_file(&exec_socket); if is_exec_mode { @@ -598,9 +622,8 @@ fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result /// Start a background bridge: exec Unix socket → guest AF_VSOCK. /// /// QEMU uses kernel `vhost-vsock-pci` which exposes guest vsock via the -/// kernel's `AF_VSOCK` address family. This is different from -/// cloud-hypervisor's text protocol — here we connect directly to the -/// guest CID and port using raw `AF_VSOCK` sockets. +/// kernel's `AF_VSOCK` address family. We connect directly to the guest +/// CID and port using raw `AF_VSOCK` sockets. fn start_vsock_exec_bridge_af_vsock( exec_socket: &Path, guest_cid: u32, @@ -692,7 +715,7 @@ fn is_transient_vsock_error(e: &std::io::Error) -> bool { code == libc::ENODEV // vsock transport not ready || code == libc::EHOSTUNREACH // guest CID not reachable yet || code == libc::ECONNRESET // connection reset during startup - || code == libc::ETIMEDOUT // connect timed out + || code == libc::ETIMEDOUT // connect timed out } None => false, } @@ -963,6 +986,10 @@ mod tests { cmdline.contains("GPU_ENABLED=true"), "cmdline should contain GPU_ENABLED=true: {cmdline}" ); + assert!( + cmdline.contains("firmware_class.path=/lib/firmware"), + "cmdline should contain firmware_class.path for GPU: {cmdline}" + ); } #[test] diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs index ea1fac718..e7fe27e12 100644 --- a/crates/openshell-vm/src/exec.rs +++ b/crates/openshell-vm/src/exec.rs @@ -51,17 +51,15 @@ pub const VM_EXEC_VSOCK_PORT: u32 = 10_777; /// How to connect to the VM exec agent. /// /// libkrun bridges each guest vsock port to a host Unix socket via -/// `krun_add_vsock_port2`. cloud-hypervisor exposes guest vsock through -/// a host-side Unix socket with a text protocol (`CONNECT \n` / -/// `OK \n`), not kernel `AF_VSOCK` or standard `vhost-vsock`. +/// `krun_add_vsock_port2`. QEMU uses kernel AF_VSOCK via vhost-vsock-pci, +/// bridged through a host Unix socket by the exec bridge thread. #[derive(Debug, Clone)] pub enum VsockConnectMode { /// Connect via a host Unix socket (libkrun per-port bridging). UnixSocket(PathBuf), - /// Connect via a vsock proxy bridge (cloud-hypervisor). - /// The path points to a bridged Unix socket that performs the CHV - /// text-protocol handshake and forwards to guest CID 3, - /// port [`VM_EXEC_VSOCK_PORT`]. + /// Connect via a vsock proxy bridge (QEMU AF_VSOCK). + /// The path points to a bridged Unix socket that connects to + /// guest CID 3, port [`VM_EXEC_VSOCK_PORT`]. VsockBridge(PathBuf), } @@ -89,7 +87,7 @@ pub struct VmRuntimeState { /// PID of the gvproxy process (if networking uses gvproxy). #[serde(default, skip_serializing_if = "Option::is_none")] pub gvproxy_pid: Option, - /// Whether this VM uses vsock-bridge mode (cloud-hypervisor) vs + /// Whether this VM uses vsock-bridge mode (QEMU AF_VSOCK) vs /// Unix socket mode (libkrun). Defaults to false for backward compat. #[serde(default, skip_serializing_if = "std::ops::Not::not")] pub vsock_bridge: bool, @@ -484,9 +482,7 @@ fn cleanup_stale_state_on_lock_acquire(rootfs: &Path, lock_path: &Path) { return; } - eprintln!( - "Warning: cleaning up stale lock from dead process (pid {prev_pid})" - ); + eprintln!("Warning: cleaning up stale lock from dead process (pid {prev_pid})"); let state_path = vm_state_path(rootfs); if let Ok(bytes) = fs::read(&state_path) { diff --git a/crates/openshell-vm/src/health.rs b/crates/openshell-vm/src/health.rs index 096a35d1f..ce9a10169 100644 --- a/crates/openshell-vm/src/health.rs +++ b/crates/openshell-vm/src/health.rs @@ -76,20 +76,60 @@ async fn grpc_health_check(gateway_port: u16, gateway_name: &str) -> Result<(), } } +/// Default health check timeout for standard (non-GPU) VMs. +const DEFAULT_HEALTH_TIMEOUT_SECS: u64 = 90; + +/// Extended health check timeout for GPU-enabled VMs. +/// +/// Cold boot with GPU passthrough involves pulling container images (no layer +/// cache on a fresh state disk) and loading NVIDIA drivers/firmware, which +/// legitimately takes longer than a standard VM boot. +const GPU_HEALTH_TIMEOUT_SECS: u64 = 240; + +/// Initial poll interval between health check attempts. +const INITIAL_POLL_INTERVAL_SECS: u64 = 2; + +/// Maximum poll interval (exponential backoff cap). +const MAX_POLL_INTERVAL_SECS: u64 = 10; + +/// How often to emit a progress log line during the health check wait. +const PROGRESS_LOG_INTERVAL_SECS: u64 = 15; + /// Wait for the gateway service to be fully ready by polling the gRPC health endpoint. /// /// This replaces the TCP-only probe with a proper gRPC health check that verifies /// the service is actually responding to requests, not just accepting connections. /// +/// When `gpu_enabled` is true, the timeout is extended to accommodate cold-boot +/// scenarios where container image pulls and NVIDIA driver/firmware loading push +/// total startup well past the standard 90-second window. +/// +/// Uses exponential backoff between retry attempts (2s initial, 10s cap) to +/// avoid hammering the endpoint while still detecting readiness promptly. +/// /// Returns `Ok(())` when the gateway is confirmed healthy, or `Err` if the health /// check fails or times out. Falls back to TCP probe if mTLS materials aren't /// available yet. -pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<(), VmError> { +pub fn wait_for_gateway_ready( + gateway_port: u16, + gateway_name: &str, + gpu_enabled: bool, +) -> Result<(), VmError> { let start = std::time::Instant::now(); - let timeout = Duration::from_secs(90); - let poll_interval = Duration::from_secs(1); + let timeout_secs = if gpu_enabled { + GPU_HEALTH_TIMEOUT_SECS + } else { + DEFAULT_HEALTH_TIMEOUT_SECS + }; + let timeout = Duration::from_secs(timeout_secs); + let mut poll_interval = Duration::from_secs(INITIAL_POLL_INTERVAL_SECS); + let max_poll_interval = Duration::from_secs(MAX_POLL_INTERVAL_SECS); + let progress_interval = Duration::from_secs(PROGRESS_LOG_INTERVAL_SECS); - eprintln!("Waiting for gateway gRPC health check..."); + eprintln!( + "Waiting for gateway gRPC health check (timeout {timeout_secs}s{})...", + if gpu_enabled { ", GPU mode" } else { "" } + ); // Create a runtime for async health checks let rt = match tokio::runtime::Builder::new_current_thread() @@ -103,7 +143,16 @@ pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<( } }; + let mut attempt: u32 = 0; + let mut last_progress_log = start; + // The initial value is never read (overwritten on each loop iteration before + // the progress log), but we need a valid String to satisfy the borrow checker. + #[allow(unused_assignments)] + let mut last_error = String::new(); + loop { + attempt += 1; + // Try gRPC health check let result = rt.block_on(async { tokio::time::timeout( @@ -119,26 +168,40 @@ pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<( return Ok(()); } Ok(Err(e)) => { + last_error = e.clone(); // gRPC call completed but failed if start.elapsed() >= timeout { return Err(VmError::Bootstrap(format!( - "gateway health check failed after {:.0}s: {e}", + "gateway health check failed after {:.0}s (attempt {attempt}): {e}", timeout.as_secs_f64() ))); } } Err(_) => { + last_error = "health probe timed out".to_string(); // Timeout on the health check itself if start.elapsed() >= timeout { return Err(VmError::Bootstrap(format!( - "gateway health check timed out after {:.0}s", + "gateway health check timed out after {:.0}s (attempt {attempt})", timeout.as_secs_f64() ))); } } } + // Periodic progress logging so operators know the check is still running + if last_progress_log.elapsed() >= progress_interval { + eprintln!( + " health check: attempt {attempt}, elapsed {:.0}s/{timeout_secs}s ({last_error})", + start.elapsed().as_secs_f64() + ); + last_progress_log = std::time::Instant::now(); + } + std::thread::sleep(poll_interval); + + // Exponential backoff: double the interval up to the cap + poll_interval = std::cmp::min(poll_interval * 2, max_poll_interval); } } @@ -146,11 +209,18 @@ pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<( fn wait_for_tcp_only( gateway_port: u16, timeout: Duration, - poll_interval: Duration, + mut poll_interval: Duration, ) -> Result<(), VmError> { let start = std::time::Instant::now(); + let max_poll_interval = Duration::from_secs(MAX_POLL_INTERVAL_SECS); + let progress_interval = Duration::from_secs(PROGRESS_LOG_INTERVAL_SECS); + let timeout_secs = timeout.as_secs(); + let mut attempt: u32 = 0; + let mut last_progress_log = start; loop { + attempt += 1; + if host_tcp_probe(gateway_port) { eprintln!( "Service reachable (TCP) [{:.1}s]", @@ -161,12 +231,22 @@ fn wait_for_tcp_only( if start.elapsed() >= timeout { return Err(VmError::Bootstrap(format!( - "gateway TCP probe failed after {:.0}s", + "gateway TCP probe failed after {:.0}s (attempt {attempt})", timeout.as_secs_f64() ))); } + // Periodic progress logging + if last_progress_log.elapsed() >= progress_interval { + eprintln!( + " TCP probe: attempt {attempt}, elapsed {:.0}s/{timeout_secs}s", + start.elapsed().as_secs_f64() + ); + last_progress_log = std::time::Instant::now(); + } + std::thread::sleep(poll_interval); + poll_interval = std::cmp::min(poll_interval * 2, max_poll_interval); } } diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index c4d5ccb31..15e2cbde6 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -119,14 +119,12 @@ fn check(ret: i32, func: &'static str) -> Result<(), VmError> { /// Hypervisor backend selection. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] pub enum VmBackendChoice { - /// Auto-select: cloud-hypervisor when a VFIO device is configured, libkrun otherwise. + /// Auto-select: QEMU when a VFIO device is configured, libkrun otherwise. #[default] Auto, /// Force the libkrun backend. Libkrun, - /// Force the cloud-hypervisor backend (even without GPU/VFIO). - CloudHypervisor, - /// Force the QEMU backend (Linux-only, supports VFIO without MSI-X). + /// Force the QEMU backend (Linux-only, supports VFIO GPU passthrough). Qemu, } @@ -238,13 +236,12 @@ pub struct VmConfig { /// Whether GPU passthrough is enabled for this VM. pub gpu_enabled: bool, - /// Whether the GPU supports MSI-X (needed for cloud-hypervisor VFIO). - /// When `false` and `Auto` backend is selected with GPU enabled, - /// QEMU is used instead of cloud-hypervisor. + /// Whether the GPU supports MSI-X. Retained for informational purposes + /// but no longer affects backend selection (QEMU handles both cases). pub gpu_has_msix: bool, /// VFIO PCI device address for GPU passthrough (e.g. `0000:41:00.0`). - /// When set, the cloud-hypervisor backend is used instead of libkrun. + /// When set, the QEMU backend is used instead of libkrun. pub vfio_device: Option, /// Hypervisor backend override. Defaults to [`VmBackendChoice::Auto`]. @@ -1322,32 +1319,15 @@ pub fn launch(config: &VmConfig) -> Result { enum SelectedBackend { Libkrun, - CloudHypervisor, Qemu, } let selected = match config.backend { - VmBackendChoice::CloudHypervisor => { - if config.gpu_enabled && !config.gpu_has_msix { - return Err(VmError::HostSetup( - "cloud-hypervisor requires MSI-X for VFIO passthrough, but this GPU \ - lacks MSI-X support. Use --backend auto or --backend qemu." - .into(), - )); - } - SelectedBackend::CloudHypervisor - } VmBackendChoice::Libkrun => SelectedBackend::Libkrun, VmBackendChoice::Qemu => SelectedBackend::Qemu, VmBackendChoice::Auto => { - if config.gpu_enabled { - if config.gpu_has_msix { - SelectedBackend::CloudHypervisor - } else { - SelectedBackend::Qemu - } - } else if config.vfio_device.is_some() { - SelectedBackend::CloudHypervisor + if config.gpu_enabled || config.vfio_device.is_some() { + SelectedBackend::Qemu } else { SelectedBackend::Libkrun } @@ -1355,21 +1335,6 @@ pub fn launch(config: &VmConfig) -> Result { }; match selected { - SelectedBackend::CloudHypervisor => { - #[cfg(not(target_os = "linux"))] - return Err(VmError::HostSetup( - "cloud-hypervisor backend requires Linux with KVM".into(), - )); - - #[cfg(target_os = "linux")] - { - if let Some(ref addr) = config.vfio_device { - validate_vfio_address(addr)?; - } - let chv_backend = backend::cloud_hypervisor::CloudHypervisorBackend::new()?; - backend::VmBackend::launch(&chv_backend, config) - } - } SelectedBackend::Qemu => { #[cfg(not(target_os = "linux"))] return Err(VmError::HostSetup( @@ -1683,13 +1648,36 @@ fn sync_host_certs_if_stale( pub(crate) static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); +pub(crate) static VIRTIOFSD_PID: std::sync::atomic::AtomicI32 = + std::sync::atomic::AtomicI32::new(0); + +/// Set to `true` by the signal handler when a shutdown signal (SIGTERM/SIGINT) +/// is received. The main thread checks this after `qemu_child.wait()` returns +/// to ensure cleanup runs even if the wait was interrupted. +pub(crate) static SHUTDOWN_REQUESTED: std::sync::atomic::AtomicBool = + std::sync::atomic::AtomicBool::new(false); + +/// Signal handler that forwards SIGTERM to child processes and sets the +/// shutdown flag. Only calls async-signal-safe functions (libc::kill, +/// atomic stores). No heap allocation, no println, no mutex. pub(crate) extern "C" fn forward_signal(_sig: libc::c_int) { + SHUTDOWN_REQUESTED.store(true, std::sync::atomic::Ordering::Relaxed); + + // Always send SIGTERM to each child individually. The process-group + // approach (kill(-pgid)) is unreliable because setpgid() in QEMU's + // pre_exec silently fails — QEMU stays in its parent's group. let pid = CHILD_PID.load(std::sync::atomic::Ordering::Relaxed); if pid > 0 { unsafe { libc::kill(pid, libc::SIGTERM); } } + let vfsd_pid = VIRTIOFSD_PID.load(std::sync::atomic::Ordering::Relaxed); + if vfsd_pid > 0 { + unsafe { + libc::kill(vfsd_pid, libc::SIGTERM); + } + } } #[cfg(test)] @@ -1809,50 +1797,34 @@ mod tests { } #[test] - fn auto_selects_qemu_when_gpu_no_msix() { + fn auto_selects_qemu_for_gpu() { enum SelectedBackend { Libkrun, - CloudHypervisor, Qemu, } - let select = |backend: VmBackendChoice, gpu_enabled: bool, gpu_has_msix: bool| { - match backend { - VmBackendChoice::CloudHypervisor => SelectedBackend::CloudHypervisor, - VmBackendChoice::Libkrun => SelectedBackend::Libkrun, - VmBackendChoice::Qemu => SelectedBackend::Qemu, - VmBackendChoice::Auto => { - if gpu_enabled { - if gpu_has_msix { - SelectedBackend::CloudHypervisor - } else { - SelectedBackend::Qemu - } - } else { - SelectedBackend::Libkrun - } + let select = |backend: VmBackendChoice, gpu_enabled: bool| match backend { + VmBackendChoice::Libkrun => SelectedBackend::Libkrun, + VmBackendChoice::Qemu => SelectedBackend::Qemu, + VmBackendChoice::Auto => { + if gpu_enabled { + SelectedBackend::Qemu + } else { + SelectedBackend::Libkrun } } }; assert!(matches!( - select(VmBackendChoice::Auto, true, false), + select(VmBackendChoice::Auto, true), SelectedBackend::Qemu )); assert!(matches!( - select(VmBackendChoice::Auto, true, true), - SelectedBackend::CloudHypervisor - )); - assert!(matches!( - select(VmBackendChoice::Auto, false, true), - SelectedBackend::Libkrun - )); - assert!(matches!( - select(VmBackendChoice::Auto, false, false), + select(VmBackendChoice::Auto, false), SelectedBackend::Libkrun )); assert!(matches!( - select(VmBackendChoice::Qemu, false, true), + select(VmBackendChoice::Qemu, false), SelectedBackend::Qemu )); } @@ -1889,19 +1861,13 @@ mod tests { #[test] fn gateway_host_port_no_gateway_mapping_returns_default() { - let cfg = config_with_port_map(vec![ - "6443:6443".to_string(), - "8080:8080".to_string(), - ]); + let cfg = config_with_port_map(vec!["6443:6443".to_string(), "8080:8080".to_string()]); assert_eq!(gateway_host_port(&cfg), DEFAULT_GATEWAY_PORT); } #[test] fn gateway_host_port_finds_remapped_gateway() { - let cfg = config_with_port_map(vec![ - "6443:6443".to_string(), - "9999:30051".to_string(), - ]); + let cfg = config_with_port_map(vec!["6443:6443".to_string(), "9999:30051".to_string()]); assert_eq!(gateway_host_port(&cfg), 9999); } diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs index cafc18763..9241db908 100644 --- a/crates/openshell-vm/src/main.rs +++ b/crates/openshell-vm/src/main.rs @@ -95,13 +95,12 @@ struct Cli { reset: bool, /// Enable GPU passthrough. Optionally specify a PCI address - /// (e.g. `0000:41:00.0`). Uses cloud-hypervisor backend with VFIO. + /// (e.g. `0000:41:00.0`). Uses QEMU backend with VFIO. #[arg(long, num_args = 0..=1, default_missing_value = "auto")] gpu: Option, - /// Hypervisor backend: "auto" (default), "libkrun", "cloud-hypervisor", or "qemu". - /// Auto selects cloud-hypervisor when --gpu is set (with MSI-X), qemu - /// when --gpu is set without MSI-X, and libkrun otherwise. + /// Hypervisor backend: "auto" (default), "libkrun", or "qemu". + /// Auto selects QEMU when --gpu is set, and libkrun otherwise. #[arg(long, default_value = "auto")] backend: String, } @@ -424,13 +423,12 @@ fn run(cli: Cli) -> Result> { } let backend_choice = match cli.backend.as_str() { - "cloud-hypervisor" | "chv" => openshell_vm::VmBackendChoice::CloudHypervisor, "qemu" => openshell_vm::VmBackendChoice::Qemu, "libkrun" => { if gpu_enabled { return Err( "--backend libkrun is incompatible with --gpu (libkrun does not support \ - VFIO passthrough). Use --backend auto, --backend cloud-hypervisor, or --backend qemu." + VFIO passthrough). Use --backend auto or --backend qemu." .into(), ); } @@ -438,10 +436,9 @@ fn run(cli: Cli) -> Result> { } "auto" => openshell_vm::VmBackendChoice::Auto, other => { - return Err(format!( - "unknown --backend: {other} (expected: auto, libkrun, cloud-hypervisor, qemu)" - ) - .into()); + return Err( + format!("unknown --backend: {other} (expected: auto, libkrun, qemu)").into(), + ); } }; @@ -476,9 +473,11 @@ fn run(cli: Cli) -> Result> { == Some(openshell_vm::GUEST_GATEWAY_NODEPORT) }); if !has_gateway { + let gw_port = openshell_vm::GUEST_GATEWAY_NODEPORT; + c.port_map.push(format!("{gw_port}:{gw_port}")); eprintln!( - "warning: no port mapping targets guest port 30051 (gateway NodePort); \ - health check will use default port 30051" + "Auto-added gateway port mapping {gw_port}:{gw_port} \ + (required for health check and CLI access)" ); } } diff --git a/crates/openshell-vm/tests/vm_boot_smoke.rs b/crates/openshell-vm/tests/vm_boot_smoke.rs index 876c2a5b4..f16027129 100644 --- a/crates/openshell-vm/tests/vm_boot_smoke.rs +++ b/crates/openshell-vm/tests/vm_boot_smoke.rs @@ -1,14 +1,14 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! Non-GPU boot smoke tests for cloud-hypervisor and QEMU backends. +//! Non-GPU boot smoke tests for the QEMU backend. //! //! Boots a VM **without** VFIO/GPU passthrough and verifies the kernel boots //! and init runs. This catches backend regressions on regular CI runners //! that lack GPU hardware. //! -//! Gated on `OPENSHELL_VM_BACKEND` — set to `cloud-hypervisor` or `qemu` to -//! run the corresponding tests. Skipped when the env var is absent. +//! Gated on `OPENSHELL_VM_BACKEND` — set to `qemu` to run the tests. +//! Skipped when the env var is absent. //! //! Requires the VM runtime bundle (vmlinux, virtiofsd, rootfs, and the //! backend binary) to be installed. Set `OPENSHELL_VM_RUNTIME_DIR` or run @@ -17,7 +17,6 @@ //! Run explicitly: //! //! ```sh -//! OPENSHELL_VM_BACKEND=cloud-hypervisor cargo test -p openshell-vm --test vm_boot_smoke //! OPENSHELL_VM_BACKEND=qemu cargo test -p openshell-vm --test vm_boot_smoke //! ``` @@ -35,14 +34,6 @@ fn runtime_bundle_dir() -> std::path::PathBuf { .join("openshell-vm.runtime") } -fn skip_unless_chv() -> bool { - if std::env::var("OPENSHELL_VM_BACKEND").as_deref() != Ok("cloud-hypervisor") { - eprintln!("OPENSHELL_VM_BACKEND != cloud-hypervisor — skipping"); - return true; - } - false -} - fn require_bundle() { let bundle = runtime_bundle_dir(); if !bundle.is_dir() { @@ -53,100 +44,6 @@ fn require_bundle() { } } -#[test] -fn cloud_hypervisor_exec_exits_cleanly() { - if skip_unless_chv() { - return; - } - require_bundle(); - - // Boot with --exec /bin/true --net none. The cloud-hypervisor backend - // wraps the exec command in a script that calls `poweroff -f` after - // completion, causing a clean ACPI shutdown. - let mut child = Command::new(GATEWAY) - .args([ - "--backend", - "cloud-hypervisor", - "--net", - "none", - "--exec", - "/bin/true", - ]) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .spawn() - .expect("failed to start openshell-vm"); - - // The VM should boot, run /bin/true, and exit within ~5s. - // Give 30s for slow CI. - let timeout = Duration::from_secs(30); - let start = std::time::Instant::now(); - - loop { - match child.try_wait() { - Ok(Some(status)) => { - assert!( - status.success(), - "cloud-hypervisor --exec /bin/true exited with {status}" - ); - return; - } - Ok(None) => { - if start.elapsed() > timeout { - let _ = unsafe { libc::kill(child.id() as i32, libc::SIGKILL) }; - let _ = child.wait(); - panic!("cloud-hypervisor VM did not exit within {timeout:?}"); - } - std::thread::sleep(Duration::from_millis(500)); - } - Err(e) => panic!("error waiting for openshell-vm: {e}"), - } - } -} - -#[test] -fn cloud_hypervisor_boots_without_gpu() { - if skip_unless_chv() { - return; - } - require_bundle(); - - // Full gateway boot requires TAP networking (root/CAP_NET_ADMIN). - // Skip unless running as root. - if !nix_is_root() { - eprintln!("skipping full gateway boot — requires root for TAP networking"); - return; - } - - let mut child = Command::new(GATEWAY) - .args(["--backend", "cloud-hypervisor"]) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .spawn() - .expect("failed to start openshell-vm"); - - let addr: std::net::SocketAddr = ([127, 0, 0, 1], 30051).into(); - let timeout = Duration::from_secs(180); - let start = std::time::Instant::now(); - let mut reachable = false; - - while start.elapsed() < timeout { - if std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { - reachable = true; - break; - } - std::thread::sleep(Duration::from_secs(2)); - } - - let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) }; - let _ = child.wait(); - - assert!( - reachable, - "cloud-hypervisor VM service on port 30051 not reachable within {timeout:?}" - ); -} - fn skip_unless_qemu() -> bool { if std::env::var("OPENSHELL_VM_BACKEND").as_deref() != Ok("qemu") { eprintln!("OPENSHELL_VM_BACKEND != qemu — skipping"); @@ -163,14 +60,7 @@ fn qemu_exec_exits_cleanly() { require_bundle(); let mut child = Command::new(GATEWAY) - .args([ - "--backend", - "qemu", - "--net", - "none", - "--exec", - "/bin/true", - ]) + .args(["--backend", "qemu", "--net", "none", "--exec", "/bin/true"]) .stdout(Stdio::null()) .stderr(Stdio::null()) .spawn() diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index b7e854677..59b133629 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -56,6 +56,7 @@ COPY crates/openshell-router/Cargo.toml crates/openshell-router/Cargo.toml COPY crates/openshell-sandbox/Cargo.toml crates/openshell-sandbox/Cargo.toml COPY crates/openshell-server/Cargo.toml crates/openshell-server/Cargo.toml COPY crates/openshell-tui/Cargo.toml crates/openshell-tui/Cargo.toml +COPY crates/openshell-vfio/Cargo.toml crates/openshell-vfio/Cargo.toml COPY crates/openshell-vm/Cargo.toml crates/openshell-vm/Cargo.toml COPY crates/openshell-core/build.rs crates/openshell-core/build.rs COPY proto/ proto/ @@ -73,6 +74,7 @@ RUN mkdir -p \ crates/openshell-sandbox/src \ crates/openshell-server/src \ crates/openshell-tui/src \ + crates/openshell-vfio/src \ crates/openshell-vm/src && \ touch crates/openshell-bootstrap/src/lib.rs && \ printf 'fn main() {}\n' > crates/openshell-cli/src/main.rs && \ @@ -89,6 +91,7 @@ RUN mkdir -p \ touch crates/openshell-server/src/lib.rs && \ printf 'fn main() {}\n' > crates/openshell-server/src/main.rs && \ touch crates/openshell-tui/src/lib.rs && \ + touch crates/openshell-vfio/src/lib.rs && \ touch crates/openshell-vm/src/lib.rs && \ printf 'fn main() {}\n' > crates/openshell-vm/src/main.rs diff --git a/tasks/scripts/vm/build-gpu-deps.sh b/tasks/scripts/vm/build-gpu-deps.sh index db109e558..7265a06c3 100755 --- a/tasks/scripts/vm/build-gpu-deps.sh +++ b/tasks/scripts/vm/build-gpu-deps.sh @@ -2,17 +2,16 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Build GPU passthrough dependencies shared by CHV and QEMU backends. +# Build GPU passthrough dependencies for the QEMU backend. # -# Downloads pre-built cloud-hypervisor and builds virtiofsd from source. +# Builds virtiofsd from source. # These are only needed on Linux for VFIO GPU passthrough. # # Artifacts produced: -# cloud-hypervisor — CHV backend binary (not needed by QEMU) -# virtiofsd — shared by both CHV and QEMU backends +# virtiofsd — filesystem daemon used by the QEMU backend # -# The vmlinux kernel (shared by CHV and QEMU) is extracted separately -# by build-libkrun.sh during the kernel build step. +# The vmlinux kernel is extracted separately by build-libkrun.sh during +# the kernel build step. # # QEMU's own binary (qemu-system-x86_64) must be installed on the host # separately — it is not built or downloaded by this script. @@ -29,7 +28,6 @@ ROOT="$(vm_lib_root)" source "${ROOT}/crates/openshell-vm/pins.env" 2>/dev/null || true -CLOUD_HYPERVISOR_VERSION="${CLOUD_HYPERVISOR_VERSION:-v42.0}" VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" OUTPUT_DIR="${ROOT}/target/libkrun-build" @@ -41,7 +39,7 @@ while [[ $# -gt 0 ]]; do done if [ "$(uname -s)" != "Linux" ]; then - echo "Error: cloud-hypervisor GPU passthrough is Linux-only" >&2 + echo "Error: GPU passthrough is Linux-only" >&2 exit 1 fi @@ -49,21 +47,11 @@ mkdir -p "$OUTPUT_DIR" HOST_ARCH="$(uname -m)" case "$HOST_ARCH" in - aarch64) CHV_ARCH="aarch64"; VIRTIOFSD_ARCH="aarch64" ;; - x86_64) CHV_ARCH="x86_64"; VIRTIOFSD_ARCH="x86_64" ;; + aarch64) VIRTIOFSD_ARCH="aarch64" ;; + x86_64) VIRTIOFSD_ARCH="x86_64" ;; *) echo "Error: Unsupported architecture: ${HOST_ARCH}" >&2; exit 1 ;; esac -echo "==> Downloading cloud-hypervisor ${CLOUD_HYPERVISOR_VERSION} for ${HOST_ARCH}..." -CHV_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CLOUD_HYPERVISOR_VERSION}/cloud-hypervisor-static" -if [ "$CHV_ARCH" = "aarch64" ]; then - CHV_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CLOUD_HYPERVISOR_VERSION}/cloud-hypervisor-static-aarch64" -fi - -curl -fsSL -o "${OUTPUT_DIR}/cloud-hypervisor" "$CHV_URL" -chmod +x "${OUTPUT_DIR}/cloud-hypervisor" -echo " Downloaded: cloud-hypervisor" - echo "==> Building virtiofsd ${VIRTIOFSD_VERSION} from source..." VIRTIOFSD_SRC="$(mktemp -d)" VIRTIOFSD_TARBALL_URL="https://gitlab.com/virtio-fs/virtiofsd/-/archive/${VIRTIOFSD_VERSION}/virtiofsd-${VIRTIOFSD_VERSION}.tar.gz" @@ -85,4 +73,4 @@ echo " Built: virtiofsd" echo "" echo "==> GPU passthrough binaries ready in ${OUTPUT_DIR}" -ls -lah "${OUTPUT_DIR}/cloud-hypervisor" "${OUTPUT_DIR}/virtiofsd" 2>/dev/null || true +ls -lah "${OUTPUT_DIR}/virtiofsd" 2>/dev/null || true diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh index cbd505f1c..c2a1a6d76 100755 --- a/tasks/scripts/vm/build-libkrun.sh +++ b/tasks/scripts/vm/build-libkrun.sh @@ -255,14 +255,14 @@ make -j"$(nproc)" cp libkrunfw.so* "$OUTPUT_DIR/" echo " Built: $(ls "$OUTPUT_DIR"/libkrunfw.so* | xargs -n1 basename | tr '\n' ' ')" -# Copy vmlinux kernel image for cloud-hypervisor GPU passthrough. +# Copy vmlinux kernel image for QEMU GPU passthrough. # This is the uncompressed kernel built by libkrunfw's kernel build. if [ -f "${KERNEL_SOURCES}/vmlinux" ]; then cp "${KERNEL_SOURCES}/vmlinux" "$OUTPUT_DIR/vmlinux" - echo " Copied vmlinux for cloud-hypervisor GPU passthrough" + echo " Copied vmlinux for QEMU GPU passthrough" elif [ -f "vmlinux" ]; then cp "vmlinux" "$OUTPUT_DIR/vmlinux" - echo " Copied vmlinux for cloud-hypervisor GPU passthrough" + echo " Copied vmlinux for QEMU GPU passthrough" else echo " Warning: vmlinux not found in kernel build tree (GPU passthrough will not be available)" >&2 fi diff --git a/tasks/scripts/vm/compress-vm-runtime.sh b/tasks/scripts/vm/compress-vm-runtime.sh index 3c0240ffb..69e1d5658 100755 --- a/tasks/scripts/vm/compress-vm-runtime.sh +++ b/tasks/scripts/vm/compress-vm-runtime.sh @@ -100,7 +100,7 @@ if [ -z "${VM_RUNTIME_TARBALL:-}" ] && _check_compressed_artifacts "$OUTPUT_DIR" # core compressed set. Copy them into WORK_DIR so bundle-vm-runtime.sh # stages them alongside the core libraries. _BUILD_DIR="${ROOT}/target/libkrun-build" - for gpu_bin in vmlinux cloud-hypervisor virtiofsd; do + for gpu_bin in vmlinux virtiofsd; do if [ -f "${_BUILD_DIR}/${gpu_bin}" ]; then cp "${_BUILD_DIR}/${gpu_bin}" "$WORK_DIR/" chmod 0755 "${WORK_DIR}/${gpu_bin}" @@ -274,7 +274,7 @@ case "$(uname -s)-$(uname -m)" in fi # GPU passthrough binaries (optional — included when present in libkrun-build) - for gpu_bin in vmlinux cloud-hypervisor virtiofsd; do + for gpu_bin in vmlinux virtiofsd; do if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then cp "${BUILD_DIR}/${gpu_bin}" "$WORK_DIR/" echo " Included GPU binary: ${gpu_bin}" diff --git a/tasks/scripts/vm/package-vm-runtime.sh b/tasks/scripts/vm/package-vm-runtime.sh index 8b09c91ba..7f5e908c6 100755 --- a/tasks/scripts/vm/package-vm-runtime.sh +++ b/tasks/scripts/vm/package-vm-runtime.sh @@ -85,7 +85,7 @@ case "$PLATFORM" in [ -n "$versioned" ] && cp "$versioned" "${PACKAGE_DIR}/libkrunfw.so.5" fi # GPU passthrough binaries (optional — only included if present) - for gpu_bin in cloud-hypervisor vmlinux virtiofsd; do + for gpu_bin in vmlinux virtiofsd; do if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then cp "${BUILD_DIR}/${gpu_bin}" "${PACKAGE_DIR}/" echo " Included GPU passthrough binary: ${gpu_bin}" diff --git a/tasks/vm.toml b/tasks/vm.toml index a61adec59..8c5fd1afc 100644 --- a/tasks/vm.toml +++ b/tasks/vm.toml @@ -55,7 +55,7 @@ description = "Build NVIDIA kernel modules for GPU VM rootfs (requires FROM_SOUR run = "tasks/scripts/vm/build-nvidia-modules.sh" ["vm:gpu-deps"] -description = "Build GPU passthrough dependencies (cloud-hypervisor, virtiofsd) shared by CHV and QEMU backends" +description = "Build GPU passthrough dependencies (virtiofsd) for the QEMU backend" run = "tasks/scripts/vm/build-gpu-deps.sh" ["vm:qemu-check"]