diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml index df953b5d3..6dd98b1cd 100644 --- a/.github/workflows/test-gpu.yml +++ b/.github/workflows/test-gpu.yml @@ -22,7 +22,7 @@ jobs: - id: get_pr_info if: github.event_name == 'push' continue-on-error: true - uses: nv-gha-runners/get-pr-info@main + uses: nv-gha-runners/get-pr-info@090577647b8ddc4e06e809e264f7881650ecdccf - id: gate shell: bash diff --git a/Cargo.lock b/Cargo.lock index e4057f75c..cc1193267 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3028,6 +3028,7 @@ dependencies = [ "openshell-prover", "openshell-providers", "openshell-tui", + "openshell-vfio", "owo-colors", "prost-types", "rcgen", @@ -3270,6 +3271,14 @@ dependencies = [ "url", ] +[[package]] +name = "openshell-vfio" +version = "0.0.0" +dependencies = [ + "nix", + "tempfile", +] + [[package]] name = "openshell-vm" version = "0.0.0" @@ -3283,11 +3292,13 @@ dependencies = [ "nix", "openshell-bootstrap", "openshell-core", + "openshell-vfio", "rustls", "rustls-pemfile", "serde", "serde_json", "tar", + "tempfile", "thiserror 2.0.18", "tokio", "tokio-rustls", diff --git a/architecture/README.md b/architecture/README.md index 570fce660..008836fca 100644 --- a/architecture/README.md +++ b/architecture/README.md @@ -301,4 +301,6 @@ This opens an interactive SSH session into the sandbox, with all provider creden | [Inference Routing](inference-routing.md) | Transparent interception and sandbox-local routing of AI inference API calls to configured backends. | | [System Architecture](system-architecture.md) | Top-level system architecture diagram with all deployable components and communication flows. | | [Gateway Settings Channel](gateway-settings.md) | Runtime settings channel: two-tier key-value configuration, global policy override, settings registry, CLI/TUI commands. | +| [Custom VM Runtime](custom-vm-runtime.md) | Dual-backend VM runtime (libkrun / QEMU), kernel configuration, and build pipeline. | +| [VM GPU Passthrough](vm-gpu-passthrough.md) | VFIO GPU passthrough for VMs: host preparation, safety checks, nvidia driver hardening, and troubleshooting. | | [TUI](tui.md) | Terminal user interface for sandbox interaction. | diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index ce4d0bf39..9963edea8 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -1,18 +1,31 @@ -# Custom libkrunfw VM Runtime +# Custom VM Runtime > Status: Experimental and work in progress (WIP). VM support is under active development and may change. ## Overview -The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a -lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel -is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel. +The OpenShell gateway VM supports two hypervisor backends: -The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or -conntrack support. This is insufficient for Kubernetes pod networking. +- **libkrun** (default) — lightweight VMM using Apple Hypervisor.framework (macOS) or KVM + (Linux). The kernel is embedded inside `libkrunfw`. Uses virtio-MMIO device transport and + gvproxy for user-space networking. +- **QEMU** — Linux-only VMM used for GPU passthrough (VFIO). Uses virtio-PCI device transport, + TAP networking, and requires a separate `vmlinux` kernel and `virtiofsd` for rootfs access. + QEMU binary is not embedded — it must be installed on the host. + +Backend selection is automatic: `--gpu` selects QEMU, otherwise libkrun is used. The `--backend` +flag provides explicit control (`auto`, `libkrun`, `qemu`). + +When `--gpu` is passed, `openshell-vm` automatically binds an eligible GPU to `vfio-pci` +and restores it to the original driver on shutdown. See +[vm-gpu-passthrough.md](vm-gpu-passthrough.md) for the full lifecycle description. -The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to -the VM kernel, enabling standard Kubernetes networking. +Both backends share the same guest kernel (built from a single `openshell.kconfig` fragment) +and rootfs. + +The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or +conntrack support. This is insufficient for Kubernetes pod networking. The custom kconfig +adds bridge CNI, iptables/nftables, conntrack, and QEMU compatibility. ## Architecture @@ -20,10 +33,11 @@ the VM kernel, enabling standard Kubernetes networking. graph TD subgraph Host["Host (macOS / Linux)"] BIN[openshell-vm binary] - EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy"] + EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy · rootfs"] CACHE["~/.local/share/openshell/vm-runtime/{version}/"] PROV[Runtime provenance logging] GVP[gvproxy networking proxy] + QEMU_BIN["qemu-system-x86_64 · virtiofsd · vmlinux\n(GPU runtime bundle)"] BIN --> EMB BIN -->|extracts to| CACHE @@ -44,8 +58,9 @@ graph TD INIT --> VAL --> CNI --> EXECA --> PKI --> K3S end - BIN -- "fork + krun_start_enter" --> INIT - GVP -- "virtio-net" --> Guest + BIN -- "libkrun: fork + krun_start_enter" --> INIT + BIN -- "QEMU: qemu-system-x86_64 + virtiofsd" --> INIT + GVP -- "virtio-net (libkrun only)" --> Guest ``` ## Embedded Runtime @@ -67,9 +82,22 @@ these to XDG cache directories with progress bars: └── ... ``` -This eliminates the need for separate bundles or downloads - a single ~120MB binary -provides everything needed to run the VM. Old cache versions are automatically -cleaned up when a new version is extracted. +When using QEMU for GPU passthrough, an additional runtime bundle is required alongside +the binary: + +``` +target/debug/openshell-vm.runtime/ (or alongside the installed binary) +├── virtiofsd # virtio-fs daemon +└── vmlinux # extracted guest kernel +``` + +This bundle is built with `mise run vm:bundle-runtime` and is separate from the +embedded runtime because virtiofsd is Linux-only and not embedded in the +self-extracting binary. + +This eliminates the need for separate bundles or downloads for the default (libkrun) +path — a single ~120MB binary provides everything needed. Old cache versions are +automatically cleaned up when a new version is extracted. ### Hybrid Approach @@ -86,6 +114,34 @@ mise run vm:rootfs # Full rootfs (~2GB, includes images) mise run vm:build # Rebuild binary with full rootfs ``` +## Backend Comparison + +| | libkrun (default) | QEMU | +|---|---|---| +| Platforms | macOS (Hypervisor.framework), Linux (KVM) | Linux (KVM) only | +| Device transport | virtio-MMIO | virtio-PCI | +| Networking | gvproxy (user-space, no root needed) | TAP (requires root/CAP_NET_ADMIN) | +| Rootfs delivery | In-process (krun API) | virtiofsd (virtio-fs daemon) | +| Kernel delivery | Embedded in libkrunfw | Separate `vmlinux` file | +| Console | virtio-console (`hvc0`) | 8250 UART (`ttyS0`) | +| Shutdown | Automatic on PID 1 exit | ACPI poweroff (`poweroff -f`) | +| GPU passthrough | Not supported | VFIO PCI | +| Vsock | libkrun built-in | `AF_VSOCK` (kernel `vhost_vsock`) | +| VM control | krun C API | Command-line args | +| Binary source | Embedded in runtime | Host-installed | +| `--exec` mode | Direct init replacement | Wrapper script with ACPI shutdown | +| CLI flag | `--backend libkrun` | `--backend qemu` or `--gpu` | + +### Exec mode differences + +With libkrun, when `--exec ` is used, the command replaces the init process and +the VM exits when PID 1 exits. + +With QEMU, the VM does not automatically exit when PID 1 terminates. A wrapper init +script is dynamically written to the guest rootfs that mounts necessary filesystems, +executes the user command, captures the exit code, and calls `poweroff -f` to trigger +an ACPI shutdown that the hypervisor detects. + ## Network Profile The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and @@ -100,6 +156,26 @@ fast with an actionable error if they are missing. - Service VIPs: functional (ClusterIP, NodePort) - hostNetwork workarounds: not required +### Networking by backend + +- **libkrun**: Uses gvproxy for user-space virtio-net networking. No root privileges + needed. Port forwarding is handled via gvproxy configuration. +- **QEMU**: Uses TAP networking (requires root or CAP_NET_ADMIN). When `--net none` + is passed, networking is disabled entirely (useful for `--exec` mode tests). gvproxy + is not used with QEMU. + +## Guest Init Script + +The init script (`openshell-vm-init.sh`) runs as PID 1 in the guest. After mounting essential filesystems, it performs: + +1. **Kernel cmdline parsing** — exports environment variables passed via the kernel command line (`GPU_ENABLED`, `OPENSHELL_VM_STATE_DISK_DEVICE`, `VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). This runs after `/proc` is mounted so `/proc/cmdline` is available. + +2. **Cgroup v2 controller enablement** — enables `cpu`, `cpuset`, `memory`, `pids`, and `io` controllers in the root cgroup hierarchy (`cgroup.subtree_control`). k3s/kubelet requires these controllers; the `cpu` controller depends on `CONFIG_CGROUP_SCHED` in the kernel. + +3. **Networking** — detects `eth0` and attempts DHCP (via `udhcpc`). On failure, falls back to static IP configuration using `VM_NET_IP` and `VM_NET_GW` from the kernel cmdline (set by the QEMU backend for TAP networking). DNS is configured from `VM_NET_DNS` if set, overriding any stale `/etc/resolv.conf` entries. + +4. **Capability validation** — verifies required kernel features (bridge networking, netfilter, cgroups) and fails fast with actionable errors if missing. + ## Runtime Provenance At boot, the openshell-vm binary logs provenance metadata about the loaded runtime bundle: @@ -128,21 +204,46 @@ graph LR BUILD_M["Build libkrunfw.dylib + libkrun.dylib"] end + subgraph GPU["Linux CI (build-gpu-deps.sh)"] + BUILD_GPU["Build virtiofsd\n(for QEMU backend)"] + end + + subgraph NV["Linux CI (build-nvidia-modules.sh)"] + BUILD_NV["Compile NVIDIA .ko against VM kernel"] + end + + subgraph QEMU["Host-installed"] + QEMU_BIN["qemu-system-x86_64\n(not built — must be on host PATH)"] + end + subgraph Output["target/libkrun-build/"] LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"] LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"] + VIRTIOFSD["virtiofsd\n(QEMU backend)"] + VMLINUX["vmlinux\n(shared by QEMU)"] + NV_KO["nvidia-modules/*.ko\n(GPU builds only)"] end KCONF --> BUILD_L BUILD_L --> LIB_SO + BUILD_L --> VMLINUX + BUILD_L -->|kernel source tree| BUILD_NV + BUILD_NV --> NV_KO KCONF --> BUILD_M BUILD_M --> LIB_DY + BUILD_GPU --> VIRTIOFSD ``` +The `vmlinux` kernel is extracted from the libkrunfw build and reused by QEMU. +Both backends boot the same kernel — the kconfig fragment includes drivers for +both virtio-MMIO (libkrun) and virtio-PCI (QEMU) transports. + ## Kernel Config Fragment The `openshell.kconfig` fragment enables these kernel features on top of the stock -libkrunfw kernel: +libkrunfw kernel. A single kernel binary is shared by both backends (libkrun and +QEMU) — backend-specific drivers coexist safely (the kernel probes whichever +transport the hypervisor provides). | Feature | Key Configs | Purpose | |---------|-------------|---------| @@ -158,11 +259,18 @@ libkrunfw kernel: | IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Pod-to-pod routing | | IPVS | `CONFIG_IP_VS`, `CONFIG_IP_VS_RR`, `CONFIG_IP_VS_NFCT` | kube-proxy IPVS mode (optional) | | Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | Kubernetes QoS | -| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Container resource limits | +| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_CGROUP_CPUACCT`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS`, `CONFIG_CGROUP_FREEZER` | Container resource limits | +| Cgroup CPU | `CONFIG_CGROUP_SCHED`, `CONFIG_FAIR_GROUP_SCHED`, `CONFIG_CFS_BANDWIDTH` | cgroup v2 `cpu` controller for k3s/kubelet | | TUN/TAP | `CONFIG_TUN` | CNI plugin support | | Dummy interface | `CONFIG_DUMMY` | Fallback networking | | Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support | | Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support | +| PCI / GPU | `CONFIG_PCI`, `CONFIG_PCI_MSI`, `CONFIG_DRM` | GPU passthrough via VFIO | +| Kernel modules | `CONFIG_MODULES`, `CONFIG_MODULE_UNLOAD` | Loading NVIDIA drivers in guest | +| virtio-PCI transport | `CONFIG_VIRTIO_PCI` | QEMU device bus (libkrun uses MMIO) | +| Serial console | `CONFIG_SERIAL_8250`, `CONFIG_SERIAL_8250_CONSOLE` | QEMU console (`ttyS0`) | +| ACPI | `CONFIG_ACPI` | QEMU power management / clean shutdown | +| x2APIC | `CONFIG_X86_X2APIC` | Multi-vCPU support (QEMU uses x2APIC MADT entries) | See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full fragment with inline comments explaining why each option is needed. @@ -189,13 +297,22 @@ The standalone `openshell-vm` binary supports `openshell-vm exec -- `openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style commands work the same way they would inside the VM shell. +### Vsock by backend + +- **libkrun**: Uses libkrun's built-in vsock port mapping, which transparently + bridges the guest vsock port to a host Unix socket. +- **QEMU**: Uses `vhost-vsock-pci` with kernel `AF_VSOCK` sockets. The exec + bridge opens a kernel `AF_VSOCK` socket to the guest CID and bridges it to + the same Unix domain socket path used by the other backend. Requires the + `vhost_vsock` kernel module on the host. + ## Build Commands ```bash # One-time setup: download pre-built runtime (~30s) mise run vm:setup -# Build and run +# Build and run (libkrun, default) mise run vm # Build embedded binary with base rootfs (~120MB, recommended) @@ -210,6 +327,29 @@ mise run vm:build # Rebuild binary FROM_SOURCE=1 mise run vm:setup # Build runtime from source mise run vm:build # Then build embedded binary +# Build GPU runtime bundle (Linux only) +mise run vm:bundle-runtime # Builds virtiofsd + extracts vmlinux + +# Validate QEMU host prerequisites +mise run vm:qemu-check + +# Install QEMU if not present (Ubuntu/Debian) +sudo apt install qemu-system-x86 + +# Load vhost-vsock kernel module (required for QEMU vsock) +sudo modprobe vhost_vsock +echo "vhost_vsock" | sudo tee /etc/modules-load.d/vhost_vsock.conf + +# Build with GPU support (Linux x86_64 only) +FROM_SOURCE=1 mise run vm:setup # Build kernel from source (module compilation needs it) +mise run vm:nvidia-modules # Compile NVIDIA .ko files against VM kernel +mise run vm:rootfs -- --base --gpu # Build GPU rootfs with injected kernel modules +mise run vm:build # Rebuild binary with GPU rootfs + +# Run with QEMU backend +openshell-vm --backend qemu # Requires qemu-system-x86_64 on host +openshell-vm --gpu # Auto-selects QEMU for GPU passthrough + # Wipe everything and start over mise run vm:clean ``` @@ -221,20 +361,23 @@ rolling `vm-dev` GitHub Release: ### Kernel Runtime (`release-vm-kernel.yml`) -Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for all -supported platforms. Runs on-demand or when the kernel config / pinned versions change. +Builds the custom libkrunfw (kernel firmware), libkrun (VMM), gvproxy, and virtiofsd +for all supported platforms. Runs on-demand or when the kernel config / pinned versions +change. | Platform | Runner | Build Method | |----------|--------|-------------| -| Linux ARM64 | `build-arm64` (self-hosted) | Native `build-libkrun.sh` | -| Linux x86_64 | `build-amd64` (self-hosted) | Native `build-libkrun.sh` | -| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` | +| Linux ARM64 | `build-arm64` (self-hosted) | `build-libkrun.sh` + `build-gpu-deps.sh` | +| Linux x86_64 | `build-amd64` (self-hosted) | `build-libkrun.sh` + `build-gpu-deps.sh` | +| macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` (no GPU support) | -Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, and -provenance metadata. +Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, +and provenance metadata. Linux artifacts additionally include virtiofsd and the +extracted `vmlinux` kernel. Each platform builds its own libkrunfw and libkrun natively. The kernel inside -libkrunfw is always Linux regardless of host platform. +libkrunfw is always Linux regardless of host platform. Virtiofsd is +Linux-only (macOS does not support VFIO/KVM passthrough). ### VM Binary (`release-vm-dev.yml`) @@ -263,6 +406,10 @@ macOS binaries produced via osxcross are not codesigned. Users must self-sign: codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - ./openshell-vm ``` +> **Note:** QEMU smoke tests (`vm_boot_smoke.rs`) are gated on `OPENSHELL_VM_BACKEND=qemu`. +> These tests require `qemu-system-x86_64` on the runner and are currently manual-only. +> Run `mise run vm:qemu-check` to validate prerequisites before running QEMU tests. + ## Rollout Strategy 1. Custom runtime is embedded by default when building with `mise run vm:build`. diff --git a/architecture/vm-gpu-passthrough.md b/architecture/vm-gpu-passthrough.md new file mode 100644 index 000000000..621e27c0c --- /dev/null +++ b/architecture/vm-gpu-passthrough.md @@ -0,0 +1,470 @@ +# VM GPU Passthrough + +> Status: Experimental and work in progress (WIP). GPU passthrough for the VM backend is under active development. + +## Overview + +OpenShell's VM backend can pass a physical NVIDIA GPU into a microVM using VFIO (Virtual Function I/O). This gives the guest direct access to GPU hardware, enabling CUDA workloads and `nvidia-smi` inside sandboxes without virtualization overhead. + +GPU passthrough uses QEMU (instead of the default libkrun backend) to attach a VFIO device to the VM. The guest sees a real PCI GPU device and loads standard NVIDIA drivers. + +## Architecture + +``` +Host │ Guest (microVM) +──────────────────────────────│─────────────────────────── + NVIDIA GPU (PCI BDF addr) │ nvidia driver + CUDA + ↕ bound to vfio-pci │ ↕ + /dev/vfio/ │ /dev/nvidia* + ↕ │ ↕ + QEMU (VFIO) ────│→ PCI device visible + ↕ │ ↕ + TAP networking │ k3s + device plugin + virtiofsd (rootfs) │ ↕ + │ sandbox pods (nvidia.com/gpu) +``` + +### Backend selection + +| Flag | Backend | GPU attached? | +|------|---------|---------------| +| (none) | libkrun | No | +| `--gpu` | QEMU | Yes | +| `--gpu 0000:41:00.0` | QEMU | Yes | +| `--backend qemu` | QEMU | Optional | + +Auto mode (`--backend auto`, the default) selects QEMU when `--gpu` is used, and libkrun otherwise. + +### Automatic GPU binding + +When `--gpu` is passed (with or without a specific PCI address), the launcher automatically prepares the GPU for VFIO passthrough: + +1. **Probe** — scans `/sys/bus/pci/devices` for NVIDIA devices (vendor `0x10de`). +2. **Safety checks** — for each candidate GPU, verifies it is safe to claim (see below). If any check fails, the launcher refuses to proceed and exits with an actionable error. +3. **Bind** — unbinds the selected GPU from the `nvidia` driver and binds it to `vfio-pci`. Also binds any IOMMU group peers to `vfio-pci` for group cleanliness. +4. **Launch** — starts QEMU with the VFIO device attached and sets `GPU_ENABLED=true` in the guest kernel cmdline. +5. **Rebind on shutdown** — when the VM exits (clean shutdown, Ctrl+C, or crash), the launcher rebinds the GPU back to the `nvidia` driver and clears `driver_override`, restoring host GPU access. Cleanup is guaranteed by a `GpuBindGuard` RAII guard that calls restore on drop, covering normal exit, early return, and panic. Only `SIGKILL` (kill -9) bypasses the guard — see Troubleshooting below for manual recovery. + +When a specific PCI address is given (`--gpu 0000:41:00.0`), the launcher targets that exact device. When `--gpu` is used without an address (`auto` mode), the launcher selects the best available GPU using the multi-GPU selection strategy. + +### Safety checks + +All safety checks are hard failures — if any check fails, the launcher prints an error and exits without binding. The one exception is display-manager-related blocking: when the GPU is held by Xorg or a Wayland compositor, the launcher prompts the user interactively to stop the display manager (see Single-GPU caveats). + +| Check | What it detects | Failure behavior | +|-------|----------------|------------------| +| **Display attached** | GPU drives an active DRM framebuffer or is the primary rendering device | Interactive prompt to stop display-manager; error if declined or non-interactive | +| **Active processes** | Processes holding `/dev/nvidia*` file descriptors (CUDA jobs, monitoring) | Error if non-display processes; interactive prompt if only display servers | +| **IOMMU enabled** | `/sys/kernel/iommu_groups/` exists and the GPU has a group assignment | Error: "IOMMU is not enabled — add intel_iommu=on or amd_iommu=on to kernel cmdline" | +| **VFIO modules loaded** | `vfio-pci` and `vfio_iommu_type1` kernel modules are loaded | Error: "vfio-pci kernel module not loaded — run: sudo modprobe vfio-pci" | +| **Permissions** | Write access to sysfs bind/unbind and `/dev/vfio/` | Error: "insufficient permissions — run as root or with CAP_NET_ADMIN" | + +### Multi-GPU selection (`--gpu` auto mode) + +On hosts with multiple NVIDIA GPUs, the launcher selects a GPU using this priority: + +1. **Already on vfio-pci** with a clean IOMMU group — use immediately (no rebind needed). +2. **Idle (no processes, no display)** — preferred for binding. +3. **Skip** GPUs with active displays or running processes. + +If no GPU passes all safety checks, the launcher fails with per-device status listing what blocked each GPU. + +## Host preparation + +The launcher handles GPU driver binding automatically. The host only needs IOMMU and VFIO kernel modules configured. + +### 1. Enable IOMMU + +IOMMU must be enabled in both BIOS/UEFI and the Linux kernel. + +**Intel systems:** + +```shell +# Add to kernel command line (e.g. /etc/default/grub GRUB_CMDLINE_LINUX) +intel_iommu=on iommu=pt +``` + +**AMD systems:** + +```shell +# AMD IOMMU is usually enabled by default; verify or add: +amd_iommu=on iommu=pt +``` + +After editing, run `update-grub` (or equivalent) and reboot. Verify IOMMU is active: + +```shell +dmesg | grep -i iommu +# Should show: "DMAR: IOMMU enabled" or "AMD-Vi: AMD IOMMUv2" +``` + +### 2. Load VFIO kernel modules + +```shell +sudo modprobe vfio-pci +sudo modprobe vfio_iommu_type1 + +# Persist across reboots +echo "vfio-pci" | sudo tee /etc/modules-load.d/vfio-pci.conf +echo "vfio_iommu_type1" | sudo tee /etc/modules-load.d/vfio_iommu_type1.conf +``` + +### 3. Device permissions + +The launcher needs root (or `CAP_NET_ADMIN`) to bind/unbind GPU drivers and configure TAP networking: + +```shell +# Option A: run as root (simplest) +sudo openshell-vm --gpu + +# Option B: set udev rules for /dev/vfio/ access (still needs sysfs write via root) +echo 'SUBSYSTEM=="vfio", OWNER="root", GROUP="kvm", MODE="0660"' | \ + sudo tee /etc/udev/rules.d/99-vfio.rules +sudo udevadm control --reload-rules +sudo usermod -aG kvm $USER +``` + +### What the launcher does automatically + +When `--gpu` is passed, the launcher performs the following steps that previously required manual intervention: + +1. **Identifies NVIDIA GPUs** via sysfs (`/sys/bus/pci/devices/*/vendor`) +2. **Runs safety checks** — display, active processes, IOMMU, VFIO modules (see Safety checks above) +3. **Unbinds from nvidia** — writes to `/sys/bus/pci/devices//driver/unbind` +4. **Sets driver override** — writes `vfio-pci` to `/sys/bus/pci/devices//driver_override` +5. **Binds to vfio-pci** — writes to `/sys/bus/pci/drivers/vfio-pci/bind` +6. **Handles IOMMU group peers** — binds other devices in the same IOMMU group to `vfio-pci` +7. **On shutdown** — reverses all bindings, clears `driver_override`, rebinds to `nvidia` + +## Single-GPU caveats + +When the host has only one NVIDIA GPU: + +- **Display manager prompt.** When the GPU drives an active display or is held by a display server (Xorg, Wayland compositor), the launcher detects this and prompts the user interactively: + + ```text + WARNING: GPU 0000:2d:00.0 is in use by the display manager. + Display server processes: Xorg (PID 1234) + Active display outputs are connected to this GPU. + + Stopping the display manager will terminate your graphical session. + You will lose access to any open GUI applications. + + The display manager will be restarted automatically when the VM exits. + Stop display-manager and proceed with GPU passthrough? [y/N] + ``` + + If the user confirms, the launcher runs `systemctl stop display-manager`, waits for Xorg to release the GPU, then proceeds with VFIO binding. A `DisplayManagerGuard` ensures that `systemctl start display-manager` is called when the VM exits (clean shutdown, Ctrl+C, error, or panic). In non-interactive mode (stdin is not a TTY), the prompt is skipped and the launcher exits with an error instructing the user to stop the display manager manually. +- **Recovery is automatic.** When the VM exits (clean shutdown, Ctrl+C, or process crash), the launcher rebinds the GPU to the `nvidia` driver, clears `driver_override`, and restarts the display manager if it was stopped. No manual intervention is needed. +- **Process check.** If non-display CUDA processes are also using the GPU (visible via `/dev/nvidia*` file descriptors), the prompt warns about those processes too. The launcher lists all PIDs and process names so the user can make an informed decision. + +## Supported GPUs + +GPU passthrough is validated with NVIDIA data center GPUs. Consumer GPUs may work but are not officially supported (NVIDIA restricts GeForce passthrough in some driver versions). + +| GPU | Architecture | Compute Capability | Status | +|-----|-------------|-------------------|--------| +| A100 | Ampere | 8.0 | Supported | +| A30 | Ampere | 8.0 | Supported | +| H100 | Hopper | 9.0 | Supported | +| H200 | Hopper | 9.0 | Supported | +| L40 | Ada Lovelace | 8.9 | Supported | +| L40S | Ada Lovelace | 8.9 | Supported | +| L4 | Ada Lovelace | 8.9 | Supported | + +## GPU build pipeline + +GPU passthrough requires NVIDIA kernel modules compiled against the VM kernel. The full build pipeline is: + +```shell +# 1. Build kernel from source (needed for module compilation) +FROM_SOURCE=1 mise run vm:setup + +# 2. Compile NVIDIA .ko files against the VM kernel +mise run vm:nvidia-modules + +# 3. Build GPU rootfs and inject kernel modules +mise run vm:rootfs -- --base --gpu + +# 4. Compile binary and package runtime +mise run vm:build +``` + +### NVIDIA kernel module build (`vm:nvidia-modules`) + +The `build-nvidia-modules.sh` script clones [NVIDIA/open-gpu-kernel-modules](https://github.com/NVIDIA/open-gpu-kernel-modules) at the tag pinned by `NVIDIA_DRIVER_TAG` in `pins.env` and compiles the open kernel modules against the VM kernel source tree produced by `build-libkrun.sh`. + +The driver tag must match the exact version of `nvidia-headless-570-open` installed in the guest rootfs. A mismatch causes "API mismatch" errors from `nvidia-smi`. The current pin is `570.211.01`. + +The build produces these modules: + +| Module | Purpose | +|--------|---------| +| `nvidia.ko` | Core GPU driver | +| `nvidia-uvm.ko` | Unified Virtual Memory (CUDA managed memory) | +| `nvidia-modeset.ko` | Display mode setting | +| `nvidia-drm.ko` | DRM/KMS integration | +| `nvidia-peermem.ko` | GPUDirect RDMA (optional) | + +### Module injection (`vm:rootfs --gpu`) + +When `build-rootfs.sh` runs with `--gpu`, it: + +1. Reads `kernel-version.txt` (exported by `build-libkrun.sh`) to determine the kernel release string. +2. Copies `.ko` files from `target/libkrun-build/nvidia-modules/` into the rootfs at `/lib/modules//kernel/drivers/video/nvidia/`. +3. Runs `depmod` to generate module dependency metadata so `modprobe` works at boot. + +The VM init script loads `nvidia`, `nvidia_uvm`, and `nvidia_modeset` during boot when `GPU_ENABLED=true` is set on the kernel command line. + +## CLI usage + +### Auto-select GPU + +```shell +# openshell-vm binary (VM backend directly) +sudo openshell-vm --gpu + +# openshell CLI (gateway deployment — requires VM backend) +OPENSHELL_GATEWAY_BACKEND=vm sudo openshell gateway start --gpu +``` + +> **Note:** The default gateway backend is Docker (containers). GPU passthrough +> requires the VM backend. Set `OPENSHELL_GATEWAY_BACKEND=vm` (or `microvm`) +> to use the VM path with `openshell gateway start`. + +### Specific PCI address (multi-GPU hosts) + +```shell +sudo openshell-vm --gpu 0000:41:00.0 +``` + +### Backend selection + +The `--backend` flag controls hypervisor selection independently of `--gpu`: + +```shell +sudo openshell-vm --gpu # auto: selects QEMU for GPU +sudo openshell-vm --backend qemu # explicit QEMU, no GPU +sudo openshell-vm --gpu --backend qemu # force QEMU with GPU +sudo openshell-vm --backend libkrun # explicit libkrun (no GPU support) +``` + +### Diagnostics + +When `--gpu` is passed, the launcher runs safety checks before unbinding. If +checks fail, it exits with an actionable error: + +```text +$ sudo openshell-vm --gpu +GPU passthrough blocked by safety checks. + + Detected devices: + 0000:41:00.0: has active display outputs + 0000:42:00.0: in use by PIDs: 12345 (python3), 12400 (nvidia-smi) + + No GPU is available for passthrough. +``` + +On a headless server with an idle GPU, the pre-unbind preparation runs first: + +```text +$ sudo openshell-vm --gpu +GPU 0000:41:00.0: disabled nvidia persistence mode +GPU 0000:41:00.0: unloaded nvidia_uvm +GPU 0000:41:00.0: unloaded nvidia_drm +GPU 0000:41:00.0: unloaded nvidia_modeset +GPU 0000:41:00.0: device already unbound after nvidia module cleanup +GPU: binding 0000:41:00.0 for VFIO passthrough +``` + +On shutdown (Ctrl+C or VM exit), the original driver is restored: + +```text +^C +GPU: restoring 0000:41:00.0 (cleanup) +GPU: rebinding 0000:41:00.0 to nvidia +``` + +## VM Networking (QEMU) + +QEMU uses TAP-based networking instead of the gvproxy user-mode networking used by the libkrun backend. This has several implications for connectivity and port forwarding. + +### Network topology + +``` +Host Guest (microVM) +───────────────────────────────────── ────────────────────────── + eth0 (or primary NIC) eth0 (virtio-net) + ↕ ↕ + iptables MASQUERADE ←── NAT ──→ 192.168.249.2/24 + ↕ ↕ default gw 192.168.249.1 + vmtap0 (TAP device) ↕ + 192.168.249.1/24 ←─── L2 bridge ──→ (kernel routes) + ↕ + 127.0.0.1:{port} ←── TCP proxy ──→ {port} (k3s NodePort) +``` + +### How it works + +The QEMU backend configures networking in three layers: + +**1. TAP device and guest IP assignment** + +QEMU creates a TAP device on the host side with IP `192.168.249.1/24`. The guest is assigned `192.168.249.2/24` via kernel command line parameters (`VM_NET_IP`, `VM_NET_GW`, `VM_NET_DNS`). The init script reads these from `/proc/cmdline` and uses them as the static fallback when DHCP is unavailable. + +**2. Host-side NAT and IP forwarding** + +After booting the VM, the launcher: +- Enables IP forwarding (`/proc/sys/net/ipv4/ip_forward`) +- Adds iptables MASQUERADE rules for the `192.168.249.0/24` subnet +- Adds FORWARD rules to allow traffic to/from the VM + +This gives the guest internet access through the host. Rules are cleaned up on VM shutdown. + +**3. TCP port forwarding** + +Unlike gvproxy (which provides built-in port forwarding), TAP networking requires explicit port forwarding. The launcher starts a userspace TCP proxy for each port mapping (e.g., `30051:30051`). The proxy binds to `127.0.0.1:{host_port}` and forwards connections to `192.168.249.2:{guest_port}`. + +### DNS resolution + +The launcher detects the host's upstream DNS server using a two-step lookup: + +1. Reads `/etc/resolv.conf` and picks the first nameserver that does not start with `127.` (skipping systemd-resolved's `127.0.0.53` stub and other loopback addresses). +2. If all nameservers in `/etc/resolv.conf` are loopback, falls back to `/run/systemd/resolve/resolv.conf` (the upstream resolv.conf maintained by systemd-resolved). +3. If no non-loopback nameserver is found in either file, falls back to `8.8.8.8`. + +The resolved DNS server is passed to the guest via `VM_NET_DNS=` on the kernel command line. The init script writes it to `/etc/resolv.conf` inside the guest, unconditionally overriding any stale entries from previous boot cycles. + +### Key constants + +| Constant | Value | Purpose | +|----------|-------|---------| +| `TAP_HOST_IP` | `192.168.249.1` | Host side of the TAP device | +| `TAP_GUEST_IP` | `192.168.249.2` | Guest static IP | +| `TAP_SUBNET` | `192.168.249.0/24` | Subnet for iptables rules | +| `TAP_NETMASK` | `255.255.255.0` | Subnet mask in VM payload | + +### Differences from libkrun/gvproxy networking + +| Feature | libkrun + gvproxy | QEMU + TAP | +|---------|------------------|-----------| +| Network mode | User-mode (SLIRP-like) | Kernel TAP device | +| DHCP | Built-in (gvproxy) | None (static IP via cmdline) | +| Guest IP | `192.168.127.2/24` | `192.168.249.2/24` | +| Port forwarding | Built-in (gvproxy `-forward`) | Userspace TCP proxy | +| Privileges | Unprivileged | Root or `CAP_NET_ADMIN` | +| NAT | Handled by gvproxy | iptables MASQUERADE | +| DNS | gvproxy provides | Host resolver passed via cmdline | + +### Troubleshooting networking + +**"lookup registry-1.docker.io: Try again" (DNS failure)** + +The VM cannot resolve DNS. Check: + +```shell +# Verify the host DNS is non-loopback +grep nameserver /etc/resolv.conf +# If only 127.0.0.53 (systemd-resolved), find the upstream: +resolvectl status | grep 'DNS Servers' + +# Verify iptables rules are in place +sudo iptables -t nat -L POSTROUTING -n -v | grep 192.168.249 +sudo iptables -L FORWARD -n -v | grep 192.168.249 + +# Verify IP forwarding is enabled +cat /proc/sys/net/ipv4/ip_forward +``` + +**Gateway health check fails (port 30051 unreachable)** + +The TCP port forwarder may not have started, or the guest service is not yet listening: + +```shell +# Check if the port forwarder is bound on the host +ss -tlnp | grep 30051 + +# Check if the guest is reachable +ping -c1 192.168.249.2 +``` + +### Host mTLS cache and state disk + +The launcher caches mTLS certificates on the host after the first successful boot (warm boot path). If the state disk is deleted or `--reset` is used, the VM generates new PKI that won't match the cached certs. The launcher detects this — when the state disk is freshly created or reset, it clears the stale host mTLS cache and runs the cold-boot PKI fetch path. This prevents `transport error` failures on the gateway health check after a state disk reset. + +## Troubleshooting + +### "no NVIDIA PCI device found" + +The host has no NVIDIA GPU installed, or the PCI device is not visible: + +```shell +lspci -nn | grep -i nvidia +# If empty, the GPU is not detected at the PCI level +``` + +### "has active display outputs" / "in use by display manager" + +The GPU drives a DRM framebuffer or is held by a display server (Xorg, Wayland compositor). If running interactively, the launcher prompts to stop the display manager. If running non-interactively or the user declines, options: + +- Use a different GPU for the monitor (iGPU, secondary card) +- Stop the display manager manually: `sudo systemctl stop display-manager` +- On headless servers, this should not occur — verify with `ls /sys/class/drm/card*/device` + +### "in use by PIDs: ..." + +Active non-display processes hold `/dev/nvidia*` file descriptors. The check is host-wide (across all NVIDIA GPUs, not per-device). The launcher lists the PIDs and process names. Stop those processes before retrying. If the only processes are display servers (Xorg, gnome-shell, etc.), the launcher will offer to stop the display manager instead. + +### "IOMMU not enabled or device has no IOMMU group" + +IOMMU must be enabled in both BIOS/UEFI and kernel cmdline. See Host Preparation above. + +### "VFIO kernel modules not loaded" + +```shell +sudo modprobe vfio-pci +sudo modprobe vfio_iommu_type1 +``` + +### "insufficient sysfs permissions — run as root" + +The launcher needs root to write to sysfs bind/unbind paths. Run with `sudo`. + +### GPU not rebound after crash + +If the launcher process is killed with `SIGKILL` (kill -9), the cleanup handler cannot run and the GPU remains on `vfio-pci`. Manually rebind: + +```shell +PCI_ADDR="0000:41:00.0" +echo "$PCI_ADDR" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver/unbind +echo "" | sudo tee /sys/bus/pci/devices/$PCI_ADDR/driver_override +echo "$PCI_ADDR" | sudo tee /sys/bus/pci/drivers/nvidia/bind +``` + +### nvidia driver unbind deadlock (kernel bug) + +Some nvidia driver versions deadlock in their sysfs `unbind` handler — the `write()` syscall to `/sys/bus/pci/drivers/nvidia/unbind` never returns. When this happens, the subprocess enters uninterruptible sleep (D state) and becomes unkillable even by `SIGKILL`. The GPU's PCI subsystem state is corrupted and all subsequent PCI operations on the device hang. Only a host reboot clears this state. + +This is a kernel/nvidia driver bug, not an openshell-vm issue. Three mitigation layers are in place: + +1. **Pre-unbind preparation**: Before the raw sysfs unbind, the launcher disables nvidia persistence mode (`nvidia-smi -pm 0`) and unloads nvidia submodules (`nvidia_uvm`, `nvidia_drm`, `nvidia_modeset`) via `modprobe -r`. This often cascade-removes the base nvidia module entirely, unbinding the device automatically without ever touching the dangerous sysfs path. + +2. **Subprocess isolation with timeout**: All sysfs writes (and the nvidia prep commands) run in a subprocess with a timeout (10s for sysfs, 15s for prep). On timeout, the subprocess is killed and dropped without calling `wait()` — preventing the parent process from being dragged into D-state. + +3. **Post-timeout verification**: If the unbind subprocess times out but the device is actually unbound at the hardware level (which the nvidia bug can cause — the operation completes but the syscall never returns), the launcher detects this and continues with the VFIO bind. + +If you hit this issue repeatedly, check for nvidia driver updates or file a bug with NVIDIA. + +### VM boots but `nvidia-smi` fails inside guest + +- Verify the GPU rootfs includes NVIDIA drivers: `chroot /path/to/rootfs which nvidia-smi` +- Check that NVIDIA kernel modules load: `openshell-vm exec -- lsmod | grep nvidia` +- Inspect dmesg for NVIDIA driver errors: `openshell-vm exec -- dmesg | grep -i nvidia` + +## Related + +- [Custom VM Runtime](custom-vm-runtime.md) — building and customizing the libkrun VM runtime +- [System Architecture](system-architecture.md) — overall OpenShell architecture +- Implementation: + - [`crates/openshell-vfio/src/lib.rs`](../crates/openshell-vfio/src/lib.rs) — GPU binding and VFIO setup + - [`crates/openshell-vm/src/backend/qemu.rs`](../crates/openshell-vm/src/backend/qemu.rs) — QEMU backend diff --git a/crates/openshell-bootstrap/src/docker.rs b/crates/openshell-bootstrap/src/docker.rs index be086e534..e62a6e13d 100644 --- a/crates/openshell-bootstrap/src/docker.rs +++ b/crates/openshell-bootstrap/src/docker.rs @@ -26,12 +26,13 @@ const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell"; /// Resolve the raw GPU device-ID list, replacing the `"auto"` sentinel with a /// concrete device ID based on whether CDI is enabled on the daemon. /// -/// | Input | Output | -/// |--------------|--------------------------------------------------------------| -/// | `[]` | `[]` — no GPU | -/// | `["legacy"]` | `["legacy"]` — pass through to the non-CDI fallback path | -/// | `["auto"]` | `["nvidia.com/gpu=all"]` if CDI enabled, else `["legacy"]` | -/// | `[cdi-ids…]` | unchanged | +/// | Input | Output | +/// |---------------------|--------------------------------------------------------------| +/// | `[]` | `[]` — no GPU | +/// | `["vm-passthrough"]`| `["vm-passthrough"]` — GPU via QEMU/VFIO, no Docker device | +/// | `["legacy"]` | `["legacy"]` — pass through to the non-CDI fallback path | +/// | `["auto"]` | `["nvidia.com/gpu=all"]` if CDI enabled, else `["legacy"]` | +/// | `[cdi-ids…]` | unchanged | pub(crate) fn resolve_gpu_device_ids(gpu: &[String], cdi_enabled: bool) -> Vec { match gpu { [] => vec![], @@ -622,6 +623,11 @@ pub async fn ensure_container( // Docker resolves them against the host CDI spec at /etc/cdi/ match device_ids { [] => {} + [id] if id == "vm-passthrough" => { + // GPU passthrough is handled by QEMU/VFIO inside the container, + // not by Docker. No DeviceRequest needed — GPU_ENABLED=true + // (set below) deploys the NVIDIA device plugin in k3s. + } [id] if id == "legacy" => { host_config.device_requests = Some(vec![DeviceRequest { driver: Some("nvidia".to_string()), @@ -1436,6 +1442,13 @@ mod tests { ); } + #[test] + fn resolve_gpu_vm_passthrough() { + let ids = vec!["vm-passthrough".to_string()]; + assert_eq!(resolve_gpu_device_ids(&ids, true), ids); + assert_eq!(resolve_gpu_device_ids(&ids, false), ids); + } + #[test] fn resolve_gpu_cdi_ids_passthrough() { let ids = vec!["nvidia.com/gpu=all".to_string()]; diff --git a/crates/openshell-cli/Cargo.toml b/crates/openshell-cli/Cargo.toml index b3a006fdd..20ba1e5f7 100644 --- a/crates/openshell-cli/Cargo.toml +++ b/crates/openshell-cli/Cargo.toml @@ -21,6 +21,7 @@ openshell-policy = { path = "../openshell-policy" } openshell-providers = { path = "../openshell-providers" } openshell-prover = { path = "../openshell-prover" } openshell-tui = { path = "../openshell-tui" } +openshell-vfio = { path = "../openshell-vfio" } serde = { workspace = true } serde_json = { workspace = true } prost-types = { workspace = true } diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 292922411..05d1fb7c1 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -807,18 +807,21 @@ enum GatewayCommands { #[arg(long, env = "OPENSHELL_REGISTRY_TOKEN")] registry_token: Option, - /// Enable NVIDIA GPU passthrough. + /// Enable NVIDIA GPU support for the gateway cluster. /// - /// Passes all host GPUs into the cluster container and deploys the - /// NVIDIA k8s-device-plugin so Kubernetes workloads can request - /// `nvidia.com/gpu` resources. Requires NVIDIA drivers and the - /// NVIDIA Container Toolkit on the host. + /// **Docker path (default):** passes GPUs into the gateway container via + /// the NVIDIA Container Toolkit — CDI when the daemon supports it, else + /// Docker's `--gpus all` — and deploys the NVIDIA device plugin. Use + /// `--gpu` or `--gpu auto` only; PCI addresses are not valid CDI device + /// names on this path. /// - /// When enabled, OpenShell auto-selects CDI when the Docker daemon has - /// CDI enabled and falls back to Docker's NVIDIA GPU request path - /// (`--gpus all`) otherwise. - #[arg(long)] - gpu: bool, + /// **MicroVM path:** set `OPENSHELL_GATEWAY_BACKEND=vm` for deployments + /// that use the VM gateway. Then you may pass `--gpu` / `--gpu auto` for + /// VFIO auto-select, or `--gpu 0000:41:00.0` (PCI BDF) for a specific GPU. + /// Requires IOMMU and the GPU bound to `vfio-pci`. See + /// `architecture/vm-gpu-passthrough.md`. + #[arg(long, num_args = 0..=1, default_missing_value = "auto")] + gpu: Option, }, /// Stop the gateway (preserves state). @@ -1129,10 +1132,9 @@ enum SandboxCommands { /// Request GPU resources for the sandbox. /// /// When no gateway is running, auto-bootstrap starts a GPU-enabled - /// gateway using the same automatic injection selection as - /// `openshell gateway start --gpu`. GPU intent is also inferred - /// automatically for known GPU-designated image names such as - /// `nvidia-gpu`. + /// gateway using the Docker NVIDIA path (`--gpu auto`), same as + /// `openshell gateway start --gpu` without the microVM backend. GPU + /// intent is also inferred for known GPU image names (e.g. `nvidia-gpu`). #[arg(long)] gpu: bool, @@ -1655,12 +1657,11 @@ async fn main() -> Result<()> { registry_token, gpu, } => { - let gpu = if gpu { - vec!["auto".to_string()] - } else { - vec![] + let gpu = match gpu { + Some(val) => vec![val], + None => vec![], }; - run::gateway_admin_deploy( + let _gpu_guard = run::gateway_admin_deploy( &name, remote.as_deref(), ssh_key.as_deref(), diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index c41b53518..a104ace4d 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1434,7 +1434,9 @@ pub async fn gateway_admin_deploy( registry_username: Option<&str>, registry_token: Option<&str>, gpu: Vec, -) -> Result<()> { +) -> Result> { + let (gpu, gpu_guard) = prepare_gateway_deploy_gpu(gpu, remote.as_deref())?; + let location = if remote.is_some() { "remote" } else { "local" }; // Build remote options once so we can reuse them for the existence check @@ -1457,7 +1459,7 @@ pub async fn gateway_admin_deploy( "{} Gateway '{name}' is already running.", "✓".green().bold() ); - return Ok(()); + return Ok(gpu_guard); } } } @@ -1518,7 +1520,7 @@ pub async fn gateway_admin_deploy( save_active_gateway(name)?; eprintln!("{} Active gateway set to '{name}'", "✓".green().bold()); - Ok(()) + Ok(gpu_guard) } /// Resolve the remote SSH destination for a gateway. @@ -5193,6 +5195,125 @@ fn format_timestamp_ms(ms: i64) -> String { } } +/// Environment variable selecting the gateway deployment backend for GPU checks. +/// +/// VFIO sysfs probes apply only to the microVM (`openshell-vm`) deploy path. +/// The default `openshell gateway start` flow uses Docker with the NVIDIA +/// Container Toolkit; leave this unset for that path. +const OPENSHELL_GATEWAY_BACKEND_ENV: &str = "OPENSHELL_GATEWAY_BACKEND"; + +fn gateway_deploy_uses_vm_backend() -> bool { + std::env::var(OPENSHELL_GATEWAY_BACKEND_ENV) + .ok() + .map(|v| { + matches!( + v.trim().to_ascii_lowercase().as_str(), + "vm" | "microvm" | "openshell-vm" + ) + }) + .unwrap_or(false) +} + +/// Heuristic: value looks like a PCI domain:bus:dev.fn address (Linux sysfs BDF). +fn looks_like_pci_bdf(s: &str) -> bool { + let s = s.trim(); + let rest = if let Some((prefix, after_colon)) = s.split_once(':') { + if prefix.len() == 4 && prefix.chars().all(|c| c.is_ascii_hexdigit()) { + after_colon + } else { + s + } + } else { + return false; + }; + + let Some((bus, dev_fn)) = rest.split_once(':') else { + return false; + }; + if bus.len() != 2 || !bus.chars().all(|c| c.is_ascii_hexdigit()) { + return false; + } + let Some((dev, func)) = dev_fn.split_once('.') else { + return false; + }; + if dev.len() != 2 || !dev.chars().all(|c| c.is_ascii_hexdigit()) { + return false; + } + if func.len() != 1 || !func.chars().all(|c| ('0'..='7').contains(&c)) { + return false; + } + true +} + +/// Validate `--gpu` for `gateway start`, run VFIO checks only for the VM deploy path, +/// and normalize Docker-path requests to CDI-compatible `auto`. +fn prepare_gateway_deploy_gpu( + gpu: Vec, + remote: Option<&str>, +) -> Result<(Vec, Option)> { + if gpu.is_empty() { + return Ok((gpu, None)); + } + + if gateway_deploy_uses_vm_backend() { + if remote.is_none() { + let guard = check_gpu_readiness(&gpu)?; + // Signal that GPU is enabled but passthrough is handled by QEMU/VFIO, + // not by Docker CDI. The bootstrap sets GPU_ENABLED=true for the + // k3s NVIDIA device plugin but skips Docker DeviceRequests. + let updated_gpu = vec!["vm-passthrough".to_string()]; + return Ok((updated_gpu, Some(guard))); + } else { + eprintln!( + "{} Local VFIO GPU probe skipped (--remote): GPU readiness is checked on the remote host during deployment.", + "ℹ".cyan().bold() + ); + } + return Ok((gpu, None)); + } + + let Some(first) = gpu.first() else { + return Ok((gpu, None)); + }; + if first.as_str() != "auto" { + if looks_like_pci_bdf(first) { + return Err(miette!( + "PCI address GPU selection ({first}) is only supported for the microVM gateway backend.\n\n\ + `openshell gateway start` uses Docker by default (NVIDIA Container Toolkit / CDI, or Docker `--gpus all`). \ + Use `--gpu` or `--gpu auto` for that path.\n\n\ + For VFIO passthrough, set {}=vm and follow architecture/vm-gpu-passthrough.md.", + OPENSHELL_GATEWAY_BACKEND_ENV, + )); + } + return Err(miette!( + "Unrecognized --gpu value `{first}` for Docker gateway deploy. Use `--gpu` or `--gpu auto`.", + )); + } + + Ok((vec!["auto".to_string()], None)) +} + +/// Bind a GPU for VFIO passthrough and return an RAII guard that restores it on drop. +fn check_gpu_readiness(gpu: &[String]) -> Result { + use openshell_vfio::{GpuBindGuard, prepare_gpu_for_passthrough}; + + let requested_addr = gpu + .first() + .filter(|v| v.as_str() != "auto") + .map(|v| v.as_str()); + + let bind_state = prepare_gpu_for_passthrough(requested_addr).map_err(|e| miette!("{e}"))?; + + eprintln!( + "{} GPU {} bound to vfio-pci (was: {})", + "✓".green().bold(), + bind_state.pci_addr, + bind_state.original_driver, + ); + + Ok(GpuBindGuard::new(bind_state)) +} + #[cfg(test)] mod tests { use super::{ @@ -5416,6 +5537,16 @@ mod tests { assert!(sandbox_should_persist(false, Some(&spec))); } + #[test] + fn looks_like_pci_bdf_recognizes_sysfs_addresses() { + assert!(super::looks_like_pci_bdf("0000:41:00.0")); + assert!(super::looks_like_pci_bdf("41:00.0")); + assert!(super::looks_like_pci_bdf(" 0a:1f.7 ")); + assert!(!super::looks_like_pci_bdf("auto")); + assert!(!super::looks_like_pci_bdf("nvidia.com/gpu=all")); + assert!(!super::looks_like_pci_bdf("00:00.8")); // invalid function + } + #[test] fn image_requests_gpu_matches_known_gpu_image_names() { for image in [ diff --git a/crates/openshell-vfio/Cargo.toml b/crates/openshell-vfio/Cargo.toml new file mode 100644 index 000000000..d4c4f32de --- /dev/null +++ b/crates/openshell-vfio/Cargo.toml @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-vfio" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +description = "Host-side NVIDIA GPU VFIO bind/unbind for VM passthrough" + +[dependencies] +nix = { workspace = true } + +[dev-dependencies] +tempfile = "3" + +[lints] +workspace = true diff --git a/crates/openshell-vfio/src/lib.rs b/crates/openshell-vfio/src/lib.rs new file mode 100644 index 000000000..675928db7 --- /dev/null +++ b/crates/openshell-vfio/src/lib.rs @@ -0,0 +1,2935 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Host-side NVIDIA GPU VFIO bind/unbind for VM passthrough. + +//! +//! This module scans Linux sysfs (`/sys/bus/pci/devices`) for NVIDIA GPUs +//! (vendor ID `0x10de`), checks their driver binding, and verifies IOMMU +//! group cleanliness — the prerequisites for passing a physical GPU into +//! a VM via VFIO. +//! +//! Returns per-device readiness for multi-GPU hosts. +//! +//! On non-Linux platforms, probing returns an empty list. + +use std::fmt; +use std::path::PathBuf; +use std::time::Duration; + +/// Per-device readiness state for NVIDIA GPU VFIO passthrough. +/// +/// Each variant represents a distinct readiness state for a single PCI device. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum HostNvidiaVfioReadiness { + /// The current platform does not support VFIO passthrough (non-Linux). + UnsupportedPlatform, + + /// No PCI device with NVIDIA vendor ID (`0x10de`) was found. + NoNvidiaDevice, + + /// An NVIDIA device exists but is bound to the nvidia (or other non-VFIO) driver. + BoundToNvidia, + + /// An NVIDIA device is bound to `vfio-pci` and its IOMMU group is clean — ready for passthrough. + VfioBoundReady, + + /// An NVIDIA device is bound to `vfio-pci` but its IOMMU group contains + /// devices not bound to `vfio-pci`, which prevents safe passthrough. + VfioBoundDirtyGroup, + + /// Some NVIDIA devices are bound to `vfio-pci` while others use + /// a different driver (mixed fleet). + MixedVfioAndOther, +} + +impl fmt::Display for HostNvidiaVfioReadiness { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::UnsupportedPlatform => write!( + f, + "VFIO passthrough is not supported on this platform (Linux required)" + ), + Self::NoNvidiaDevice => write!(f, "no NVIDIA PCI device found"), + Self::BoundToNvidia => { + write!(f, "NVIDIA device found but not bound to vfio-pci driver") + } + Self::VfioBoundReady => write!( + f, + "NVIDIA device bound to vfio-pci and IOMMU group is clean" + ), + Self::VfioBoundDirtyGroup => write!( + f, + "NVIDIA device bound to vfio-pci but IOMMU group contains non-VFIO devices" + ), + Self::MixedVfioAndOther => write!( + f, + "some NVIDIA devices are on vfio-pci while others use a different driver" + ), + } + } +} + +const NVIDIA_VENDOR_ID: &str = "0x10de"; + +#[cfg(target_os = "linux")] +const SYSFS_WRITE_TIMEOUT: Duration = Duration::from_secs(10); + +/// Reject sysfs data containing characters outside the safe set for shell +/// interpolation. All legitimate sysfs writes in this crate use PCI BDF +/// addresses, driver names, or single digits — this blocks anything else. +#[cfg(target_os = "linux")] +fn validate_sysfs_data(data: &str) -> Result<(), std::io::Error> { + if data.is_empty() + || data + .bytes() + .all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.' || b == b':') + { + Ok(()) + } else { + Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("sysfs data contains unexpected characters: {data:?}"), + )) + } +} + +#[cfg(target_os = "linux")] +fn sysfs_write_with_timeout( + path: &std::path::Path, + data: &str, + timeout: Duration, +) -> Result<(), std::io::Error> { + use std::process::{Command, Stdio}; + use std::thread; + + if data.is_empty() { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!( + "sysfs_write_with_timeout called with empty data for {}", + path.display() + ), + )); + } + validate_sysfs_data(data)?; + + let mut child = Command::new("sh") + .arg("-c") + .arg(format!( + r#"printf '%s' '{}' > '{}'"#, + data.replace('\'', "'\\''"), + path.display().to_string().replace('\'', "'\\''") + )) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + std::io::Error::new( + e.kind(), + format!( + "failed to spawn sysfs write subprocess for {}: {e}", + path.display() + ), + ) + })?; + + let poll_interval = Duration::from_millis(100); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => { + if status.success() { + return Ok(()); + } + let mut stderr_buf = String::new(); + if let Some(mut stderr) = child.stderr.take() { + use std::io::Read; + let _ = stderr.read_to_string(&mut stderr_buf); + } + let hint = if stderr_buf.contains("Permission denied") { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!( + "sysfs write to {} failed (exit {}){hint}: {stderr_buf}", + path.display(), + status.code().unwrap_or(-1), + ), + )); + } + Ok(None) => { + if start.elapsed() > timeout { + let pid = child.id(); + let _ = child.kill(); + // CRITICAL: Do NOT call child.wait() here. If the child + // is stuck in uninterruptible sleep (D-state) — which is + // the nvidia unbind deadlock scenario — wait() will block + // the parent indefinitely, making it unkillable too. + // + // Dropping the Child struct closes pipe handles but does + // NOT wait. The zombie child is reparented to init and + // reaped when/if it eventually exits. + drop(child); + return Err(std::io::Error::new( + std::io::ErrorKind::TimedOut, + format!( + "sysfs write to {} timed out after {:.0}s (subprocess pid {pid}) — \ + possible nvidia driver deadlock. The subprocess may still be \ + stuck in kernel space; a reboot may be required to clear it.", + path.display(), + timeout.as_secs_f64(), + ), + )); + } + thread::sleep(poll_interval); + } + Err(e) => return Err(e), + } + } +} + +/// Check whether a PCI device supports MSI-X by walking the PCI capability +/// list in the sysfs `config` file. MSI-X is capability ID `0x11`. +/// +/// MSI-X support is tracked for informational purposes. QEMU handles +/// devices with or without MSI-X via legacy interrupt emulation fallback. +#[cfg(target_os = "linux")] +pub fn check_msix_support(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + let config_path = sysfs.sys_bus_pci_devices().join(pci_addr).join("config"); + let config = match std::fs::read(&config_path) { + Ok(data) => data, + Err(_) => return false, + }; + + // PCI config space: capability pointer at offset 0x34. + if config.len() < 0x35 { + return false; + } + + // Status register (offset 0x06, bit 4) indicates capability list present. + if config.len() > 0x07 && (config[0x06] & 0x10) == 0 { + return false; + } + + // PCI spec: capability pointers are DWORD-aligned (low 2 bits reserved). + let mut cap_ptr = (config[0x34] & 0xFC) as usize; + // Walk the capability linked list (max 48 iterations to avoid infinite loops). + for _ in 0..48 { + if cap_ptr == 0 || cap_ptr + 1 >= config.len() { + break; + } + let cap_id = config[cap_ptr]; + if cap_id == 0x11 { + return true; + } + cap_ptr = (config[cap_ptr + 1] & 0xFC) as usize; + } + false +} + +#[cfg(not(target_os = "linux"))] +pub fn check_msix_support(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +/// Validates that `addr` matches the PCI BDF format `DDDD:BB:DD.F`. +fn validate_pci_addr(addr: &str) -> Result<(), std::io::Error> { + let bytes = addr.as_bytes(); + let valid = bytes.len() == 12 + && bytes[4] == b':' + && bytes[7] == b':' + && bytes[10] == b'.' + && bytes[..4].iter().all(|b| b.is_ascii_hexdigit()) + && bytes[5..7].iter().all(|b| b.is_ascii_hexdigit()) + && bytes[8..10].iter().all(|b| b.is_ascii_hexdigit()) + && bytes[11] >= b'0' + && bytes[11] <= b'7'; + if valid { + Ok(()) + } else { + Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("invalid PCI address '{addr}': expected DDDD:BB:DD.F format"), + )) + } +} + +/// Probe the host for NVIDIA GPU VFIO readiness by scanning Linux sysfs. +/// +/// Returns a per-device list of `(pci_address, readiness)` tuples for every +/// NVIDIA GPU found. On non-Linux platforms the list is empty. +/// +/// On Linux, walks `/sys/bus/pci/devices/` and for each device: +/// 1. Reads `vendor` to check for NVIDIA (`0x10de`). +/// 2. Reads the `driver` symlink to determine which kernel driver is bound. +/// 3. If bound to `vfio-pci`, inspects the `iommu_group/devices/` directory +/// to verify all group members are also on `vfio-pci`. +pub fn probe_host_nvidia_vfio_readiness() -> Vec<(String, HostNvidiaVfioReadiness)> { + #[cfg(not(target_os = "linux"))] + { + Vec::new() + } + + #[cfg(target_os = "linux")] + { + probe_linux_sysfs() + } +} + +#[cfg(target_os = "linux")] +fn probe_linux_sysfs() -> Vec<(String, HostNvidiaVfioReadiness)> { + use std::fs; + use std::path::Path; + + let pci_devices = Path::new("/sys/bus/pci/devices"); + let entries = match fs::read_dir(pci_devices) { + Ok(e) => e, + Err(_) => return Vec::new(), + }; + + let mut results = Vec::new(); + + for entry in entries.filter_map(Result::ok) { + let dev_path = entry.path(); + + let vendor = match fs::read_to_string(dev_path.join("vendor")) { + Ok(v) => v.trim().to_lowercase(), + Err(_) => continue, + }; + + if vendor != NVIDIA_VENDOR_ID { + continue; + } + + let pci_addr = entry.file_name().to_string_lossy().to_string(); + + let driver_link = dev_path.join("driver"); + let driver_name = fs::read_link(&driver_link).ok().and_then(|target| { + target + .file_name() + .map(|name| name.to_string_lossy().to_string()) + }); + + let state = match driver_name.as_deref() { + Some("vfio-pci") => { + let iommu_group_devices = dev_path.join("iommu_group/devices"); + let group_clean = match fs::read_dir(&iommu_group_devices) { + Ok(group_entries) => group_entries.filter_map(Result::ok).all(|ge| { + let peer_path = iommu_group_devices.join(ge.file_name()).join("driver"); + fs::read_link(&peer_path) + .ok() + .and_then(|t| t.file_name().map(|n| n.to_string_lossy().to_string())) + .as_deref() + == Some("vfio-pci") + }), + Err(_) => false, + }; + + if group_clean { + HostNvidiaVfioReadiness::VfioBoundReady + } else { + HostNvidiaVfioReadiness::VfioBoundDirtyGroup + } + } + _ => HostNvidiaVfioReadiness::BoundToNvidia, + }; + + results.push((pci_addr, state)); + } + + results +} + +/// Returns whether any NVIDIA GPU is fully available for VM passthrough. +/// +/// Requires `OPENSHELL_VM_GPU_E2E=1` to activate probing. When the env var +/// is unset or not `"1"`, returns `false` unconditionally so non-GPU CI +/// runners are never affected. +/// +/// When activated, checks two conditions: +/// 1. At least one NVIDIA device reports [`VfioBoundReady`]. +/// 2. The QEMU binary (`qemu-system-x86_64`) exists in `runtime_dir` or on PATH (if provided). +pub fn nvidia_gpu_available_for_vm_passthrough(runtime_dir: Option) -> bool { + if std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() != Ok("1") { + return false; + } + + let has_vfio_ready = probe_host_nvidia_vfio_readiness() + .iter() + .any(|(_, state)| *state == HostNvidiaVfioReadiness::VfioBoundReady); + + if !has_vfio_ready { + return false; + } + + let has_qemu = runtime_dir + .map(|dir| dir.join("qemu-system-x86_64").is_file()) + .unwrap_or(false); + let has_qemu_on_path = std::process::Command::new("qemu-system-x86_64") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .is_ok(); + + has_qemu || has_qemu_on_path +} + +/// Sysfs root path, defaulting to "/" in production and a temp dir in tests. +#[derive(Debug, Clone)] +pub struct SysfsRoot(PathBuf); + +impl Default for SysfsRoot { + fn default() -> Self { + Self(PathBuf::from("/")) + } +} + +impl SysfsRoot { + #[cfg(test)] + pub fn new(root: PathBuf) -> Self { + Self(root) + } + + pub fn sys_bus_pci_devices(&self) -> PathBuf { + self.0.join("sys/bus/pci/devices") + } + + pub fn sys_class_drm(&self) -> PathBuf { + self.0.join("sys/class/drm") + } + + pub fn sys_module(&self, module: &str) -> PathBuf { + self.0.join("sys/module").join(module) + } + + pub fn sys_bus_pci_drivers(&self, driver: &str) -> PathBuf { + self.0.join("sys/bus/pci/drivers").join(driver) + } + + pub fn sys_kernel_iommu_groups(&self) -> PathBuf { + self.0.join("sys/kernel/iommu_groups") + } + + fn is_real_sysfs(&self) -> bool { + self.0 == std::path::Path::new("/") + } + + #[cfg(target_os = "linux")] + fn write_sysfs(&self, path: &std::path::Path, data: &str) -> Result<(), std::io::Error> { + if self.is_real_sysfs() { + if data.is_empty() { + // Clearing a sysfs attribute requires a direct write() syscall. + // Shell-based approaches (`printf '%s' '' > file`) produce zero + // bytes of output, and sysfs doesn't support truncation — so the + // kernel store function is never invoked and the attribute keeps + // its old value. A direct write("\n") always works: the kernel + // strips trailing newlines in store functions like + // driver_override_store(), resulting in an empty string that + // clears the attribute. Uses O_WRONLY only (no O_CREAT/O_TRUNC) + // for sysfs compatibility. This path does NOT use the timeout + // wrapper because clearing attributes never hangs — unlike driver + // unbind which can deadlock in nvidia's remove(). + use std::io::Write; + let mut f = std::fs::OpenOptions::new() + .write(true) + .open(path) + .map_err(|e| { + std::io::Error::new( + e.kind(), + format!("failed to open {} for clearing: {e}", path.display()), + ) + })?; + f.write_all(b"\n").map_err(|e| { + std::io::Error::new( + e.kind(), + format!("failed to write newline to {}: {e}", path.display()), + ) + })?; + return Ok(()); + } + sysfs_write_with_timeout(path, data, SYSFS_WRITE_TIMEOUT) + } else { + std::fs::write(path, data).map_err(|e| { + std::io::Error::new(e.kind(), format!("failed to write {}: {e}", path.display())) + }) + } + } +} + +#[cfg(target_os = "linux")] +pub fn check_display_attached(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + use std::fs; + + let drm_dir = sysfs.sys_class_drm(); + let entries = match fs::read_dir(&drm_dir) { + Ok(e) => e, + Err(_) => return false, + }; + + for entry in entries.filter_map(Result::ok) { + let name = entry.file_name().to_string_lossy().to_string(); + if !name.starts_with("card") || name.contains('-') { + continue; + } + + let card_dir = entry.path(); + let device_link = card_dir.join("device"); + + let target = match fs::read_link(&device_link) { + Ok(t) => t, + Err(_) => continue, + }; + if !target.to_string_lossy().ends_with(pci_addr) { + continue; + } + + let boot_vga_path = card_dir.join("device").join("boot_vga"); + if let Ok(val) = fs::read_to_string(&boot_vga_path) { + if val.trim() == "1" { + return true; + } + } + + if let Ok(sub_entries) = fs::read_dir(&card_dir) { + for sub in sub_entries.filter_map(Result::ok) { + let sub_name = sub.file_name().to_string_lossy().to_string(); + if sub_name.starts_with(&format!("{name}-")) { + if let Ok(status) = fs::read_to_string(sub.path().join("status")) { + if status.trim() == "connected" { + return true; + } + } + } + } + } + } + + false +} + +#[cfg(not(target_os = "linux"))] +pub fn check_display_attached(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +#[cfg(target_os = "linux")] +/// Checks whether any process on the host has an open handle to an NVIDIA GPU +/// device (`/dev/nvidia*`). This is a host-wide check across ALL NVIDIA GPUs, +/// not scoped to a single PCI address. Returns a list of (pid, comm) pairs. +pub fn check_active_gpu_processes() -> std::io::Result> { + use std::fs; + + let mut result = Vec::new(); + + let proc_dir = match fs::read_dir("/proc") { + Ok(d) => d, + Err(e) => { + return Err(std::io::Error::new( + e.kind(), + format!( + "cannot scan /proc for active GPU processes: {e} — \ + refusing to unbind (fail-closed)" + ), + )); + } + }; + + for proc_entry in proc_dir.filter_map(Result::ok) { + let pid: u32 = match proc_entry.file_name().to_string_lossy().parse() { + Ok(p) => p, + Err(_) => continue, + }; + + let fd_dir = proc_entry.path().join("fd"); + let fds = match fs::read_dir(&fd_dir) { + Ok(d) => d, + Err(_) => continue, + }; + + for fd_entry in fds.filter_map(Result::ok) { + if let Ok(target) = fs::read_link(fd_entry.path()) { + if target.to_string_lossy().starts_with("/dev/nvidia") { + let comm = fs::read_to_string(format!("/proc/{pid}/comm")) + .unwrap_or_default() + .trim() + .to_string(); + result.push((pid, comm)); + break; + } + } + } + } + + Ok(result) +} + +#[cfg(not(target_os = "linux"))] +pub fn check_active_gpu_processes() -> std::io::Result> { + Ok(vec![]) +} + +#[cfg(target_os = "linux")] +pub fn check_iommu_enabled(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + let iommu_groups = sysfs.sys_kernel_iommu_groups(); + if !iommu_groups.is_dir() { + return false; + } + sysfs + .sys_bus_pci_devices() + .join(pci_addr) + .join("iommu_group") + .exists() +} + +#[cfg(not(target_os = "linux"))] +pub fn check_iommu_enabled(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +#[cfg(target_os = "linux")] +pub fn check_vfio_modules_loaded(sysfs: &SysfsRoot) -> bool { + sysfs.sys_module("vfio_pci").is_dir() && sysfs.sys_module("vfio_iommu_type1").is_dir() +} + +#[cfg(not(target_os = "linux"))] +pub fn check_vfio_modules_loaded(_sysfs: &SysfsRoot) -> bool { + false +} + +#[cfg(target_os = "linux")] +pub fn check_sysfs_permissions(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + use nix::unistd::{AccessFlags, access}; + + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + let driver_override = dev_dir.join("driver_override"); + let unbind = dev_dir.join("driver/unbind"); + let bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind"); + + let writable = |path: &std::path::Path| -> bool { access(path, AccessFlags::W_OK).is_ok() }; + + let unbind_ok = !unbind.exists() || writable(&unbind); + writable(&driver_override) && unbind_ok && writable(&bind) +} + +#[cfg(not(target_os = "linux"))] +pub fn check_sysfs_permissions(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +#[cfg(target_os = "linux")] +pub fn current_driver(sysfs: &SysfsRoot, pci_addr: &str) -> Option { + let driver_link = sysfs.sys_bus_pci_devices().join(pci_addr).join("driver"); + std::fs::read_link(&driver_link) + .ok() + .and_then(|target| target.file_name().map(|n| n.to_string_lossy().to_string())) +} + +#[cfg(not(target_os = "linux"))] +pub fn current_driver(_sysfs: &SysfsRoot, _pci_addr: &str) -> Option { + None +} + +/// Nvidia kernel modules that hold internal references to GPU devices and can +/// prevent a clean unbind. Unloaded in order (most-dependent first). +#[cfg(target_os = "linux")] +const NVIDIA_SUBMODULES: &[&str] = &["nvidia_uvm", "nvidia_drm", "nvidia_modeset"]; + +/// Timeout for nvidia prep commands (nvidia-smi, modprobe). These commands +/// can wedge if the nvidia driver is in a bad state. +#[cfg(target_os = "linux")] +const NVIDIA_PREP_TIMEOUT: Duration = Duration::from_secs(15); + +/// Run a command with a timeout. Returns `Some(ExitStatus)` on success, +/// `None` on timeout or spawn failure. On timeout, kills the child and +/// drops it without calling `wait()` (same D-state safety as sysfs writes). +#[cfg(target_os = "linux")] +fn run_with_timeout( + mut cmd: std::process::Command, + timeout: Duration, +) -> Option { + use std::thread; + + let mut child = match cmd.spawn() { + Ok(c) => c, + Err(_) => return None, + }; + + let poll_interval = Duration::from_millis(100); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => return Some(status), + Ok(None) => { + if start.elapsed() > timeout { + let _ = child.kill(); + drop(child); + return None; + } + thread::sleep(poll_interval); + } + Err(_) => return None, + } + } +} + +/// Best-effort preparation of the nvidia driver before a raw sysfs unbind. +/// +/// Reduces the probability of the nvidia unbind deadlock by: +/// 1. Disabling persistence mode (nvidia-persistenced holds device refs). +/// 2. Unloading nvidia submodules that keep internal references open. +/// +/// All commands run with a timeout — if `nvidia-smi` or `modprobe` hangs +/// (which can happen when the nvidia driver is in a bad state), the parent +/// process is not blocked. Failures are logged but not fatal. +#[cfg(target_os = "linux")] +fn nvidia_pre_unbind_prep(pci_addr: &str) { + use std::process::{Command, Stdio}; + + // 1. Disable persistence mode via nvidia-smi (if available). + let mut cmd = Command::new("nvidia-smi"); + cmd.args(["-i", pci_addr, "-pm", "0"]) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()); + match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) { + Some(s) if s.success() => { + eprintln!("GPU {pci_addr}: disabled nvidia persistence mode"); + } + None => { + eprintln!( + "GPU {pci_addr}: nvidia-smi timed out after {:.0}s — skipping persistence mode", + NVIDIA_PREP_TIMEOUT.as_secs_f64() + ); + } + _ => {} + } + + // 2. Unload nvidia submodules that hold device references. + // This is best-effort — modules may be in use by other GPUs. + for module in NVIDIA_SUBMODULES { + let mut cmd = Command::new("modprobe"); + cmd.args(["-r", module]) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()); + match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) { + Some(s) if s.success() => { + eprintln!("GPU {pci_addr}: unloaded {module}"); + } + None => { + eprintln!( + "GPU {pci_addr}: modprobe -r {module} timed out after {:.0}s", + NVIDIA_PREP_TIMEOUT.as_secs_f64() + ); + } + _ => {} + } + } +} + +/// Reset a PCI device to clear stale IOMMU state after VFIO passthrough. +/// +/// Tries the device's own `reset` file (FLR) first. If that doesn't exist, +/// locates the parent PCI bridge and triggers a secondary bus reset (SBR). +/// Either reset clears stale IOMMU page table entries that would otherwise +/// cause `RmInitAdapter` failures when the nvidia driver initialises. +#[cfg(target_os = "linux")] +fn pci_reset_device(sysfs: &SysfsRoot, pci_addr: &str) { + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + + // Try device-level FLR first. + let device_reset = dev_dir.join("reset"); + if device_reset.exists() { + eprintln!("GPU {pci_addr}: performing PCI function-level reset"); + match sysfs.write_sysfs(&device_reset, "1") { + Ok(()) => { + std::thread::sleep(Duration::from_secs(1)); + eprintln!("GPU {pci_addr}: FLR complete"); + return; + } + Err(e) => { + eprintln!("GPU {pci_addr}: FLR failed ({e}), trying bridge SBR"); + } + } + } + + // Fall back to secondary bus reset on the parent bridge. The sysfs + // device path is a symlink whose real path encodes the PCI topology: + // /sys/devices/pci0000:00/0000:00:03.1/0000:2d:00.0 + // The parent directory (0000:00:03.1) is the bridge. + if let Ok(real) = std::fs::canonicalize(&dev_dir) { + if let Some(bridge_dir) = real.parent() { + let bridge_reset = bridge_dir.join("reset"); + if bridge_reset.exists() { + let bridge_name = bridge_dir + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown"); + eprintln!("GPU {pci_addr}: performing secondary bus reset on bridge {bridge_name}"); + if let Err(e) = std::fs::write(&bridge_reset, "1") { + eprintln!("GPU {pci_addr}: bridge SBR failed: {e}"); + } else { + std::thread::sleep(Duration::from_secs(1)); + eprintln!("GPU {pci_addr}: SBR complete"); + } + } + } + } +} + +/// Reload nvidia kernel modules so the driver's sysfs bind file exists. +/// +/// Called during restore to ensure `modprobe nvidia` brings back the driver +/// that `nvidia_pre_unbind_prep` may have unloaded. Loads the base `nvidia` +/// module plus its dependent submodules in the correct order. +#[cfg(target_os = "linux")] +fn nvidia_reload_modules() { + use std::process::{Command, Stdio}; + + // Load in dependency order: base module first, then dependents. + // If the base "nvidia" module fails, skip submodules (they depend on it). + for (i, module) in ["nvidia", "nvidia_modeset", "nvidia_uvm", "nvidia_drm"] + .iter() + .enumerate() + { + let mut cmd = Command::new("modprobe"); + cmd.arg(module) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::piped()); + match run_with_timeout(cmd, NVIDIA_PREP_TIMEOUT) { + Some(s) if s.success() => { + eprintln!("GPU: loaded {module} for restore"); + } + None => { + eprintln!( + "GPU: modprobe {module} timed out after {:.0}s during restore", + NVIDIA_PREP_TIMEOUT.as_secs_f64() + ); + break; + } + Some(s) => { + eprintln!( + "GPU: modprobe {module} exited {} during restore (non-fatal)", + s.code().unwrap_or(-1) + ); + if i == 0 { + break; + } + } + } + } +} + +#[cfg(target_os = "linux")] +pub fn bind_gpu_to_vfio(sysfs: &SysfsRoot, pci_addr: &str) -> Result { + validate_pci_addr(pci_addr)?; + let drv = current_driver(sysfs, pci_addr); + + if drv.as_deref() == Some("vfio-pci") { + return Ok("vfio-pci".to_string()); + } + + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + + if drv.is_some() { + let is_nvidia = drv.as_deref() == Some("nvidia"); + if is_nvidia && sysfs.is_real_sysfs() { + nvidia_pre_unbind_prep(pci_addr); + + // nvidia_pre_unbind_prep may cascade-remove the nvidia module when + // all submodules are unloaded, which automatically unbinds the device. + // Re-check before attempting the sysfs unbind write. + if current_driver(sysfs, pci_addr).is_none() { + eprintln!("GPU {pci_addr}: device already unbound after nvidia module cleanup"); + } else if current_driver(sysfs, pci_addr).as_deref() == Some("vfio-pci") { + return Ok("vfio-pci".to_string()); + } + } + + // Only attempt the sysfs unbind if a driver is still bound. + if current_driver(sysfs, pci_addr).is_some() { + let unbind = dev_dir.join("driver/unbind"); + let unbind_result = sysfs.write_sysfs(&unbind, pci_addr); + + if let Err(ref e) = unbind_result { + if e.kind() == std::io::ErrorKind::TimedOut { + // The nvidia unbind deadlock can complete the unbind at the + // hardware level while the syscall never returns to userspace. + // Check if the device is actually unbound despite the timeout. + if current_driver(sysfs, pci_addr).is_none() { + eprintln!( + "GPU {pci_addr}: sysfs unbind timed out but device is unbound — \ + continuing (zombie subprocess may linger until reboot)" + ); + } else { + return Err(std::io::Error::new( + std::io::ErrorKind::TimedOut, + format!( + "Failed to unbind {pci_addr}: timed out and device is still \ + bound to {}. A reboot may be required.", + drv.as_deref().unwrap_or("unknown"), + ), + )); + } + } else { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to unbind device at {path}{hint}", + path = unbind.display() + ), + )); + } + } + } + } + + let driver_override = dev_dir.join("driver_override"); + if let Err(e) = sysfs.write_sysfs(&driver_override, "vfio-pci") { + if let Some(ref orig) = drv { + let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind"); + let _ = sysfs.write_sysfs(&orig_bind, pci_addr); + } + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to write driver_override at {path}{hint}", + path = driver_override.display() + ), + )); + } + + let vfio_bind = sysfs.sys_bus_pci_drivers("vfio-pci").join("bind"); + if let Err(e) = sysfs.write_sysfs(&vfio_bind, pci_addr) { + let _ = sysfs.write_sysfs(&driver_override, ""); + if let Some(ref orig) = drv { + let orig_bind = sysfs.sys_bus_pci_drivers(orig).join("bind"); + let _ = sysfs.write_sysfs(&orig_bind, pci_addr); + } + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to bind to vfio-pci at {path}{hint} — is the vfio-pci module loaded?", + path = vfio_bind.display() + ), + )); + } + + // When the device had no driver (e.g. nvidia modules were already unloaded + // from a previous crash, or display-manager was stopped), infer the restore + // target from vendor + PCI class so the right driver is rebound on exit. + let original = match drv { + Some(d) if !d.is_empty() => d, + _ => { + let vendor = std::fs::read_to_string(dev_dir.join("vendor")) + .map(|v| v.trim().to_lowercase()) + .unwrap_or_default(); + let class = std::fs::read_to_string(dev_dir.join("class")) + .map(|c| c.trim().to_lowercase()) + .unwrap_or_default(); + if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x0403") { + // NVIDIA HDA audio companion (HDMI/DP audio) + eprintln!( + "GPU {pci_addr}: no driver was bound, defaulting restore target to snd_hda_intel (audio device)" + ); + "snd_hda_intel".to_string() + } else if vendor == NVIDIA_VENDOR_ID { + eprintln!( + "GPU {pci_addr}: no driver was bound, defaulting restore target to nvidia" + ); + "nvidia".to_string() + } else { + String::new() + } + } + }; + + Ok(original) +} + +#[cfg(not(target_os = "linux"))] +pub fn bind_gpu_to_vfio(_sysfs: &SysfsRoot, _pci_addr: &str) -> Result { + Ok(String::new()) +} + +#[cfg(target_os = "linux")] +pub fn rebind_gpu_to_original( + sysfs: &SysfsRoot, + pci_addr: &str, + original_driver: &str, +) -> Result<(), std::io::Error> { + validate_pci_addr(pci_addr)?; + let dev_dir = sysfs.sys_bus_pci_devices().join(pci_addr); + + // Restore is best-effort: attempt every step even if earlier ones fail, + // so a partial failure (e.g. unbind succeeds but driver_override clear + // fails) doesn't leave the device in a worse state than before. Track + // the first error to return at the end. + let mut first_err: Option = None; + + // Step 1: Unbind from the current driver. Without this, modprobe for + // the original driver fails with "No such device" because the kernel + // still considers the PCI slot claimed. + let cur_drv = current_driver(sysfs, pci_addr); + if cur_drv.as_deref() == Some("vfio-pci") { + let vfio_unbind = sysfs.sys_bus_pci_drivers("vfio-pci").join("unbind"); + if let Err(e) = sysfs.write_sysfs(&vfio_unbind, pci_addr) { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + eprintln!( + "GPU {pci_addr}: failed to unbind from vfio-pci at {}{hint} — continuing restore", + vfio_unbind.display() + ); + if first_err.is_none() { + first_err = Some(std::io::Error::new( + e.kind(), + format!( + "Failed to unbind {pci_addr} from vfio-pci at {path}{hint}", + path = vfio_unbind.display() + ), + )); + } + } + } else if cur_drv.is_some() { + let unbind = dev_dir.join("driver/unbind"); + if let Err(e) = sysfs.write_sysfs(&unbind, pci_addr) { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + eprintln!( + "GPU {pci_addr}: failed to unbind from {} at {}{hint} — continuing restore", + cur_drv.as_deref().unwrap_or("unknown"), + unbind.display() + ); + if first_err.is_none() { + first_err = Some(std::io::Error::new( + e.kind(), + format!( + "Failed to unbind device at {path}{hint}", + path = unbind.display() + ), + )); + } + } + } + + // Step 2: Clear driver_override so modprobe can claim the device. This + // is required even when the device is already unbound — a killed VM + // process can leave driver_override set to "vfio-pci" with no driver + // actually bound. + let driver_override = dev_dir.join("driver_override"); + if let Err(e) = sysfs.write_sysfs(&driver_override, "") { + let hint = if e.kind() == std::io::ErrorKind::PermissionDenied { + " — run as root" + } else { + "" + }; + eprintln!( + "GPU {pci_addr}: failed to clear driver_override at {}{hint} — continuing restore", + driver_override.display() + ); + if first_err.is_none() { + first_err = Some(std::io::Error::new( + e.kind(), + format!( + "Failed to clear driver_override at {path}{hint}", + path = driver_override.display() + ), + )); + } + } + + // Step 3: PCI device reset to clear stale IOMMU state. + // After VFIO passthrough (especially on AMD-Vi systems), the GPU may + // retain stale IOMMU page table entries. Without a reset, modprobe + // nvidia fails with RmInitAdapter errors and IO_PAGE_FAULTs. + if sysfs.is_real_sysfs() { + pci_reset_device(sysfs, pci_addr); + } + + // Step 4: Reload modules and bind to the original driver. + if !original_driver.is_empty() && original_driver != "none" { + if original_driver == "nvidia" && sysfs.is_real_sysfs() { + nvidia_reload_modules(); + } else if sysfs.is_real_sysfs() { + let _ = std::process::Command::new("modprobe") + .arg(original_driver) + .output(); + } + + // modprobe may have auto-bound the device (now that driver_override is + // cleared). Skip the explicit bind if already on the right driver. + let cur = current_driver(sysfs, pci_addr); + if cur.as_deref() == Some(original_driver) { + eprintln!("GPU {pci_addr}: already bound to {original_driver}"); + } else { + let bind = sysfs.sys_bus_pci_drivers(original_driver).join("bind"); + if let Err(e) = sysfs.write_sysfs(&bind, pci_addr) { + eprintln!( + "GPU {pci_addr}: explicit bind to {original_driver} failed ({e}), \ + falling back to PCI rescan" + ); + let rescan = sysfs.0.join("sys/bus/pci/rescan"); + if let Err(rescan_err) = sysfs.write_sysfs(&rescan, "1") { + eprintln!("GPU {pci_addr}: PCI rescan write failed: {rescan_err}"); + } + std::thread::sleep(Duration::from_secs(1)); + + match current_driver(sysfs, pci_addr) { + None => { + let bind_err = std::io::Error::new( + e.kind(), + format!( + "Failed to restore {pci_addr} to {original_driver}: \ + explicit bind and PCI rescan both failed. \ + Manual fix:\n \ + echo {pci_addr} | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind\n \ + echo | sudo tee /sys/bus/pci/devices/{pci_addr}/driver_override\n \ + sudo modprobe {original_driver}" + ), + ); + if first_err.is_none() { + first_err = Some(bind_err); + } + } + Some(new_drv) => { + eprintln!("GPU {pci_addr}: PCI rescan bound device to {new_drv}"); + } + } + } + } + } else { + let rescan = sysfs.0.join("sys/bus/pci/rescan"); + if let Err(rescan_err) = sysfs.write_sysfs(&rescan, "1") { + eprintln!("GPU {pci_addr}: PCI rescan write failed: {rescan_err}"); + } + } + + if first_err.is_none() { + if current_driver(sysfs, pci_addr).is_none() { + eprintln!( + "GPU {pci_addr}: warning: driver link missing in sysfs after restore \ + (nvidia-smi may still work via character devices). \ + To re-create the sysfs link: echo {pci_addr} | sudo tee /sys/bus/pci/drivers/{original_driver}/bind" + ); + } + } + + match first_err { + Some(e) => Err(e), + None => Ok(()), + } +} + +#[cfg(not(target_os = "linux"))] +pub fn rebind_gpu_to_original( + _sysfs: &SysfsRoot, + _pci_addr: &str, + _original_driver: &str, +) -> Result<(), std::io::Error> { + Ok(()) +} + +#[cfg(target_os = "linux")] +pub fn iommu_group_peers(sysfs: &SysfsRoot, pci_addr: &str) -> Result, std::io::Error> { + validate_pci_addr(pci_addr)?; + let iommu_devices = sysfs + .sys_bus_pci_devices() + .join(pci_addr) + .join("iommu_group/devices"); + + let entries = match std::fs::read_dir(&iommu_devices) { + Ok(e) => e, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(vec![]), + Err(e) => return Err(e), + }; + + let mut peers = Vec::new(); + for entry in entries.filter_map(Result::ok) { + let name = entry.file_name().to_string_lossy().to_string(); + if name != pci_addr { + peers.push(name); + } + } + Ok(peers) +} + +#[cfg(not(target_os = "linux"))] +pub fn iommu_group_peers( + _sysfs: &SysfsRoot, + _pci_addr: &str, +) -> Result, std::io::Error> { + Ok(vec![]) +} + +#[cfg(target_os = "linux")] +pub fn bind_iommu_group_peers( + sysfs: &SysfsRoot, + pci_addr: &str, +) -> Result, std::io::Error> { + let peers = iommu_group_peers(sysfs, pci_addr)?; + let mut restore_list = Vec::new(); + + for peer in peers { + match bind_gpu_to_vfio(sysfs, &peer) { + Ok(original) => { + if original != "vfio-pci" { + restore_list.push((peer, original)); + } + } + Err(e) => { + let _ = rebind_iommu_group_peers(sysfs, &restore_list); + return Err(std::io::Error::new( + e.kind(), + format!( + "Failed to bind IOMMU peer {peer}: {e}. Rolled back {} peer(s).", + restore_list.len() + ), + )); + } + } + } + + Ok(restore_list) +} + +#[cfg(not(target_os = "linux"))] +pub fn bind_iommu_group_peers( + _sysfs: &SysfsRoot, + _pci_addr: &str, +) -> Result, std::io::Error> { + Ok(vec![]) +} + +#[cfg(target_os = "linux")] +pub fn rebind_iommu_group_peers( + sysfs: &SysfsRoot, + peers: &[(String, String)], +) -> Result<(), std::io::Error> { + let mut first_err = None; + for (peer_addr, original_driver) in peers { + if let Err(e) = rebind_gpu_to_original(sysfs, peer_addr, original_driver) { + eprintln!("IOMMU peer {peer_addr}: failed to restore to {original_driver}: {e}"); + if first_err.is_none() { + first_err = Some(e); + } + } + } + match first_err { + Some(e) => Err(e), + None => Ok(()), + } +} + +#[cfg(not(target_os = "linux"))] +pub fn rebind_iommu_group_peers( + _sysfs: &SysfsRoot, + _peers: &[(String, String)], +) -> Result<(), std::io::Error> { + Ok(()) +} + +#[cfg(target_os = "linux")] +fn is_iommu_group_clean(sysfs: &SysfsRoot, pci_addr: &str) -> bool { + let peers = match iommu_group_peers(sysfs, pci_addr) { + Ok(p) => p, + Err(_) => return false, + }; + peers + .iter() + .all(|peer| current_driver(sysfs, peer).as_deref() == Some("vfio-pci")) +} + +#[cfg(not(target_os = "linux"))] +fn is_iommu_group_clean(_sysfs: &SysfsRoot, _pci_addr: &str) -> bool { + false +} + +/// Discover IOMMU group peers already on vfio-pci (inherited from a previous +/// session) and infer their original driver from the PCI class code so they +/// can be restored on exit. +#[cfg(target_os = "linux")] +fn inherited_peer_binds(sysfs: &SysfsRoot, gpu_addr: &str) -> Vec<(String, String)> { + iommu_group_peers(sysfs, gpu_addr) + .unwrap_or_default() + .into_iter() + .filter(|peer| peer != gpu_addr) + .filter_map(|peer| { + if current_driver(sysfs, &peer).as_deref() != Some("vfio-pci") { + return None; + } + let class = + std::fs::read_to_string(sysfs.sys_bus_pci_devices().join(&peer).join("class")) + .unwrap_or_default() + .trim() + .to_lowercase(); + // 0x0403xx = multimedia audio controller — typically snd_hda_intel + let orig = if class.starts_with("0x0403") { + "snd_hda_intel" + } else { + "nvidia" + }; + Some((peer, orig.to_string())) + }) + .collect() +} + +#[cfg(not(target_os = "linux"))] +fn inherited_peer_binds(_sysfs: &SysfsRoot, _gpu_addr: &str) -> Vec<(String, String)> { + vec![] +} + +/// Captures the bind state for a GPU so it can be restored on shutdown. +#[derive(Debug)] +pub struct GpuBindState { + /// PCI address of the GPU that was bound. + pub pci_addr: String, + /// Driver the GPU was on before binding (e.g. "nvidia"). + pub original_driver: String, + /// IOMMU group peers that were rebound, with their original drivers. + pub peer_binds: Vec<(String, String)>, + /// Whether this instance performed the bind (false if GPU was already on vfio-pci). + pub did_bind: bool, + /// Whether the GPU supports MSI-X (informational; QEMU handles both cases). + pub has_msix: bool, +} + +impl GpuBindState { + /// Shell commands to manually restore the GPU and its peers to their + /// original drivers. Useful for printing recovery instructions when + /// the process might be force-killed (SIGKILL). + pub fn recovery_commands(&self) -> String { + let mut cmds = Vec::new(); + + cmds.push(format!( + "echo {} | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind", + self.pci_addr + )); + + for (peer_addr, _) in &self.peer_binds { + cmds.push(format!( + "echo {} | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind", + peer_addr + )); + } + + cmds.push(format!( + "echo | sudo tee /sys/bus/pci/devices/{}/driver_override", + self.pci_addr + )); + + for (peer_addr, _) in &self.peer_binds { + cmds.push(format!( + "echo | sudo tee /sys/bus/pci/devices/{}/driver_override", + peer_addr + )); + } + + if self.original_driver == "nvidia" || self.original_driver.is_empty() { + cmds.push("sudo modprobe nvidia".to_string()); + } + + let mut peer_drivers: Vec<&str> = Vec::new(); + for (_, original_drv) in &self.peer_binds { + if !original_drv.is_empty() + && original_drv != "nvidia" + && !peer_drivers.contains(&original_drv.as_str()) + { + peer_drivers.push(original_drv.as_str()); + } + } + for drv in peer_drivers { + cmds.push(format!("sudo modprobe {drv}")); + } + + cmds.join("\n") + } + + /// Restore the GPU and its IOMMU peers to their original drivers. + pub fn restore(&self) -> Result<(), std::io::Error> { + self.restore_with_sysfs(&SysfsRoot::default()) + } + + pub fn restore_with_sysfs(&self, sysfs: &SysfsRoot) -> Result<(), std::io::Error> { + if !self.did_bind { + return Ok(()); + } + + // Restore IOMMU peers (e.g. the HDA audio companion) BEFORE the GPU. + // nvidia_reload_modules() during GPU restore can claim peer devices + // through nvidia-modeset/nvidia-drm if they're still unbound, racing + // with the snd_hda_intel rebind. Restoring peers first avoids this. + let peer_result = rebind_iommu_group_peers(sysfs, &self.peer_binds); + if let Err(ref e) = peer_result { + eprintln!("GPU: peer restore failed: {e}"); + } + + eprintln!( + "GPU: rebinding {} to {}", + self.pci_addr, self.original_driver + ); + let gpu_result = rebind_gpu_to_original(sysfs, &self.pci_addr, &self.original_driver); + + if let Err(ref gpu_err) = gpu_result { + return Err(std::io::Error::new(gpu_err.kind(), gpu_err.to_string())); + } + peer_result + } +} + +/// RAII guard that restores GPU driver binding when dropped. +/// +/// Ensures the GPU is rebound to its original driver on normal exit, +/// early return (?), or panic. Cannot protect against SIGKILL. +pub struct GpuBindGuard { + state: Option, +} + +impl GpuBindGuard { + pub fn new(state: GpuBindState) -> Self { + Self { state: Some(state) } + } + + /// Take the state out, preventing restore on drop. + pub fn disarm(&mut self) -> Option { + self.state.take() + } + + /// Access the inner bind state, if present. + pub fn state(&self) -> Option<&GpuBindState> { + self.state.as_ref() + } + + /// Get the PCI address of the bound GPU, if any. + pub fn pci_addr(&self) -> Option<&str> { + self.state.as_ref().map(|s| s.pci_addr.as_str()) + } +} + +impl Drop for GpuBindGuard { + fn drop(&mut self) { + if let Some(ref state) = self.state { + eprintln!( + "GPU: restoring {} to {} (cleanup)", + state.pci_addr, state.original_driver + ); + if let Err(e) = state.restore() { + eprintln!("GPU: restore failed: {e}"); + } + } + } +} + +/// Known display server process names (matched against `/proc/PID/comm`). +const DISPLAY_SERVER_NAMES: &[&str] = &[ + "Xorg", + "X", + "Xwayland", + "gnome-shell", + "kwin_wayland", + "kwin_x11", + "sway", + "weston", + "mutter", +]; + +/// Returns `true` if `comm` matches a known display server process name. +pub fn is_display_server_process(comm: &str) -> bool { + DISPLAY_SERVER_NAMES.contains(&comm) +} + +/// Information about display manager processes blocking GPU passthrough. +/// +/// Returned by [`detect_display_blocker`] when a GPU that would otherwise +/// be eligible for passthrough is held by Xorg or a Wayland compositor. +#[derive(Debug, Clone)] +pub struct DisplayBlockerInfo { + /// PCI address of the GPU blocked by the display manager. + pub pci_addr: String, + /// Display-server processes holding `/dev/nvidia*` device files open. + pub display_processes: Vec<(u32, String)>, + /// Whether the GPU has active display outputs (DRM connectors). + pub has_active_outputs: bool, + /// Non-display processes also holding `/dev/nvidia*` device files. + /// If non-empty, stopping the display manager alone won't free the GPU. + pub other_processes: Vec<(u32, String)>, +} + +/// Detect whether a display manager is blocking GPU passthrough. +/// +/// Returns `Some(info)` when at least one GPU that would otherwise pass +/// safety checks is blocked by display-server processes (Xorg, Wayland +/// compositor) or has active display outputs. The caller can use this to +/// prompt the user to stop the display manager before retrying. +/// +/// Returns `None` when no display-related blocker is detected (GPUs may +/// still be blocked by other issues like missing IOMMU or permissions). +pub fn detect_display_blocker(requested_bdf: Option<&str>) -> Option { + detect_display_blocker_with_sysfs(&SysfsRoot::default(), requested_bdf) +} + +#[cfg(target_os = "linux")] +pub fn detect_display_blocker_with_sysfs( + sysfs: &SysfsRoot, + requested_bdf: Option<&str>, +) -> Option { + let addrs: Vec = match requested_bdf { + Some(bdf) => { + if validate_pci_addr(bdf).is_err() { + return None; + } + vec![bdf.to_string()] + } + None => find_nvidia_gpu_addrs(sysfs), + }; + + if addrs.is_empty() { + return None; + } + + let active_procs = check_active_gpu_processes().unwrap_or_default(); + + let display_procs: Vec<(u32, String)> = active_procs + .iter() + .filter(|(_, comm)| is_display_server_process(comm)) + .cloned() + .collect(); + + let other_procs: Vec<(u32, String)> = active_procs + .iter() + .filter(|(_, comm)| !is_display_server_process(comm)) + .cloned() + .collect(); + + for addr in &addrs { + if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") { + continue; + } + + let has_outputs = check_display_attached(sysfs, addr); + + if has_outputs || !display_procs.is_empty() { + return Some(DisplayBlockerInfo { + pci_addr: addr.clone(), + display_processes: display_procs, + other_processes: other_procs, + has_active_outputs: has_outputs, + }); + } + } + + None +} + +#[cfg(not(target_os = "linux"))] +pub fn detect_display_blocker_with_sysfs( + _sysfs: &SysfsRoot, + _requested_bdf: Option<&str>, +) -> Option { + None +} + +/// Find all NVIDIA GPU PCI addresses (class 0x03xxxx) in sysfs. +#[cfg(target_os = "linux")] +fn find_nvidia_gpu_addrs(sysfs: &SysfsRoot) -> Vec { + let pci_dir = sysfs.sys_bus_pci_devices(); + let Ok(entries) = std::fs::read_dir(&pci_dir) else { + return vec![]; + }; + + let mut addrs = Vec::new(); + for entry in entries.filter_map(Result::ok) { + let dev_path = entry.path(); + let vendor = match std::fs::read_to_string(dev_path.join("vendor")) { + Ok(v) => v.trim().to_lowercase(), + Err(_) => continue, + }; + let class = match std::fs::read_to_string(dev_path.join("class")) { + Ok(c) => c.trim().to_lowercase(), + Err(_) => continue, + }; + if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x03") { + addrs.push(entry.file_name().to_string_lossy().to_string()); + } + } + addrs.sort(); + addrs +} + +/// Prepare a GPU for VFIO passthrough: run safety checks, select, and bind. +/// +/// When `requested_bdf` is Some, targets that specific device. +/// When None (auto mode), selects the best available GPU. +/// +/// All safety checks are hard failures — if any check fails, this returns +/// an error and does not bind anything. +pub fn prepare_gpu_for_passthrough( + requested_bdf: Option<&str>, +) -> Result { + prepare_gpu_with_sysfs(&SysfsRoot::default(), requested_bdf) +} + +pub fn prepare_gpu_with_sysfs( + sysfs: &SysfsRoot, + requested_bdf: Option<&str>, +) -> Result { + match requested_bdf { + Some(bdf) => prepare_specific_gpu(sysfs, bdf), + None => prepare_auto_gpu(sysfs), + } +} + +fn prepare_specific_gpu(sysfs: &SysfsRoot, bdf: &str) -> Result { + validate_pci_addr(bdf)?; + + let dev_dir = sysfs.sys_bus_pci_devices().join(bdf); + if !dev_dir.exists() { + return Err(std::io::Error::new( + std::io::ErrorKind::NotFound, + format!("PCI device {bdf} not found in sysfs"), + )); + } + + let vendor = std::fs::read_to_string(dev_dir.join("vendor")) + .map(|v| v.trim().to_lowercase()) + .unwrap_or_default(); + if vendor != NVIDIA_VENDOR_ID { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("PCI device {bdf} is not an NVIDIA device (vendor: {vendor})"), + )); + } + let class = std::fs::read_to_string(dev_dir.join("class")) + .map(|c| c.trim().to_lowercase()) + .unwrap_or_default(); + if !class.starts_with("0x03") { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("PCI device {bdf} is not a GPU (class: {class})"), + )); + } + + let has_msix = check_msix_support(sysfs, bdf); + if !has_msix { + eprintln!("GPU {bdf}: no MSI-X support (QEMU will use legacy interrupt emulation)"); + } + + if current_driver(sysfs, bdf).as_deref() == Some("vfio-pci") && is_iommu_group_clean(sysfs, bdf) + { + let peer_binds = inherited_peer_binds(sysfs, bdf); + eprintln!( + "GPU {bdf}: already on vfio-pci (inherited from previous session), \ + will restore to nvidia on exit ({} peer(s) also tracked)", + peer_binds.len() + ); + return Ok(GpuBindState { + pci_addr: bdf.to_string(), + original_driver: "nvidia".to_string(), + peer_binds, + did_bind: true, + has_msix, + }); + } + + if check_display_attached(sysfs, bdf) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: has active display outputs"), + )); + } + + let procs = check_active_gpu_processes().map_err(|e| { + std::io::Error::new( + e.kind(), + format!("GPU {bdf}: cannot verify GPU is idle — {e}"), + ) + })?; + if !procs.is_empty() { + let desc: Vec = procs + .iter() + .map(|(pid, comm)| format!("{pid} ({comm})")) + .collect(); + let display_procs: Vec<&str> = procs + .iter() + .filter(|(_, comm)| is_display_server_process(comm)) + .map(|(_, comm)| comm.as_str()) + .collect(); + let mut msg = format!("GPU {bdf}: in use by PIDs: {}", desc.join(", ")); + if !display_procs.is_empty() { + msg.push_str(&format!( + "\n\n {} {} a display server \ + — stop the display manager to release the GPU:\n \ + sudo systemctl stop display-manager\ + \n\n The display manager will need to be restarted after the VM exits:\n \ + sudo systemctl start display-manager", + display_procs.join(", "), + if display_procs.len() == 1 { + "is" + } else { + "are" + }, + )); + } + return Err(std::io::Error::new(std::io::ErrorKind::Other, msg)); + } + + if !check_iommu_enabled(sysfs, bdf) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: IOMMU not enabled or device has no IOMMU group"), + )); + } + + if !check_vfio_modules_loaded(sysfs) { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("GPU {bdf}: VFIO kernel modules not loaded"), + )); + } + + if !check_sysfs_permissions(sysfs, bdf) { + return Err(std::io::Error::new( + std::io::ErrorKind::PermissionDenied, + format!("GPU {bdf}: insufficient sysfs permissions — run as root"), + )); + } + + let original_driver = bind_gpu_to_vfio(sysfs, bdf)?; + let peer_binds = match bind_iommu_group_peers(sysfs, bdf) { + Ok(peers) => peers, + Err(e) => { + let _ = rebind_gpu_to_original(sysfs, bdf, &original_driver); + return Err(e); + } + }; + + Ok(GpuBindState { + pci_addr: bdf.to_string(), + original_driver, + peer_binds, + did_bind: true, + has_msix, + }) +} + +fn prepare_auto_gpu(sysfs: &SysfsRoot) -> Result { + let pci_dir = sysfs.sys_bus_pci_devices(); + let entries = std::fs::read_dir(&pci_dir).map_err(|e| { + std::io::Error::new(e.kind(), format!("cannot read {}: {e}", pci_dir.display())) + })?; + + let mut nvidia_addrs = Vec::new(); + for entry in entries.filter_map(Result::ok) { + let dev_path = entry.path(); + let vendor = match std::fs::read_to_string(dev_path.join("vendor")) { + Ok(v) => v.trim().to_lowercase(), + Err(_) => continue, + }; + let class = match std::fs::read_to_string(dev_path.join("class")) { + Ok(c) => c.trim().to_lowercase(), + Err(_) => continue, + }; + if vendor == NVIDIA_VENDOR_ID && class.starts_with("0x03") { + nvidia_addrs.push(entry.file_name().to_string_lossy().to_string()); + } + } + + if nvidia_addrs.is_empty() { + return Err(std::io::Error::new( + std::io::ErrorKind::NotFound, + "no NVIDIA PCI device found", + )); + } + + nvidia_addrs.sort(); + + // Phase 1: prefer GPUs already on vfio-pci with clean IOMMU group. + // MSI-X GPUs get slight priority (better interrupt performance). + let mut vfio_msix: Option = None; + let mut vfio_no_msix: Option = None; + for addr in &nvidia_addrs { + if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") + && is_iommu_group_clean(sysfs, addr) + { + if check_msix_support(sysfs, addr) { + if vfio_msix.is_none() { + vfio_msix = Some(addr.clone()); + } + } else if vfio_no_msix.is_none() { + vfio_no_msix = Some(addr.clone()); + } + } + } + if let Some(addr) = vfio_msix { + let peer_binds = inherited_peer_binds(sysfs, &addr); + eprintln!( + "GPU {addr}: already on vfio-pci (inherited from previous session), \ + will restore to nvidia on exit ({} peer(s) also tracked)", + peer_binds.len() + ); + return Ok(GpuBindState { + pci_addr: addr, + original_driver: "nvidia".to_string(), + peer_binds, + did_bind: true, + has_msix: true, + }); + } + if let Some(ref addr) = vfio_no_msix { + let peer_binds = inherited_peer_binds(sysfs, addr); + eprintln!("GPU {addr}: no MSI-X support (QEMU will use legacy interrupt emulation)"); + eprintln!( + "GPU {addr}: already on vfio-pci (inherited from previous session), \ + will restore to nvidia on exit ({} peer(s) also tracked)", + peer_binds.len() + ); + return Ok(GpuBindState { + pci_addr: addr.clone(), + original_driver: "nvidia".to_string(), + peer_binds, + did_bind: true, + has_msix: false, + }); + } + + // Phase 2: try to bind idle GPUs. Collect eligible candidates, then + // pick the best one (MSI-X preferred over non-MSI-X). + let mut blocked: Vec<(String, String)> = Vec::new(); + let mut has_display_blocker = false; + let active_procs = check_active_gpu_processes() + .map_err(|e| std::io::Error::new(e.kind(), format!("cannot verify GPUs are idle — {e}")))?; + + let mut idle_candidates: Vec<(String, bool)> = Vec::new(); + + for addr in &nvidia_addrs { + if current_driver(sysfs, addr).as_deref() == Some("vfio-pci") { + blocked.push((addr.clone(), "IOMMU group not clean".to_string())); + continue; + } + + if check_display_attached(sysfs, addr) { + has_display_blocker = true; + blocked.push((addr.clone(), "has active display outputs".to_string())); + continue; + } + + if !active_procs.is_empty() { + let display_names: Vec<&str> = active_procs + .iter() + .filter(|(_, comm)| is_display_server_process(comm)) + .map(|(_, comm)| comm.as_str()) + .collect(); + if !display_names.is_empty() { + has_display_blocker = true; + } + let desc: Vec = active_procs + .iter() + .map(|(pid, comm)| format!("{pid} ({comm})")) + .collect(); + blocked.push((addr.clone(), format!("in use by PIDs: {}", desc.join(", ")))); + continue; + } + + if !check_iommu_enabled(sysfs, addr) { + blocked.push((addr.clone(), "IOMMU not enabled".to_string())); + continue; + } + + if !check_vfio_modules_loaded(sysfs) { + blocked.push((addr.clone(), "VFIO modules not loaded".to_string())); + continue; + } + + if !check_sysfs_permissions(sysfs, addr) { + blocked.push((addr.clone(), "insufficient sysfs permissions".to_string())); + continue; + } + + let has_msix = check_msix_support(sysfs, addr); + idle_candidates.push((addr.clone(), has_msix)); + } + + // Sort: MSI-X candidates first (better interrupt performance). + idle_candidates.sort_by_key(|(_, has_msix)| !has_msix); + + for (addr, has_msix) in &idle_candidates { + if !has_msix { + eprintln!("GPU {addr}: no MSI-X support (QEMU will use legacy interrupt emulation)"); + } + eprintln!("GPU: binding {addr} for VFIO passthrough"); + let original_driver = bind_gpu_to_vfio(sysfs, addr)?; + let peer_binds = match bind_iommu_group_peers(sysfs, addr) { + Ok(peers) => peers, + Err(e) => { + let _ = rebind_gpu_to_original(sysfs, addr, &original_driver); + return Err(e); + } + }; + + return Ok(GpuBindState { + pci_addr: addr.clone(), + original_driver, + peer_binds, + did_bind: true, + has_msix: *has_msix, + }); + } + + let mut msg = + String::from("GPU passthrough blocked by safety checks.\n\n Detected devices:\n"); + for (addr, reason) in &blocked { + msg.push_str(&format!(" {addr}: {reason}\n")); + } + if has_display_blocker { + msg.push_str( + "\n A display server is using the GPU. \ + Stop the display manager to release it:\n \ + sudo systemctl stop display-manager\ + \n\n The display manager will be restarted automatically if you use the --gpu flag,\ + \n or manually with: sudo systemctl start display-manager\n", + ); + } + msg.push_str("\n No GPU is available for passthrough."); + + Err(std::io::Error::new(std::io::ErrorKind::Other, msg)) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::path::Path; + + #[test] + #[allow(unsafe_code)] + fn passthrough_gate_is_false_without_env_var() { + // SAFETY: test runs single-threaded; no other thread reads this var. + unsafe { std::env::remove_var("OPENSHELL_VM_GPU_E2E") }; + assert!( + !nvidia_gpu_available_for_vm_passthrough(None), + "gate must return false when OPENSHELL_VM_GPU_E2E is unset" + ); + } + + #[test] + fn probe_returns_no_device_or_readiness_on_typical_ci() { + let results = probe_host_nvidia_vfio_readiness(); + + #[cfg(not(target_os = "linux"))] + assert!(results.is_empty(), "non-Linux should return empty Vec"); + + #[cfg(target_os = "linux")] + { + // CI machines typically have no NVIDIA GPU bound to vfio-pci. + // Accept an empty list or any per-device readiness state. + for (addr, state) in &results { + assert!(!addr.is_empty(), "PCI address should not be empty"); + assert!( + matches!( + state, + HostNvidiaVfioReadiness::BoundToNvidia + | HostNvidiaVfioReadiness::VfioBoundReady + | HostNvidiaVfioReadiness::VfioBoundDirtyGroup + ), + "unexpected per-device readiness state for {addr}: {state:?}" + ); + } + } + } + + #[test] + fn display_impl_is_meaningful() { + let states = [ + HostNvidiaVfioReadiness::UnsupportedPlatform, + HostNvidiaVfioReadiness::NoNvidiaDevice, + HostNvidiaVfioReadiness::BoundToNvidia, + HostNvidiaVfioReadiness::VfioBoundReady, + HostNvidiaVfioReadiness::VfioBoundDirtyGroup, + HostNvidiaVfioReadiness::MixedVfioAndOther, + ]; + for state in &states { + let msg = format!("{state}"); + assert!(!msg.is_empty(), "Display for {state:?} should not be empty"); + } + } + + /// Build a minimal PCI config space (64 bytes) with a capability list + /// containing a single MSI-X entry (cap ID 0x11) so `check_msix_support` + /// sees the device as passthrough-capable. + fn mock_pci_config_with_msix() -> Vec { + let mut cfg = vec![0u8; 64]; + // Status register (offset 0x06): set bit 4 = capabilities list present. + cfg[0x06] = 0x10; + // Capabilities pointer (offset 0x34): first cap at 0x40. + cfg[0x34] = 0x40; + // Extend to include the capability at offset 0x40. + cfg.resize(0x42, 0); + // Cap at 0x40: ID = 0x11 (MSI-X), next = 0x00 (end of list). + cfg[0x40] = 0x11; + cfg[0x41] = 0x00; + cfg + } + + fn mock_pci_device(root: &Path, pci_addr: &str, vendor: &str, driver: Option<&str>) { + use std::fs; + let dev_dir = root.join("sys/bus/pci/devices").join(pci_addr); + fs::create_dir_all(&dev_dir).unwrap(); + fs::write(dev_dir.join("vendor"), vendor).unwrap(); + fs::write(dev_dir.join("class"), "0x030000").unwrap(); + fs::write(dev_dir.join("config"), mock_pci_config_with_msix()).unwrap(); + if let Some(drv) = driver { + let driver_dir = root.join("sys/bus/pci/drivers").join(drv); + fs::create_dir_all(&driver_dir).unwrap(); + #[cfg(unix)] + std::os::unix::fs::symlink(&driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(dev_dir.join("driver_override"), "").unwrap(); + } + + fn mock_drm_card(root: &Path, card: &str, pci_addr: &str, outputs: &[(&str, &str)]) { + use std::fs; + let card_dir = root.join("sys/class/drm").join(card); + fs::create_dir_all(&card_dir).unwrap(); + #[cfg(unix)] + std::os::unix::fs::symlink( + root.join("sys/bus/pci/devices").join(pci_addr), + card_dir.join("device"), + ) + .unwrap(); + for (output, status) in outputs { + let out_dir = card_dir.join(format!("{card}-{output}")); + fs::create_dir_all(&out_dir).unwrap(); + fs::write(out_dir.join("status"), status).unwrap(); + } + } + + fn mock_iommu_group(root: &Path, group_id: u32, members: &[&str]) { + use std::fs; + let group_dir = root.join(format!("sys/kernel/iommu_groups/{group_id}/devices")); + fs::create_dir_all(&group_dir).unwrap(); + for member in members { + let dev_dir = root.join("sys/bus/pci/devices").join(member); + fs::create_dir_all(&dev_dir).unwrap(); + #[cfg(unix)] + { + let iommu_group_target = root.join(format!("sys/kernel/iommu_groups/{group_id}")); + let _ = + std::os::unix::fs::symlink(&iommu_group_target, dev_dir.join("iommu_group")); + let _ = std::os::unix::fs::symlink(&dev_dir, group_dir.join(member)); + } + } + } + + #[test] + #[cfg(target_os = "linux")] + fn display_attached_detects_active_framebuffer() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + assert!(check_display_attached(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_attached_false_on_headless() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "disconnected")], + ); + assert!(!check_display_attached(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_attached_false_no_drm_card() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + assert!(!check_display_attached(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_check_fails_without_groups_dir() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + assert!(!check_iommu_enabled(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_check_passes_with_group() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_iommu_group(root.path(), 15, &["0000:41:00.0"]); + assert!(check_iommu_enabled(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn vfio_modules_loaded_true() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + assert!(check_vfio_modules_loaded(&sysfs)); + } + + #[test] + #[cfg(target_os = "linux")] + fn vfio_modules_missing() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + assert!(!check_vfio_modules_loaded(&sysfs)); + } + + #[test] + #[cfg(target_os = "linux")] + fn permissions_writable() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&bind_dir).unwrap(); + fs::write(bind_dir.join("bind"), "").unwrap(); + assert!(check_sysfs_permissions(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn permissions_readonly_driver_override() { + use std::os::unix::fs::PermissionsExt; + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let driver_override = root + .path() + .join("sys/bus/pci/devices/0000:41:00.0/driver_override"); + fs::set_permissions(&driver_override, fs::Permissions::from_mode(0o444)).unwrap(); + assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn permissions_readonly_bind() { + use std::os::unix::fs::PermissionsExt; + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let bind_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&bind_dir).unwrap(); + let bind_path = bind_dir.join("bind"); + fs::write(&bind_path, "").unwrap(); + fs::set_permissions(&bind_path, fs::Permissions::from_mode(0o444)).unwrap(); + assert!(!check_sysfs_permissions(&sysfs, "0000:41:00.0")); + } + + fn mock_bindable_gpu(root: &Path, pci_addr: &str) { + mock_pci_device(root, pci_addr, "0x10de", Some("nvidia")); + let drv_unbind = root.join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + let vfio_dir = root.join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + mock_iommu_group(root, 15, &[pci_addr]); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_gpu_writes_correct_sysfs_paths() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + + bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + + let unbind_content = + fs::read_to_string(root.path().join("sys/bus/pci/drivers/nvidia/unbind")).unwrap(); + assert_eq!(unbind_content, "0000:41:00.0"); + + let override_content = fs::read_to_string( + root.path() + .join("sys/bus/pci/devices/0000:41:00.0/driver_override"), + ) + .unwrap(); + assert_eq!(override_content, "vfio-pci"); + + let bind_content = + fs::read_to_string(root.path().join("sys/bus/pci/drivers/vfio-pci/bind")).unwrap(); + assert_eq!(bind_content, "0000:41:00.0"); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_returns_original_driver() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + + let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!(result, "nvidia"); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_noop_when_already_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("vfio-pci")); + let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + + let nvidia_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::create_dir_all(nvidia_unbind.parent().unwrap()).unwrap(); + fs::write(&nvidia_unbind, "").unwrap(); + + let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!(result, "vfio-pci"); + + let unbind_content = fs::read_to_string(&nvidia_unbind).unwrap(); + assert_eq!( + unbind_content, "", + "nvidia unbind should NOT have been written" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn rebind_clears_driver_override() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, ""); + } + + #[test] + #[cfg(target_os = "linux")] + fn rebind_writes_to_original_driver_bind() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + rebind_gpu_to_original(&sysfs, "0000:41:00.0", "nvidia").unwrap(); + + let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap(); + assert_eq!(bind_content, "0000:41:00.0"); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_peers_listed_correctly() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + mock_pci_device(root.path(), "0000:41:00.1", "0x10de", None); + mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]); + + let peers = iommu_group_peers(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!(peers, vec!["0000:41:00.1"]); + } + + #[test] + #[cfg(target_os = "linux")] + fn iommu_peers_bound_together() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia")); + let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]); + + let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!( + restore, + vec![("0000:41:00.1".to_string(), "nvidia".to_string())] + ); + + let override_content = fs::read_to_string( + root.path() + .join("sys/bus/pci/devices/0000:41:00.1/driver_override"), + ) + .unwrap(); + assert_eq!(override_content, "vfio-pci"); + } + + #[test] + #[cfg(target_os = "linux")] + fn peer_restore_rebinds_to_original() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + mock_pci_device(root.path(), "0000:41:00.1", "0x10de", Some("nvidia")); + let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + mock_iommu_group(root.path(), 15, &["0000:41:00.0", "0000:41:00.1"]); + + let restore = bind_iommu_group_peers(&sysfs, "0000:41:00.0").unwrap(); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.1"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + rebind_iommu_group_peers(&sysfs, &restore).unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, ""); + } + + fn mock_multi_gpu_host(root: &Path) { + // GPU 0: on nvidia, has display attached + mock_bindable_gpu(root, "0000:41:00.0"); + mock_drm_card(root, "card0", "0000:41:00.0", &[("DP-1", "connected")]); + + // GPU 1: on nvidia, idle (no display, no processes) + mock_bindable_gpu(root, "0000:42:00.0"); + + // GPU 2: already on vfio-pci, clean IOMMU group + mock_pci_device(root, "0000:43:00.0", "0x10de", Some("vfio-pci")); + mock_iommu_group(root, 17, &["0000:43:00.0"]); + + fs::create_dir_all(root.join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.join("sys/module/vfio_iommu_type1")).unwrap(); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_prefers_already_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_multi_gpu_host(root.path()); + + let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap(); + assert_eq!(state.pci_addr, "0000:43:00.0"); + assert!( + state.did_bind, + "inherited vfio-pci should set did_bind=true for restore" + ); + assert_eq!(state.original_driver, "nvidia"); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_selects_idle_gpu_when_no_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + mock_iommu_group(root.path(), 15, &["0000:41:00.0"]); + + mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia")); + mock_iommu_group(root.path(), 16, &["0000:42:00.0"]); + + let drv_unbind = root.path().join("sys/bus/pci/drivers/nvidia/unbind"); + fs::write(&drv_unbind, "").unwrap(); + let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, None).unwrap(); + assert_eq!(state.pci_addr, "0000:42:00.0"); + assert!(state.did_bind); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_fails_when_all_blocked() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + mock_iommu_group(root.path(), 15, &["0000:41:00.0"]); + + mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card1", + "0000:42:00.0", + &[("HDMI-1", "connected")], + ); + mock_iommu_group(root.path(), 16, &["0000:42:00.0"]); + + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("display"), + "error should mention display: {msg}" + ); + assert!( + msg.contains("0000:41:00.0"), + "error should list first GPU: {msg}" + ); + assert!( + msg.contains("0000:42:00.0"), + "error should list second GPU: {msg}" + ); + assert!( + msg.contains("sudo systemctl stop display-manager"), + "error should suggest stopping display-manager: {msg}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_blocked_by_display_includes_restart_hint() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:61:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:61:00.0", + &[("DP-1", "connected")], + ); + mock_iommu_group(root.path(), 20, &["0000:61:00.0"]); + + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("sudo systemctl stop display-manager"), + "error should include display-manager stop command: {msg}" + ); + assert!( + msg.contains("sudo systemctl start display-manager"), + "error should include display-manager restart command: {msg}" + ); + assert!( + msg.contains("0000:61:00.0"), + "error should list the blocked GPU: {msg}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn auto_fails_on_empty_host() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + fs::create_dir_all(root.path().join("sys/bus/pci/devices")).unwrap(); + + let err = prepare_gpu_with_sysfs(&sysfs, None).unwrap_err(); + assert!( + err.to_string().contains("no NVIDIA PCI device found"), + "unexpected error: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_binds_target() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap(); + assert_eq!(state.pci_addr, "0000:41:00.0"); + assert!(state.did_bind); + assert_eq!(state.original_driver, "nvidia"); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_validates_format() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + let err = prepare_gpu_with_sysfs(&sysfs, Some("invalid")).unwrap_err(); + assert!( + err.to_string().contains("invalid PCI address"), + "unexpected error: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_fails_display_check() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + + let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err(); + assert!( + err.to_string().contains("display"), + "error should mention display: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn specific_bdf_fails_iommu_check() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + + let err = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap_err(); + assert!( + err.to_string().contains("IOMMU"), + "error should mention IOMMU: {err}" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn restore_round_trips() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_bindable_gpu(root.path(), "0000:41:00.0"); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:41:00.0")).unwrap(); + assert!(state.did_bind); + assert_eq!(state.original_driver, "nvidia"); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:41:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + #[cfg(unix)] + { + let _ = fs::remove_file(dev_dir.join("driver")); + std::os::unix::fs::symlink(&vfio_driver_dir, dev_dir.join("driver")).unwrap(); + } + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + state.restore_with_sysfs(&sysfs).unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!(override_content, ""); + + let bind_content = fs::read_to_string(nvidia_dir.join("bind")).unwrap(); + assert_eq!(bind_content, "0000:41:00.0"); + } + + #[test] + #[cfg(target_os = "linux")] + fn restore_inherited_vfio_rebinds_to_nvidia() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:43:00.0", "0x10de", Some("vfio-pci")); + mock_iommu_group(root.path(), 17, &["0000:43:00.0"]); + fs::create_dir_all(root.path().join("sys/module/vfio_pci")).unwrap(); + fs::create_dir_all(root.path().join("sys/module/vfio_iommu_type1")).unwrap(); + + let state = prepare_gpu_with_sysfs(&sysfs, Some("0000:43:00.0")).unwrap(); + assert!( + state.did_bind, + "inherited vfio-pci state should set did_bind=true" + ); + assert_eq!( + state.original_driver, "nvidia", + "inherited vfio-pci should target nvidia for restore" + ); + + let dev_dir = root.path().join("sys/bus/pci/devices/0000:43:00.0"); + let vfio_driver_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_driver_dir).unwrap(); + fs::write(vfio_driver_dir.join("unbind"), "").unwrap(); + let nvidia_dir = root.path().join("sys/bus/pci/drivers/nvidia"); + fs::create_dir_all(&nvidia_dir).unwrap(); + fs::write(nvidia_dir.join("bind"), "").unwrap(); + + state.restore_with_sysfs(&sysfs).unwrap(); + + let override_content = fs::read_to_string(dev_dir.join("driver_override")).unwrap(); + assert_eq!( + override_content, "", + "driver_override should be cleared after restore" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn bind_unbound_nvidia_defaults_to_nvidia_driver() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + // Device with no driver bound (simulating post-crash state). + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + let vfio_dir = root.path().join("sys/bus/pci/drivers/vfio-pci"); + fs::create_dir_all(&vfio_dir).unwrap(); + fs::write(vfio_dir.join("bind"), "").unwrap(); + + let result = bind_gpu_to_vfio(&sysfs, "0000:41:00.0").unwrap(); + assert_eq!( + result, "nvidia", + "unbound NVIDIA device should default to nvidia as restore driver" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn msix_detected_in_config() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", None); + assert!(check_msix_support(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn msix_absent_msi_only() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + let dev_dir = root.path().join("sys/bus/pci/devices").join("0000:41:00.0"); + fs::create_dir_all(&dev_dir).unwrap(); + // Config with MSI (cap 0x05) only, no MSI-X (0x11). + let mut cfg = vec![0u8; 0x42]; + cfg[0x06] = 0x10; // capabilities list present + cfg[0x34] = 0x40; // cap pointer + cfg[0x40] = 0x05; // MSI capability + cfg[0x41] = 0x00; // end of list + fs::write(dev_dir.join("config"), &cfg).unwrap(); + assert!(!check_msix_support(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn msix_empty_cap_list() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + let dev_dir = root.path().join("sys/bus/pci/devices").join("0000:41:00.0"); + fs::create_dir_all(&dev_dir).unwrap(); + let mut cfg = vec![0u8; 0x40]; + cfg[0x06] = 0x10; // capabilities list present + cfg[0x34] = 0x00; // null cap pointer + fs::write(dev_dir.join("config"), &cfg).unwrap(); + assert!(!check_msix_support(&sysfs, "0000:41:00.0")); + } + + #[test] + #[cfg(target_os = "linux")] + fn msix_circular_cap_list() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + let dev_dir = root.path().join("sys/bus/pci/devices").join("0000:41:00.0"); + fs::create_dir_all(&dev_dir).unwrap(); + // Circular: cap at 0x40 points back to 0x40. + let mut cfg = vec![0u8; 0x42]; + cfg[0x06] = 0x10; + cfg[0x34] = 0x40; + cfg[0x40] = 0x05; // MSI (not MSI-X) + cfg[0x41] = 0x40; // points back to self + fs::write(dev_dir.join("config"), &cfg).unwrap(); + // Should terminate via the 48-iteration guard, not hang. + assert!(!check_msix_support(&sysfs, "0000:41:00.0")); + } + + #[test] + fn guard_has_pci_addr() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + has_msix: true, + }; + let guard = GpuBindGuard::new(state); + assert_eq!(guard.pci_addr(), Some("0000:41:00.0")); + } + + #[test] + fn guard_disarm_returns_state() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + has_msix: true, + }; + let mut guard = GpuBindGuard::new(state); + let taken = guard.disarm(); + assert!(taken.is_some()); + assert_eq!(guard.pci_addr(), None); + } + + #[test] + fn guard_disarm_prevents_double_restore() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: true, + has_msix: true, + }; + let mut guard = GpuBindGuard::new(state); + let _ = guard.disarm(); + let second = guard.disarm(); + assert!(second.is_none()); + } + + #[test] + fn recovery_commands_includes_gpu_and_peers() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![("0000:41:00.1".to_string(), "snd_hda_intel".to_string())], + did_bind: true, + has_msix: true, + }; + let cmds = state.recovery_commands(); + assert!( + cmds.contains("vfio-pci/unbind"), + "should unbind GPU from vfio-pci" + ); + assert!( + cmds.contains("0000:41:00.0"), + "should reference GPU address" + ); + assert!( + cmds.contains("0000:41:00.1"), + "should reference peer address" + ); + assert!( + cmds.contains("driver_override"), + "should clear driver_override" + ); + assert!( + cmds.contains("modprobe nvidia"), + "should reload nvidia modules" + ); + assert!( + cmds.contains("modprobe snd_hda_intel"), + "should reload peer original driver" + ); + } + + #[test] + fn guard_drop_noop_when_did_not_bind() { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: false, + has_msix: true, + }; + let guard = GpuBindGuard::new(state); + drop(guard); + } + + #[test] + fn guard_drop_on_panic_is_safe() { + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let state = GpuBindState { + pci_addr: "0000:41:00.0".to_string(), + original_driver: "nvidia".to_string(), + peer_binds: vec![], + did_bind: false, + has_msix: true, + }; + let _guard = GpuBindGuard::new(state); + panic!("test panic"); + })); + assert!(result.is_err()); + } + + #[test] + fn display_server_process_detection() { + assert!(is_display_server_process("Xorg")); + assert!(is_display_server_process("X")); + assert!(is_display_server_process("Xwayland")); + assert!(is_display_server_process("gnome-shell")); + assert!(is_display_server_process("kwin_wayland")); + assert!(is_display_server_process("sway")); + assert!(is_display_server_process("mutter")); + + assert!(!is_display_server_process("firefox")); + assert!(!is_display_server_process("python3")); + assert!(!is_display_server_process("nvidia-smi")); + assert!(!is_display_server_process("cuda_app")); + assert!(!is_display_server_process("")); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_blocker_detected_with_active_outputs() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + + let info = detect_display_blocker_with_sysfs(&sysfs, Some("0000:41:00.0")); + assert!(info.is_some(), "should detect display blocker"); + let info = info.unwrap(); + assert_eq!(info.pci_addr, "0000:41:00.0"); + assert!(info.has_active_outputs); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_blocker_none_when_gpu_already_vfio() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("vfio-pci")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + + let info = detect_display_blocker_with_sysfs(&sysfs, Some("0000:41:00.0")); + assert!( + info.is_none(), + "should not detect blocker when GPU is already on vfio-pci" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_blocker_none_on_headless_idle_gpu() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "disconnected")], + ); + + let info = detect_display_blocker_with_sysfs(&sysfs, Some("0000:41:00.0")); + assert!( + info.is_none(), + "headless idle GPU should not trigger display blocker" + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn display_blocker_auto_finds_blocked_gpu() { + let root = tempfile::tempdir().unwrap(); + let sysfs = SysfsRoot::new(root.path().to_path_buf()); + + mock_pci_device(root.path(), "0000:41:00.0", "0x10de", Some("nvidia")); + mock_drm_card( + root.path(), + "card0", + "0000:41:00.0", + &[("DP-1", "connected")], + ); + + mock_pci_device(root.path(), "0000:42:00.0", "0x10de", Some("vfio-pci")); + + let info = detect_display_blocker_with_sysfs(&sysfs, None); + assert!(info.is_some()); + assert_eq!(info.unwrap().pci_addr, "0000:41:00.0"); + } +} diff --git a/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs b/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs new file mode 100644 index 000000000..08c658f7a --- /dev/null +++ b/crates/openshell-vfio/tests/gpu_passthrough_implementation.rs @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration tests for GPU passthrough on real hardware. +//! +//! Gated by `OPENSHELL_VM_GPU_E2E=1`. On machines without a real GPU, +//! all tests early-return and pass. + +use openshell_vfio::{ + GpuBindGuard, HostNvidiaVfioReadiness, prepare_gpu_for_passthrough, + probe_host_nvidia_vfio_readiness, +}; + +fn gpu_e2e_enabled() -> bool { + std::env::var("OPENSHELL_VM_GPU_E2E").as_deref() == Ok("1") +} + +#[test] +fn nvidia_gpu_passthrough_is_available() { + if !gpu_e2e_enabled() { + eprintln!("OPENSHELL_VM_GPU_E2E not set — skipping GPU passthrough gate test"); + return; + } + assert!( + openshell_vfio::nvidia_gpu_available_for_vm_passthrough(None), + "GPU passthrough gate returned false on a GPU CI runner — \ + check VFIO binding and VM runtime bundle" + ); +} + +#[test] +fn bind_and_rebind_real_gpu() { + if !gpu_e2e_enabled() { + return; + } + + let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU"); + + let results = probe_host_nvidia_vfio_readiness(); + let (_, readiness) = results + .iter() + .find(|(a, _)| a == &state.pci_addr) + .expect("bound GPU should appear in probe"); + assert_eq!(*readiness, HostNvidiaVfioReadiness::VfioBoundReady); + + state.restore().expect("restore should succeed"); + + let results = probe_host_nvidia_vfio_readiness(); + let (_, readiness) = results + .iter() + .find(|(a, _)| a == &state.pci_addr) + .expect("restored GPU should appear in probe"); + assert_eq!(*readiness, HostNvidiaVfioReadiness::BoundToNvidia); +} + +#[test] +fn safety_checks_pass_on_ci_gpu() { + if !gpu_e2e_enabled() { + return; + } + + // `prepare_gpu_for_passthrough` runs all safety checks internally + // (display-attached, IOMMU enabled, VFIO modules loaded, sysfs + // permissions). Success here validates that the CI GPU is headless, + // IOMMU is on, and VFIO modules are loaded. + let state = prepare_gpu_for_passthrough(None) + .expect("all safety checks should pass on a headless CI GPU"); + assert!(!state.pci_addr.is_empty()); + + state.restore().expect("restore should succeed"); +} + +#[test] +fn guard_restores_on_drop_real_gpu() { + if !gpu_e2e_enabled() { + return; + } + + let state = prepare_gpu_for_passthrough(None).expect("should find and bind a GPU"); + let pci_addr = state.pci_addr.clone(); + + let guard = GpuBindGuard::new(state); + drop(guard); + + let output = std::process::Command::new("nvidia-smi") + .arg("--query-gpu=pci.bus_id") + .arg("--format=csv,noheader") + .output() + .expect("nvidia-smi should be available after guard drop"); + assert!( + output.status.success(), + "nvidia-smi failed after guard drop" + ); + + let stdout = String::from_utf8_lossy(&output.stdout); + let normalized_addr = pci_addr.to_uppercase(); + assert!( + stdout.to_uppercase().contains(&normalized_addr), + "nvidia-smi should list the restored GPU {pci_addr}, got: {stdout}" + ); +} + +#[test] +fn auto_select_finds_ci_gpu() { + if !gpu_e2e_enabled() { + return; + } + + let state = prepare_gpu_for_passthrough(None).expect("auto-select should find a GPU on CI"); + assert!(!state.pci_addr.is_empty()); + assert!(state.did_bind); + + state.restore().expect("restore should succeed"); +} diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml index 7d74b3139..aa3d85a4a 100644 --- a/crates/openshell-vm/Cargo.toml +++ b/crates/openshell-vm/Cargo.toml @@ -28,6 +28,7 @@ miette = { workspace = true } nix = { workspace = true } openshell-bootstrap = { path = "../openshell-bootstrap" } openshell-core = { path = "../openshell-core" } +openshell-vfio = { path = "../openshell-vfio" } serde = { workspace = true } serde_json = "1" tar = "0.4" @@ -46,5 +47,8 @@ tokio-rustls = { workspace = true } [build-dependencies] zstd = "0.13" +[dev-dependencies] +tempfile = "3" + [lints] workspace = true diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs index 33fab9a78..7c709defd 100644 --- a/crates/openshell-vm/build.rs +++ b/crates/openshell-vm/build.rs @@ -12,7 +12,7 @@ //! Environment: //! `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` - Path to compressed artifacts -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::{env, fs}; fn main() { @@ -28,6 +28,7 @@ fn main() { "libkrunfw.5.dylib.zst", "gvproxy.zst", "rootfs.tar.zst", + "rootfs-gpu.tar.zst", ] { println!("cargo:rerun-if-changed={dir}/{name}"); } @@ -68,24 +69,30 @@ fn main() { return; } - // Copy compressed files to OUT_DIR - let files = [ + // Copy compressed files to OUT_DIR. + // Core artifacts are required; rootfs has two variants (base and GPU) and + // the presence of either one is sufficient. + let core_files = [ (format!("{libkrun_name}.zst"), format!("{libkrun_name}.zst")), ( format!("{libkrunfw_name}.zst"), format!("{libkrunfw_name}.zst"), ), ("gvproxy.zst".to_string(), "gvproxy.zst".to_string()), - ("rootfs.tar.zst".to_string(), "rootfs.tar.zst".to_string()), ]; let mut all_found = true; - for (src_name, dst_name) in &files { + let mut total_embedded_size: u64 = 0; + + let copy_artifact = |src_name: &str, + dst_name: &str, + compressed_dir: &Path, + out_dir: &Path, + total: &mut u64| + -> bool { let src_path = compressed_dir.join(src_name); let dst_path = out_dir.join(dst_name); - if src_path.exists() { - // Remove existing file first (may be read-only from previous build) if dst_path.exists() { let _ = fs::remove_file(&dst_path); } @@ -98,25 +105,104 @@ fn main() { ) }); let size = fs::metadata(&dst_path).map(|m| m.len()).unwrap_or(0); + *total += size; println!("cargo:warning=Embedded {src_name}: {size} bytes"); + true } else { + false + } + }; + + for (src_name, dst_name) in &core_files { + if !copy_artifact( + src_name, + dst_name, + &compressed_dir, + &out_dir, + &mut total_embedded_size, + ) { println!( "cargo:warning=Missing compressed artifact: {}", - src_path.display() + compressed_dir.join(src_name).display() ); all_found = false; } } + // Rootfs: accept either the base rootfs or the GPU rootfs (or both). + let has_base = copy_artifact( + "rootfs.tar.zst", + "rootfs.tar.zst", + &compressed_dir, + &out_dir, + &mut total_embedded_size, + ); + let has_gpu = copy_artifact( + "rootfs-gpu.tar.zst", + "rootfs-gpu.tar.zst", + &compressed_dir, + &out_dir, + &mut total_embedded_size, + ); + if !has_base && !has_gpu { + println!( + "cargo:warning=Missing rootfs artifact: neither rootfs.tar.zst nor rootfs-gpu.tar.zst found in {}", + compressed_dir.display() + ); + } else if !has_base { + println!( + "cargo:warning=Only rootfs-gpu.tar.zst found (base rootfs.tar.zst absent). \ + This is fine for GPU-only builds; run `mise run vm:setup` to get the base rootfs." + ); + } else if !has_gpu { + println!( + "cargo:warning=Only rootfs.tar.zst found (GPU rootfs-gpu.tar.zst absent). \ + This is fine for non-GPU builds; run `mise run vm:rootfs -- --gpu` to get the GPU rootfs." + ); + } + + // Write empty stubs for any missing rootfs variant so that + // `include_bytes!()` in embedded.rs always resolves. The embedded module + // treats zero-length slices as "not available". + for (found, name) in [ + (has_base, "rootfs.tar.zst"), + (has_gpu, "rootfs-gpu.tar.zst"), + ] { + if !found { + let stub = out_dir.join(name); + if !stub.exists() { + fs::write(&stub, b"") + .unwrap_or_else(|e| panic!("Failed to write stub {name}: {e}")); + } + } + } + if !all_found { println!("cargo:warning=Some artifacts missing. Run: mise run vm:setup"); generate_stub_resources(&out_dir); } + + // Warn when total embedded data approaches the x86_64 small code model limit. + // The default code model uses R_X86_64_PC32 (±2 GiB) relocations; embedding + // blobs that push .rodata past 2 GiB will cause linker failures unless + // RUSTFLAGS="-C code-model=large" is set. The vm:build task does this + // automatically, but direct cargo invocations may not. + const LARGE_BLOB_THRESHOLD: u64 = 1_800_000_000; // ~1.8 GiB + if target_arch == "x86_64" && total_embedded_size > LARGE_BLOB_THRESHOLD { + println!( + "cargo:warning=Total embedded data is {total_embedded_size} bytes ({:.1} GiB).", + total_embedded_size as f64 / (1024.0 * 1024.0 * 1024.0) + ); + println!("cargo:warning=This exceeds the x86_64 small code model limit (~2 GiB)."); + println!( + "cargo:warning=Ensure RUSTFLAGS includes '-C code-model=large' or use `mise run vm:build`." + ); + } } /// Generate stub (empty) resource files so the build can complete. /// The embedded module will fail at runtime if these stubs are used. -fn generate_stub_resources(out_dir: &PathBuf) { +fn generate_stub_resources(out_dir: &Path) { let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); let (libkrun_name, libkrunfw_name) = match target_os.as_str() { @@ -129,6 +215,7 @@ fn generate_stub_resources(out_dir: &PathBuf) { format!("{libkrunfw_name}.zst"), "gvproxy.zst".to_string(), "rootfs.tar.zst".to_string(), + "rootfs-gpu.tar.zst".to_string(), ]; for name in &stubs { diff --git a/crates/openshell-vm/pins.env b/crates/openshell-vm/pins.env index b3d802292..cedc15d85 100644 --- a/crates/openshell-vm/pins.env +++ b/crates/openshell-vm/pins.env @@ -42,3 +42,37 @@ GVPROXY_VERSION="${GVPROXY_VERSION:-v0.8.8}" # Repo: https://github.com/containers/libkrunfw # Pinned: 2026-03-27 (main branch HEAD at time of pinning) LIBKRUNFW_REF="${LIBKRUNFW_REF:-463f717bbdd916e1352a025b6fb2456e882b0b39}" + +# ── virtiofsd (virtio-fs daemon for QEMU rootfs) ──────────────────────── +# Repo: https://gitlab.com/virtio-fs/virtiofsd +VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" + +# ── NVIDIA GPU support (GPU rootfs variant) ──────────────────────────── +# Driver branch: 570.x (open kernel modules, data-center/workstation) +# +# Compatibility matrix: +# Minimum driver version: 570 (NVIDIA 570.x open kernel modules) +# Minimum compute capability: sm_70 (Volta V100 and newer) +# Supported architectures: Volta (V100), Turing (T4, RTX 20xx), +# Ampere (A100, A10, RTX 30xx), +# Hopper (H100, H200), Ada Lovelace (L40S), +# Blackwell (B100, B200) +# Guest architecture: x86_64 only (NVIDIA does not publish +# aarch64 data-center drivers in APT form) +# Host requirements: IOMMU enabled, GPU bound to vfio-pci driver, +# host driver version >= guest driver version +# +# The 570.x branch uses the open kernel module flavour +# (nvidia-headless-570-open), required for data-center GPUs (Turing+). +# Consumer GPUs (GeForce) may work but are not officially supported +# for VFIO passthrough. +NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}" +NVIDIA_CONTAINER_TOOLKIT_VERSION="${NVIDIA_CONTAINER_TOOLKIT_VERSION:-1.19.0}" + +# NVIDIA open kernel module source tag (must match nvidia-headless-570-open version). +# Repo: https://github.com/NVIDIA/open-gpu-kernel-modules +# The tag must be the exact driver version so that the compiled kernel modules +# match the userspace libraries installed by nvidia-headless-570-open in the +# rootfs. A mismatch causes "API mismatch" errors from nvidia-smi. +# Find the APT version: apt-cache show nvidia-headless-570-open | grep Version +NVIDIA_DRIVER_TAG="${NVIDIA_DRIVER_TAG:-570.211.01}" diff --git a/crates/openshell-vm/runtime/kernel/openshell.kconfig b/crates/openshell-vm/runtime/kernel/openshell.kconfig index b5f0330af..d1244ad32 100644 --- a/crates/openshell-vm/runtime/kernel/openshell.kconfig +++ b/crates/openshell-vm/runtime/kernel/openshell.kconfig @@ -115,6 +115,10 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PIDS=y CONFIG_MEMCG=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_FREEZER=y # ── Disable kernel headers archive (avoids cpio issues in CI) ────────── # CONFIG_IKHEADERS is not set @@ -126,3 +130,58 @@ CONFIG_POSIX_MQUEUE_SYSCTL=y # ── Security features required by the sandbox runtime ─────────────────── CONFIG_SECURITY_LANDLOCK=y CONFIG_SECCOMP_FILTER=y + +# ── PCI / GPU passthrough (harmless for non-GPU boots) ────────────────── +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_DRM=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y + +# MTRR — required dependency for CONFIG_X86_PAT below. +CONFIG_MTRR=y + +# MMU notifier — required by NVIDIA UVM module for GPU memory management. +CONFIG_MMU_NOTIFIER=y + +# PAT (Page Attribute Table) — required for correct GPU memory mapping. +# Without this, the NVIDIA driver compiles a fallback code path in nv-pat.c +# that calls __flush_tlb(), which was removed in kernel 6.12+. All modern +# x86_64 CPUs support PAT; every distro kernel enables it. +CONFIG_X86_PAT=y + +# ── Firmware loading (required for NVIDIA GSP firmware) ────────────────── +# The NVIDIA open kernel modules use request_firmware() to load GSP firmware +# from /lib/firmware/nvidia//. Without CONFIG_FW_LOADER, the kernel +# has no firmware loading infrastructure and GPU init fails with: +# NVRM: RmFetchGspRmImages: No firmware image found +# On kernel 6.12+, CONFIG_FW_LOADER includes the sysfs loading interface +# (previously CONFIG_FW_LOADER_SYSFS, now merged). +CONFIG_FW_LOADER=y + +# ── Compressed firmware support ────────────────────────────────────────── +# NVIDIA driver packages (570.x+) ship GSP firmware as compressed files +# (gsp_*.bin.xz). Without decompression support, request_firmware() fails +# to find the firmware even when the files exist in /lib/firmware/. +CONFIG_FW_LOADER_COMPRESS=y +CONFIG_FW_LOADER_COMPRESS_XZ=y +CONFIG_FW_LOADER_COMPRESS_ZSTD=y + +# ── QEMU backend support ───────────────────────────────────────────────── +# QEMU uses virtio-PCI transport (libkrun uses virtio-MMIO). Both drivers +# coexist safely — the kernel probes whichever transport the hypervisor +# provides. +CONFIG_VIRTIO_PCI=y + +# Serial console for QEMU (8250/16550 UART). libkrun uses virtio-console +# which is already enabled in the base config. +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y + +# ACPI support for QEMU power management. Required for `poweroff -f` +# to trigger a clean ACPI shutdown that QEMU detects. +CONFIG_ACPI=y + +# x2APIC support — QEMU uses x2APIC MADT entries for multi-vCPU VMs. +# Without this, only the bootstrap CPU is activated. +CONFIG_X86_X2APIC=y diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index d43046d4f..efcf7ed10 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -18,11 +18,16 @@ # - NO pre-initialized k3s state (cold start on first boot) # First boot will be slower (~30-60s) as k3s initializes and pulls images. # +# With --gpu, installs NVIDIA driver packages and the nvidia-container-toolkit +# into the rootfs, producing a GPU-capable variant. The launcher selects this +# rootfs when `--gpu` is passed. Only supported on x86_64 (NVIDIA does not +# publish aarch64 data-center drivers for Ubuntu in this packaging form). +# # Supports aarch64 and x86_64 guest architectures. The target architecture # is auto-detected from the host but can be overridden with --arch. # # Usage: -# ./build-rootfs.sh [--base] [--arch aarch64|x86_64] [output_dir] +# ./build-rootfs.sh [--base] [--gpu] [--arch aarch64|x86_64] [output_dir] # # If output_dir is omitted, the rootfs is built under target/rootfs-build. # @@ -43,12 +48,15 @@ fi # ── Argument parsing ─────────────────────────────────────────────────── BASE_ONLY=false +GPU_BUILD=false GUEST_ARCH="" POSITIONAL_ARGS=() while [[ $# -gt 0 ]]; do case "$1" in --base) BASE_ONLY=true; shift ;; + --gpu) + GPU_BUILD=true; shift ;; --arch) GUEST_ARCH="$2"; shift 2 ;; *) @@ -90,6 +98,14 @@ case "$GUEST_ARCH" in ;; esac +# GPU builds are only supported on x86_64 — NVIDIA does not publish +# aarch64 data-center driver packages in the same APT repository. +if [ "$GPU_BUILD" = true ] && [ "$GUEST_ARCH" != "x86_64" ]; then + echo "ERROR: --gpu is only supported for x86_64 guest architecture." >&2 + echo " Current arch: ${GUEST_ARCH}" >&2 + exit 1 +fi + # Project root (two levels up from crates/openshell-vm/scripts/) PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" DEFAULT_ROOTFS="${PROJECT_ROOT}/target/rootfs-build" @@ -119,12 +135,76 @@ verify_checksum() { fi } +verify_gpu_rootfs() { + local rootfs_dir="$1" + local kernel_version="$2" + local driver_tag="$3" + local driver_version="$4" + + echo "==> Verifying GPU components in rootfs..." + if [ ! -f "${rootfs_dir}/usr/bin/nvidia-smi" ]; then + echo "ERROR: nvidia-smi not found in rootfs." + exit 1 + fi + echo " nvidia-smi: found" + if ls "${rootfs_dir}"/usr/bin/nvidia-container-runtime* >/dev/null 2>&1; then + echo " nvidia-container-runtime: found" + else + echo "WARNING: nvidia-container-runtime not found — GPU pods may not work." + fi + if [ -z "${kernel_version}" ]; then + echo "ERROR: VM_KERNEL_VERSION not set — kernel module injection may have been skipped" >&2 + exit 1 + fi + if [ -d "${rootfs_dir}/lib/modules/${kernel_version}" ]; then + local mod_count + mod_count=$(find "${rootfs_dir}/lib/modules/${kernel_version}" -name "nvidia*.ko" | wc -l) + echo " nvidia kernel modules: ${mod_count} found (kernel ${kernel_version})" + if [ "$mod_count" -eq 0 ]; then + echo "ERROR: no nvidia kernel modules in /lib/modules/${kernel_version}/" + echo " Run: mise run vm:nvidia-modules" + exit 1 + fi + else + echo "ERROR: /lib/modules/${kernel_version}/ not found in rootfs" + echo " Run: mise run vm:nvidia-modules" + exit 1 + fi + local fw_dir="${rootfs_dir}/lib/firmware/nvidia/${driver_tag}" + if [ ! -d "${fw_dir}" ]; then + fw_dir="${rootfs_dir}/usr/lib/firmware/nvidia/${driver_tag}" + fi + if [ -d "${fw_dir}" ]; then + local fw_count + fw_count=$(ls "${fw_dir}"/gsp_*.bin 2>/dev/null | wc -l) + echo " GSP firmware: ${fw_count} files found" + for fw in "${fw_dir}"/gsp_*.bin; do + [ -f "$fw" ] || continue + echo " $(basename "$fw") ($(du -h "$fw" | cut -f1))" + done + if [ "$fw_count" -eq 0 ]; then + echo "ERROR: No GSP firmware files (gsp_*.bin) in ${fw_dir}" >&2 + echo " nvidia-smi will fail with: RmFetchGspRmImages: No firmware image found" >&2 + exit 1 + fi + else + echo "ERROR: GSP firmware directory not found" >&2 + echo " Checked: ${rootfs_dir}/lib/firmware/nvidia/${driver_tag}/" >&2 + echo " and: ${rootfs_dir}/usr/lib/firmware/nvidia/${driver_tag}/" >&2 + echo " Install: nvidia-firmware-${driver_version}-${driver_tag}" >&2 + exit 1 + fi +} + if [ "$BASE_ONLY" = true ]; then echo "==> Building base openshell-vm rootfs" echo " Guest arch: ${GUEST_ARCH}" echo " k3s version: ${K3S_VERSION}" echo " Output: ${ROOTFS_DIR}" echo " Mode: base (no pre-loaded images, cold start)" + if [ "$GPU_BUILD" = true ]; then + echo " GPU: yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})" + fi else echo "==> Building openshell-vm rootfs" echo " Guest arch: ${GUEST_ARCH}" @@ -132,6 +212,9 @@ else echo " Images: ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}" echo " Output: ${ROOTFS_DIR}" echo " Mode: full (pre-loaded images, pre-initialized)" + if [ "$GPU_BUILD" = true ]; then + echo " GPU: yes (NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION})" + fi fi echo "" @@ -222,38 +305,110 @@ fi docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true echo "==> Building base image..." -docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ - --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' +if [ "$GPU_BUILD" = true ]; then + docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" \ + --build-arg "NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}" \ + --build-arg "NVIDIA_DRIVER_TAG=${NVIDIA_DRIVER_TAG}" \ + --build-arg "NVIDIA_CONTAINER_TOOLKIT_VERSION=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \ + -f - . <<'DOCKERFILE' ARG BASE_IMAGE FROM ${BASE_IMAGE} +ARG NVIDIA_DRIVER_VERSION +ARG NVIDIA_DRIVER_TAG +ARG NVIDIA_CONTAINER_TOOLKIT_VERSION RUN apt-get update && \ apt-get install -y --no-install-recommends \ ca-certificates \ e2fsprogs \ iptables \ iproute2 \ + kmod \ python3 \ busybox-static \ sqlite3 \ util-linux \ zstd \ + gnupg \ + curl \ && rm -rf /var/lib/apt/lists/* # busybox-static provides udhcpc for DHCP inside the VM. RUN mkdir -p /usr/share/udhcpc && \ ln -sf /bin/busybox /sbin/udhcpc RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s +# ── NVIDIA driver and container toolkit ────────────────────────────── +# Add the NVIDIA package repository and install the open kernel module +# flavour of the driver plus nvidia-container-toolkit. The open modules +# are required for data-center GPUs (Turing+ / compute capability >= 7.0). +# Userspace packages are pinned to $NVIDIA_DRIVER_TAG so they match the +# kernel modules compiled by build-nvidia-modules.sh. +RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + > /etc/apt/sources.list.d/nvidia-container-toolkit.list +RUN apt-get update && \ + HEADLESS_VER=$(apt-cache madison nvidia-headless-${NVIDIA_DRIVER_VERSION}-open \ + | awk -v tag="${NVIDIA_DRIVER_TAG}" '$3 ~ "^"tag {print $3; exit}') && \ + UTILS_VER=$(apt-cache madison nvidia-utils-${NVIDIA_DRIVER_VERSION} \ + | awk -v tag="${NVIDIA_DRIVER_TAG}" '$3 ~ "^"tag {print $3; exit}') && \ + if [ -z "$HEADLESS_VER" ] || [ -z "$UTILS_VER" ]; then \ + echo "ERROR: No APT package found for driver tag ${NVIDIA_DRIVER_TAG}" >&2; \ + echo " headless: ${HEADLESS_VER:-not found}"; \ + echo " utils: ${UTILS_VER:-not found}"; \ + exit 1; \ + fi && \ + echo "Pinning NVIDIA packages: headless=${HEADLESS_VER} utils=${UTILS_VER}" && \ + apt-get install -y --no-install-recommends \ + nvidia-headless-${NVIDIA_DRIVER_VERSION}-open=${HEADLESS_VER} \ + nvidia-utils-${NVIDIA_DRIVER_VERSION}=${UTILS_VER} \ + nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION}-1 \ + && rm -rf /var/lib/apt/lists/* +# Configure the NVIDIA container runtime as the default for containerd. +RUN nvidia-ctk runtime configure --runtime=containerd --set-as-default DOCKERFILE +else + docker build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ + --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' +ARG BASE_IMAGE +FROM ${BASE_IMAGE} +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + e2fsprogs \ + iptables \ + iproute2 \ + python3 \ + busybox-static \ + sqlite3 \ + util-linux \ + zstd \ + && rm -rf /var/lib/apt/lists/* +# busybox-static provides udhcpc for DHCP inside the VM. +RUN mkdir -p /usr/share/udhcpc && \ + ln -sf /bin/busybox /sbin/udhcpc +RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s +DOCKERFILE +fi # Create a container and export the filesystem echo "==> Creating container..." docker create --platform "${DOCKER_PLATFORM}" --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true echo "==> Exporting filesystem..." -# Previous builds may leave overlayfs work/ dirs with permissions that -# prevent rm on macOS. Force-fix permissions before removing. +# Previous builds (especially VM pre-init) may leave root-owned files +# (k3s data, CNI, kubelet) that prevent non-root removal. Try normal +# cleanup first, fall back to sudo if needed. if [ -d "${ROOTFS_DIR}" ]; then + if [ -z "${ROOTFS_DIR}" ] || [ "${ROOTFS_DIR}" = "/" ]; then + echo "ERROR: ROOTFS_DIR is empty or root — refusing to rm -rf" >&2 + exit 1 + fi chmod -R u+rwx "${ROOTFS_DIR}" 2>/dev/null || true - rm -rf "${ROOTFS_DIR}" + if ! rm -rf "${ROOTFS_DIR}" 2>/dev/null; then + echo " Root-owned files detected in ${ROOTFS_DIR}, using sudo to clean..." + sudo rm -rf "${ROOTFS_DIR}" + fi fi mkdir -p "${ROOTFS_DIR}" docker export "${CONTAINER_NAME}" | tar -C "${ROOTFS_DIR}" -xf - @@ -363,6 +518,71 @@ for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do fi done +# ── Inject GPU manifests (when building GPU rootfs) ─────────────────── +# These are deployed by openshell-vm-init.sh when GPU_ENABLED=true. +GPU_MANIFEST_SRC="${SCRIPT_DIR}/gpu-manifests" +GPU_MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/gpu-manifests" +if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_SRC}" ]; then + echo "==> Injecting GPU manifests..." + mkdir -p "${GPU_MANIFEST_DEST}" + GPU_MANIFEST_COPIED=0 + for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do + [ -f "$manifest" ] || continue + cp "$manifest" "${GPU_MANIFEST_DEST}/" + echo " $(basename "$manifest")" + GPU_MANIFEST_COPIED=$((GPU_MANIFEST_COPIED + 1)) + done + # Sentinel only when at least one manifest was staged (empty glob must not create it). + if [ "$GPU_MANIFEST_COPIED" -gt 0 ]; then + echo "gpu" > "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" + else + echo "WARNING: No GPU manifests (*.yaml) found in ${GPU_MANIFEST_SRC}; not writing .rootfs-gpu sentinel." >&2 + fi +fi + +# ── Inject NVIDIA kernel modules (GPU rootfs only) ──────────────────── +# The kernel modules are compiled separately by build-nvidia-modules.sh +# against the VM kernel source tree. We inject them here so modprobe +# can load nvidia.ko at VM boot time. +if [ "$GPU_BUILD" = true ]; then + NVIDIA_MODULES_DIR="${PROJECT_ROOT}/target/libkrun-build/nvidia-modules" + + # Read the kernel version exported by build-libkrun.sh. + KERNEL_VERSION_FILE="${PROJECT_ROOT}/target/libkrun-build/kernel-version.txt" + if [ -f "$KERNEL_VERSION_FILE" ]; then + VM_KERNEL_VERSION="$(cat "$KERNEL_VERSION_FILE")" + else + echo "ERROR: kernel-version.txt not found at ${KERNEL_VERSION_FILE}" >&2 + echo " Run: FROM_SOURCE=1 mise run vm:setup" >&2 + exit 1 + fi + + MODULE_DEST="${ROOTFS_DIR}/lib/modules/${VM_KERNEL_VERSION}/kernel/drivers/video/nvidia" + + if [ -d "${NVIDIA_MODULES_DIR}" ] && ls "${NVIDIA_MODULES_DIR}"/*.ko >/dev/null 2>&1; then + echo "==> Injecting NVIDIA kernel modules (kernel ${VM_KERNEL_VERSION})..." + mkdir -p "${MODULE_DEST}" + cp "${NVIDIA_MODULES_DIR}"/*.ko "${MODULE_DEST}/" + for mod in "${MODULE_DEST}"/*.ko; do + echo " $(basename "$mod") ($(du -h "$mod" | cut -f1))" + done + + # Generate module dependency metadata so modprobe works. + KERNEL_DIR_NAME="$(grep '^KERNEL_VERSION' "${PROJECT_ROOT}/target/libkrun-build/libkrunfw/Makefile" | head -1 | awk '{print $3}')" + SYSTEM_MAP="${PROJECT_ROOT}/target/libkrun-build/libkrunfw/${KERNEL_DIR_NAME}/System.map" + if [ -f "$SYSTEM_MAP" ]; then + depmod -a -b "${ROOTFS_DIR}" -F "$SYSTEM_MAP" "${VM_KERNEL_VERSION}" + else + depmod -a -b "${ROOTFS_DIR}" "${VM_KERNEL_VERSION}" + fi + echo " depmod: module dependencies generated" + else + echo "ERROR: NVIDIA kernel modules not found at ${NVIDIA_MODULES_DIR}" >&2 + echo " Run: tasks/scripts/vm/build-nvidia-modules.sh" >&2 + exit 1 + fi +fi + # ── Base mode: mark rootfs type and skip pre-loading ─────────────────── if [ "$BASE_ONLY" = true ]; then @@ -384,10 +604,22 @@ if [ "$BASE_ONLY" = true ]; then exit 1 fi + if [ "$GPU_BUILD" = true ]; then + if [ ! -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ]; then + echo "ERROR: GPU sentinel file not found in rootfs." + exit 1 + fi + verify_gpu_rootfs "${ROOTFS_DIR}" "${VM_KERNEL_VERSION:-}" "${NVIDIA_DRIVER_TAG}" "${NVIDIA_DRIVER_VERSION}" + fi + echo "" echo "==> Base rootfs ready at: ${ROOTFS_DIR}" echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" - echo " Type: base (cold start, images pulled on demand)" + if [ "$GPU_BUILD" = true ]; then + echo " Type: base + GPU (cold start, NVIDIA driver ${NVIDIA_DRIVER_VERSION})" + else + echo " Type: base (cold start, images pulled on demand)" + fi echo "" echo "Note: First boot will take ~30-60s as k3s initializes." echo " Container images will be pulled from registries on first use." @@ -475,6 +707,15 @@ for manifest in "${MANIFEST_DEST}"/*.yaml; do cp "$manifest" "${INIT_MANIFESTS}/" done +# GPU manifests: same pre-init path as other auto-deploy manifests so k3s +# sees them during cluster bake (not only under /opt/openshell/gpu-manifests). +if [ "$GPU_BUILD" = true ] && [ -d "${GPU_MANIFEST_DEST}" ]; then + for manifest in "${GPU_MANIFEST_DEST}"/*.yaml; do + [ -f "$manifest" ] || continue + cp "$manifest" "${INIT_MANIFESTS}/" + done +fi + # Patch HelmChart for local images and VM settings. HELMCHART="${INIT_MANIFESTS}/openshell-helmchart.yaml" if [ -f "$HELMCHART" ]; then @@ -589,6 +830,7 @@ else fi # Pre-initialize directly on virtio-fs. Runtime boots attach a separate # block-backed state disk and seed it from the rootfs on first launch. +rm -f "${ROOTFS_DIR}-console.log" 2>/dev/null || sudo rm -f "${ROOTFS_DIR}-console.log" 2>/dev/null || true OPENSHELL_VM_DISABLE_STATE_DISK=1 "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" --reset & VM_PID=$! @@ -599,6 +841,13 @@ cleanup_vm() { kill "${VM_PID}" 2>/dev/null || true wait "${VM_PID}" 2>/dev/null || true fi + # Kill orphaned gvproxy processes left by the VM (holds port 30051). + local gvproxy_pids + gvproxy_pids=$(pgrep -f "gvproxy.*listen-qemu" 2>/dev/null || true) + if [ -n "$gvproxy_pids" ]; then + echo " Killing orphaned gvproxy: $gvproxy_pids" + kill $gvproxy_pids 2>/dev/null || true + fi } trap cleanup_vm EXIT @@ -616,15 +865,16 @@ for i in $(seq 1 120); do sleep 1 done -# Wait for containerd to be ready. +# Wait for containerd to be ready. The first boot after a --reset may +# need extra time for k3s to extract its data dir and start containerd. echo " Waiting for containerd..." -for i in $(seq 1 60); do +for i in $(seq 1 180); do if vm_exec k3s ctr version >/dev/null 2>&1; then echo " Containerd ready (${i}s)" break fi - if [ "$i" -eq 60 ]; then - echo "ERROR: containerd did not become ready in 60s" + if [ "$i" -eq 180 ]; then + echo "ERROR: containerd did not become ready in 180s" exit 1 fi sleep 1 @@ -669,8 +919,8 @@ done # per-boot layer extraction that previously added ~3-5s per container. echo " Pre-unpacking container images..." for img in \ - "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" \ - "ghcr.io/nvidia/openshell/gateway:latest"; do + "${COMMUNITY_SANDBOX_IMAGE}" \ + "${SERVER_IMAGE}"; do if vm_exec k3s ctr -n k8s.io images ls -q 2>/dev/null | grep -qF "$img"; then echo " unpacking: $img" vm_exec k3s ctr -n k8s.io run --rm "$img" "pre-unpack-$(date +%s)" true 2>/dev/null || true @@ -741,10 +991,18 @@ if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then exit 1 fi +# ── GPU verification (full mode) ────────────────────────────────────── +if [ "$GPU_BUILD" = true ]; then + verify_gpu_rootfs "${ROOTFS_DIR}" "${VM_KERNEL_VERSION:-}" "${NVIDIA_DRIVER_TAG}" "${NVIDIA_DRIVER_VERSION}" +fi + echo "" echo "==> Rootfs ready at: ${ROOTFS_DIR}" echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" echo " Pre-initialized: $(cat "${ROOTFS_DIR}/opt/openshell/.initialized" 2>/dev/null || echo 'no')" +if [ "$GPU_BUILD" = true ]; then + echo " GPU: NVIDIA driver ${NVIDIA_DRIVER_VERSION}, toolkit ${NVIDIA_CONTAINER_TOOLKIT_VERSION}" +fi # Show k3s data size K3S_DATA="${ROOTFS_DIR}/var/lib/rancher/k3s" diff --git a/crates/openshell-vm/scripts/gpu-manifests/README.md b/crates/openshell-vm/scripts/gpu-manifests/README.md new file mode 100644 index 000000000..c72deb1aa --- /dev/null +++ b/crates/openshell-vm/scripts/gpu-manifests/README.md @@ -0,0 +1,41 @@ +# GPU Rootfs Manifests + +These Kubernetes manifests are injected into the VM rootfs when +`build-rootfs.sh --gpu` is used. During a **full** rootfs build they are +also copied into the k3s auto-deploy manifest directory so they are +applied at pre-init time. + +**Phase 2:** deployment from `openshell-vm-init.sh` when +`GPU_ENABLED=true` is not implemented yet; that path will copy or +reconcile these manifests at VM boot. + +## NVIDIA Driver Compatibility + +| Property | Value | +|---|---| +| Driver branch | 570.x (open kernel modules) | +| Minimum compute capability | sm_70 (Volta V100 and newer) | +| Container toolkit | nvidia-container-toolkit 1.17.x | +| Device plugin Helm chart | 0.18.2 | + +### Why open kernel modules? + +The 570.x open kernel modules are required for data-center GPUs +(Volta, Turing, Ampere, Hopper, Blackwell). They are the +NVIDIA-recommended driver for passthrough and container workloads. +Consumer GPUs (GeForce) prior to Turing (sm_75) are **not supported** +with open modules — use the proprietary driver branch if needed. + +### Host requirements + +- IOMMU enabled in BIOS and kernel (`intel_iommu=on` or `amd_iommu=on`) +- GPU bound to `vfio-pci` driver on the host +- `/dev/vfio/vfio` and `/dev/vfio/` accessible +- Host NVIDIA driver version >= 570 (must match or exceed guest driver) + +### Files + +- `nvidia-device-plugin.yaml` — HelmChart CR that deploys the NVIDIA + k8s-device-plugin via the k3s Helm controller. +- `nvidia-runtime-class.yaml` — RuntimeClass object so pods can use + `runtimeClassName: nvidia`. diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml new file mode 100644 index 000000000..c1cbeaa8a --- /dev/null +++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-device-plugin.yaml @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# HelmChart CR for auto-deploying the NVIDIA k8s-device-plugin via k3s Helm controller. +# +# This manifest is copied into /var/lib/rancher/k3s/server/manifests/ by the +# VM init script when GPU_ENABLED=true. It is the VM-specific equivalent of +# deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml used by the +# Docker-based gateway. +# +# The chart installs: +# - NVIDIA device plugin DaemonSet (advertises nvidia.com/gpu resources) +# +# NFD and GFD are disabled; the device plugin's default nodeAffinity +# (which requires nvidia.com/gpu.present=true) is overridden to empty +# so it schedules on any node without requiring NFD/GFD labels. +# +# CDI injection mode: the device plugin uses deviceListStrategy=cdi-cri so that +# devices are injected via CDI hooks before container start. Sandbox pods only +# need the nvidia.com/gpu resource request — no runtimeClassName is required. +# +# k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia" +# RuntimeClass automatically, so no manual RuntimeClass manifest is needed. + +apiVersion: helm.cattle.io/v1 +kind: HelmChart +metadata: + name: nvidia-device-plugin + namespace: kube-system +spec: + repo: https://nvidia.github.io/k8s-device-plugin + chart: nvidia-device-plugin + version: "0.18.2" + targetNamespace: nvidia-device-plugin + createNamespace: true + valuesContent: |- + runtimeClassName: nvidia + deviceListStrategy: cdi-cri + deviceIDStrategy: index + cdi: + nvidiaHookPath: /usr/bin/nvidia-cdi-hook + nvidiaDriverRoot: "/" + gfd: + enabled: false + nfd: + enabled: false + affinity: null diff --git a/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml new file mode 100644 index 000000000..fe2ccbd6e --- /dev/null +++ b/crates/openshell-vm/scripts/gpu-manifests/nvidia-runtime-class.yaml @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# RuntimeClass for NVIDIA GPU workloads. +# Deployed alongside the device plugin when GPU_ENABLED=true. +# Pods requesting nvidia.com/gpu resources should set +# runtimeClassName: nvidia to use the NVIDIA container runtime. +--- +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh index 1cb686a31..ab871e334 100755 --- a/crates/openshell-vm/scripts/openshell-vm-init.sh +++ b/crates/openshell-vm/scripts/openshell-vm-init.sh @@ -46,6 +46,31 @@ mkdir -p /sys/fs/cgroup mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null & wait +# ── Parse kernel cmdline for env vars ───────────────────────────────── +# The QEMU backend passes environment variables via kernel cmdline +# (KEY=VALUE tokens). These are not automatically exported to init. +# Must run after /proc is mounted. +if [ -f /proc/cmdline ]; then + for token in $(cat /proc/cmdline); do + case "$token" in + GPU_ENABLED=*|OPENSHELL_VM_STATE_DISK_DEVICE=*|VM_NET_IP=*|VM_NET_GW=*|VM_NET_DNS=*) + export "$token" + ;; + esac + done +fi + +# Enable cgroup v2 controllers in the root cgroup hierarchy. +# k3s/kubelet requires cpu, cpuset, memory, and pids controllers. +# The kernel must have CONFIG_CGROUP_SCHED=y for the cpu controller. +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + for ctrl in cpu cpuset memory pids io; do + if grep -qw "$ctrl" /sys/fs/cgroup/cgroup.controllers; then + echo "+$ctrl" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true + fi + done +fi + ts "filesystems mounted" # ── Networking ────────────────────────────────────────────────────────── @@ -97,20 +122,26 @@ DHCP_SCRIPT # -n: exit if no lease, -T 1: 1s between retries, -t 3: 3 retries # -A 1: wait 1s before first retry (aggressive for local gvproxy) if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then - ts "WARNING: DHCP failed, falling back to static config" - ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + STATIC_IP="${VM_NET_IP:-192.168.127.2}" + STATIC_GW="${VM_NET_GW:-192.168.127.1}" + ts "WARNING: DHCP failed, falling back to static config ($STATIC_IP gw $STATIC_GW)" + ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true + ip route add default via "$STATIC_GW" 2>/dev/null || true fi else - # Fallback to static config if no DHCP client available. - ts "no DHCP client, using static config" - ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + STATIC_IP="${VM_NET_IP:-192.168.127.2}" + STATIC_GW="${VM_NET_GW:-192.168.127.1}" + ts "no DHCP client, using static config ($STATIC_IP gw $STATIC_GW)" + ip addr add "${STATIC_IP}/24" dev eth0 2>/dev/null || true + ip route add default via "$STATIC_GW" 2>/dev/null || true fi - # Ensure DNS is configured. DHCP should have set /etc/resolv.conf, - # but if it didn't (or static fallback was used), provide a default. - if [ ! -s /etc/resolv.conf ]; then + # Ensure DNS is configured. When VM_NET_DNS is set (TAP networking), + # always use it — the rootfs may have a stale resolv.conf from a + # previous gvproxy run that points to an unreachable gateway. + if [ -n "${VM_NET_DNS:-}" ]; then + echo "nameserver $VM_NET_DNS" > /etc/resolv.conf + elif [ ! -s /etc/resolv.conf ]; then echo "nameserver 8.8.8.8" > /etc/resolv.conf echo "nameserver 8.8.4.4" >> /etc/resolv.conf fi @@ -248,12 +279,20 @@ find /run -name '*.sock' -delete 2>/dev/null || true # start; clear it so k3s doesn't fail node re-registration validation. rm -f /var/lib/rancher/k3s/server/cred/node-passwd 2>/dev/null || true +# Clean stale k3s TLS certificates from previous boots. If k3s crashes +# mid-write it can leave partially-written (0-byte or non-PEM) cert files +# that cause "tls: failed to find any PEM data in certificate input" on +# restart. Wiping the TLS directory forces k3s to regenerate self-signed +# certs on startup. This is safe for both cold and warm boots — the certs +# are ephemeral per-cluster and recreated automatically by k3s. +rm -rf /var/lib/rancher/k3s/server/tls 2>/dev/null || true + # Clean stale containerd runtime state from previous boots. # -# The rootfs persists across VM restarts via virtio-fs. The overlayfs -# snapshotter now lives on the host-backed state disk when present, so -# snapshot data and meta.db persist across boots. We only clean runtime -# state (shim PIDs, sockets) that becomes stale when the VM restarts. +# The rootfs persists across VM restarts via virtio-fs. The snapshotter +# (overlayfs on state disk, native on virtiofs) persists across boots, +# so snapshot data and meta.db survive. We only clean runtime state +# (shim PIDs, sockets) that becomes stale when the VM restarts. if [ -d "$CONTAINERD_DIR" ]; then # Remove runtime task state (stale shim PIDs, sockets from dead processes). rm -rf "${CONTAINERD_DIR}/io.containerd.runtime.v2.task" 2>/dev/null || true @@ -265,24 +304,27 @@ if [ -d "$CONTAINERD_DIR" ]; then # Clean stale ingest temp files from the content store. rm -rf "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" 2>/dev/null || true mkdir -p "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" - # meta.db and overlayfs snapshots persist across boots on virtio-fs. - # No need to delete meta.db — snapshot metadata remains valid since - # the snapshotter directory is no longer backed by volatile tmpfs. + # meta.db and snapshots persist across boots. ts "cleaned containerd runtime state (meta.db + snapshots preserved)" fi rm -rf /run/k3s 2>/dev/null || true -# Ensure the overlayfs snapshotter directory exists. The snapshotter -# runs directly on virtio-fs, so layer data and snapshot metadata -# persist across VM restarts. This eliminates the need to re-import -# image tarballs and re-extract layers on every boot, significantly -# reducing sandbox creation time. -OVERLAYFS_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.overlayfs" -mkdir -p "$OVERLAYFS_DIR" +# Select snapshotter based on the backing filesystem. overlayfs requires +# filesystem features (redirect_dir xattrs) that virtiofs does not +# support. When containerd lives on the block-backed state disk (ext4), +# overlayfs works and provides efficient layer sharing. On virtiofs +# (no state disk), fall back to the native snapshotter which uses +# simple directory copies and works on any POSIX filesystem. if [ "$STATE_DISK_ACTIVE" = true ]; then - ts "overlayfs snapshotter on block-backed containerd state" + SNAPSHOTTER="overlayfs" + OVERLAYFS_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.overlayfs" + mkdir -p "$OVERLAYFS_DIR" + ts "snapshotter: overlayfs on block-backed containerd state" else - ts "overlayfs snapshotter on virtio-fs (persistent)" + SNAPSHOTTER="native" + NATIVE_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.native" + mkdir -p "$NATIVE_DIR" + ts "snapshotter: native on virtio-fs (overlayfs unsupported on virtiofs)" fi ts "stale artifacts cleaned" @@ -366,6 +408,69 @@ if [ "$_caps_ok" = false ]; then exit 1 fi +# ── GPU: NVIDIA driver and device plugin ───────────────────────────── +# When the VM is launched with --gpu, the Rust launcher passes +# GPU_ENABLED=true. Load the NVIDIA kernel modules, verify the device +# is visible via nvidia-smi, and confirm that the container runtime is +# available before k3s starts. + +if [ "${GPU_ENABLED:-false}" = "true" ]; then + ts "GPU mode enabled — loading NVIDIA drivers" + + if ! command -v modprobe >/dev/null 2>&1; then + echo "FATAL: modprobe not found — the kmod package is missing from the GPU rootfs" >&2 + echo "Fix: add 'kmod' to the apt-get install list in build-rootfs.sh and rebuild" >&2 + exit 1 + fi + + # ── Stage NVIDIA GSP firmware onto tmpfs for reliable loading ───── + # The kernel's request_firmware() calls kernel_read_file_from_path_initns() + # which must read the full firmware blob (64MB+ for GSP) through the VFS + # layer. On virtiofs (FUSE-based), each read is a round-trip through the + # virtio ring to virtiofsd. This can fail or stall on non-DAX virtiofs + # configurations (QEMU vhost-user-fs-pci without cache-size). + # + # Copying firmware to /run (tmpfs) eliminates the FUSE path entirely — + # kernel_read_file() reads directly from page cache backed by RAM. + NVIDIA_FW_SRC="/lib/firmware/nvidia" + NVIDIA_FW_TMPFS="/run/firmware/nvidia" + if [ -d "$NVIDIA_FW_SRC" ]; then + mkdir -p "/run/firmware" + cp -a "$NVIDIA_FW_SRC" "/run/firmware/" + ts "staged NVIDIA firmware to tmpfs ($(du -sh "$NVIDIA_FW_TMPFS" | cut -f1))" + + if [ -f /sys/module/firmware_class/parameters/path ]; then + echo -n "/run/firmware" > /sys/module/firmware_class/parameters/path + ts "firmware_class.path set to /run/firmware" + fi + else + echo "WARNING: NVIDIA firmware directory not found at $NVIDIA_FW_SRC" >&2 + echo " modprobe nvidia will likely fail with: RmFetchGspRmImages: No firmware image found" >&2 + fi + + modprobe nvidia || { echo "FATAL: failed to load nvidia kernel module" >&2; exit 1; } + modprobe nvidia_uvm || { echo "FATAL: failed to load nvidia_uvm kernel module" >&2; exit 1; } + modprobe nvidia_modeset || { echo "FATAL: failed to load nvidia_modeset kernel module" >&2; exit 1; } + ts "NVIDIA kernel modules loaded" + + # Firmware is now in kernel memory; free the tmpfs copy. + rm -rf /run/firmware 2>/dev/null || true + + if ! nvidia-smi > /dev/null 2>&1; then + echo "FATAL: GPU_ENABLED=true but nvidia-smi failed — GPU not visible to guest" >&2 + echo "Check: VFIO passthrough, IOMMU groups, guest kernel modules" >&2 + exit 1 + fi + ts "nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)" + + if command -v nvidia-container-runtime >/dev/null 2>&1; then + ts "nvidia-container-runtime: $(command -v nvidia-container-runtime)" + else + echo "FATAL: nvidia-container-runtime not found — GPU pods will fail" >&2 + exit 1 + fi +fi + # ── Deploy bundled manifests (cold boot only) ─────────────────────────── # On pre-initialized rootfs, manifests are already in place from the # build-time k3s boot. Skip this entirely for fast startup. @@ -411,6 +516,29 @@ else ts "skipping manifest deploy (pre-initialized)" fi +# ── GPU manifests (device plugin, runtime class) ───────────────────── +# Deployed on every boot (not just cold boot) so the device plugin is +# always present when GPU_ENABLED=true. Mirrors cluster-entrypoint.sh. +if [ "${GPU_ENABLED:-false}" = "true" ]; then + GPU_MANIFESTS="/opt/openshell/gpu-manifests" + if [ ! -d "$GPU_MANIFESTS" ]; then + echo "FATAL: GPU_ENABLED=true but GPU manifests directory missing: $GPU_MANIFESTS" >&2 + exit 1 + fi + mkdir -p "$K3S_MANIFESTS" + _gpu_manifest_deployed=false + for manifest in "$GPU_MANIFESTS"/*.yaml; do + [ -f "$manifest" ] || continue + _gpu_manifest_deployed=true + cp "$manifest" "$K3S_MANIFESTS/" + ts "deployed GPU manifest: $(basename "$manifest")" + done + if [ "$_gpu_manifest_deployed" = false ]; then + echo "FATAL: GPU_ENABLED=true but no YAML manifests found in $GPU_MANIFESTS" >&2 + exit 1 + fi +fi + # Patch manifests for VM deployment constraints. HELMCHART="$K3S_MANIFESTS/openshell-helmchart.yaml" if [ -f "$HELMCHART" ]; then @@ -737,9 +865,9 @@ K3S_ARGS=( --node-ip="$NODE_IP" --kube-apiserver-arg=bind-address=0.0.0.0 --resolv-conf=/etc/resolv.conf - --tls-san=localhost,127.0.0.1,10.0.2.15,192.168.127.2 + --tls-san="localhost,127.0.0.1,10.0.2.15,192.168.127.2,$NODE_IP" --flannel-backend=none - --snapshotter=overlayfs + --snapshotter="$SNAPSHOTTER" --kube-proxy-arg=proxy-mode=nftables --kube-proxy-arg=nodeport-addresses=0.0.0.0/0 # virtio-fs passthrough reports the host disk usage, which is @@ -755,7 +883,7 @@ K3S_ARGS=( # container create after an image import may still be slow if # containerd needs to extract layers. 10m is a conservative safety # margin; typical operations complete much faster with persistent - # overlayfs snapshots. + # snapshots (overlayfs on state disk, native on virtiofs). --kubelet-arg=runtime-request-timeout=10m ) @@ -803,30 +931,51 @@ setsid sh -c ' ' & fi -# ── Clear stale kine bootstrap lock ───────────────────────────────────── -# k3s uses kine with a SQLite backend at state.db. When k3s starts, kine -# sets a bootstrap lock row; if k3s is killed before completing bootstrap -# (SIGKILL, host crash, power loss), the lock persists and the next k3s -# instance hangs forever on: -# "Bootstrap key already locked — waiting for data to be populated by -# another server" +# ── Kine database health check ─────────────────────────────────────────── +# k3s uses kine with a SQLite backend at state.db. Two failure modes: +# +# 1. Page-level corruption (SQLITE_CORRUPT) — from a killed VM mid-write. +# Detected via PRAGMA quick_check; the DB is removed so k3s starts fresh. +# The host-side recover_corrupt_kine_db() in exec.rs only checks the +# virtiofs path, so it misses corruption on the state disk (--gpu). +# This in-VM check is the authoritative corruption gate. # -# We clear the lock row before starting k3s so that a warm boot with -# persistent state.db succeeds. If state.db doesn't exist (first boot or -# --reset), this is a harmless no-op. If state.db is corrupt, sqlite3 -# fails silently (|| true) and the host-side corruption check in exec.rs -# will have already removed the file. +# 2. Stale bootstrap lock — kine sets a lock row on startup; if k3s is +# killed before completing bootstrap, the lock persists and the next +# instance hangs on "Bootstrap key already locked". Cleared via DELETE. KINE_DB="/var/lib/rancher/k3s/server/db/state.db" if [ -f "$KINE_DB" ]; then - ts "clearing stale kine bootstrap lock (if any)" - # If sqlite3 fails (corrupt DB, missing binary), log the failure. - # The host-side corruption check in exec.rs handles the corrupt case, - # but we should still know about it. - if ! sqlite3 "$KINE_DB" "DELETE FROM kine WHERE name LIKE '/bootstrap/%';" 2>/dev/null; then - ts "WARNING: failed to clear kine bootstrap lock — k3s may hang if DB is corrupt" + # When the state disk is in use, the kine DB lives on the block device, + # not on the virtiofs rootfs. The host-side recover_corrupt_kine_db() + # in exec.rs can only check the virtiofs path, so it misses corruption + # on the state disk. Run a quick_check here inside the VM where the + # bind-mount is active and the DB is at its final runtime path. + _kine_corrupt=false + if command -v sqlite3 >/dev/null 2>&1; then + _qc_result=$(sqlite3 "$KINE_DB" "PRAGMA quick_check;" 2>&1) || _kine_corrupt=true + if [ "$_kine_corrupt" = false ] && [ "$_qc_result" != "ok" ]; then + _kine_corrupt=true + fi + else + # No sqlite3 binary — can't verify, try to proceed. + ts "WARNING: sqlite3 not available, skipping kine DB integrity check" fi - if ! sqlite3 "$KINE_DB" "PRAGMA wal_checkpoint(TRUNCATE);" 2>/dev/null; then - ts "WARNING: failed to checkpoint kine WAL" + + if [ "$_kine_corrupt" = true ]; then + ts "WARNING: kine database is corrupt ($_qc_result), removing for clean boot" + rm -f "$KINE_DB" "${KINE_DB}-wal" "${KINE_DB}-shm" + ts "corrupt kine DB removed — k3s will recreate from manifests" + else + ts "clearing stale kine bootstrap lock (if any)" + if ! sqlite3 "$KINE_DB" "DELETE FROM kine WHERE name LIKE '/bootstrap/%';" 2>/dev/null; then + ts "WARNING: failed to clear kine bootstrap lock — removing DB for safety" + rm -f "$KINE_DB" "${KINE_DB}-wal" "${KINE_DB}-shm" + fi + if [ -f "$KINE_DB" ]; then + if ! sqlite3 "$KINE_DB" "PRAGMA wal_checkpoint(TRUNCATE);" 2>/dev/null; then + ts "WARNING: failed to checkpoint kine WAL" + fi + fi fi fi diff --git a/crates/openshell-vm/src/backend/libkrun.rs b/crates/openshell-vm/src/backend/libkrun.rs new file mode 100644 index 000000000..3ab2d6631 --- /dev/null +++ b/crates/openshell-vm/src/backend/libkrun.rs @@ -0,0 +1,470 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! libkrun hypervisor backend. +//! +//! Implements [`VmBackend`] using the libkrun C API for lightweight microVMs. +//! This is the original backend — on macOS it uses Hypervisor.framework, +//! on Linux it uses KVM. + +use std::ffi::CString; +use std::path::Path; +use std::time::Instant; + +use super::{VmBackend, setup_gvproxy_port_forwarding, start_gvproxy}; +use crate::exec::{clear_vm_runtime_state, write_vm_runtime_state}; +use crate::{ + GvproxyGuard, NetBackend, StateDiskConfig, VmConfig, VmError, VsockPort, bootstrap_gateway, + c_string_array, check, ffi, gateway_host_port, health, path_to_cstring, vm_rootfs_key, +}; + +/// libkrun hypervisor backend. +pub struct LibkrunBackend; + +impl VmBackend for LibkrunBackend { + fn launch(&self, config: &VmConfig) -> Result { + launch_libkrun(config) + } +} + +/// VM context wrapping the libkrun FFI context ID. +struct VmContext { + krun: &'static ffi::LibKrun, + ctx_id: u32, +} + +impl VmContext { + fn create(log_level: u32) -> Result { + let krun = ffi::libkrun()?; + unsafe { + check( + (krun.krun_init_log)( + ffi::KRUN_LOG_TARGET_DEFAULT, + crate::clamp_log_level(log_level), + ffi::KRUN_LOG_STYLE_AUTO, + ffi::KRUN_LOG_OPTION_NO_ENV, + ), + "krun_init_log", + )?; + } + + let ctx_id = unsafe { (krun.krun_create_ctx)() }; + if ctx_id < 0 { + return Err(VmError::Krun { + func: "krun_create_ctx", + code: ctx_id, + }); + } + + Ok(Self { + krun, + ctx_id: ctx_id as u32, + }) + } + + fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib), + "krun_set_vm_config", + ) + } + } + + fn set_root(&self, rootfs: &Path) -> Result<(), VmError> { + let rootfs_c = path_to_cstring(rootfs)?; + unsafe { + check( + (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()), + "krun_set_root", + ) + } + } + + fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> { + let Some(add_disk3) = self.krun.krun_add_disk3 else { + return Err(VmError::HostSetup( + "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support" + .to_string(), + )); + }; + + let block_id_c = CString::new(state_disk.block_id.as_str())?; + let disk_path_c = path_to_cstring(&state_disk.path)?; + unsafe { + check( + add_disk3( + self.ctx_id, + block_id_c.as_ptr(), + disk_path_c.as_ptr(), + ffi::KRUN_DISK_FORMAT_RAW, + false, + false, + crate::state_disk_sync_mode(), + ), + "krun_add_disk3", + ) + } + } + + fn set_workdir(&self, workdir: &str) -> Result<(), VmError> { + let workdir_c = CString::new(workdir)?; + unsafe { + check( + (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()), + "krun_set_workdir", + ) + } + } + + fn disable_implicit_vsock(&self) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_disable_implicit_vsock)(self.ctx_id), + "krun_disable_implicit_vsock", + ) + } + } + + fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_add_vsock)(self.ctx_id, tsi_features), + "krun_add_vsock", + ) + } + } + + #[cfg(target_os = "macos")] + fn add_net_unixgram( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + flags: u32, + ) -> Result<(), VmError> { + let sock_c = path_to_cstring(socket_path)?; + unsafe { + check( + (self.krun.krun_add_net_unixgram)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + flags, + ), + "krun_add_net_unixgram", + ) + } + } + + #[allow(dead_code)] + fn add_net_unixstream( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + ) -> Result<(), VmError> { + let sock_c = path_to_cstring(socket_path)?; + unsafe { + check( + (self.krun.krun_add_net_unixstream)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + 0, + ), + "krun_add_net_unixstream", + ) + } + } + + fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { + let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); + let (_port_owners, port_ptrs) = c_string_array(&port_strs)?; + unsafe { + check( + (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()), + "krun_set_port_map", + ) + } + } + + fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> { + let socket_c = path_to_cstring(&port.socket_path)?; + unsafe { + check( + (self.krun.krun_add_vsock_port2)( + self.ctx_id, + port.port, + socket_c.as_ptr(), + port.listen, + ), + "krun_add_vsock_port2", + ) + } + } + + fn set_console_output(&self, path: &Path) -> Result<(), VmError> { + let console_c = path_to_cstring(path)?; + unsafe { + check( + (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()), + "krun_set_console_output", + ) + } + } + + fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> { + let exec_c = CString::new(exec_path)?; + let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect(); + let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?; + let env_strs: Vec<&str> = env.iter().map(String::as_str).collect(); + let (_env_owners, env_ptrs) = c_string_array(&env_strs)?; + + unsafe { + check( + (self.krun.krun_set_exec)( + self.ctx_id, + exec_c.as_ptr(), + argv_ptrs.as_ptr(), + env_ptrs.as_ptr(), + ), + "krun_set_exec", + ) + } + } + + fn start_enter(&self) -> i32 { + unsafe { (self.krun.krun_start_enter)(self.ctx_id) } + } +} + +impl Drop for VmContext { + fn drop(&mut self) { + unsafe { + let ret = (self.krun.krun_free_ctx)(self.ctx_id); + if ret < 0 { + eprintln!( + "warning: krun_free_ctx({}) failed with code {ret}", + self.ctx_id + ); + } + } + } +} + +/// Launch a VM using the libkrun backend. +/// +/// This contains the VM-specific configuration, networking, fork/exec, +/// signal forwarding, bootstrap, and cleanup logic that was previously +/// inline in `lib.rs::launch()`. +#[allow(clippy::similar_names)] +fn launch_libkrun(config: &VmConfig) -> Result { + let launch_start = Instant::now(); + + let vm = VmContext::create(config.log_level)?; + vm.set_vm_config(config.vcpus, config.mem_mib)?; + vm.set_root(&config.rootfs)?; + if let Some(state_disk) = &config.state_disk { + vm.add_state_disk(state_disk)?; + } + vm.set_workdir(&config.workdir)?; + + let mut gvproxy_guard: Option = None; + let mut gvproxy_api_sock: Option = None; + + match &config.net { + NetBackend::Tsi => {} + NetBackend::None => { + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + eprintln!("Networking: disabled (no TSI, no virtio-net)"); + } + NetBackend::Gvproxy { .. } => { + let gvproxy_setup = start_gvproxy(config, launch_start)?; + + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; + + const NET_FEATURE_CSUM: u32 = 1 << 0; + const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; + const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; + const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; + const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; + const NET_FEATURE_HOST_UFO: u32 = 1 << 14; + const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM + | NET_FEATURE_GUEST_CSUM + | NET_FEATURE_GUEST_TSO4 + | NET_FEATURE_GUEST_UFO + | NET_FEATURE_HOST_TSO4 + | NET_FEATURE_HOST_UFO; + + #[cfg(target_os = "linux")] + vm.add_net_unixstream(&gvproxy_setup.net_sock, &mac, COMPAT_NET_FEATURES)?; + #[cfg(target_os = "macos")] + { + const NET_FLAG_VFKIT: u32 = 1 << 0; + vm.add_net_unixgram( + &gvproxy_setup.net_sock, + &mac, + COMPAT_NET_FEATURES, + NET_FLAG_VFKIT, + )?; + } + + eprintln!( + "Networking: gvproxy (virtio-net) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + gvproxy_api_sock = Some(gvproxy_setup.api_sock); + gvproxy_guard = Some(gvproxy_setup.guard); + } + } + + if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) { + vm.set_port_map(&config.port_map)?; + } + + for vsock_port in &config.vsock_ports { + if let Some(parent) = vsock_port.socket_path.parent() { + std::fs::create_dir_all(parent).map_err(|e| { + VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display())) + })?; + } + let _ = std::fs::remove_file(&vsock_port.socket_path); + vm.add_vsock_port(vsock_port)?; + } + + let console_log = config.console_output.clone().unwrap_or_else(|| { + config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs))) + }); + vm.set_console_output(&console_log)?; + + let mut env: Vec = if config.env.is_empty() { + vec![ + "HOME=/root", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + ] + .into_iter() + .map(ToOwned::to_owned) + .collect() + } else { + config.env.clone() + }; + if let Some(state_disk) = &config.state_disk + && !env + .iter() + .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE=")) + { + env.push(format!( + "OPENSHELL_VM_STATE_DISK_DEVICE={}", + state_disk.guest_device + )); + } + if config.gpu_enabled { + env.push("GPU_ENABLED=true".to_string()); + } + vm.set_exec(&config.exec_path, &config.args, &env)?; + + // Fork and enter the VM + let boot_start = Instant::now(); + eprintln!("Booting microVM..."); + + let pid = unsafe { libc::fork() }; + match pid { + -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())), + 0 => { + let ret = vm.start_enter(); + eprintln!("krun_start_enter failed: {ret}"); + std::process::exit(1); + } + _ => { + if !config.is_exec_mode() { + let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id); + if let Err(err) = + write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid, false) + { + unsafe { + libc::kill(pid, libc::SIGTERM); + } + drop(gvproxy_guard); + clear_vm_runtime_state(&config.rootfs); + return Err(err); + } + } + eprintln!( + "VM started (child pid {pid}) [{:.1}s]", + boot_start.elapsed().as_secs_f64() + ); + for pm in &config.port_map { + let host_port = pm.split(':').next().unwrap_or(pm); + eprintln!(" port {pm} -> http://localhost:{host_port}"); + } + eprintln!("Console output: {}", console_log.display()); + + if let Some(ref api_sock) = gvproxy_api_sock { + setup_gvproxy_port_forwarding(api_sock, &config.port_map)?; + } + + if !config.is_exec_mode() && !config.port_map.is_empty() { + let gateway_port = gateway_host_port(config); + bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; + health::wait_for_gateway_ready(gateway_port, &config.gateway_name, config.gpu_enabled)?; + } + + eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); + eprintln!("Press Ctrl+C to stop."); + + crate::CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); + unsafe { + libc::signal( + libc::SIGINT, + crate::forward_signal as *const () as libc::sighandler_t, + ); + libc::signal( + libc::SIGTERM, + crate::forward_signal as *const () as libc::sighandler_t, + ); + } + + let mut status: libc::c_int = 0; + unsafe { + libc::waitpid(pid, &raw mut status, 0); + } + crate::CHILD_PID.store(0, std::sync::atomic::Ordering::Relaxed); + + if !config.is_exec_mode() { + clear_vm_runtime_state(&config.rootfs); + } + if let Some(mut guard) = gvproxy_guard + && let Some(mut child) = guard.disarm() + { + let _ = child.kill(); + let _ = child.wait(); + eprintln!("gvproxy stopped"); + } + + if libc::WIFEXITED(status) { + let code = libc::WEXITSTATUS(status); + eprintln!("VM exited with code {code}"); + return Ok(code); + } else if libc::WIFSIGNALED(status) { + let sig = libc::WTERMSIG(status); + eprintln!("VM killed by signal {sig}"); + return Ok(128 + sig); + } + + Ok(status) + } + } +} diff --git a/crates/openshell-vm/src/backend/mod.rs b/crates/openshell-vm/src/backend/mod.rs new file mode 100644 index 000000000..0fe4abab1 --- /dev/null +++ b/crates/openshell-vm/src/backend/mod.rs @@ -0,0 +1,618 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! VM backend abstraction layer. +//! +//! Defines the [`VmBackend`] trait that all hypervisor backends implement, +//! and shared infrastructure (gvproxy startup, networking helpers) used by +//! the libkrun and QEMU backends. + +pub mod libkrun; +pub mod qemu; + +use std::os::unix::net::UnixStream; +use std::path::{Path, PathBuf}; +use std::time::{Duration, Instant}; + +use crate::{ + GvproxyGuard, NetBackend, VmConfig, VmError, gvproxy_expose, gvproxy_socket_dir, + kill_stale_gvproxy, kill_stale_gvproxy_by_port, pick_gvproxy_ssh_port, vm_rootfs_key, +}; + +/// Trait implemented by each hypervisor backend (libkrun, QEMU). +pub trait VmBackend { + /// Launch a VM with the given configuration. + /// + /// Returns the VM exit code. + fn launch(&self, config: &VmConfig) -> Result; +} + +/// Result of starting a gvproxy instance, used by both backends. +pub(crate) struct GvproxySetup { + pub(crate) guard: GvproxyGuard, + pub(crate) api_sock: PathBuf, + pub(crate) net_sock: PathBuf, +} + +/// Start gvproxy for the given configuration. +/// +/// Shared between libkrun and QEMU backends. Handles stale process +/// cleanup, socket setup, and process spawning with exponential backoff +/// waiting for the network socket. +pub(crate) fn start_gvproxy( + config: &VmConfig, + launch_start: Instant, +) -> Result { + let binary = match &config.net { + NetBackend::Gvproxy { binary } => binary, + _ => { + return Err(VmError::HostSetup( + "start_gvproxy called without Gvproxy net backend".into(), + )); + } + }; + + if !binary.exists() { + return Err(VmError::BinaryNotFound { + path: binary.display().to_string(), + hint: "Install Podman Desktop or place gvproxy in PATH".to_string(), + }); + } + + let run_dir = config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .to_path_buf(); + let rootfs_key = vm_rootfs_key(&config.rootfs); + let sock_base = gvproxy_socket_dir(&config.rootfs)?; + let net_sock = sock_base.with_extension("v"); + let api_sock = sock_base.with_extension("a"); + + kill_stale_gvproxy(&config.rootfs); + for pm in &config.port_map { + if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::().ok()) { + kill_stale_gvproxy_by_port(host_port); + } + } + + let _ = std::fs::remove_file(&net_sock); + let _ = std::fs::remove_file(&api_sock); + let krun_sock = sock_base.with_extension("v-krun.sock"); + let _ = std::fs::remove_file(&krun_sock); + + eprintln!("Starting gvproxy: {}", binary.display()); + let ssh_port = pick_gvproxy_ssh_port()?; + let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log")); + let gvproxy_log_file = std::fs::File::create(&gvproxy_log) + .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; + + #[cfg(target_os = "linux")] + let (gvproxy_net_flag, gvproxy_net_url) = + ("-listen-qemu", format!("unix://{}", net_sock.display())); + #[cfg(target_os = "macos")] + let (gvproxy_net_flag, gvproxy_net_url) = ( + "-listen-vfkit", + format!("unixgram://{}", net_sock.display()), + ); + + let child = std::process::Command::new(binary) + .arg(gvproxy_net_flag) + .arg(&gvproxy_net_url) + .arg("-listen") + .arg(format!("unix://{}", api_sock.display())) + .arg("-ssh-port") + .arg(ssh_port.to_string()) + .stdout(std::process::Stdio::null()) + .stderr(gvproxy_log_file) + .spawn() + .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; + + eprintln!( + "gvproxy started (pid {}, ssh port {}) [{:.1}s]", + child.id(), + ssh_port, + launch_start.elapsed().as_secs_f64() + ); + + { + let deadline = Instant::now() + Duration::from_secs(5); + let mut interval = Duration::from_millis(5); + while !net_sock.exists() { + if Instant::now() >= deadline { + return Err(VmError::Fork( + "gvproxy socket did not appear within 5s".to_string(), + )); + } + std::thread::sleep(interval); + interval = (interval * 2).min(Duration::from_millis(100)); + } + } + + Ok(GvproxySetup { + guard: GvproxyGuard::new(child), + api_sock, + net_sock, + }) +} + +/// Set up port forwarding via the gvproxy HTTP API. +/// +/// Translates `host:guest` port map entries into gvproxy expose calls. +pub(crate) fn setup_gvproxy_port_forwarding( + api_sock: &Path, + port_map: &[String], +) -> Result<(), VmError> { + let fwd_start = Instant::now(); + { + let deadline = Instant::now() + Duration::from_secs(2); + let mut interval = Duration::from_millis(5); + while !api_sock.exists() { + if Instant::now() >= deadline { + eprintln!("warning: gvproxy API socket not ready after 2s, attempting anyway"); + break; + } + std::thread::sleep(interval); + interval = (interval * 2).min(Duration::from_millis(200)); + } + } + + let guest_ip = "192.168.127.2"; + + for pm in port_map { + let parts: Vec<&str> = pm.split(':').collect(); + let (host_port, guest_port) = match parts.len() { + 2 => (parts[0], parts[1]), + 1 => (parts[0], parts[0]), + _ => { + eprintln!(" skipping invalid port mapping: {pm}"); + continue; + } + }; + + let expose_body = format!( + r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"# + ); + + let mut expose_ok = false; + let mut retry_interval = Duration::from_millis(100); + let expose_deadline = Instant::now() + Duration::from_secs(10); + loop { + match gvproxy_expose(api_sock, &expose_body) { + Ok(()) => { + eprintln!(" port {host_port} -> {guest_ip}:{guest_port}"); + expose_ok = true; + break; + } + Err(e) => { + if Instant::now() >= expose_deadline { + eprintln!(" port {host_port}: {e} (retries exhausted)"); + break; + } + std::thread::sleep(retry_interval); + retry_interval = (retry_interval * 2).min(Duration::from_secs(1)); + } + } + } + if !expose_ok { + return Err(VmError::HostSetup(format!( + "failed to forward port {host_port} via gvproxy" + ))); + } + } + eprintln!( + "Port forwarding ready [{:.1}s]", + fwd_start.elapsed().as_secs_f64() + ); + + Ok(()) +} + +// ── TAP networking constants ──────────────────────────────────────────── +// The QEMU backend uses 192.168.249.1/24 on the host side of the TAP +// device. The guest uses .2 with the host as its gateway. + +/// Fixed MAC for the guest TAP interface. Only one VM runs per host. +pub(crate) const GUEST_MAC: &str = "5a:94:ef:e4:0c:ee"; + +pub(crate) const TAP_HOST_IP: &str = "192.168.249.1"; +pub(crate) const TAP_GUEST_IP: &str = "192.168.249.2"; +pub(crate) const TAP_SUBNET: &str = "192.168.249.0/24"; + +/// Wait for a Unix socket to appear on the filesystem. +pub(crate) fn wait_for_socket( + socket_path: &Path, + label: &str, + timeout: Duration, +) -> Result<(), VmError> { + let deadline = Instant::now() + timeout; + let mut interval = Duration::from_millis(10); + + while !socket_path.exists() { + if Instant::now() >= deadline { + return Err(VmError::HostSetup(format!( + "{label} socket did not appear within {}s: {}", + timeout.as_secs(), + socket_path.display(), + ))); + } + std::thread::sleep(interval); + interval = (interval * 2).min(Duration::from_millis(200)); + } + + Ok(()) +} + +/// Run a command, returning an error if it fails. +pub(crate) fn run_cmd(cmd: &str, args: &[&str]) -> Result<(), VmError> { + let output = std::process::Command::new(cmd) + .args(args) + .output() + .map_err(|e| VmError::HostSetup(format!("{cmd}: {e}")))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(VmError::HostSetup(format!( + "{cmd} {}: {}", + args.join(" "), + stderr.trim() + ))); + } + + Ok(()) +} + +/// Escape a string for use in a shell script. +/// +/// Uses an allowlist of safe characters; anything outside the list gets +/// single-quoted. Single quotes inside the value are escaped with the +/// standard `'\''` idiom. +pub(crate) fn shell_escape(s: &str) -> String { + if s.is_empty() { + return "''".to_string(); + } + if s.bytes().all(|b| { + matches!(b, + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' + | b'_' | b'-' | b'.' | b'/' | b':' | b'@' | b'=' + ) + }) { + return s.to_string(); + } + format!("'{}'", s.replace('\'', "'\\''")) +} + +/// Parse a DNS server from resolv.conf content. +/// +/// Returns the first non-`127.x.x.x` nameserver, or `8.8.8.8` if none found. +pub(crate) fn parse_dns_server(content: &str) -> String { + content + .lines() + .filter(|line| line.starts_with("nameserver")) + .filter_map(|line| line.split_whitespace().nth(1)) + .find(|ip| !ip.starts_with("127.")) + .map(String::from) + .unwrap_or_else(|| "8.8.8.8".to_string()) +} + +/// Read the host's primary DNS server. +/// +/// Checks `/etc/resolv.conf` first. If every nameserver there is a loopback +/// address (e.g. systemd-resolved's `127.0.0.53`), falls back to the +/// upstream resolv.conf at `/run/systemd/resolve/resolv.conf` which +/// contains the real upstream nameservers. Final fallback is `8.8.8.8`. +pub(crate) fn host_dns_server() -> String { + for path in &["/etc/resolv.conf", "/run/systemd/resolve/resolv.conf"] { + if let Ok(content) = std::fs::read_to_string(path) { + let server = parse_dns_server(&content); + if server != "8.8.8.8" { + return server; + } + } + } + "8.8.8.8".to_string() +} + +// ── Kernel command line ───────────────────────────────────────────────── + +/// Build the kernel command line shared by all backends that use virtiofs +/// rootfs and the standard init path. +pub(crate) fn build_kernel_cmdline( + config: &VmConfig, + effective_exec_path: &str, + use_tap_net: bool, +) -> String { + let mut parts = vec![ + "console=ttyS0".to_string(), + "root=rootfs".to_string(), + "rootfstype=virtiofs".to_string(), + "rw".to_string(), + "panic=-1".to_string(), + format!("init={effective_exec_path}"), + ]; + + if config.gpu_enabled && config.vfio_device.is_some() { + parts.push("GPU_ENABLED=true".to_string()); + // Tell the kernel firmware loader to search /lib/firmware explicitly. + // The init script stages firmware to tmpfs and overrides this via + // sysfs, but the cmdline provides an early fallback so + // request_firmware() can find GSP blobs on the virtiofs rootfs even + // before init runs the staging logic. + parts.push("firmware_class.path=/lib/firmware".to_string()); + } + if let Some(state_disk) = &config.state_disk { + parts.push(format!( + "OPENSHELL_VM_STATE_DISK_DEVICE={}", + state_disk.guest_device + )); + } + for var in &config.env { + if var.contains('=') && !var.contains(' ') && !var.contains('"') { + parts.push(var.clone()); + } + } + + if use_tap_net { + parts.push(format!("VM_NET_IP={TAP_GUEST_IP}")); + parts.push(format!("VM_NET_GW={TAP_HOST_IP}")); + parts.push(format!("VM_NET_DNS={}", host_dns_server())); + } + + parts.join(" ") +} + +// ── TAP host networking ───────────────────────────────────────────────── + +/// Set up host-side networking so the guest can reach the internet via TAP. +/// +/// 1. Enable IP forwarding (saving the original value for teardown) +/// 2. MASQUERADE outbound traffic from the VM subnet +/// 3. Allow forwarding to/from the VM subnet +/// +/// Returns the original value of `ip_forward` so the caller can restore it. +pub(crate) fn setup_tap_host_networking() -> Result { + let original_ip_forward = std::fs::read_to_string("/proc/sys/net/ipv4/ip_forward") + .map(|s| s.trim().to_string()) + .unwrap_or_else(|_| "0".to_string()); + + std::fs::write("/proc/sys/net/ipv4/ip_forward", "1") + .map_err(|e| VmError::HostSetup(format!("enable IP forwarding: {e}")))?; + + let _ = run_cmd( + "iptables", + &[ + "-t", + "nat", + "-D", + "POSTROUTING", + "-s", + TAP_SUBNET, + "!", + "-d", + TAP_SUBNET, + "-j", + "MASQUERADE", + ], + ); + run_cmd( + "iptables", + &[ + "-t", + "nat", + "-A", + "POSTROUTING", + "-s", + TAP_SUBNET, + "!", + "-d", + TAP_SUBNET, + "-j", + "MASQUERADE", + ], + )?; + + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-s", TAP_SUBNET, "-j", "ACCEPT"], + ); + run_cmd( + "iptables", + &["-A", "FORWARD", "-s", TAP_SUBNET, "-j", "ACCEPT"], + )?; + + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-d", TAP_SUBNET, "-j", "ACCEPT"], + ); + run_cmd( + "iptables", + &["-A", "FORWARD", "-d", TAP_SUBNET, "-j", "ACCEPT"], + )?; + + eprintln!("host networking: IP forwarding + NAT masquerade for {TAP_SUBNET}"); + Ok(original_ip_forward) +} + +/// Remove the iptables rules added by [`setup_tap_host_networking`] and +/// restore the original `ip_forward` sysctl value. +pub(crate) fn teardown_tap_host_networking(original_ip_forward: &str) { + let _ = run_cmd( + "iptables", + &[ + "-t", + "nat", + "-D", + "POSTROUTING", + "-s", + TAP_SUBNET, + "!", + "-d", + TAP_SUBNET, + "-j", + "MASQUERADE", + ], + ); + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-s", TAP_SUBNET, "-j", "ACCEPT"], + ); + let _ = run_cmd( + "iptables", + &["-D", "FORWARD", "-d", TAP_SUBNET, "-j", "ACCEPT"], + ); + if original_ip_forward != "1" { + let _ = std::fs::write("/proc/sys/net/ipv4/ip_forward", original_ip_forward); + } + eprintln!( + "host networking: cleaned up iptables rules, restored ip_forward={original_ip_forward}" + ); +} + +// ── TCP port forwarding ───────────────────────────────────────────────── + +/// Start a background TCP proxy that forwards `127.0.0.1:{host_port}` +/// to `{guest_ip}:{guest_port}`. +/// +/// Each accepted connection spawns two threads for bidirectional copy. +/// The listener thread runs until the process exits. +pub(crate) fn start_tcp_port_forwarder( + host_port: u16, + guest_ip: &str, + guest_port: u16, +) -> Result<(), VmError> { + use std::net::{TcpListener, TcpStream}; + + let listener = TcpListener::bind(("127.0.0.1", host_port)) + .map_err(|e| VmError::HostSetup(format!("bind port forwarder on :{host_port}: {e}")))?; + + let guest_addr = format!("{guest_ip}:{guest_port}"); + eprintln!("port forwarder: 127.0.0.1:{host_port} -> {guest_addr}"); + + std::thread::spawn(move || { + for stream in listener.incoming() { + let client = match stream { + Ok(s) => s, + Err(_) => continue, + }; + + let addr = guest_addr.clone(); + std::thread::spawn(move || { + if let Ok(remote) = TcpStream::connect(&addr) { + forward_tcp_bidirectional(client, remote); + } + }); + } + }); + + Ok(()) +} + +/// Copy data bidirectionally between two TCP streams until either side closes. +fn forward_tcp_bidirectional(client: std::net::TcpStream, remote: std::net::TcpStream) { + let Ok(mut client_r) = client.try_clone() else { + return; + }; + let mut client_w = client; + let Ok(mut remote_r) = remote.try_clone() else { + return; + }; + let mut remote_w = remote; + + std::thread::spawn(move || { + let _ = std::io::copy(&mut client_r, &mut remote_w); + }); + std::thread::spawn(move || { + let _ = std::io::copy(&mut remote_r, &mut client_w); + }); +} + +// ── Bidirectional Unix stream bridge ──────────────────────────────────── + +/// Spawn two threads that copy data between two Unix streams. +pub(crate) fn bridge_bidirectional(client: UnixStream, guest: UnixStream) { + let Ok(mut client_r) = client.try_clone() else { + return; + }; + let mut client_w = client; + let Ok(mut guest_r) = guest.try_clone() else { + return; + }; + let mut guest_w = guest; + + std::thread::spawn(move || { + let _ = std::io::copy(&mut client_r, &mut guest_w); + }); + std::thread::spawn(move || { + let _ = std::io::copy(&mut guest_r, &mut client_w); + }); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_dns_server_returns_first_non_loopback() { + let content = "nameserver 10.0.0.1\nnameserver 8.8.8.8\n"; + assert_eq!(parse_dns_server(content), "10.0.0.1"); + } + + #[test] + fn parse_dns_server_skips_systemd_resolved() { + let content = "nameserver 127.0.0.53\nnameserver 1.1.1.1\n"; + assert_eq!(parse_dns_server(content), "1.1.1.1"); + } + + #[test] + fn parse_dns_server_skips_all_loopback_variants() { + let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\nnameserver 172.16.0.1\n"; + assert_eq!(parse_dns_server(content), "172.16.0.1"); + } + + #[test] + fn parse_dns_server_falls_back_when_only_loopback() { + let content = "nameserver 127.0.0.1\nnameserver 127.0.0.53\n"; + assert_eq!(parse_dns_server(content), "8.8.8.8"); + } + + #[test] + fn parse_dns_server_handles_empty_content() { + assert_eq!(parse_dns_server(""), "8.8.8.8"); + } + + #[test] + fn parse_dns_server_ignores_comments_and_other_lines() { + let content = "# Generated by NetworkManager\nsearch example.com\nnameserver 10.1.2.3\n"; + assert_eq!(parse_dns_server(content), "10.1.2.3"); + } + + #[test] + fn shell_escape_empty_string() { + assert_eq!(shell_escape(""), "''"); + } + + #[test] + fn shell_escape_simple_string() { + assert_eq!(shell_escape("hello"), "hello"); + } + + #[test] + fn shell_escape_string_with_single_quotes() { + assert_eq!(shell_escape("it's"), "'it'\\''s'"); + } + + #[test] + fn shell_escape_string_with_spaces() { + assert_eq!(shell_escape("hello world"), "'hello world'"); + } + + #[test] + fn shell_escape_string_with_double_quotes() { + assert_eq!(shell_escape(r#"say "hi""#), r#"'say "hi"'"#); + } + + #[test] + fn shell_escape_string_with_backslash() { + assert_eq!(shell_escape("path\\to"), "'path\\to'"); + } +} diff --git a/crates/openshell-vm/src/backend/qemu.rs b/crates/openshell-vm/src/backend/qemu.rs new file mode 100644 index 000000000..10a9d7149 --- /dev/null +++ b/crates/openshell-vm/src/backend/qemu.rs @@ -0,0 +1,1048 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! QEMU backend for GPU passthrough VMs. +//! +//! Uses QEMU's command-line interface with KVM acceleration and VFIO device +//! passthrough. This backend is Linux-only and requires a separate kernel +//! image (`vmlinux`) and `virtiofsd` for the root filesystem. +//! +//! QEMU handles VFIO devices with or without MSI-X capability, falling +//! back to legacy interrupt emulation when MSI-X is unavailable. + +use std::os::unix::net::UnixStream; +use std::os::unix::process::CommandExt; +use std::path::{Path, PathBuf}; +use std::time::{Duration, Instant}; + +use super::{ + GUEST_MAC, TAP_GUEST_IP, TAP_HOST_IP, VmBackend, bridge_bidirectional, build_kernel_cmdline, + run_cmd, setup_tap_host_networking, shell_escape, start_tcp_port_forwarder, + teardown_tap_host_networking, wait_for_socket, +}; +use crate::exec::{ + VM_EXEC_VSOCK_PORT, clear_vm_runtime_state, vm_exec_socket_path, write_vm_runtime_state, +}; +use crate::{NetBackend, VmConfig, VmError, vm_rootfs_key}; + +const VSOCK_GUEST_CID: u32 = 3; +const QEMU_BINARY_NAME: &str = "qemu-system-x86_64"; + +/// QEMU hypervisor backend for GPU passthrough. +pub struct QemuBackend { + qemu_binary: PathBuf, + vmlinux: PathBuf, + virtiofsd: PathBuf, +} + +impl QemuBackend { + /// Create a new QEMU backend, validating required binaries. + pub fn new() -> Result { + let runtime_dir = crate::configured_runtime_dir()?; + + let qemu_binary = { + let bundled = runtime_dir.join(QEMU_BINARY_NAME); + if bundled.is_file() { + bundled + } else { + find_in_path(QEMU_BINARY_NAME).ok_or_else(|| VmError::BinaryNotFound { + path: bundled.display().to_string(), + hint: "QEMU backend requires qemu-system-x86_64. Install QEMU or set OPENSHELL_VM_RUNTIME_DIR".to_string(), + })? + } + }; + + let vmlinux = runtime_dir.join("vmlinux"); + if !vmlinux.is_file() { + return Err(VmError::BinaryNotFound { + path: vmlinux.display().to_string(), + hint: "QEMU backend requires a vmlinux kernel. Run the GPU build pipeline" + .to_string(), + }); + } + + let virtiofsd = runtime_dir.join("virtiofsd"); + if !virtiofsd.is_file() { + return Err(VmError::BinaryNotFound { + path: virtiofsd.display().to_string(), + hint: "QEMU backend requires virtiofsd. Run the GPU build pipeline".to_string(), + }); + } + + // Verify vhost-vsock is available. QEMU's vhost-vsock-pci device + // needs /dev/vhost-vsock (provided by the vhost_vsock kernel module). + // A plain AF_VSOCK socket() can succeed with just the vsock module, + // but connect() will fail with ENODEV if vhost_vsock isn't loaded. + if !Path::new("/dev/vhost-vsock").exists() { + return Err(VmError::HostSetup( + "/dev/vhost-vsock not found.\n\ + QEMU backend requires the vhost_vsock kernel module.\n\ + Fix: sudo modprobe vhost_vsock" + .to_string(), + )); + } + { + let fd = unsafe { libc::socket(libc::AF_VSOCK, libc::SOCK_STREAM, 0) }; + if fd < 0 { + let err = std::io::Error::last_os_error(); + return Err(VmError::HostSetup(format!( + "AF_VSOCK socket creation failed: {err}\n\ + QEMU backend requires the vhost_vsock kernel module.\n\ + Fix: sudo modprobe vhost_vsock" + ))); + } + unsafe { libc::close(fd) }; + } + + Ok(Self { + qemu_binary, + vmlinux, + virtiofsd, + }) + } +} + +impl VmBackend for QemuBackend { + fn launch(&self, config: &VmConfig) -> Result { + launch_qemu(self, config) + } +} + +/// Search `$PATH` for a binary by name. +fn find_in_path(name: &str) -> Option { + let path_var = std::env::var_os("PATH")?; + for dir in std::env::split_paths(&path_var) { + let candidate = dir.join(name); + if candidate.is_file() { + return Some(candidate); + } + } + None +} + +const TAP_DEVICE_NAME: &str = "vmtap0"; + +/// Create and configure the TAP device before QEMU starts. +/// +/// QEMU with `script=no` expects the TAP device to already exist. +fn setup_tap_device() -> Result<(), VmError> { + // Clean up stale TAP device from a previous crashed run. + if Path::new(&format!("/sys/class/net/{TAP_DEVICE_NAME}")).exists() { + eprintln!("TAP device {TAP_DEVICE_NAME} already exists, removing stale device"); + let _ = run_cmd("ip", &["link", "delete", TAP_DEVICE_NAME]); + } + run_cmd( + "ip", + &["tuntap", "add", "dev", TAP_DEVICE_NAME, "mode", "tap"], + )?; + run_cmd( + "ip", + &[ + "addr", + "add", + &format!("{TAP_HOST_IP}/24"), + "dev", + TAP_DEVICE_NAME, + ], + )?; + run_cmd("ip", &["link", "set", TAP_DEVICE_NAME, "up"])?; + eprintln!("TAP device {TAP_DEVICE_NAME} created with {TAP_HOST_IP}"); + Ok(()) +} + +/// Remove the TAP device created by [`setup_tap_device`]. +fn teardown_tap_device() { + let _ = run_cmd("ip", &["link", "delete", TAP_DEVICE_NAME]); + eprintln!("TAP device {TAP_DEVICE_NAME} removed"); +} + +// ── Build QEMU command-line arguments ─────────────────────────────────── + +fn build_qemu_args( + backend: &QemuBackend, + config: &VmConfig, + effective_exec_path: &str, + vfio_device: Option<&str>, + virtiofsd_sock: &Path, + state_disk_path: Option<&Path>, + use_tap_net: bool, + guest_cid: u32, + console_log: &Path, +) -> Vec { + let mut args = Vec::new(); + + // Machine, CPU, resources + args.extend([ + "-machine".into(), + "q35,accel=kvm".into(), + "-cpu".into(), + "host".into(), + "-smp".into(), + config.vcpus.to_string(), + "-m".into(), + format!("{}M", config.mem_mib), + ]); + + // Kernel + args.extend(["-kernel".into(), backend.vmlinux.display().to_string()]); + + let cmdline = build_kernel_cmdline(config, effective_exec_path, use_tap_net); + args.extend(["-append".into(), cmdline]); + + // virtiofs rootfs + args.extend([ + "-chardev".into(), + format!("socket,id=vfsock,path={}", virtiofsd_sock.display()), + "-device".into(), + "vhost-user-fs-pci,chardev=vfsock,tag=rootfs".into(), + "-object".into(), + format!( + "memory-backend-file,id=mem,size={}M,mem-path=/dev/shm,share=on", + config.mem_mib + ), + "-numa".into(), + "node,memdev=mem".into(), + ]); + + // State disk + if let Some(disk_path) = state_disk_path { + args.extend([ + "-drive".into(), + format!("file={},format=raw,if=virtio", disk_path.display()), + ]); + } + + // PCIe root ports — Q35's pcie.0 root bus does not support + // hotplugging. VFIO and vhost-vsock-pci need dedicated root ports + // to initialize correctly under the Q35 PCIe topology. + // virtio-net-pci and vhost-user-fs-pci are QEMU-emulated devices + // that work directly on the root bus without dedicated root ports. + const PCIE_SLOT_VFIO: u8 = 1; + const PCIE_SLOT_VSOCK: u8 = 2; + + // VFIO device passthrough + if let Some(bdf) = vfio_device { + args.extend([ + "-device".into(), + format!("pcie-root-port,id=vfio-rp,chassis={PCIE_SLOT_VFIO},slot={PCIE_SLOT_VFIO}"), + "-device".into(), + format!("vfio-pci,host={bdf},bus=vfio-rp"), + ]); + } + + // vsock + args.extend([ + "-device".into(), + format!("pcie-root-port,id=vsock-rp,chassis={PCIE_SLOT_VSOCK},slot={PCIE_SLOT_VSOCK}"), + "-device".into(), + format!("vhost-vsock-pci,guest-cid={guest_cid},bus=vsock-rp"), + ]); + + // TAP networking + if use_tap_net { + args.extend([ + "-netdev".into(), + "tap,id=net0,ifname=vmtap0,script=no,downscript=no".into(), + "-device".into(), + format!("virtio-net-pci,netdev=net0,mac={GUEST_MAC}"), + ]); + } + + // Console / display — disable monitor explicitly to prevent + // stdin from being interpreted as monitor commands. + args.extend([ + "-serial".into(), + format!("file:{}", console_log.display()), + "-display".into(), + "none".into(), + "-monitor".into(), + "none".into(), + "-no-reboot".into(), + ]); + + args +} + +// ── Launch ────────────────────────────────────────────────────────────── + +#[allow(clippy::similar_names)] +fn launch_qemu(backend: &QemuBackend, config: &VmConfig) -> Result { + let launch_start = Instant::now(); + + let run_dir = config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .to_path_buf(); + let rootfs_key = vm_rootfs_key(&config.rootfs); + + let sock_dir = PathBuf::from(format!("/tmp/ovm-qemu-{}", std::process::id())); + if let Ok(entries) = std::fs::read_dir("/tmp") { + for entry in entries.filter_map(Result::ok) { + let name = entry.file_name().to_string_lossy().to_string(); + if name.starts_with("ovm-qemu-") && entry.path() != sock_dir { + let is_stale = name + .strip_prefix("ovm-qemu-") + .and_then(|pid_str| pid_str.parse::().ok()) + .map(|pid| unsafe { libc::kill(pid, 0) } != 0) + .unwrap_or(true); + if is_stale { + let _ = std::fs::remove_dir_all(entry.path()); + } + } + } + } + std::fs::create_dir_all(&sock_dir).map_err(|e| { + VmError::HostSetup(format!("create socket dir {}: {e}", sock_dir.display())) + })?; + + let virtiofsd_sock_path = sock_dir.join("virtiofsd.sock"); + let console_log = config + .console_output + .clone() + .unwrap_or_else(|| run_dir.join(format!("{rootfs_key}-console.log"))); + + let _ = std::fs::remove_file(&virtiofsd_sock_path); + + // Start virtiofsd + eprintln!("Starting virtiofsd: {}", backend.virtiofsd.display()); + let virtiofsd_log = run_dir.join(format!("{rootfs_key}-virtiofsd.log")); + let virtiofsd_log_file = std::fs::File::create(&virtiofsd_log) + .map_err(|e| VmError::Fork(format!("create virtiofsd log: {e}")))?; + + let mut virtiofsd_cmd = std::process::Command::new(&backend.virtiofsd); + virtiofsd_cmd + .arg(format!("--socket-path={}", virtiofsd_sock_path.display())) + .arg(format!("--shared-dir={}", config.rootfs.display())) + .arg("--cache=always") + .stdout(std::process::Stdio::null()) + .stderr(virtiofsd_log_file); + #[allow(unsafe_code)] + unsafe { + virtiofsd_cmd.pre_exec(|| { + libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); + Ok(()) + }); + } + let mut virtiofsd_child = virtiofsd_cmd + .spawn() + .map_err(|e| VmError::Fork(format!("start virtiofsd: {e}")))?; + + let virtiofsd_pid = virtiofsd_child.id() as i32; + crate::VIRTIOFSD_PID.store(virtiofsd_pid, std::sync::atomic::Ordering::Relaxed); + + eprintln!( + "virtiofsd started (pid {virtiofsd_pid}) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + + wait_for_socket(&virtiofsd_sock_path, "virtiofsd", Duration::from_secs(5))?; + + let use_tap_net = !matches!(config.net, NetBackend::None); + + // Build exec wrapper for --exec mode + let is_exec_mode = config.is_exec_mode(); + let wrapper_path = config.rootfs.join("tmp/qemu-exec-wrapper.sh"); + let effective_exec_path; + if is_exec_mode { + let args_str = config + .args + .iter() + .map(|a| shell_escape(a)) + .collect::>() + .join(" "); + + let env_str = config + .env + .iter() + .map(|v| format!("export {}", shell_escape(v))) + .collect::>() + .join("\n"); + + let wrapper = format!( + "#!/bin/sh\n\ + mount -t proc proc /proc 2>/dev/null\n\ + mount -t sysfs sysfs /sys 2>/dev/null\n\ + mount -t devtmpfs devtmpfs /dev 2>/dev/null\n\ + {env_str}\n\ + cd {workdir}\n\ + {exec} {args}\n\ + RC=$?\n\ + if command -v poweroff >/dev/null 2>&1; then\n\ + poweroff -f\n\ + elif [ -x /usr/bin/busybox ]; then\n\ + /usr/bin/busybox poweroff -f\n\ + else\n\ + echo o > /proc/sysrq-trigger\n\ + fi\n\ + exit $RC\n", + env_str = env_str, + workdir = shell_escape(&config.workdir), + exec = shell_escape(&config.exec_path), + args = args_str, + ); + + if let Some(parent) = wrapper_path.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| VmError::HostSetup(format!("create wrapper dir: {e}")))?; + } + std::fs::write(&wrapper_path, &wrapper) + .map_err(|e| VmError::HostSetup(format!("write exec wrapper: {e}")))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let _ = std::fs::set_permissions(&wrapper_path, std::fs::Permissions::from_mode(0o755)); + } + effective_exec_path = "/tmp/qemu-exec-wrapper.sh".to_string(); + } else { + effective_exec_path = config.exec_path.clone(); + } + + // Build QEMU command line + let state_disk_path = config.state_disk.as_ref().map(|sd| sd.path.as_path()); + let qemu_args = build_qemu_args( + backend, + config, + &effective_exec_path, + config.vfio_device.as_deref(), + &virtiofsd_sock_path, + state_disk_path, + use_tap_net, + VSOCK_GUEST_CID, + &console_log, + ); + + // Create TAP device before QEMU starts (QEMU with script=no expects it). + if use_tap_net { + setup_tap_device()?; + } + + // Spawn QEMU + eprintln!("Starting QEMU: {}", backend.qemu_binary.display()); + let qemu_log = run_dir.join(format!("{rootfs_key}-qemu.log")); + let qemu_log_file = std::fs::File::create(&qemu_log) + .map_err(|e| VmError::Fork(format!("create QEMU log: {e}")))?; + + let mut qemu_cmd = std::process::Command::new(&backend.qemu_binary); + qemu_cmd + .args(&qemu_args) + .stdout(std::process::Stdio::null()) + .stderr(qemu_log_file); + #[allow(unsafe_code)] + unsafe { + qemu_cmd.pre_exec(|| { + libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); + Ok(()) + }); + } + let mut qemu_child = qemu_cmd + .spawn() + .map_err(|e| VmError::Fork(format!("start QEMU: {e}")))?; + + let qemu_pid = qemu_child.id() as i32; + eprintln!( + "QEMU started (pid {qemu_pid}) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + + // Install signal handlers immediately so SIGTERM during the long + // gateway bootstrap (30-120s) forwards to QEMU instead of killing + // the parent via the default handler (which skips Drop and leaves + // the GPU bound to vfio-pci). + // + // We use sigaction with SA_RESTART so that the wait() syscall in the + // main thread auto-restarts after the handler returns, rather than + // failing with EINTR. This prevents a second signal from killing the + // process before cleanup runs. + crate::CHILD_PID.store(qemu_pid, std::sync::atomic::Ordering::Relaxed); + unsafe { + let mut sa: libc::sigaction = std::mem::zeroed(); + sa.sa_sigaction = crate::forward_signal as *const () as libc::sighandler_t; + sa.sa_flags = libc::SA_RESTART; + libc::sigemptyset(&raw mut sa.sa_mask); + libc::sigaction(libc::SIGTERM, &sa, std::ptr::null_mut()); + libc::sigaction(libc::SIGINT, &sa, std::ptr::null_mut()); + } + + // Set up host-side TAP networking + let mut original_ip_forward: Option = None; + if use_tap_net { + match setup_tap_host_networking() { + Ok(orig) => original_ip_forward = Some(orig), + Err(e) => { + eprintln!("WARNING: host networking setup failed: {e}"); + eprintln!(" The VM may not have internet access."); + } + } + } + + // Start AF_VSOCK exec bridge + let exec_socket = vm_exec_socket_path(&config.rootfs); + start_vsock_exec_bridge_af_vsock( + &exec_socket, + VSOCK_GUEST_CID, + VM_EXEC_VSOCK_PORT, + qemu_child.id(), + )?; + + // Write runtime state (vsock_bridge: true — uses AF_VSOCK bridging) + if !config.is_exec_mode() { + if let Err(err) = write_vm_runtime_state(&config.rootfs, qemu_pid, &console_log, None, true) + { + let _ = qemu_child.kill(); + let _ = qemu_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + if use_tap_net { + teardown_tap_device(); + } + clear_vm_runtime_state(&config.rootfs); + return Err(err); + } + } + + // TCP port forwarding for TAP networking + if use_tap_net { + for pm in &config.port_map { + let parts: Vec<&str> = pm.split(':').collect(); + if parts.len() == 2 { + if let (Ok(hp), Ok(gp)) = (parts[0].parse::(), parts[1].parse::()) { + if let Err(e) = start_tcp_port_forwarder(hp, TAP_GUEST_IP, gp) { + let _ = qemu_child.kill(); + let _ = qemu_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + if use_tap_net { + teardown_tap_device(); + } + clear_vm_runtime_state(&config.rootfs); + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + return Err(e); + } + } + } + } + } + + for pm in &config.port_map { + let host_port = pm.split(':').next().unwrap_or(pm); + eprintln!(" port {pm} -> http://localhost:{host_port}"); + } + eprintln!("Console output: {}", console_log.display()); + + // Gateway bootstrap and health check + if !config.is_exec_mode() && !config.port_map.is_empty() { + let gateway_port = crate::gateway_host_port(config); + if let Err(e) = crate::bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port) + .and_then(|_| crate::health::wait_for_gateway_ready(gateway_port, &config.gateway_name, config.gpu_enabled)) + { + let _ = qemu_child.kill(); + let _ = qemu_child.wait(); + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + if use_tap_net { + teardown_tap_device(); + } + clear_vm_runtime_state(&config.rootfs); + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + return Err(e); + } + } + + eprintln!("Ready [{:.1}s total]", launch_start.elapsed().as_secs_f64()); + eprintln!("Press Ctrl+C to stop."); + + // Wait for QEMU to exit. SA_RESTART ensures the wait() syscall + // auto-restarts after our signal handler runs, so QEMU gets a + // chance to shut down gracefully before we proceed to cleanup. + let status = qemu_child + .wait() + .map_err(|e| VmError::HostSetup(format!("wait for QEMU: {e}")))?; + + // Clear all signal-related atomics now that QEMU has exited. + crate::CHILD_PID.store(0, std::sync::atomic::Ordering::Relaxed); + crate::VIRTIOFSD_PID.store(0, std::sync::atomic::Ordering::Relaxed); + + let was_shutdown = crate::SHUTDOWN_REQUESTED.load(std::sync::atomic::Ordering::Relaxed); + if was_shutdown { + eprintln!("Shutdown signal received, running explicit cleanup..."); + } + + // ── Explicit cleanup (does NOT rely on Drop) ────────────────── + // + // This runs whether QEMU exited normally or was signalled. The + // signal handler forwarded SIGTERM to the process group, but we + // still need to clean up host-side state. + + // 1. Kill virtiofsd + let _ = virtiofsd_child.kill(); + let _ = virtiofsd_child.wait(); + eprintln!("virtiofsd stopped"); + + // 2. Tear down TAP device + if use_tap_net { + teardown_tap_device(); + } + + // 3. Tear down host networking (iptables) + if let Some(ref orig) = original_ip_forward { + teardown_tap_host_networking(orig); + } + + // 4. Clean up runtime state files + if !config.is_exec_mode() { + clear_vm_runtime_state(&config.rootfs); + } + + // 5. Clean up socket directories and temporary files + let _ = std::fs::remove_dir_all(&sock_dir); + let _ = std::fs::remove_file(&exec_socket); + if is_exec_mode { + let _ = std::fs::remove_file(&wrapper_path); + } + + let code = status.code().unwrap_or(1); + eprintln!("VM exited with code {code}"); + Ok(code) +} + +// ── AF_VSOCK exec bridge ──────────────────────────────────────────────── + +/// Start a background bridge: exec Unix socket → guest AF_VSOCK. +/// +/// QEMU uses kernel `vhost-vsock-pci` which exposes guest vsock via the +/// kernel's `AF_VSOCK` address family. We connect directly to the guest +/// CID and port using raw `AF_VSOCK` sockets. +fn start_vsock_exec_bridge_af_vsock( + exec_socket: &Path, + guest_cid: u32, + guest_port: u32, + qemu_pid: u32, +) -> Result<(), VmError> { + use std::os::unix::net::UnixListener; + + if let Some(parent) = exec_socket.parent() { + std::fs::create_dir_all(parent).map_err(|e| { + VmError::HostSetup(format!("create exec bridge dir {}: {e}", parent.display())) + })?; + } + let _ = std::fs::remove_file(exec_socket); + + let listener = UnixListener::bind(exec_socket).map_err(|e| { + VmError::HostSetup(format!( + "bind vsock exec bridge {}: {e}", + exec_socket.display() + )) + })?; + + eprintln!( + "vsock exec bridge (AF_VSOCK): {} → CID {} port {}", + exec_socket.display(), + guest_cid, + guest_port, + ); + + std::thread::spawn(move || { + af_vsock_bridge_accept_loop(listener, guest_cid, guest_port, qemu_pid); + }); + + Ok(()) +} + +/// Connect to a guest vsock port via kernel AF_VSOCK. +/// +/// Returns the connected socket wrapped as a `UnixStream`. The `UnixStream` +/// type is used solely for its `Read`/`Write` trait impls which delegate to +/// raw `read()`/`write()` syscalls — address-family-specific methods like +/// `peer_addr()` must not be called on the returned stream. +fn connect_af_vsock(cid: u32, port: u32) -> std::io::Result { + use std::os::unix::io::FromRawFd; + + let fd = unsafe { libc::socket(libc::AF_VSOCK, libc::SOCK_STREAM, 0) }; + if fd < 0 { + return Err(std::io::Error::last_os_error()); + } + + let addr = libc::sockaddr_vm { + svm_family: libc::AF_VSOCK as u16, + svm_reserved1: 0, + svm_port: port, + svm_cid: cid, + svm_zero: [0; 4], + }; + + let ret = unsafe { + libc::connect( + fd, + std::ptr::from_ref(&addr).cast::(), + size_of::() as libc::socklen_t, + ) + }; + + if ret < 0 { + let err = std::io::Error::last_os_error(); + unsafe { libc::close(fd) }; + return Err(err); + } + + // SAFETY: fd is a valid, connected socket. We wrap it as UnixStream + // purely for Read/Write access used by bridge_bidirectional(). + Ok(unsafe { UnixStream::from_raw_fd(fd) }) +} + +/// Whether a vsock connect error is transient (expected during VM boot). +/// +/// The guest exec agent takes time to start, and the vhost-vsock transport +/// may not be fully initialized when QEMU first launches. These errors +/// resolve on their own once the guest is ready. +fn is_transient_vsock_error(e: &std::io::Error) -> bool { + if e.kind() == std::io::ErrorKind::ConnectionRefused { + return true; + } + match e.raw_os_error() { + Some(code) => { + code == libc::ENODEV // vsock transport not ready + || code == libc::EHOSTUNREACH // guest CID not reachable yet + || code == libc::ECONNRESET // connection reset during startup + || code == libc::ETIMEDOUT // connect timed out + } + None => false, + } +} + +/// Accept loop for the AF_VSOCK bridge background thread. +/// +/// Connection failures during boot are expected — the guest exec agent +/// isn't listening yet. We keep retrying since the bootstrap caller has +/// its own 120s timeout. If the QEMU process exits, we stop immediately +/// rather than retrying against a dead CID for 120s. +fn af_vsock_bridge_accept_loop( + listener: std::os::unix::net::UnixListener, + guest_cid: u32, + port: u32, + qemu_pid: u32, +) { + // Give QEMU time to initialize the vhost-vsock-pci device and register + // the CID with the kernel transport before accepting connections. + std::thread::sleep(Duration::from_secs(2)); + + let mut fatal_failures: u32 = 0; + let mut logged_transient = false; + + for stream in listener.incoming() { + if !is_process_alive(qemu_pid) { + eprintln!("vsock bridge: QEMU (pid {qemu_pid}) exited, stopping bridge"); + return; + } + + let client = match stream { + Ok(s) => s, + Err(e) => { + eprintln!("vsock bridge: accept: {e}"); + continue; + } + }; + + match connect_af_vsock(guest_cid, port) { + Ok(guest) => { + fatal_failures = 0; + bridge_bidirectional(client, guest); + } + Err(e) if is_transient_vsock_error(&e) => { + if !is_process_alive(qemu_pid) { + eprintln!( + "vsock bridge: QEMU (pid {qemu_pid}) exited — \ + check console log for VM boot errors" + ); + return; + } + if !logged_transient { + eprintln!( + "vsock bridge: guest not ready on CID {guest_cid} port {port} ({e}), \ + will keep retrying..." + ); + logged_transient = true; + } + std::thread::sleep(Duration::from_secs(1)); + } + Err(e) => { + fatal_failures += 1; + if fatal_failures <= 2 { + eprintln!("vsock bridge: AF_VSOCK connect failed: {e}"); + } + if fatal_failures >= 5 { + eprintln!("vsock bridge: too many AF_VSOCK failures, stopping bridge"); + return; + } + std::thread::sleep(Duration::from_secs(1)); + } + } + } +} + +fn is_process_alive(pid: u32) -> bool { + unsafe { libc::kill(pid as i32, 0) == 0 } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_backend() -> QemuBackend { + QemuBackend { + qemu_binary: "/usr/bin/qemu-system-x86_64".into(), + vmlinux: "/boot/vmlinux".into(), + virtiofsd: "/usr/bin/virtiofsd".into(), + } + } + + fn base_config() -> VmConfig { + VmConfig { + rootfs: "/tmp/rootfs".into(), + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/openshell-vm-init.sh".into(), + args: vec![], + env: vec![], + workdir: "/".into(), + port_map: vec![], + vsock_ports: vec![], + log_level: 1, + console_output: None, + net: NetBackend::None, + reset: false, + gateway_name: "test".into(), + state_disk: None, + gpu_enabled: false, + gpu_has_msix: false, + vfio_device: None, + backend: crate::VmBackendChoice::Qemu, + } + } + + #[test] + fn build_qemu_args_basic() { + let backend = test_backend(); + let config = base_config(); + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + false, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + assert!(args.contains(&"-machine".to_string())); + assert!(args.contains(&"q35,accel=kvm".to_string())); + assert!(args.contains(&"-cpu".to_string())); + assert!(args.contains(&"host".to_string())); + assert!(args.contains(&"-smp".to_string())); + assert!(args.contains(&"4".to_string())); + assert!(args.contains(&"-m".to_string())); + assert!(args.contains(&"8192M".to_string())); + assert!(args.contains(&"-monitor".to_string())); + assert!(args.contains(&"none".to_string())); + assert!(args.contains(&"-no-reboot".to_string())); + assert!(!args.iter().any(|a| a.contains("vfio-pci"))); + assert!(!args.iter().any(|a| a.contains("tap"))); + assert!( + args.iter() + .any(|a| a.contains("pcie-root-port,id=vsock-rp")), + "args should contain PCIe root port for vsock: {args:?}" + ); + assert!( + args.iter() + .any(|a| a.contains("vhost-vsock-pci,guest-cid=3,bus=vsock-rp")), + "args should contain vsock on root port: {args:?}" + ); + } + + #[test] + fn build_qemu_args_with_vfio() { + let backend = test_backend(); + let mut config = base_config(); + config.gpu_enabled = true; + config.vfio_device = Some("0000:41:00.0".into()); + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + false, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + assert!( + args.iter() + .any(|a| a.contains("vfio-pci,host=0000:41:00.0,bus=vfio-rp")), + "args should contain VFIO device on root port: {args:?}" + ); + assert!( + args.iter().any(|a| a.contains("pcie-root-port,id=vfio-rp")), + "args should contain PCIe root port for VFIO: {args:?}" + ); + } + + #[test] + fn build_qemu_args_with_tap_net() { + let backend = test_backend(); + let mut config = base_config(); + config.net = NetBackend::Gvproxy { + binary: "/usr/bin/gvproxy".into(), + }; + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + true, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + assert!( + args.iter().any(|a| a.contains("tap,id=net0")), + "args should contain TAP netdev: {args:?}" + ); + assert!( + args.iter() + .any(|a| a.contains("virtio-net-pci,netdev=net0")), + "args should contain virtio-net device: {args:?}" + ); + } + + #[test] + fn build_qemu_args_without_net() { + let backend = test_backend(); + let config = base_config(); + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + None, + Path::new("/tmp/virtiofsd.sock"), + None, + false, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + assert!( + !args.iter().any(|a| a.contains("tap")), + "args should not contain TAP: {args:?}" + ); + assert!( + !args.iter().any(|a| a.contains("virtio-net")), + "args should not contain virtio-net: {args:?}" + ); + } + + #[test] + fn build_qemu_args_gpu_enabled_cmdline() { + let backend = test_backend(); + let mut config = base_config(); + config.gpu_enabled = true; + config.vfio_device = Some("0000:41:00.0".into()); + + let args = build_qemu_args( + &backend, + &config, + &config.exec_path, + config.vfio_device.as_deref(), + Path::new("/tmp/virtiofsd.sock"), + None, + false, + VSOCK_GUEST_CID, + Path::new("/tmp/console.log"), + ); + + let append_idx = args.iter().position(|a| a == "-append").unwrap(); + let cmdline = &args[append_idx + 1]; + assert!( + cmdline.contains("GPU_ENABLED=true"), + "cmdline should contain GPU_ENABLED=true: {cmdline}" + ); + assert!( + cmdline.contains("firmware_class.path=/lib/firmware"), + "cmdline should contain firmware_class.path for GPU: {cmdline}" + ); + } + + #[test] + fn transient_vsock_errors_classified_correctly() { + // Kind-based: ConnectionRefused + let refused = std::io::Error::from(std::io::ErrorKind::ConnectionRefused); + assert!( + is_transient_vsock_error(&refused), + "ConnectionRefused should be transient" + ); + + // OS-error-based transient codes + let enodev = std::io::Error::from_raw_os_error(libc::ENODEV); + assert!( + is_transient_vsock_error(&enodev), + "ENODEV should be transient" + ); + + let ehostunreach = std::io::Error::from_raw_os_error(libc::EHOSTUNREACH); + assert!( + is_transient_vsock_error(&ehostunreach), + "EHOSTUNREACH should be transient" + ); + + let econnreset = std::io::Error::from_raw_os_error(libc::ECONNRESET); + assert!( + is_transient_vsock_error(&econnreset), + "ECONNRESET should be transient" + ); + + let etimedout = std::io::Error::from_raw_os_error(libc::ETIMEDOUT); + assert!( + is_transient_vsock_error(&etimedout), + "ETIMEDOUT should be transient" + ); + + // Non-transient errors + let eperm = std::io::Error::from_raw_os_error(libc::EPERM); + assert!( + !is_transient_vsock_error(&eperm), + "EPERM should not be transient" + ); + + let eacces = std::io::Error::from_raw_os_error(libc::EACCES); + assert!( + !is_transient_vsock_error(&eacces), + "EACCES should not be transient" + ); + + let other = std::io::Error::new(std::io::ErrorKind::Other, "something else"); + assert!( + !is_transient_vsock_error(&other), + "ErrorKind::Other should not be transient" + ); + } +} diff --git a/crates/openshell-vm/src/embedded.rs b/crates/openshell-vm/src/embedded.rs index 731f34b10..6a4a2d3f6 100644 --- a/crates/openshell-vm/src/embedded.rs +++ b/crates/openshell-vm/src/embedded.rs @@ -26,6 +26,7 @@ mod resources { pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.5.dylib.zst")); pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); + pub const ROOTFS_GPU: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs-gpu.tar.zst")); pub const LIBKRUN_NAME: &str = "libkrun.dylib"; pub const LIBKRUNFW_NAME: &str = "libkrunfw.5.dylib"; } @@ -36,6 +37,7 @@ mod resources { pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); + pub const ROOTFS_GPU: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs-gpu.tar.zst")); pub const LIBKRUN_NAME: &str = "libkrun.so"; pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; } @@ -46,6 +48,7 @@ mod resources { pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); + pub const ROOTFS_GPU: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs-gpu.tar.zst")); pub const LIBKRUN_NAME: &str = "libkrun.so"; pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; } @@ -61,6 +64,7 @@ mod resources { pub const LIBKRUNFW: &[u8] = &[]; pub const GVPROXY: &[u8] = &[]; pub const ROOTFS: &[u8] = &[]; + pub const ROOTFS_GPU: &[u8] = &[]; pub const LIBKRUN_NAME: &str = "libkrun"; pub const LIBKRUNFW_NAME: &str = "libkrunfw"; } @@ -232,11 +236,16 @@ pub fn cleanup_old_rootfs() -> Result<(), VmError> { cleanup_old_versions_in_base(&base, ¤t_version_dir) } -/// Check if the rootfs is embedded (non-empty). +/// Check if the base rootfs is embedded (non-empty). pub fn has_embedded_rootfs() -> bool { !resources::ROOTFS.is_empty() } +/// Check if the GPU rootfs is embedded (non-empty). +pub fn has_embedded_gpu_rootfs() -> bool { + !resources::ROOTFS_GPU.is_empty() +} + // ── Internal helpers ─────────────────────────────────────────────────────── /// Build a cache key that combines the version string with a short content diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs index 6195556e1..e7fe27e12 100644 --- a/crates/openshell-vm/src/exec.rs +++ b/crates/openshell-vm/src/exec.rs @@ -48,6 +48,21 @@ fn safe_remove_dir_all(path: &Path) -> Result { pub const VM_EXEC_VSOCK_PORT: u32 = 10_777; +/// How to connect to the VM exec agent. +/// +/// libkrun bridges each guest vsock port to a host Unix socket via +/// `krun_add_vsock_port2`. QEMU uses kernel AF_VSOCK via vhost-vsock-pci, +/// bridged through a host Unix socket by the exec bridge thread. +#[derive(Debug, Clone)] +pub enum VsockConnectMode { + /// Connect via a host Unix socket (libkrun per-port bridging). + UnixSocket(PathBuf), + /// Connect via a vsock proxy bridge (QEMU AF_VSOCK). + /// The path points to a bridged Unix socket that connects to + /// guest CID 3, port [`VM_EXEC_VSOCK_PORT`]. + VsockBridge(PathBuf), +} + const VM_STATE_NAME: &str = "vm-state.json"; const VM_LOCK_NAME: &str = "vm.lock"; const KUBECONFIG_ENV: &str = "KUBECONFIG=/etc/rancher/k3s/k3s.yaml"; @@ -72,6 +87,10 @@ pub struct VmRuntimeState { /// PID of the gvproxy process (if networking uses gvproxy). #[serde(default, skip_serializing_if = "Option::is_none")] pub gvproxy_pid: Option, + /// Whether this VM uses vsock-bridge mode (QEMU AF_VSOCK) vs + /// Unix socket mode (libkrun). Defaults to false for backward compat. + #[serde(default, skip_serializing_if = "std::ops::Not::not")] + pub vsock_bridge: bool, } #[derive(Debug, Serialize)] @@ -132,6 +151,7 @@ pub fn write_vm_runtime_state( pid: i32, console_log: &Path, gvproxy_pid: Option, + vsock_bridge: bool, ) -> Result<(), VmError> { let state = VmRuntimeState { pid, @@ -141,6 +161,7 @@ pub fn write_vm_runtime_state( console_log: console_log.to_path_buf(), started_at_ms: now_ms()?, gvproxy_pid, + vsock_bridge, }; let path = vm_state_path(rootfs); let bytes = serde_json::to_vec_pretty(&state) @@ -154,8 +175,10 @@ pub fn write_vm_runtime_state( pub fn clear_vm_runtime_state(rootfs: &Path) { let state_path = vm_state_path(rootfs); + let lock_path = vm_lock_path(rootfs); let socket_path = vm_exec_socket_path(rootfs); let _ = fs::remove_file(state_path); + let _ = fs::remove_file(lock_path); let _ = fs::remove_file(socket_path); } @@ -285,6 +308,13 @@ pub fn reset_runtime_state(rootfs: &Path, gateway_name: &str) -> Result<(), VmEr /// create a fresh database on startup and cluster state will be re-applied from /// the auto-deploy manifests in `server/manifests/`. /// +/// **Limitation — state disk:** When a state disk is configured (common with +/// `--gpu`), the kine DB lives inside the raw disk image, not on the virtiofs +/// rootfs. This host-side check only sees the virtiofs path and cannot detect +/// corruption on the state disk. The init script (`openshell-vm-init.sh`) runs +/// `PRAGMA quick_check` inside the VM where the state disk is mounted, catching +/// corruption that this function misses. +/// /// **Stale bootstrap locks** (a kine application-level issue where a killed k3s /// server leaves a lock row that causes the next instance to hang) are handled /// separately by the init script (`openshell-vm-init.sh`), which runs @@ -358,6 +388,10 @@ fn remove_kine_db_files(db_path: &Path) -> Result<(), VmError> { /// automatically. This provides a reliable guard against two VM processes /// sharing the same rootfs — even if the state file is deleted. /// +/// When the lock file already contains a PID from a previous holder that +/// is no longer alive, a warning is logged and any stale VM state files +/// are cleaned up proactively. +/// /// Returns `Ok(File)` on success. The caller must keep the `File` alive /// for as long as the VM is running. pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { @@ -383,14 +417,13 @@ pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { if rc != 0 { let err = std::io::Error::last_os_error(); if err.raw_os_error() == Some(libc::EWOULDBLOCK) { - // Another process holds the lock — read its PID for diagnostics. + // Another process holds the flock. Read the PID recorded in + // the file for diagnostics — but verify it's still alive, + // because the file may contain a stale PID from a crashed + // predecessor while a different process now holds the flock. let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default(); let holder_pid = holder_pid.trim(); - return Err(VmError::RuntimeState(format!( - "another process (pid {holder_pid}) is using rootfs {}. \ - Stop the running VM first", - rootfs.display() - ))); + return Err(stale_lock_error(rootfs, holder_pid, &lock_path)); } return Err(VmError::RuntimeState(format!( "lock rootfs {}: {err}", @@ -398,7 +431,11 @@ pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { ))); } - // Lock acquired — write our PID (truncate first, then write). + // Lock acquired — check for stale state from a crashed predecessor. + // Read the previous PID before we overwrite it. + cleanup_stale_state_on_lock_acquire(rootfs, &lock_path); + + // Write our PID (truncate first, then write). // This is informational only; the flock is the real guard. let _ = file.set_len(0); { @@ -409,6 +446,56 @@ pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { Ok(file) } +/// Build an appropriate error when flock returns EWOULDBLOCK. +/// +/// If the PID recorded in the lock file is dead, the flock holder is a +/// different (unknown) process — provide enhanced diagnostics so the user +/// isn't misled by a stale PID. +fn stale_lock_error(rootfs: &Path, recorded_pid: &str, _lock_path: &Path) -> VmError { + if let Ok(pid) = recorded_pid.parse::() { + if pid > 0 && !process_alive(pid) { + return VmError::RuntimeState(format!( + "rootfs {} is locked, but the recorded holder (pid {pid}) is dead. \ + A different openshell-vm process likely holds the lock. \ + Check for running openshell-vm processes (`ps aux | grep openshell-vm`) \ + and stop them before retrying.", + rootfs.display(), + )); + } + } + VmError::RuntimeState(format!( + "another process (pid {recorded_pid}) is using rootfs {}. \ + Stop the running VM first", + rootfs.display() + )) +} + +/// After successfully acquiring the flock, check whether the lock file +/// contained a PID from a dead process (crash recovery). If so, log a +/// warning and clean up stale VM state/socket files. +fn cleanup_stale_state_on_lock_acquire(rootfs: &Path, lock_path: &Path) { + let prev_contents = fs::read_to_string(lock_path).unwrap_or_default(); + let Ok(prev_pid) = prev_contents.trim().parse::() else { + return; + }; + if prev_pid <= 0 || process_alive(prev_pid) { + return; + } + + eprintln!("Warning: cleaning up stale lock from dead process (pid {prev_pid})"); + + let state_path = vm_state_path(rootfs); + if let Ok(bytes) = fs::read(&state_path) { + if let Ok(state) = serde_json::from_slice::(&bytes) { + if !process_alive(state.pid) { + eprintln!(" Removing stale VM state (pid {})", state.pid); + let _ = fs::remove_file(&state_path); + let _ = fs::remove_file(vm_exec_socket_path(rootfs)); + } + } + } +} + /// Check whether the rootfs lock file is currently held by another process. /// /// Returns `Ok(())` if the lock is free (or can be acquired), and an @@ -431,11 +518,7 @@ fn check_rootfs_lock_free(rootfs: &Path) -> Result<(), VmError> { if err.raw_os_error() == Some(libc::EWOULDBLOCK) { let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default(); let holder_pid = holder_pid.trim(); - return Err(VmError::RuntimeState(format!( - "another process (pid {holder_pid}) is using rootfs {}. \ - Stop the running VM first", - rootfs.display() - ))); + return Err(stale_lock_error(rootfs, holder_pid, &lock_path)); } } else { // We acquired the lock — release it immediately since we're only probing. @@ -446,35 +529,35 @@ fn check_rootfs_lock_free(rootfs: &Path) -> Result<(), VmError> { } pub fn ensure_vm_not_running(rootfs: &Path) -> Result<(), VmError> { - // Primary guard: check the flock. This works even if the state file - // has been deleted, because the kernel holds the lock until the - // owning process exits. + // The flock is the definitive guard: the kernel releases it + // automatically when the owning process exits (even via SIGKILL). + // If this succeeds, no VM process holds the rootfs. check_rootfs_lock_free(rootfs)?; - // Secondary guard: check the state file for any stale state. - match load_vm_runtime_state(Some(rootfs)) { - Ok(state) => Err(VmError::RuntimeState(format!( - "VM is already running (pid {}) with exec socket {}", - state.pid, - state.socket_path.display() - ))), - Err(VmError::RuntimeState(message)) - if message.starts_with("read VM runtime state") - || message.starts_with("VM is not running") => - { - clear_vm_runtime_state(rootfs); - Ok(()) - } - Err(err) => Err(err), - } + // Flock is free — no VM process holds the rootfs lock. Any remaining + // state file is stale (from a killed/crashed VM or PID reuse by an + // unrelated process). Clean it up unconditionally. + clear_vm_runtime_state(rootfs); + Ok(()) } pub fn exec_running_vm(options: VmExecOptions) -> Result { let state = load_vm_runtime_state(options.rootfs.as_deref())?; - let mut stream = UnixStream::connect(&state.socket_path).map_err(|e| { + + let connect_mode = if state.vsock_bridge { + VsockConnectMode::VsockBridge(state.socket_path.clone()) + } else { + VsockConnectMode::UnixSocket(state.socket_path.clone()) + }; + + let socket_path = match &connect_mode { + VsockConnectMode::UnixSocket(p) | VsockConnectMode::VsockBridge(p) => p, + }; + + let mut stream = UnixStream::connect(socket_path).map_err(|e| { VmError::Exec(format!( "connect to VM exec socket {}: {e}", - state.socket_path.display() + socket_path.display() )) })?; let mut writer = stream diff --git a/crates/openshell-vm/src/health.rs b/crates/openshell-vm/src/health.rs index 096a35d1f..ce9a10169 100644 --- a/crates/openshell-vm/src/health.rs +++ b/crates/openshell-vm/src/health.rs @@ -76,20 +76,60 @@ async fn grpc_health_check(gateway_port: u16, gateway_name: &str) -> Result<(), } } +/// Default health check timeout for standard (non-GPU) VMs. +const DEFAULT_HEALTH_TIMEOUT_SECS: u64 = 90; + +/// Extended health check timeout for GPU-enabled VMs. +/// +/// Cold boot with GPU passthrough involves pulling container images (no layer +/// cache on a fresh state disk) and loading NVIDIA drivers/firmware, which +/// legitimately takes longer than a standard VM boot. +const GPU_HEALTH_TIMEOUT_SECS: u64 = 240; + +/// Initial poll interval between health check attempts. +const INITIAL_POLL_INTERVAL_SECS: u64 = 2; + +/// Maximum poll interval (exponential backoff cap). +const MAX_POLL_INTERVAL_SECS: u64 = 10; + +/// How often to emit a progress log line during the health check wait. +const PROGRESS_LOG_INTERVAL_SECS: u64 = 15; + /// Wait for the gateway service to be fully ready by polling the gRPC health endpoint. /// /// This replaces the TCP-only probe with a proper gRPC health check that verifies /// the service is actually responding to requests, not just accepting connections. /// +/// When `gpu_enabled` is true, the timeout is extended to accommodate cold-boot +/// scenarios where container image pulls and NVIDIA driver/firmware loading push +/// total startup well past the standard 90-second window. +/// +/// Uses exponential backoff between retry attempts (2s initial, 10s cap) to +/// avoid hammering the endpoint while still detecting readiness promptly. +/// /// Returns `Ok(())` when the gateway is confirmed healthy, or `Err` if the health /// check fails or times out. Falls back to TCP probe if mTLS materials aren't /// available yet. -pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<(), VmError> { +pub fn wait_for_gateway_ready( + gateway_port: u16, + gateway_name: &str, + gpu_enabled: bool, +) -> Result<(), VmError> { let start = std::time::Instant::now(); - let timeout = Duration::from_secs(90); - let poll_interval = Duration::from_secs(1); + let timeout_secs = if gpu_enabled { + GPU_HEALTH_TIMEOUT_SECS + } else { + DEFAULT_HEALTH_TIMEOUT_SECS + }; + let timeout = Duration::from_secs(timeout_secs); + let mut poll_interval = Duration::from_secs(INITIAL_POLL_INTERVAL_SECS); + let max_poll_interval = Duration::from_secs(MAX_POLL_INTERVAL_SECS); + let progress_interval = Duration::from_secs(PROGRESS_LOG_INTERVAL_SECS); - eprintln!("Waiting for gateway gRPC health check..."); + eprintln!( + "Waiting for gateway gRPC health check (timeout {timeout_secs}s{})...", + if gpu_enabled { ", GPU mode" } else { "" } + ); // Create a runtime for async health checks let rt = match tokio::runtime::Builder::new_current_thread() @@ -103,7 +143,16 @@ pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<( } }; + let mut attempt: u32 = 0; + let mut last_progress_log = start; + // The initial value is never read (overwritten on each loop iteration before + // the progress log), but we need a valid String to satisfy the borrow checker. + #[allow(unused_assignments)] + let mut last_error = String::new(); + loop { + attempt += 1; + // Try gRPC health check let result = rt.block_on(async { tokio::time::timeout( @@ -119,26 +168,40 @@ pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<( return Ok(()); } Ok(Err(e)) => { + last_error = e.clone(); // gRPC call completed but failed if start.elapsed() >= timeout { return Err(VmError::Bootstrap(format!( - "gateway health check failed after {:.0}s: {e}", + "gateway health check failed after {:.0}s (attempt {attempt}): {e}", timeout.as_secs_f64() ))); } } Err(_) => { + last_error = "health probe timed out".to_string(); // Timeout on the health check itself if start.elapsed() >= timeout { return Err(VmError::Bootstrap(format!( - "gateway health check timed out after {:.0}s", + "gateway health check timed out after {:.0}s (attempt {attempt})", timeout.as_secs_f64() ))); } } } + // Periodic progress logging so operators know the check is still running + if last_progress_log.elapsed() >= progress_interval { + eprintln!( + " health check: attempt {attempt}, elapsed {:.0}s/{timeout_secs}s ({last_error})", + start.elapsed().as_secs_f64() + ); + last_progress_log = std::time::Instant::now(); + } + std::thread::sleep(poll_interval); + + // Exponential backoff: double the interval up to the cap + poll_interval = std::cmp::min(poll_interval * 2, max_poll_interval); } } @@ -146,11 +209,18 @@ pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<( fn wait_for_tcp_only( gateway_port: u16, timeout: Duration, - poll_interval: Duration, + mut poll_interval: Duration, ) -> Result<(), VmError> { let start = std::time::Instant::now(); + let max_poll_interval = Duration::from_secs(MAX_POLL_INTERVAL_SECS); + let progress_interval = Duration::from_secs(PROGRESS_LOG_INTERVAL_SECS); + let timeout_secs = timeout.as_secs(); + let mut attempt: u32 = 0; + let mut last_progress_log = start; loop { + attempt += 1; + if host_tcp_probe(gateway_port) { eprintln!( "Service reachable (TCP) [{:.1}s]", @@ -161,12 +231,22 @@ fn wait_for_tcp_only( if start.elapsed() >= timeout { return Err(VmError::Bootstrap(format!( - "gateway TCP probe failed after {:.0}s", + "gateway TCP probe failed after {:.0}s (attempt {attempt})", timeout.as_secs_f64() ))); } + // Periodic progress logging + if last_progress_log.elapsed() >= progress_interval { + eprintln!( + " TCP probe: attempt {attempt}, elapsed {:.0}s/{timeout_secs}s", + start.elapsed().as_secs_f64() + ); + last_progress_log = std::time::Instant::now(); + } + std::thread::sleep(poll_interval); + poll_interval = std::cmp::min(poll_interval * 2, max_poll_interval); } } diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs index 2b78a7669..15e2cbde6 100644 --- a/crates/openshell-vm/src/lib.rs +++ b/crates/openshell-vm/src/lib.rs @@ -14,6 +14,7 @@ #![allow(unsafe_code)] +pub mod backend; mod embedded; mod exec; mod ffi; @@ -22,12 +23,13 @@ mod health; use std::ffi::CString; use std::path::{Path, PathBuf}; use std::ptr; -use std::time::Instant; +use std::time::{Duration, Instant}; pub use exec::{ - VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state, - ensure_vm_not_running, exec_capture, exec_running_vm, recover_corrupt_kine_db, - reset_runtime_state, vm_exec_socket_path, vm_state_path, write_vm_runtime_state, + VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, VsockConnectMode, acquire_rootfs_lock, + clear_vm_runtime_state, ensure_vm_not_running, exec_capture, exec_running_vm, + recover_corrupt_kine_db, reset_runtime_state, vm_exec_socket_path, vm_state_path, + write_vm_runtime_state, }; // ── Error type ───────────────────────────────────────────────────────── @@ -45,6 +47,22 @@ pub enum VmError { )] RootfsNotFound { path: String }, + /// The GPU rootfs directory does not exist. + #[error( + "GPU rootfs not found: {path}\n\ + The --gpu flag requires a rootfs built with GPU support (NVIDIA drivers,\n\ + nvidia-container-toolkit, and GPU manifests).\n\ + Build one with:\n\ + \x20 mise run vm:rootfs -- --base --gpu\n\ + \x20 mise run vm:build\n\ + Or manually:\n\ + \x20 - Place rootfs-gpu.tar.zst in the openshell-vm.runtime/ sidecar directory\n\ + \x20 - Or set OPENSHELL_VM_GPU_ROOTFS_TARBALL=/path/to/rootfs-gpu.tar.zst\n\ + \x20 - Or copy the extracted rootfs to: {path}\n\ + \x20 - Or use: openshell-vm --gpu --rootfs " + )] + GpuRootfsNotFound { path: String }, + /// A path contained invalid UTF-8. #[error("path is not valid UTF-8: {0}")] InvalidPath(String), @@ -98,6 +116,18 @@ fn check(ret: i32, func: &'static str) -> Result<(), VmError> { // ── Configuration ────────────────────────────────────────────────────── +/// Hypervisor backend selection. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum VmBackendChoice { + /// Auto-select: QEMU when a VFIO device is configured, libkrun otherwise. + #[default] + Auto, + /// Force the libkrun backend. + Libkrun, + /// Force the QEMU backend (Linux-only, supports VFIO GPU passthrough). + Qemu, +} + /// Networking backend for the microVM. #[derive(Debug, Clone)] pub enum NetBackend { @@ -202,9 +232,28 @@ pub struct VmConfig { /// Optional host-backed raw block image for mutable guest state. pub state_disk: Option, + + /// Whether GPU passthrough is enabled for this VM. + pub gpu_enabled: bool, + + /// Whether the GPU supports MSI-X. Retained for informational purposes + /// but no longer affects backend selection (QEMU handles both cases). + pub gpu_has_msix: bool, + + /// VFIO PCI device address for GPU passthrough (e.g. `0000:41:00.0`). + /// When set, the QEMU backend is used instead of libkrun. + pub vfio_device: Option, + + /// Hypervisor backend override. Defaults to [`VmBackendChoice::Auto`]. + pub backend: VmBackendChoice, } impl VmConfig { + /// Returns true when the VM runs in exec mode (one-shot command) rather than gateway mode. + pub(crate) fn is_exec_mode(&self) -> bool { + self.exec_path != "/srv/openshell-vm-init.sh" + } + /// Default gateway configuration: boots k3s server inside the VM. /// /// Runs `/srv/openshell-vm-init.sh` which mounts essential filesystems, @@ -245,6 +294,10 @@ impl VmConfig { reset: false, gateway_name: format!("{GATEWAY_NAME_PREFIX}-default"), state_disk: Some(state_disk), + gpu_enabled: false, + gpu_has_msix: true, + vfio_device: None, + backend: VmBackendChoice::Auto, } } } @@ -277,6 +330,130 @@ pub fn named_rootfs_dir(instance_name: &str) -> Result { .join("rootfs")) } +/// Resolve the GPU rootfs path for a named instance. +/// +/// Layout: `$XDG_DATA_HOME/openshell/openshell-vm/{version}/instances/{name}/rootfs-gpu` +/// +/// The GPU rootfs is built separately with `build-rootfs.sh --gpu` and is +/// never embedded (too large with NVIDIA drivers). If it doesn't exist, +/// callers should return [`VmError::GpuRootfsNotFound`]. +pub fn named_gpu_rootfs_dir(instance_name: &str) -> Result { + let name = sanitize_instance_name(instance_name)?; + let base = openshell_bootstrap::paths::openshell_vm_base_dir() + .map_err(|e| VmError::RuntimeState(format!("resolve openshell-vm base dir: {e}")))?; + Ok(base + .join(env!("CARGO_PKG_VERSION")) + .join("instances") + .join(name) + .join("rootfs-gpu")) +} + +/// Ensure a GPU rootfs exists for the named instance. +/// +/// When the GPU rootfs directory doesn't exist, looks for a +/// `rootfs-gpu.tar.zst` tarball in these locations (in order): +/// +/// 1. Sidecar runtime dir: `/openshell-vm.runtime/rootfs-gpu.tar.zst` +/// 2. Environment variable: `OPENSHELL_VM_GPU_ROOTFS_TARBALL` +/// +/// If found, extracts to the instance `rootfs-gpu` path. This mirrors the +/// pattern used by [`ensure_named_rootfs`] for the standard rootfs. +/// +/// Validates that the rootfs contains the `.rootfs-gpu` sentinel written +/// by `build-rootfs.sh --gpu`, catching the case where a regular rootfs +/// was accidentally placed at the `rootfs-gpu` path. +pub fn ensure_gpu_rootfs(instance_name: &str) -> Result { + let gpu_rootfs = named_gpu_rootfs_dir(instance_name)?; + if !gpu_rootfs.is_dir() { + if let Some(tarball) = find_gpu_rootfs_tarball() { + extract_gpu_rootfs_tarball(&tarball, &gpu_rootfs)?; + } else { + return Err(VmError::GpuRootfsNotFound { + path: gpu_rootfs.display().to_string(), + }); + } + } + + let sentinel = gpu_rootfs.join("opt/openshell/.rootfs-gpu"); + if !sentinel.is_file() { + return Err(VmError::GpuRootfsNotFound { + path: format!( + "{} (directory exists but missing .rootfs-gpu sentinel — \ + was it built with --gpu?)", + gpu_rootfs.display() + ), + }); + } + + eprintln!("GPU rootfs: {}", gpu_rootfs.display()); + Ok(gpu_rootfs) +} + +const GPU_ROOTFS_TARBALL_ENV: &str = "OPENSHELL_VM_GPU_ROOTFS_TARBALL"; +const GPU_ROOTFS_TARBALL_NAME: &str = "rootfs-gpu.tar.zst"; + +/// Search for a GPU rootfs tarball in known locations. +fn find_gpu_rootfs_tarball() -> Option { + // 1. Sidecar runtime dir next to the binary + if let Ok(exe) = std::env::current_exe() { + if let Some(exe_dir) = exe.parent() { + let sidecar = exe_dir + .join("openshell-vm.runtime") + .join(GPU_ROOTFS_TARBALL_NAME); + if sidecar.is_file() { + return Some(sidecar); + } + } + } + + // 2. Environment variable override + if let Some(path) = std::env::var_os(GPU_ROOTFS_TARBALL_ENV) { + let path = PathBuf::from(path); + if path.is_file() { + return Some(path); + } + } + + None +} + +/// Extract a `rootfs-gpu.tar.zst` tarball into the given destination directory. +fn extract_gpu_rootfs_tarball(tarball: &Path, dest: &Path) -> Result<(), VmError> { + eprintln!( + "Extracting GPU rootfs...\n source: {}\n dest: {}", + tarball.display(), + dest.display() + ); + + let file = std::fs::File::open(tarball).map_err(|e| { + VmError::HostSetup(format!( + "open GPU rootfs tarball {}: {e}", + tarball.display() + )) + })?; + + let decoder = zstd::Decoder::new(std::io::BufReader::new(file)).map_err(|e| { + VmError::HostSetup(format!( + "create zstd decoder for {}: {e}", + tarball.display() + )) + })?; + + std::fs::create_dir_all(dest).map_err(|e| { + VmError::HostSetup(format!("create GPU rootfs dir {}: {e}", dest.display())) + })?; + + let mut archive = tar::Archive::new(decoder); + archive.unpack(dest).map_err(|e| { + // Clean up partial extraction + let _ = std::fs::remove_dir_all(dest); + VmError::HostSetup(format!("extract GPU rootfs tarball: {e}")) + })?; + + eprintln!(" GPU rootfs extracted to {}", dest.display()); + Ok(()) +} + /// Ensure a named instance rootfs exists, extracting from the embedded /// rootfs tarball on first use. /// @@ -365,7 +542,9 @@ fn sanitize_instance_name(name: &str) -> Result { /// Build a null-terminated C string array from a slice of strings. /// /// Returns both the `CString` owners (to keep them alive) and the pointer array. -fn c_string_array(strings: &[&str]) -> Result<(Vec, Vec<*const libc::c_char>), VmError> { +pub(crate) fn c_string_array( + strings: &[&str], +) -> Result<(Vec, Vec<*const libc::c_char>), VmError> { let owned: Vec = strings .iter() .map(|s| CString::new(*s)) @@ -570,7 +749,7 @@ fn extract_json_string(json: &str, key: &str) -> Option { map.get(key)?.as_str().map(ToOwned::to_owned) } -fn clamp_log_level(level: u32) -> u32 { +pub(crate) fn clamp_log_level(level: u32) -> u32 { match level { 0 => ffi::KRUN_LOG_LEVEL_OFF, 1 => ffi::KRUN_LOG_LEVEL_ERROR, @@ -581,258 +760,29 @@ fn clamp_log_level(level: u32) -> u32 { } } -struct VmContext { - krun: &'static ffi::LibKrun, - ctx_id: u32, -} - -impl VmContext { - fn create(log_level: u32) -> Result { - let krun = ffi::libkrun()?; - unsafe { - check( - (krun.krun_init_log)( - ffi::KRUN_LOG_TARGET_DEFAULT, - clamp_log_level(log_level), - ffi::KRUN_LOG_STYLE_AUTO, - ffi::KRUN_LOG_OPTION_NO_ENV, - ), - "krun_init_log", - )?; - } - - let ctx_id = unsafe { (krun.krun_create_ctx)() }; - if ctx_id < 0 { - return Err(VmError::Krun { - func: "krun_create_ctx", - code: ctx_id, - }); - } - - Ok(Self { - krun, - ctx_id: ctx_id as u32, - }) - } - - fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib), - "krun_set_vm_config", - ) - } - } - - fn set_root(&self, rootfs: &Path) -> Result<(), VmError> { - let rootfs_c = path_to_cstring(rootfs)?; - unsafe { - check( - (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()), - "krun_set_root", - ) - } - } - - fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> { - let Some(add_disk3) = self.krun.krun_add_disk3 else { - return Err(VmError::HostSetup( - "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support" - .to_string(), - )); - }; - - let block_id_c = CString::new(state_disk.block_id.as_str())?; - let disk_path_c = path_to_cstring(&state_disk.path)?; - unsafe { - check( - add_disk3( - self.ctx_id, - block_id_c.as_ptr(), - disk_path_c.as_ptr(), - ffi::KRUN_DISK_FORMAT_RAW, - false, - false, - state_disk_sync_mode(), - ), - "krun_add_disk3", - ) - } - } - - fn set_workdir(&self, workdir: &str) -> Result<(), VmError> { - let workdir_c = CString::new(workdir)?; - unsafe { - check( - (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()), - "krun_set_workdir", - ) - } - } - - fn disable_implicit_vsock(&self) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_disable_implicit_vsock)(self.ctx_id), - "krun_disable_implicit_vsock", - ) - } - } - - fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_add_vsock)(self.ctx_id, tsi_features), - "krun_add_vsock", - ) - } - } - - #[cfg(target_os = "macos")] - fn add_net_unixgram( - &self, - socket_path: &Path, - mac: &[u8; 6], - features: u32, - flags: u32, - ) -> Result<(), VmError> { - let sock_c = path_to_cstring(socket_path)?; - unsafe { - check( - (self.krun.krun_add_net_unixgram)( - self.ctx_id, - sock_c.as_ptr(), - -1, - mac.as_ptr(), - features, - flags, - ), - "krun_add_net_unixgram", - ) - } - } - - #[allow(dead_code)] // FFI binding for future use (e.g. Linux networking) - fn add_net_unixstream( - &self, - socket_path: &Path, - mac: &[u8; 6], - features: u32, - ) -> Result<(), VmError> { - let sock_c = path_to_cstring(socket_path)?; - unsafe { - check( - (self.krun.krun_add_net_unixstream)( - self.ctx_id, - sock_c.as_ptr(), - -1, - mac.as_ptr(), - features, - 0, - ), - "krun_add_net_unixstream", - ) - } - } - - fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { - let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); - let (_port_owners, port_ptrs) = c_string_array(&port_strs)?; - unsafe { - check( - (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()), - "krun_set_port_map", - ) - } - } - - fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> { - let socket_c = path_to_cstring(&port.socket_path)?; - unsafe { - check( - (self.krun.krun_add_vsock_port2)( - self.ctx_id, - port.port, - socket_c.as_ptr(), - port.listen, - ), - "krun_add_vsock_port2", - ) - } - } - - fn set_console_output(&self, path: &Path) -> Result<(), VmError> { - let console_c = path_to_cstring(path)?; - unsafe { - check( - (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()), - "krun_set_console_output", - ) - } - } - - fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> { - let exec_c = CString::new(exec_path)?; - let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect(); - let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?; - let env_strs: Vec<&str> = env.iter().map(String::as_str).collect(); - let (_env_owners, env_ptrs) = c_string_array(&env_strs)?; - - unsafe { - check( - (self.krun.krun_set_exec)( - self.ctx_id, - exec_c.as_ptr(), - argv_ptrs.as_ptr(), - env_ptrs.as_ptr(), - ), - "krun_set_exec", - ) - } - } - - fn start_enter(&self) -> i32 { - unsafe { (self.krun.krun_start_enter)(self.ctx_id) } - } -} - -impl Drop for VmContext { - fn drop(&mut self) { - unsafe { - let ret = (self.krun.krun_free_ctx)(self.ctx_id); - if ret < 0 { - eprintln!( - "warning: krun_free_ctx({}) failed with code {ret}", - self.ctx_id - ); - } - } - } -} - /// RAII guard that kills and waits on a gvproxy child process when dropped. /// /// This prevents orphaned gvproxy processes when early `?` returns in the /// launch function cause the child to be dropped before cleanup code runs. /// Call [`GvproxyGuard::disarm`] to take ownership of the child when it /// should outlive the guard (i.e., after a successful fork). -struct GvproxyGuard { +pub(crate) struct GvproxyGuard { child: Option, } impl GvproxyGuard { - fn new(child: std::process::Child) -> Self { + pub(crate) fn new(child: std::process::Child) -> Self { Self { child: Some(child) } } /// Take the child out of the guard, preventing it from being killed on drop. /// Use this after the launch is successful and the parent will manage cleanup. - fn disarm(&mut self) -> Option { + pub(crate) fn disarm(&mut self) -> Option { self.child.take() } /// Get the child's PID without disarming. - fn id(&self) -> Option { + pub(crate) fn id(&self) -> Option { self.child.as_ref().map(std::process::Child::id) } } @@ -852,7 +802,7 @@ impl Drop for GvproxyGuard { /// /// Sends a raw HTTP/1.1 POST request over the unix socket to avoid /// depending on `curl` being installed on the host. -fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { +pub(crate) fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { use std::io::{Read, Write}; use std::os::unix::net::UnixStream; @@ -908,7 +858,7 @@ fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { /// runtime state. If the state file was deleted (e.g. the user ran /// `rm -rf` on the data directory), we fall back to killing any gvproxy /// process holding the target ports. -fn kill_stale_gvproxy(rootfs: &Path) { +pub(crate) fn kill_stale_gvproxy(rootfs: &Path) { kill_stale_gvproxy_by_state(rootfs); } @@ -929,7 +879,7 @@ fn kill_stale_gvproxy_by_state(rootfs: &Path) { /// /// Used as a fallback when the VM state file is missing (e.g. after the /// user deleted the data directory while a VM was running). -fn kill_stale_gvproxy_by_port(port: u16) { +pub(crate) fn kill_stale_gvproxy_by_port(port: u16) { // Use lsof to find PIDs listening on the target port. let output = std::process::Command::new("lsof") .args(["-ti", &format!(":{port}")]) @@ -953,23 +903,54 @@ fn kill_stale_gvproxy_by_port(port: u16) { fn kill_gvproxy_pid(gvproxy_pid: u32) { let pid_i32 = gvproxy_pid as libc::pid_t; let is_alive = unsafe { libc::kill(pid_i32, 0) } == 0; - if is_alive { - // Verify the process is actually gvproxy before killing. - // Without this check, PID reuse could cause us to kill an - // unrelated process. - if !is_process_named(pid_i32, "gvproxy") { - eprintln!( - "Stale gvproxy pid {gvproxy_pid} is no longer gvproxy (PID reused), skipping kill" - ); + if !is_alive { + return; + } + + if !is_process_named(pid_i32, "gvproxy") { + eprintln!( + "Stale gvproxy pid {gvproxy_pid} is no longer gvproxy (PID reused), skipping kill" + ); + return; + } + + unsafe { + libc::kill(pid_i32, libc::SIGTERM); + } + eprintln!("Killing stale gvproxy process (pid {gvproxy_pid})..."); + + // Wait up to 2 seconds for graceful shutdown, then escalate to SIGKILL. + let deadline = Instant::now() + Duration::from_secs(2); + loop { + std::thread::sleep(Duration::from_millis(50)); + if unsafe { libc::kill(pid_i32, 0) } != 0 { + eprintln!("Stale gvproxy (pid {gvproxy_pid}) terminated"); + std::thread::sleep(Duration::from_millis(100)); return; } - unsafe { - libc::kill(pid_i32, libc::SIGTERM); + if Instant::now() >= deadline { + break; } - eprintln!("Killed stale gvproxy process (pid {gvproxy_pid})"); - // Brief pause for the port to be released. - std::thread::sleep(std::time::Duration::from_millis(200)); } + + eprintln!("gvproxy (pid {gvproxy_pid}) did not exit after SIGTERM, sending SIGKILL"); + unsafe { + libc::kill(pid_i32, libc::SIGKILL); + } + + // Wait for the process to be reaped (up to 2 more seconds). + let kill_deadline = Instant::now() + Duration::from_secs(2); + loop { + std::thread::sleep(Duration::from_millis(50)); + if unsafe { libc::kill(pid_i32, 0) } != 0 { + break; + } + if Instant::now() >= kill_deadline { + eprintln!("warning: gvproxy (pid {gvproxy_pid}) still alive after SIGKILL"); + break; + } + } + std::thread::sleep(Duration::from_millis(100)); } /// Check whether a process with the given PID has the expected name. @@ -1009,7 +990,7 @@ fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool { false } -fn vm_rootfs_key(rootfs: &Path) -> String { +pub(crate) fn vm_rootfs_key(rootfs: &Path) -> String { let name = rootfs .file_name() .and_then(|part| part.to_str()) @@ -1078,7 +1059,7 @@ fn ensure_state_disk_image(state_disk: &StateDiskConfig) -> Result<(), VmError> Ok(()) } -fn state_disk_sync_mode() -> u32 { +pub(crate) fn state_disk_sync_mode() -> u32 { #[cfg(target_os = "macos")] { ffi::KRUN_SYNC_RELAXED @@ -1126,12 +1107,13 @@ fn secure_socket_base(subdir: &str) -> Result { dir.display() ))); } - // Verify ownership matches current user. + // Verify ownership matches current user. Root (uid 0) can safely + // use any directory, so skip this check under sudo / as root. #[cfg(unix)] { use std::os::unix::fs::MetadataExt as _; - let uid = unsafe { libc::getuid() }; - if meta.uid() != uid { + let uid = unsafe { libc::geteuid() }; + if uid != 0 && meta.uid() != uid { return Err(VmError::HostSetup(format!( "socket directory {} is owned by uid {} but we are uid {} — refusing to use it", dir.display(), @@ -1154,7 +1136,7 @@ fn secure_socket_base(subdir: &str) -> Result { Ok(dir) } -fn gvproxy_socket_dir(rootfs: &Path) -> Result { +pub(crate) fn gvproxy_socket_dir(rootfs: &Path) -> Result { let dir = secure_socket_base("ovm-gv")?; // macOS unix socket path limit is tight (~104 bytes). Keep paths very short. @@ -1162,16 +1144,44 @@ fn gvproxy_socket_dir(rootfs: &Path) -> Result { Ok(dir.join(id)) } -fn gateway_host_port(config: &VmConfig) -> u16 { - config - .port_map - .first() - .and_then(|pm| pm.split(':').next()) - .and_then(|port| port.parse::().ok()) - .unwrap_or(DEFAULT_GATEWAY_PORT) +/// Validate that a VFIO PCI address matches the BDF format `DDDD:BB:DD.F`. +/// +/// Rejects strings containing `/`, `..`, or non-hex characters to prevent +/// path traversal when the address is interpolated into sysfs paths. +fn validate_vfio_address(addr: &str) -> Result<(), VmError> { + let bytes = addr.as_bytes(); + if bytes.len() == 12 + && bytes[4] == b':' + && bytes[7] == b':' + && bytes[10] == b'.' + && bytes[..4].iter().all(u8::is_ascii_hexdigit) + && bytes[5..7].iter().all(u8::is_ascii_hexdigit) + && bytes[8..10].iter().all(u8::is_ascii_hexdigit) + && bytes[11].is_ascii_digit() + && bytes[11] <= b'7' + { + return Ok(()); + } + Err(VmError::HostSetup(format!( + "invalid VFIO PCI address '{addr}': expected BDF format DDDD:BB:DD.F (e.g. 0000:41:00.0)" + ))) } -fn pick_gvproxy_ssh_port() -> Result { +pub(crate) fn gateway_host_port(config: &VmConfig) -> u16 { + for pm in &config.port_map { + let parts: Vec<&str> = pm.split(':').collect(); + if parts.len() == 2 { + if let Ok(guest) = parts[1].parse::() { + if guest == GUEST_GATEWAY_NODEPORT { + return parts[0].parse::().unwrap_or(DEFAULT_GATEWAY_PORT); + } + } + } + } + DEFAULT_GATEWAY_PORT +} + +pub(crate) fn pick_gvproxy_ssh_port() -> Result { let listener = std::net::TcpListener::bind(("127.0.0.1", 0)) .map_err(|e| VmError::HostSetup(format!("allocate gvproxy ssh port on localhost: {e}")))?; let port = listener @@ -1182,7 +1192,7 @@ fn pick_gvproxy_ssh_port() -> Result { Ok(port) } -fn path_to_cstring(path: &Path) -> Result { +pub(crate) fn path_to_cstring(path: &Path) -> Result { let s = path .to_str() .ok_or_else(|| VmError::InvalidPath(path.display().to_string()))?; @@ -1236,7 +1246,7 @@ pub fn launch(config: &VmConfig) -> Result { #[cfg(target_os = "linux")] check_kvm_access()?; - if config.exec_path == "/srv/openshell-vm-init.sh" { + if !config.is_exec_mode() { ensure_vm_not_running(&config.rootfs)?; } @@ -1245,7 +1255,7 @@ pub fn launch(config: &VmConfig) -> Result { // is killed (even SIGKILL), the OS releases the lock automatically. // This prevents a second launch or rootfs rebuild from corrupting a // running VM's filesystem via virtio-fs. - let _rootfs_lock = if config.exec_path == "/srv/openshell-vm-init.sh" { + let _rootfs_lock = if !config.is_exec_mode() { Some(acquire_rootfs_lock(&config.rootfs)?) } else { None @@ -1257,7 +1267,7 @@ pub fn launch(config: &VmConfig) -> Result { // every normal boot (not --reset, which wipes k3s/server/ entirely). // Must happen after the lock so we know no other VM process is using // the rootfs. - if !config.reset && config.exec_path == "/srv/openshell-vm-init.sh" { + if !config.reset && !config.is_exec_mode() { recover_corrupt_kine_db(&config.rootfs)?; } @@ -1277,11 +1287,22 @@ pub fn launch(config: &VmConfig) -> Result { state_disk.path.display() ))); } - if let Some(state_disk) = &config.state_disk { + let fresh_state_disk = if let Some(state_disk) = &config.state_disk { + let existed_before = state_disk.path.is_file(); ensure_state_disk_image(state_disk)?; + !existed_before + } else { + false + }; + + // When the state disk is freshly created (deleted by user, --reset, or + // first boot), the VM will generate new PKI. Clear any cached host-side + // mTLS certs so `bootstrap_gateway` runs the cold-boot PKI fetch path + // instead of using stale certs that won't match the new VM CA. + if fresh_state_disk || config.reset { + clear_warm_boot_certs(&config.gateway_name); } - let launch_start = Instant::now(); eprintln!("rootfs: {}", config.rootfs.display()); if let Some(state_disk) = &config.state_disk { eprintln!( @@ -1292,421 +1313,58 @@ pub fn launch(config: &VmConfig) -> Result { } eprintln!("vm: {} vCPU(s), {} MiB RAM", config.vcpus, config.mem_mib); - // The runtime is embedded in the binary and extracted on first use. - // Can be overridden via OPENSHELL_VM_RUNTIME_DIR for development. - let runtime_gvproxy = resolve_runtime_bundle()?; - let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { - VmError::HostSetup(format!( - "runtime bundle file has no parent directory: {}", - runtime_gvproxy.display() - )) - })?; - configure_runtime_loader_env(runtime_dir)?; raise_nofile_limit(); - // ── Log runtime provenance ───────────────────────────────────── - // After configuring the loader, trigger library loading so that - // provenance is captured before we proceed with VM configuration. - let _ = ffi::libkrun()?; - log_runtime_provenance(runtime_dir); - - // ── Configure the microVM ────────────────────────────────────── + // ── Dispatch to the appropriate backend ───────────────────────── - let vm = VmContext::create(config.log_level)?; - vm.set_vm_config(config.vcpus, config.mem_mib)?; - vm.set_root(&config.rootfs)?; - if let Some(state_disk) = &config.state_disk { - vm.add_state_disk(state_disk)?; + enum SelectedBackend { + Libkrun, + Qemu, } - vm.set_workdir(&config.workdir)?; - - // Networking setup — use a drop guard so gvproxy is killed if we - // return early via `?` before reaching the parent's cleanup code. - let mut gvproxy_guard: Option = None; - let mut gvproxy_api_sock: Option = None; - match &config.net { - NetBackend::Tsi => { - // Default TSI — no special setup needed. - } - NetBackend::None => { - vm.disable_implicit_vsock()?; - vm.add_vsock(0)?; - eprintln!("Networking: disabled (no TSI, no virtio-net)"); - } - NetBackend::Gvproxy { binary } => { - if !binary.exists() { - return Err(VmError::BinaryNotFound { - path: binary.display().to_string(), - hint: "Install Podman Desktop or place gvproxy in PATH".to_string(), - }); + let selected = match config.backend { + VmBackendChoice::Libkrun => SelectedBackend::Libkrun, + VmBackendChoice::Qemu => SelectedBackend::Qemu, + VmBackendChoice::Auto => { + if config.gpu_enabled || config.vfio_device.is_some() { + SelectedBackend::Qemu + } else { + SelectedBackend::Libkrun } + } + }; - // Create temp socket paths - let run_dir = config - .rootfs - .parent() - .unwrap_or(&config.rootfs) - .to_path_buf(); - let rootfs_key = vm_rootfs_key(&config.rootfs); - let sock_base = gvproxy_socket_dir(&config.rootfs)?; - let net_sock = sock_base.with_extension("v"); - let api_sock = sock_base.with_extension("a"); - - // Kill any stale gvproxy process from a previous run. - // First try via the saved PID in the state file, then fall - // back to killing any gvproxy holding our target ports (covers - // the case where the state file was deleted). - kill_stale_gvproxy(&config.rootfs); - for pm in &config.port_map { - if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::().ok()) { - kill_stale_gvproxy_by_port(host_port); - } - } + match selected { + SelectedBackend::Qemu => { + #[cfg(not(target_os = "linux"))] + return Err(VmError::HostSetup( + "QEMU backend requires Linux with KVM".into(), + )); - // Clean stale sockets (including the -krun.sock file that - // libkrun creates as its datagram endpoint on macOS). - let _ = std::fs::remove_file(&net_sock); - let _ = std::fs::remove_file(&api_sock); - let krun_sock = sock_base.with_extension("v-krun.sock"); - let _ = std::fs::remove_file(&krun_sock); - - // Start gvproxy - eprintln!("Starting gvproxy: {}", binary.display()); - let ssh_port = pick_gvproxy_ssh_port()?; - let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log")); - let gvproxy_log_file = std::fs::File::create(&gvproxy_log) - .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; - - // On Linux, gvproxy uses QEMU mode (SOCK_STREAM) since the vfkit - // unixgram scheme is macOS/vfkit-specific. On macOS, use vfkit mode. #[cfg(target_os = "linux")] - let (gvproxy_net_flag, gvproxy_net_url) = - ("-listen-qemu", format!("unix://{}", net_sock.display())); - #[cfg(target_os = "macos")] - let (gvproxy_net_flag, gvproxy_net_url) = ( - "-listen-vfkit", - format!("unixgram://{}", net_sock.display()), - ); - - let child = std::process::Command::new(binary) - .arg(gvproxy_net_flag) - .arg(&gvproxy_net_url) - .arg("-listen") - .arg(format!("unix://{}", api_sock.display())) - .arg("-ssh-port") - .arg(ssh_port.to_string()) - .stdout(std::process::Stdio::null()) - .stderr(gvproxy_log_file) - .spawn() - .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; - - eprintln!( - "gvproxy started (pid {}, ssh port {}) [{:.1}s]", - child.id(), - ssh_port, - launch_start.elapsed().as_secs_f64() - ); - - // Wait for the socket to appear (exponential backoff: 5ms → 100ms). { - let deadline = Instant::now() + std::time::Duration::from_secs(5); - let mut interval = std::time::Duration::from_millis(5); - while !net_sock.exists() { - if Instant::now() >= deadline { - return Err(VmError::Fork( - "gvproxy socket did not appear within 5s".to_string(), - )); - } - std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(100)); + if let Some(ref addr) = config.vfio_device { + validate_vfio_address(addr)?; } + let qemu_backend = backend::qemu::QemuBackend::new()?; + backend::VmBackend::launch(&qemu_backend, config) } - - // Disable implicit TSI and add virtio-net via gvproxy - vm.disable_implicit_vsock()?; - vm.add_vsock(0)?; - // This MAC matches gvproxy's default static DHCP lease for - // 192.168.127.2. Using a different MAC can cause the gVisor - // network stack to misroute or drop packets. - let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; - - // COMPAT_NET_FEATURES from libkrun.h - const NET_FEATURE_CSUM: u32 = 1 << 0; - const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; - const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; - const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; - const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; - const NET_FEATURE_HOST_UFO: u32 = 1 << 14; - const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM - | NET_FEATURE_GUEST_CSUM - | NET_FEATURE_GUEST_TSO4 - | NET_FEATURE_GUEST_UFO - | NET_FEATURE_HOST_TSO4 - | NET_FEATURE_HOST_UFO; - - // On Linux use unixstream (SOCK_STREAM) to connect to gvproxy's - // QEMU listener. On macOS use unixgram (SOCK_DGRAM) with the vfkit - // magic byte for the vfkit listener. - #[cfg(target_os = "linux")] - vm.add_net_unixstream(&net_sock, &mac, COMPAT_NET_FEATURES)?; - #[cfg(target_os = "macos")] - { - const NET_FLAG_VFKIT: u32 = 1 << 0; - vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; - } - - eprintln!( - "Networking: gvproxy (virtio-net) [{:.1}s]", - launch_start.elapsed().as_secs_f64() - ); - gvproxy_guard = Some(GvproxyGuard::new(child)); - gvproxy_api_sock = Some(api_sock); } - } - - // Port mapping (TSI only) - if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) { - vm.set_port_map(&config.port_map)?; - } - - for vsock_port in &config.vsock_ports { - if let Some(parent) = vsock_port.socket_path.parent() { - std::fs::create_dir_all(parent).map_err(|e| { - VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display())) + SelectedBackend::Libkrun => { + let runtime_gvproxy = resolve_runtime_bundle()?; + let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { + VmError::HostSetup(format!( + "runtime bundle file has no parent directory: {}", + runtime_gvproxy.display() + )) })?; - } - // libkrun returns EEXIST if the socket file is already present from a - // previous run. Remove any stale socket before registering the port. - let _ = std::fs::remove_file(&vsock_port.socket_path); - vm.add_vsock_port(vsock_port)?; - } - - // Console output - let console_log = config.console_output.clone().unwrap_or_else(|| { - config - .rootfs - .parent() - .unwrap_or(&config.rootfs) - .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs))) - }); - vm.set_console_output(&console_log)?; - - // envp: use provided env or minimal defaults - let mut env: Vec = if config.env.is_empty() { - vec![ - "HOME=/root", - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm", - ] - .into_iter() - .map(ToOwned::to_owned) - .collect() - } else { - config.env.clone() - }; - if let Some(state_disk) = &config.state_disk - && !env - .iter() - .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE=")) - { - env.push(format!( - "OPENSHELL_VM_STATE_DISK_DEVICE={}", - state_disk.guest_device - )); - } - vm.set_exec(&config.exec_path, &config.args, &env)?; + configure_runtime_loader_env(runtime_dir)?; - // ── Fork and enter the VM ────────────────────────────────────── - // - // krun_start_enter() never returns — it calls exit() when the guest - // process exits. We fork so the parent can monitor and report. - - let boot_start = Instant::now(); - eprintln!("Booting microVM..."); - - let pid = unsafe { libc::fork() }; - match pid { - -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())), - 0 => { - // Child process: enter the VM (never returns on success) - let ret = vm.start_enter(); - eprintln!("krun_start_enter failed: {ret}"); - std::process::exit(1); - } - _ => { - // Parent: wait for child - if config.exec_path == "/srv/openshell-vm-init.sh" { - let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id); - if let Err(err) = - write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid) - { - unsafe { - libc::kill(pid, libc::SIGTERM); - } - // Guard drop will kill gvproxy automatically - drop(gvproxy_guard); - clear_vm_runtime_state(&config.rootfs); - return Err(err); - } - } - eprintln!( - "VM started (child pid {pid}) [{:.1}s]", - boot_start.elapsed().as_secs_f64() - ); - for pm in &config.port_map { - let host_port = pm.split(':').next().unwrap_or(pm); - eprintln!(" port {pm} -> http://localhost:{host_port}"); - } - eprintln!("Console output: {}", console_log.display()); - - // Set up gvproxy port forwarding via its HTTP API. - // The port_map entries use the same "host:guest" format - // as TSI, but here we translate them into gvproxy expose - // calls targeting the guest IP (192.168.127.2). - // - // Instead of a fixed 500ms sleep, poll the API socket with - // exponential backoff (5ms → 200ms, ~1s total budget). - if let Some(ref api_sock) = gvproxy_api_sock { - let fwd_start = Instant::now(); - // Wait for the API socket to appear (it lags slightly - // behind the vfkit data socket). - { - let deadline = Instant::now() + std::time::Duration::from_secs(2); - let mut interval = std::time::Duration::from_millis(5); - while !api_sock.exists() { - if Instant::now() >= deadline { - eprintln!( - "warning: gvproxy API socket not ready after 2s, attempting anyway" - ); - break; - } - std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(200)); - } - } - - let guest_ip = "192.168.127.2"; - - for pm in &config.port_map { - let parts: Vec<&str> = pm.split(':').collect(); - let (host_port, guest_port) = match parts.len() { - 2 => (parts[0], parts[1]), - 1 => (parts[0], parts[0]), - _ => { - eprintln!(" skipping invalid port mapping: {pm}"); - continue; - } - }; - - let expose_body = format!( - r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"# - ); - - // Retry with exponential backoff — gvproxy's internal - // netstack may not be ready immediately after socket creation. - let mut expose_ok = false; - let mut retry_interval = std::time::Duration::from_millis(100); - let expose_deadline = Instant::now() + std::time::Duration::from_secs(10); - loop { - match gvproxy_expose(api_sock, &expose_body) { - Ok(()) => { - eprintln!(" port {host_port} -> {guest_ip}:{guest_port}"); - expose_ok = true; - break; - } - Err(e) => { - if Instant::now() >= expose_deadline { - eprintln!(" port {host_port}: {e} (retries exhausted)"); - break; - } - std::thread::sleep(retry_interval); - retry_interval = - (retry_interval * 2).min(std::time::Duration::from_secs(1)); - } - } - } - if !expose_ok { - return Err(VmError::HostSetup(format!( - "failed to forward port {host_port} via gvproxy" - ))); - } - } - eprintln!( - "Port forwarding ready [{:.1}s]", - fwd_start.elapsed().as_secs_f64() - ); - } - - // Bootstrap the OpenShell control plane and wait for the - // service to be reachable. Only for the gateway preset, and - // only when port forwarding is configured (i.e. the gateway - // is reachable from the host). During rootfs pre-init builds, - // no --port is specified so there is nothing to health-check - // — the build script has its own kubectl-based readiness - // checks inside the VM. - if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { - // Bootstrap stores host-side metadata and mTLS creds. - // With pre-baked rootfs (Path 1) this reads PKI directly - // from virtio-fs — no kubectl or port forwarding needed. - // Cold boot (Path 2) writes secret manifests into the - // k3s auto-deploy directory via virtio-fs. - let gateway_port = gateway_host_port(config); - bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; - - // Wait for the gRPC health check to pass. This ensures - // the service is fully operational, not just accepting - // TCP connections. The health check confirms the full - // path (gvproxy → kube-proxy nftables → pod:8080) and - // that the gRPC service is responding to requests. - health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; - } - - eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); - eprintln!("Press Ctrl+C to stop."); - - // Forward signals to child - unsafe { - libc::signal( - libc::SIGINT, - forward_signal as *const () as libc::sighandler_t, - ); - libc::signal( - libc::SIGTERM, - forward_signal as *const () as libc::sighandler_t, - ); - CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); - } - - let mut status: libc::c_int = 0; - unsafe { - libc::waitpid(pid, &raw mut status, 0); - } - - // Clean up gvproxy — disarm the guard and do explicit cleanup - // so we can print the "stopped" message. - if config.exec_path == "/srv/openshell-vm-init.sh" { - clear_vm_runtime_state(&config.rootfs); - } - if let Some(mut guard) = gvproxy_guard - && let Some(mut child) = guard.disarm() - { - let _ = child.kill(); - let _ = child.wait(); - eprintln!("gvproxy stopped"); - } - - if libc::WIFEXITED(status) { - let code = libc::WEXITSTATUS(status); - eprintln!("VM exited with code {code}"); - return Ok(code); - } else if libc::WIFSIGNALED(status) { - let sig = libc::WTERMSIG(status); - eprintln!("VM killed by signal {sig}"); - return Ok(128 + sig); - } + let _ = ffi::libkrun()?; + log_runtime_provenance(runtime_dir); - Ok(status) + let libkrun_backend = backend::libkrun::LibkrunBackend; + backend::VmBackend::launch(&libkrun_backend, config) } } } @@ -1716,6 +1374,9 @@ pub fn launch(config: &VmConfig) -> Result { /// Default gateway port: host port mapped to the `OpenShell` `NodePort` (30051). const DEFAULT_GATEWAY_PORT: u16 = 30051; +/// The NodePort the OpenShell gateway listens on inside the VM. +pub const GUEST_GATEWAY_NODEPORT: u16 = 30051; + /// Bootstrap the `OpenShell` control plane after k3s is ready. /// /// Two paths: @@ -1727,7 +1388,11 @@ const DEFAULT_GATEWAY_PORT: u16 = 30051; /// 2. **First boot / post-reset**: poll the exec agent to `cat` each PEM file /// from `/opt/openshell/pki/` until the files exist (PKI generation has /// finished), then store them in `~/.config/openshell/gateways//mtls/`. -fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Result<(), VmError> { +pub(crate) fn bootstrap_gateway( + rootfs: &Path, + gateway_name: &str, + gateway_port: u16, +) -> Result<(), VmError> { let bootstrap_start = Instant::now(); let metadata = openshell_bootstrap::GatewayMetadata { @@ -1761,7 +1426,7 @@ fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Re // drift check and the host already has valid certs. If the agent // isn't reachable we skip silently rather than blocking boot for // 30s. - match fetch_pki_over_exec(&exec_socket, std::time::Duration::from_secs(5)) { + match fetch_pki_over_exec(&exec_socket, Duration::from_secs(5)) { Ok(bundle) => { if let Err(e) = sync_host_certs_if_stale(gateway_name, &bundle) { eprintln!("Warning: cert sync check failed: {e}"); @@ -1788,7 +1453,7 @@ fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Re // We poll the exec agent with `cat ` for each PEM file until they // exist, retrying to handle the window between VM boot and PKI generation. eprintln!("Waiting for VM to generate PKI..."); - let pki_bundle = fetch_pki_over_exec(&exec_socket, std::time::Duration::from_secs(120)) + let pki_bundle = fetch_pki_over_exec(&exec_socket, Duration::from_secs(120)) .map_err(|e| VmError::Bootstrap(format!("VM did not produce PKI within 120s: {e}")))?; eprintln!("PKI ready — storing client certs on host..."); @@ -1829,7 +1494,7 @@ const PKI_FILES: &[(&str, &str)] = &[ /// and PKI generation completing. fn fetch_pki_over_exec( exec_socket: &Path, - timeout: std::time::Duration, + timeout: Duration, ) -> Result { let deadline = Instant::now() + timeout; @@ -1837,7 +1502,7 @@ fn fetch_pki_over_exec( match try_read_pki_files(exec_socket) { Ok(bundle) => return Ok(bundle), Err(_) if Instant::now() < deadline => { - std::thread::sleep(std::time::Duration::from_millis(500)); + std::thread::sleep(Duration::from_millis(500)); } Err(e) => { return Err(VmError::Bootstrap(format!( @@ -1921,6 +1586,31 @@ fn is_warm_boot(gateway_name: &str) -> bool { true } +/// Remove cached mTLS certs from the host so the next `bootstrap_gateway` +/// call treats this as a cold boot and fetches fresh PKI from the VM. +/// +/// Called when the state disk is freshly created or `--reset` is used, +/// since the VM will generate new PKI that won't match stale host certs. +fn clear_warm_boot_certs(gateway_name: &str) { + let Ok(home) = std::env::var("HOME") else { + return; + }; + let config_base = + std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); + let mtls_dir = PathBuf::from(&config_base) + .join("openshell/gateways") + .join(gateway_name) + .join("mtls"); + + if mtls_dir.is_dir() { + if let Err(e) = std::fs::remove_dir_all(&mtls_dir) { + eprintln!("Warning: failed to clear stale mTLS certs: {e}"); + } else { + eprintln!("Cleared stale host mTLS certs"); + } + } +} + /// Compare the CA cert on the rootfs (authoritative source) against the /// host-side copy. If they differ, re-copy all client certs from the rootfs. /// @@ -1956,15 +1646,38 @@ fn sync_host_certs_if_stale( Ok(()) } -static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); +pub(crate) static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); + +pub(crate) static VIRTIOFSD_PID: std::sync::atomic::AtomicI32 = + std::sync::atomic::AtomicI32::new(0); -extern "C" fn forward_signal(_sig: libc::c_int) { +/// Set to `true` by the signal handler when a shutdown signal (SIGTERM/SIGINT) +/// is received. The main thread checks this after `qemu_child.wait()` returns +/// to ensure cleanup runs even if the wait was interrupted. +pub(crate) static SHUTDOWN_REQUESTED: std::sync::atomic::AtomicBool = + std::sync::atomic::AtomicBool::new(false); + +/// Signal handler that forwards SIGTERM to child processes and sets the +/// shutdown flag. Only calls async-signal-safe functions (libc::kill, +/// atomic stores). No heap allocation, no println, no mutex. +pub(crate) extern "C" fn forward_signal(_sig: libc::c_int) { + SHUTDOWN_REQUESTED.store(true, std::sync::atomic::Ordering::Relaxed); + + // Always send SIGTERM to each child individually. The process-group + // approach (kill(-pgid)) is unreliable because setpgid() in QEMU's + // pre_exec silently fails — QEMU stays in its parent's group. let pid = CHILD_PID.load(std::sync::atomic::Ordering::Relaxed); if pid > 0 { unsafe { libc::kill(pid, libc::SIGTERM); } } + let vfsd_pid = VIRTIOFSD_PID.load(std::sync::atomic::Ordering::Relaxed); + if vfsd_pid > 0 { + unsafe { + libc::kill(vfsd_pid, libc::SIGTERM); + } + } } #[cfg(test)] @@ -2082,4 +1795,85 @@ mod tests { let _ = fs::remove_dir_all(&dir); } + + #[test] + fn auto_selects_qemu_for_gpu() { + enum SelectedBackend { + Libkrun, + Qemu, + } + + let select = |backend: VmBackendChoice, gpu_enabled: bool| match backend { + VmBackendChoice::Libkrun => SelectedBackend::Libkrun, + VmBackendChoice::Qemu => SelectedBackend::Qemu, + VmBackendChoice::Auto => { + if gpu_enabled { + SelectedBackend::Qemu + } else { + SelectedBackend::Libkrun + } + } + }; + + assert!(matches!( + select(VmBackendChoice::Auto, true), + SelectedBackend::Qemu + )); + assert!(matches!( + select(VmBackendChoice::Auto, false), + SelectedBackend::Libkrun + )); + assert!(matches!( + select(VmBackendChoice::Qemu, false), + SelectedBackend::Qemu + )); + } + + fn config_with_port_map(port_map: Vec) -> VmConfig { + VmConfig { + rootfs: PathBuf::from("/tmp/fake-rootfs"), + vcpus: 1, + mem_mib: 512, + exec_path: "/bin/true".to_string(), + args: vec![], + env: vec![], + workdir: "/".to_string(), + port_map, + vsock_ports: vec![], + log_level: 0, + console_output: None, + net: NetBackend::Tsi, + reset: false, + gateway_name: "test".to_string(), + state_disk: None, + gpu_enabled: false, + gpu_has_msix: false, + vfio_device: None, + backend: VmBackendChoice::Auto, + } + } + + #[test] + fn gateway_host_port_default_mapping() { + let cfg = config_with_port_map(vec!["30051:30051".to_string()]); + assert_eq!(gateway_host_port(&cfg), 30051); + } + + #[test] + fn gateway_host_port_no_gateway_mapping_returns_default() { + let cfg = config_with_port_map(vec!["6443:6443".to_string(), "8080:8080".to_string()]); + assert_eq!(gateway_host_port(&cfg), DEFAULT_GATEWAY_PORT); + } + + #[test] + fn gateway_host_port_finds_remapped_gateway() { + let cfg = config_with_port_map(vec!["6443:6443".to_string(), "9999:30051".to_string()]); + assert_eq!(gateway_host_port(&cfg), 9999); + } + + #[test] + fn gateway_host_port_empty_port_map() { + let cfg = config_with_port_map(vec![]); + assert_eq!(gateway_host_port(&cfg), DEFAULT_GATEWAY_PORT); + } } diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs index bb9d854b1..9241db908 100644 --- a/crates/openshell-vm/src/main.rs +++ b/crates/openshell-vm/src/main.rs @@ -17,8 +17,9 @@ //! codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - target/debug/openshell-vm //! ``` -use std::io::IsTerminal; +use std::io::{BufRead, IsTerminal}; use std::path::PathBuf; +use std::time::Duration; use clap::{Parser, Subcommand, ValueHint}; @@ -92,6 +93,16 @@ struct Cli { /// unclean shutdown. #[arg(long)] reset: bool, + + /// Enable GPU passthrough. Optionally specify a PCI address + /// (e.g. `0000:41:00.0`). Uses QEMU backend with VFIO. + #[arg(long, num_args = 0..=1, default_missing_value = "auto")] + gpu: Option, + + /// Hypervisor backend: "auto" (default), "libkrun", or "qemu". + /// Auto selects QEMU when --gpu is set, and libkrun otherwise. + #[arg(long, default_value = "auto")] + backend: String, } #[derive(Subcommand)] @@ -158,6 +169,19 @@ fn main() { } } + #[cfg(target_os = "linux")] + { + #[allow(unsafe_code)] + let ret = unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM) }; + if ret != 0 { + eprintln!( + "warning: prctl(PR_SET_PDEATHSIG) failed: {} — \ + signal propagation through sudo may not work", + std::io::Error::last_os_error() + ); + } + } + tracing_subscriber::fmt::init(); let cli = Cli::parse(); @@ -175,6 +199,102 @@ fn main() { } } +/// RAII guard that restarts the display manager when dropped. +/// +/// Created when the user confirms stopping the display manager for GPU +/// passthrough. On drop (normal exit, error, or panic), restarts the +/// service so the user's graphical session is restored. +struct DisplayManagerGuard; + +impl DisplayManagerGuard { + fn stop_display_manager() -> Result> { + eprintln!("Stopping display-manager..."); + let status = std::process::Command::new("systemctl") + .args(["stop", "display-manager"]) + .status()?; + if !status.success() { + return Err(format!( + "failed to stop display-manager (exit {})", + status.code().unwrap_or(-1) + ) + .into()); + } + eprintln!("display-manager stopped"); + // Give Xorg time to release GPU device handles. + std::thread::sleep(Duration::from_secs(2)); + Ok(Self) + } +} + +impl Drop for DisplayManagerGuard { + fn drop(&mut self) { + eprintln!("Restarting display-manager..."); + match std::process::Command::new("systemctl") + .args(["start", "display-manager"]) + .status() + { + Ok(s) if s.success() => eprintln!("display-manager restarted"), + Ok(s) => eprintln!( + "warning: display-manager restart failed (exit {})", + s.code().unwrap_or(-1) + ), + Err(e) => eprintln!("warning: could not restart display-manager: {e}"), + } + } +} + +/// Prompt the user to stop the display manager for GPU passthrough. +/// +/// Returns `true` if the user confirms. Always returns `false` when stdin +/// is not a terminal (non-interactive mode). +fn prompt_display_manager_stop(info: &openshell_vfio::DisplayBlockerInfo) -> bool { + if !std::io::stdin().is_terminal() { + return false; + } + + eprintln!(); + eprintln!( + "WARNING: GPU {} is in use by the display manager.", + info.pci_addr + ); + if !info.display_processes.is_empty() { + let procs: Vec = info + .display_processes + .iter() + .map(|(pid, comm)| format!("{comm} (PID {pid})")) + .collect(); + eprintln!(" Display server processes: {}", procs.join(", ")); + } + if info.has_active_outputs { + eprintln!(" Active display outputs are connected to this GPU."); + } + eprintln!(); + eprintln!("Stopping the display manager will terminate your graphical session."); + eprintln!("You will lose access to any open GUI applications."); + if !info.other_processes.is_empty() { + let procs: Vec = info + .other_processes + .iter() + .map(|(pid, comm)| format!("{comm} (PID {pid})")) + .collect(); + eprintln!(); + eprintln!( + "Other non-display processes are also using the GPU: {}", + procs.join(", ") + ); + eprintln!("These will also lose GPU access."); + } + eprintln!(); + eprintln!("The display manager will be restarted automatically when the VM exits."); + eprint!("Stop display-manager and proceed with GPU passthrough? [y/N] "); + + let mut input = String::new(); + if std::io::stdin().lock().read_line(&mut input).is_err() { + return false; + } + matches!(input.trim().to_lowercase().as_str(), "y" | "yes") +} + fn run(cli: Cli) -> Result> { if let Some(GatewayCommand::PrepareRootfs { force }) = &cli.command { let rootfs = openshell_vm::prepare_rootfs(cli.rootfs.clone(), &cli.name, *force)?; @@ -196,12 +316,16 @@ fn run(cli: Cli) -> Result> { return Err("openshell-vm exec requires a command when stdin is not a TTY".into()); } } + let exec_rootfs = if let Some(explicit) = cli.rootfs { + explicit + } else if cli.gpu.is_some() { + openshell_vm::named_gpu_rootfs_dir(&cli.name)? + } else { + openshell_vm::named_rootfs_dir(&cli.name)? + }; return Ok(openshell_vm::exec_running_vm( openshell_vm::VmExecOptions { - rootfs: Some( - cli.rootfs - .unwrap_or(openshell_vm::named_rootfs_dir(&cli.name)?), - ), + rootfs: Some(exec_rootfs), command, workdir, env, @@ -223,12 +347,101 @@ fn run(cli: Cli) -> Result> { } }; - let rootfs = cli - .rootfs - .map_or_else(|| openshell_vm::ensure_named_rootfs(&cli.name), Ok)?; + let rootfs = if let Some(explicit) = cli.rootfs { + Ok(explicit) + } else if cli.gpu.is_some() { + openshell_vm::ensure_gpu_rootfs(&cli.name) + } else { + openshell_vm::ensure_named_rootfs(&cli.name) + }?; let gateway_name = openshell_vm::gateway_name(&cli.name)?; + // Check if the display manager is blocking GPU passthrough and offer + // to stop it interactively. The guard restarts display-manager on exit. + let _display_manager_guard: Option = if cli.gpu.is_some() { + let requested_bdf = match cli.gpu.as_deref() { + Some(addr) if addr != "auto" => Some(addr), + _ => None, + }; + + if let Some(blocker) = openshell_vfio::detect_display_blocker(requested_bdf) { + if prompt_display_manager_stop(&blocker) { + Some(DisplayManagerGuard::stop_display_manager()?) + } else { + return Err(format!( + "GPU passthrough aborted: GPU {} is in use by the display manager.\n\ + To proceed, stop it manually before launching the VM:\n \ + sudo systemctl stop display-manager", + blocker.pci_addr + ) + .into()); + } + } else { + None + } + } else { + None + }; + + let (gpu_enabled, vfio_device, gpu_has_msix, _gpu_guard) = match cli.gpu { + Some(ref addr) if addr != "auto" => { + let state = openshell_vfio::prepare_gpu_for_passthrough(Some(addr))?; + let bdf = state.pci_addr.clone(); + let has_msix = state.has_msix; + ( + true, + Some(bdf), + has_msix, + Some(openshell_vfio::GpuBindGuard::new(state)), + ) + } + Some(_) => { + let state = openshell_vfio::prepare_gpu_for_passthrough(None)?; + let bdf = state.pci_addr.clone(); + let has_msix = state.has_msix; + ( + true, + Some(bdf), + has_msix, + Some(openshell_vfio::GpuBindGuard::new(state)), + ) + } + None => (false, None, true, None), + }; + + if let Some(ref guard) = _gpu_guard { + if let Some(state) = guard.state() { + if state.did_bind { + eprintln!( + "\nGPU recovery: if this process is force-killed (kill -9), \ + restore your GPU with:\n{}", + state.recovery_commands() + ); + } + } + } + + let backend_choice = match cli.backend.as_str() { + "qemu" => openshell_vm::VmBackendChoice::Qemu, + "libkrun" => { + if gpu_enabled { + return Err( + "--backend libkrun is incompatible with --gpu (libkrun does not support \ + VFIO passthrough). Use --backend auto or --backend qemu." + .into(), + ); + } + openshell_vm::VmBackendChoice::Libkrun + } + "auto" => openshell_vm::VmBackendChoice::Auto, + other => { + return Err( + format!("unknown --backend: {other} (expected: auto, libkrun, qemu)").into(), + ); + } + }; + let mut config = if let Some(exec_path) = cli.exec { openshell_vm::VmConfig { rootfs, @@ -246,11 +459,27 @@ fn run(cli: Cli) -> Result> { reset: cli.reset, gateway_name, state_disk: None, + gpu_enabled, + gpu_has_msix, + vfio_device, + backend: backend_choice, } } else { let mut c = openshell_vm::VmConfig::gateway(rootfs); if !cli.port.is_empty() { c.port_map = cli.port; + let has_gateway = c.port_map.iter().any(|pm| { + pm.split(':').nth(1).and_then(|p| p.parse::().ok()) + == Some(openshell_vm::GUEST_GATEWAY_NODEPORT) + }); + if !has_gateway { + let gw_port = openshell_vm::GUEST_GATEWAY_NODEPORT; + c.port_map.push(format!("{gw_port}:{gw_port}")); + eprintln!( + "Auto-added gateway port mapping {gw_port}:{gw_port} \ + (required for health check and CLI access)" + ); + } } if let Some(v) = cli.vcpus { c.vcpus = v; @@ -261,6 +490,10 @@ fn run(cli: Cli) -> Result> { c.net = net_backend; c.reset = cli.reset; c.gateway_name = gateway_name; + c.gpu_enabled = gpu_enabled; + c.gpu_has_msix = gpu_has_msix; + c.vfio_device = vfio_device; + c.backend = backend_choice; if state_disk_disabled() { c.state_disk = None; } diff --git a/crates/openshell-vm/tests/vm_boot_smoke.rs b/crates/openshell-vm/tests/vm_boot_smoke.rs new file mode 100644 index 000000000..f16027129 --- /dev/null +++ b/crates/openshell-vm/tests/vm_boot_smoke.rs @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Non-GPU boot smoke tests for the QEMU backend. +//! +//! Boots a VM **without** VFIO/GPU passthrough and verifies the kernel boots +//! and init runs. This catches backend regressions on regular CI runners +//! that lack GPU hardware. +//! +//! Gated on `OPENSHELL_VM_BACKEND` — set to `qemu` to run the tests. +//! Skipped when the env var is absent. +//! +//! Requires the VM runtime bundle (vmlinux, virtiofsd, rootfs, and the +//! backend binary) to be installed. Set `OPENSHELL_VM_RUNTIME_DIR` or run +//! `mise run vm:bundle-runtime` first. +//! +//! Run explicitly: +//! +//! ```sh +//! OPENSHELL_VM_BACKEND=qemu cargo test -p openshell-vm --test vm_boot_smoke +//! ``` + +#![allow(unsafe_code)] + +use std::process::{Command, Stdio}; +use std::time::Duration; + +const GATEWAY: &str = env!("CARGO_BIN_EXE_openshell-vm"); + +fn runtime_bundle_dir() -> std::path::PathBuf { + std::path::Path::new(GATEWAY) + .parent() + .expect("openshell-vm binary has no parent") + .join("openshell-vm.runtime") +} + +fn require_bundle() { + let bundle = runtime_bundle_dir(); + if !bundle.is_dir() { + panic!( + "VM runtime bundle not found at {}. Run `mise run vm:bundle-runtime` first.", + bundle.display() + ); + } +} + +fn skip_unless_qemu() -> bool { + if std::env::var("OPENSHELL_VM_BACKEND").as_deref() != Ok("qemu") { + eprintln!("OPENSHELL_VM_BACKEND != qemu — skipping"); + return true; + } + false +} + +#[test] +fn qemu_exec_exits_cleanly() { + if skip_unless_qemu() { + return; + } + require_bundle(); + + let mut child = Command::new(GATEWAY) + .args(["--backend", "qemu", "--net", "none", "--exec", "/bin/true"]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("failed to start openshell-vm"); + + let timeout = Duration::from_secs(30); + let start = std::time::Instant::now(); + + loop { + match child.try_wait() { + Ok(Some(status)) => { + assert!( + status.success(), + "qemu --exec /bin/true exited with {status}" + ); + return; + } + Ok(None) => { + if start.elapsed() > timeout { + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGKILL) }; + let _ = child.wait(); + panic!("QEMU VM did not exit within {timeout:?}"); + } + std::thread::sleep(Duration::from_millis(500)); + } + Err(e) => panic!("error waiting for openshell-vm: {e}"), + } + } +} + +#[test] +fn qemu_boots_without_gpu() { + if skip_unless_qemu() { + return; + } + require_bundle(); + + if !nix_is_root() { + eprintln!("skipping full gateway boot — requires root for TAP networking"); + return; + } + + let mut child = Command::new(GATEWAY) + .args(["--backend", "qemu"]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("failed to start openshell-vm"); + + let addr: std::net::SocketAddr = ([127, 0, 0, 1], 30051).into(); + let timeout = Duration::from_secs(180); + let start = std::time::Instant::now(); + let mut reachable = false; + + while start.elapsed() < timeout { + if std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { + reachable = true; + break; + } + std::thread::sleep(Duration::from_secs(2)); + } + + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) }; + let _ = child.wait(); + + assert!( + reachable, + "QEMU VM service on port 30051 not reachable within {timeout:?}" + ); +} + +fn nix_is_root() -> bool { + unsafe { libc::geteuid() == 0 } +} diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index b7e854677..59b133629 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -56,6 +56,7 @@ COPY crates/openshell-router/Cargo.toml crates/openshell-router/Cargo.toml COPY crates/openshell-sandbox/Cargo.toml crates/openshell-sandbox/Cargo.toml COPY crates/openshell-server/Cargo.toml crates/openshell-server/Cargo.toml COPY crates/openshell-tui/Cargo.toml crates/openshell-tui/Cargo.toml +COPY crates/openshell-vfio/Cargo.toml crates/openshell-vfio/Cargo.toml COPY crates/openshell-vm/Cargo.toml crates/openshell-vm/Cargo.toml COPY crates/openshell-core/build.rs crates/openshell-core/build.rs COPY proto/ proto/ @@ -73,6 +74,7 @@ RUN mkdir -p \ crates/openshell-sandbox/src \ crates/openshell-server/src \ crates/openshell-tui/src \ + crates/openshell-vfio/src \ crates/openshell-vm/src && \ touch crates/openshell-bootstrap/src/lib.rs && \ printf 'fn main() {}\n' > crates/openshell-cli/src/main.rs && \ @@ -89,6 +91,7 @@ RUN mkdir -p \ touch crates/openshell-server/src/lib.rs && \ printf 'fn main() {}\n' > crates/openshell-server/src/main.rs && \ touch crates/openshell-tui/src/lib.rs && \ + touch crates/openshell-vfio/src/lib.rs && \ touch crates/openshell-vm/src/lib.rs && \ printf 'fn main() {}\n' > crates/openshell-vm/src/main.rs diff --git a/tasks/scripts/vm/build-gpu-deps.sh b/tasks/scripts/vm/build-gpu-deps.sh new file mode 100755 index 000000000..7265a06c3 --- /dev/null +++ b/tasks/scripts/vm/build-gpu-deps.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build GPU passthrough dependencies for the QEMU backend. +# +# Builds virtiofsd from source. +# These are only needed on Linux for VFIO GPU passthrough. +# +# Artifacts produced: +# virtiofsd — filesystem daemon used by the QEMU backend +# +# The vmlinux kernel is extracted separately by build-libkrun.sh during +# the kernel build step. +# +# QEMU's own binary (qemu-system-x86_64) must be installed on the host +# separately — it is not built or downloaded by this script. +# Run `mise run vm:qemu-check` to validate QEMU prerequisites. +# +# Usage: +# ./build-gpu-deps.sh [--output-dir ] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/_lib.sh" +ROOT="$(vm_lib_root)" + +source "${ROOT}/crates/openshell-vm/pins.env" 2>/dev/null || true + +VIRTIOFSD_VERSION="${VIRTIOFSD_VERSION:-v1.13.0}" +OUTPUT_DIR="${ROOT}/target/libkrun-build" + +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) OUTPUT_DIR="$2"; shift 2 ;; + *) echo "Unknown argument: $1" >&2; exit 1 ;; + esac +done + +if [ "$(uname -s)" != "Linux" ]; then + echo "Error: GPU passthrough is Linux-only" >&2 + exit 1 +fi + +mkdir -p "$OUTPUT_DIR" + +HOST_ARCH="$(uname -m)" +case "$HOST_ARCH" in + aarch64) VIRTIOFSD_ARCH="aarch64" ;; + x86_64) VIRTIOFSD_ARCH="x86_64" ;; + *) echo "Error: Unsupported architecture: ${HOST_ARCH}" >&2; exit 1 ;; +esac + +echo "==> Building virtiofsd ${VIRTIOFSD_VERSION} from source..." +VIRTIOFSD_SRC="$(mktemp -d)" +VIRTIOFSD_TARBALL_URL="https://gitlab.com/virtio-fs/virtiofsd/-/archive/${VIRTIOFSD_VERSION}/virtiofsd-${VIRTIOFSD_VERSION}.tar.gz" +curl -fsSL "$VIRTIOFSD_TARBALL_URL" | tar -xzf - -C "$VIRTIOFSD_SRC" --strip-components=1 +rm -f "${VIRTIOFSD_SRC}/Cargo.lock" + +CARGO_CMD="cargo" +if command -v mise &>/dev/null; then + CARGO_CMD="mise exec -- cargo" +fi +# Prevent external CARGO_TARGET_DIR from redirecting build output away from +# the local temp directory (e.g. Cursor sandbox sets this globally). +unset CARGO_TARGET_DIR +$CARGO_CMD build --release --manifest-path "${VIRTIOFSD_SRC}/Cargo.toml" +cp "${VIRTIOFSD_SRC}/target/release/virtiofsd" "${OUTPUT_DIR}/virtiofsd" +chmod +x "${OUTPUT_DIR}/virtiofsd" +rm -rf "$VIRTIOFSD_SRC" +echo " Built: virtiofsd" + +echo "" +echo "==> GPU passthrough binaries ready in ${OUTPUT_DIR}" +ls -lah "${OUTPUT_DIR}/virtiofsd" 2>/dev/null || true diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh index 9e2217f50..c2a1a6d76 100755 --- a/tasks/scripts/vm/build-libkrun.sh +++ b/tasks/scripts/vm/build-libkrun.sh @@ -210,9 +210,25 @@ if [ -f openshell.kconfig ]; then # Re-run olddefconfig to fill in any new symbols introduced by the fragment. make -C "${KERNEL_SOURCES}" ARCH="${KARCH}" olddefconfig + # Force-enable hidden Kconfig bools required by out-of-tree NVIDIA modules. + # CONFIG_MMU_NOTIFIER is a hidden bool (no prompt) that can only be + # activated via "select" from another in-tree option. olddefconfig and + # syncconfig both strip it if nothing selects it. NVIDIA UVM needs it for + # GPU memory management. We patch the DRM Kconfig (already enabled as + # CONFIG_DRM=y) to select MMU_NOTIFIER, then re-run olddefconfig so the + # dependency chain (INTERVAL_TREE) is resolved properly. + if ! grep -q "select MMU_NOTIFIER" "${KERNEL_SOURCES}/drivers/gpu/drm/Kconfig"; then + sed -i '/^menuconfig DRM$/,/^[[:space:]]*select VIDEO/ { + /^[[:space:]]*select VIDEO/a\ +\tselect MMU_NOTIFIER + }' "${KERNEL_SOURCES}/drivers/gpu/drm/Kconfig" + echo " Patched DRM Kconfig to select MMU_NOTIFIER" + fi + make -C "${KERNEL_SOURCES}" ARCH="${KARCH}" olddefconfig + # Verify that the key options were actually applied. all_ok=true - for opt in CONFIG_BRIDGE CONFIG_NETFILTER CONFIG_NF_NAT; do + for opt in CONFIG_BRIDGE CONFIG_NETFILTER CONFIG_NF_NAT CONFIG_X86_PAT CONFIG_MMU_NOTIFIER CONFIG_FW_LOADER; do val="$(grep "^${opt}=" "${KERNEL_SOURCES}/.config" 2>/dev/null || true)" if [ -n "$val" ]; then echo " ${opt}: ${val#*=}" @@ -239,6 +255,25 @@ make -j"$(nproc)" cp libkrunfw.so* "$OUTPUT_DIR/" echo " Built: $(ls "$OUTPUT_DIR"/libkrunfw.so* | xargs -n1 basename | tr '\n' ' ')" +# Copy vmlinux kernel image for QEMU GPU passthrough. +# This is the uncompressed kernel built by libkrunfw's kernel build. +if [ -f "${KERNEL_SOURCES}/vmlinux" ]; then + cp "${KERNEL_SOURCES}/vmlinux" "$OUTPUT_DIR/vmlinux" + echo " Copied vmlinux for QEMU GPU passthrough" +elif [ -f "vmlinux" ]; then + cp "vmlinux" "$OUTPUT_DIR/vmlinux" + echo " Copied vmlinux for QEMU GPU passthrough" +else + echo " Warning: vmlinux not found in kernel build tree (GPU passthrough will not be available)" >&2 +fi + +# Export kernel release string for downstream scripts (nvidia modules, rootfs). +# Uses kernelrelease (includes CONFIG_LOCALVERSION) so that module vermagic, +# rootfs module path, and the kernel's uname -r all agree. +KERNEL_RELEASE="$(make -s -C "${KERNEL_SOURCES}" kernelrelease)" +echo "${KERNEL_RELEASE}" > "${OUTPUT_DIR}/kernel-version.txt" +echo " Exported kernel version: ${KERNEL_RELEASE}" + cd "$BUILD_DIR" # ── Build libkrun (VMM) ───────────────────────────────────────────────── diff --git a/tasks/scripts/vm/build-nvidia-modules.sh b/tasks/scripts/vm/build-nvidia-modules.sh new file mode 100755 index 000000000..064c4bb0c --- /dev/null +++ b/tasks/scripts/vm/build-nvidia-modules.sh @@ -0,0 +1,182 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build NVIDIA open kernel modules against the VM kernel source tree. +# +# Clones the NVIDIA open-gpu-kernel-modules repo at a pinned driver tag +# and compiles the kernel modules against the kernel built by +# build-libkrun.sh. The resulting .ko files are placed in the output +# directory for injection into the GPU rootfs by build-rootfs.sh. +# +# Prerequisites: +# - Kernel source tree built by build-libkrun.sh +# (target/libkrun-build/libkrunfw/linux-/) +# - Build tools: make, gcc +# +# Usage: +# ./build-nvidia-modules.sh [--output-dir ] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/_lib.sh" +ROOT="$(vm_lib_root)" + +source "${ROOT}/crates/openshell-vm/pins.env" 2>/dev/null || true + +NVIDIA_DRIVER_VERSION="${NVIDIA_DRIVER_VERSION:-570}" + +BUILD_DIR="${ROOT}/target/libkrun-build" +OUTPUT_DIR="${BUILD_DIR}/nvidia-modules" + +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) OUTPUT_DIR="$2"; shift 2 ;; + *) echo "Unknown argument: $1" >&2; exit 1 ;; + esac +done + +if [ "$(uname -s)" != "Linux" ]; then + echo "Error: NVIDIA GPU module build is Linux-only" >&2 + exit 1 +fi + +HOST_ARCH="$(uname -m)" +if [ "$HOST_ARCH" != "x86_64" ]; then + echo "Error: NVIDIA GPU passthrough is only supported on x86_64 (got: ${HOST_ARCH})" >&2 + exit 1 +fi + +# ── Locate the kernel source tree ──────────────────────────────────────── + +LIBKRUNFW_DIR="${BUILD_DIR}/libkrunfw" +if [ ! -f "${LIBKRUNFW_DIR}/Makefile" ]; then + echo "ERROR: libkrunfw not found at ${LIBKRUNFW_DIR}" >&2 + echo " The GPU module build requires the kernel source tree." >&2 + echo " Run: FROM_SOURCE=1 mise run vm:setup" >&2 + exit 1 +fi + +KERNEL_DIR_NAME="$(grep '^KERNEL_VERSION' "${LIBKRUNFW_DIR}/Makefile" | head -1 | awk '{print $3}')" +KERNEL_SOURCES="${LIBKRUNFW_DIR}/${KERNEL_DIR_NAME}" + +if [ ! -f "${KERNEL_SOURCES}/.config" ]; then + echo "ERROR: Kernel source tree not found at ${KERNEL_SOURCES}" >&2 + echo " Run: FROM_SOURCE=1 mise run vm:setup" >&2 + exit 1 +fi + +if [ ! -f "${KERNEL_SOURCES}/Module.symvers" ]; then + echo "ERROR: Kernel tree at ${KERNEL_SOURCES} is missing Module.symvers." >&2 + echo " The kernel must have been fully built." >&2 + echo " Run: FROM_SOURCE=1 mise run vm:setup" >&2 + exit 1 +fi + +# Use kernelrelease to get the full version string (includes CONFIG_LOCALVERSION). +KERNEL_VERSION="$(make -s -C "${KERNEL_SOURCES}" kernelrelease)" +echo "==> Building NVIDIA ${NVIDIA_DRIVER_VERSION} kernel modules for kernel ${KERNEL_VERSION}" +echo " Kernel source: ${KERNEL_SOURCES}" +echo " Output: ${OUTPUT_DIR}" +echo "" + +# ── Prepare kernel tree for out-of-tree module builds ──────────────────── + +echo "==> Preparing kernel tree for external module builds..." +make -C "${KERNEL_SOURCES}" modules_prepare -j"$(nproc)" + +# ── Clone or reuse NVIDIA open-gpu-kernel-modules ──────────────────────── + +NVIDIA_DRIVER_TAG="${NVIDIA_DRIVER_TAG:-}" +if [ -z "${NVIDIA_DRIVER_TAG}" ]; then + echo "ERROR: NVIDIA_DRIVER_TAG not set in pins.env or environment." >&2 + echo " This must be the exact driver version tag matching the" >&2 + echo " nvidia-headless-${NVIDIA_DRIVER_VERSION}-open APT package." >&2 + echo " Find it: apt-cache show nvidia-headless-${NVIDIA_DRIVER_VERSION}-open | grep Version" >&2 + echo " Example: NVIDIA_DRIVER_TAG=570.86.16" >&2 + exit 1 +fi + +NVIDIA_SRC="${BUILD_DIR}/open-gpu-kernel-modules" + +if [ -d "${NVIDIA_SRC}" ]; then + EXISTING_TAG="$(git -C "${NVIDIA_SRC}" describe --tags --exact-match HEAD 2>/dev/null || true)" + if [ "${EXISTING_TAG}" = "${NVIDIA_DRIVER_TAG}" ]; then + echo "==> Using cached NVIDIA source (tag ${NVIDIA_DRIVER_TAG})" + else + echo "==> NVIDIA source tag mismatch (have: ${EXISTING_TAG:-unknown}, want: ${NVIDIA_DRIVER_TAG}), re-cloning..." + rm -rf "${NVIDIA_SRC}" + fi +fi + +if [ ! -d "${NVIDIA_SRC}" ]; then + echo "==> Cloning NVIDIA open-gpu-kernel-modules (tag ${NVIDIA_DRIVER_TAG})..." + git clone --depth 1 --branch "${NVIDIA_DRIVER_TAG}" \ + https://github.com/NVIDIA/open-gpu-kernel-modules.git "${NVIDIA_SRC}" +fi + +# ── Build the kernel modules ───────────────────────────────────────────── + +echo "" +echo "==> Compiling NVIDIA kernel modules (this may take 2-5 minutes)..." +make -C "${NVIDIA_SRC}" -j"$(nproc)" modules \ + SYSSRC="${KERNEL_SOURCES}" \ + KERNEL_UNAME="${KERNEL_VERSION}" + +# ── Collect built modules ──────────────────────────────────────────────── + +mkdir -p "${OUTPUT_DIR}" + +# The NVIDIA kbuild produces modules at deterministic paths under kernel-open/. +declare -A MODULE_PATHS=( + [nvidia.ko]="kernel-open/nvidia.ko" + [nvidia-uvm.ko]="kernel-open/nvidia-uvm.ko" + [nvidia-modeset.ko]="kernel-open/nvidia-modeset.ko" + [nvidia-drm.ko]="kernel-open/nvidia-drm.ko" + [nvidia-peermem.ko]="kernel-open/nvidia-peermem.ko" +) + +EXPECTED_MODULES=(nvidia.ko nvidia-uvm.ko nvidia-modeset.ko nvidia-drm.ko nvidia-peermem.ko) + +for mod in "${EXPECTED_MODULES[@]}"; do + src_path="${NVIDIA_SRC}/${MODULE_PATHS[$mod]}" + if [ -f "$src_path" ]; then + cp "$src_path" "${OUTPUT_DIR}/" + echo " Built: $mod ($(du -h "$src_path" | cut -f1))" + fi +done + +# Normalize permissions. +chmod 644 "${OUTPUT_DIR}"/*.ko 2>/dev/null || true + +# nvidia-peermem.ko is optional (GPUDirect RDMA); the other four are required. +REQUIRED_MODULES=(nvidia.ko nvidia-uvm.ko nvidia-modeset.ko nvidia-drm.ko) +for mod in "${REQUIRED_MODULES[@]}"; do + if [ ! -f "${OUTPUT_DIR}/${mod}" ]; then + echo "ERROR: Required module ${mod} was not produced by the build." >&2 + echo " Check build output above for compilation errors." >&2 + exit 1 + fi +done + +echo "" +echo "==> NVIDIA modules ready at ${OUTPUT_DIR}" +ls -lah "${OUTPUT_DIR}/"*.ko + +# Verify module vermagic matches the kernel. +echo "" +echo "==> Verifying module compatibility..." +if command -v modinfo &>/dev/null; then + VERMAGIC="$(modinfo -F vermagic "${OUTPUT_DIR}/nvidia.ko" 2>/dev/null || true)" + if [ -n "$VERMAGIC" ]; then + echo " vermagic: ${VERMAGIC}" + if echo "$VERMAGIC" | grep -q "^${KERNEL_VERSION} "; then + echo " OK: modules match kernel ${KERNEL_VERSION}" + else + echo " ERROR: vermagic does not start with ${KERNEL_VERSION}" >&2 + echo " Modules will fail to load in the VM." >&2 + exit 1 + fi + fi +fi diff --git a/tasks/scripts/vm/build-rootfs-tarball.sh b/tasks/scripts/vm/build-rootfs-tarball.sh index 76e4f6297..d41b2ff25 100755 --- a/tasks/scripts/vm/build-rootfs-tarball.sh +++ b/tasks/scripts/vm/build-rootfs-tarball.sh @@ -9,36 +9,43 @@ # 2. Compresses it to a zstd tarball for embedding # # Usage: -# ./build-rootfs-tarball.sh [--base] +# ./build-rootfs-tarball.sh [--base] [--gpu] # # Options: # --base Build a base rootfs (~200-300MB) without pre-loaded images. # First boot will be slower but binary size is much smaller. # Default: full rootfs with pre-loaded images (~2GB+). +# --gpu Include NVIDIA drivers and nvidia-container-toolkit for GPU +# passthrough. Only supported on x86_64. # -# The resulting tarball is placed at target/vm-runtime-compressed/rootfs.tar.zst -# for inclusion in the embedded binary build. +# The resulting tarball is placed at: +# target/vm-runtime-compressed/rootfs.tar.zst (standard) +# target/vm-runtime-compressed/rootfs-gpu.tar.zst (--gpu) set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" ROOTFS_BUILD_DIR="${ROOT}/target/rootfs-build" OUTPUT_DIR="${ROOT}/target/vm-runtime-compressed" -OUTPUT="${OUTPUT_DIR}/rootfs.tar.zst" # Parse arguments BASE_ONLY=false +GPU=false for arg in "$@"; do case "$arg" in --base) BASE_ONLY=true ;; + --gpu) + GPU=true + ;; --help|-h) - echo "Usage: $0 [--base]" + echo "Usage: $0 [--base] [--gpu]" echo "" echo "Options:" echo " --base Build base rootfs (~200-300MB) without pre-loaded images" echo " First boot will be slower but binary size is much smaller" + echo " --gpu Include NVIDIA drivers for GPU passthrough (x86_64 only)" exit 0 ;; *) @@ -63,28 +70,33 @@ if ! docker info &>/dev/null; then exit 1 fi +ROOTFS_ARGS=() +MODE_DESC="full (pre-loaded images, pre-initialized, ~2GB+)" if [ "$BASE_ONLY" = true ]; then - echo "==> Building BASE rootfs for embedding" - echo " Build dir: ${ROOTFS_BUILD_DIR}" - echo " Output: ${OUTPUT}" - echo " Mode: base (no pre-loaded images, ~200-300MB)" - echo "" - - # Build base rootfs - echo "==> Step 1/2: Building base rootfs..." - "${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" --base "${ROOTFS_BUILD_DIR}" + ROOTFS_ARGS+=(--base) + MODE_DESC="base (no pre-loaded images, ~200-300MB)" +fi +if [ "$GPU" = true ]; then + ROOTFS_ARGS+=(--gpu) + MODE_DESC="${MODE_DESC}, GPU (NVIDIA drivers included)" +fi + +# GPU rootfs gets a distinct tarball name so both can coexist in the output dir +if [ "$GPU" = true ]; then + OUTPUT="${OUTPUT_DIR}/rootfs-gpu.tar.zst" else - echo "==> Building FULL rootfs for embedding" - echo " Build dir: ${ROOTFS_BUILD_DIR}" - echo " Output: ${OUTPUT}" - echo " Mode: full (pre-loaded images, pre-initialized, ~2GB+)" - echo "" - - # Build full rootfs - echo "==> Step 1/2: Building full rootfs (this may take 10-15 minutes)..." - "${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" "${ROOTFS_BUILD_DIR}" + OUTPUT="${OUTPUT_DIR}/rootfs.tar.zst" fi +echo "==> Building rootfs for embedding" +echo " Build dir: ${ROOTFS_BUILD_DIR}" +echo " Output: ${OUTPUT}" +echo " Mode: ${MODE_DESC}" +echo "" + +echo "==> Step 1/2: Building rootfs..." +"${ROOT}/crates/openshell-vm/scripts/build-rootfs.sh" "${ROOTFS_ARGS[@]}" "${ROOTFS_BUILD_DIR}" + # Compress to tarball echo "" echo "==> Step 2/2: Compressing rootfs to tarball..." @@ -107,10 +119,13 @@ echo "" echo "==> Rootfs tarball created successfully!" echo " Output: ${OUTPUT}" echo " Compressed: $(du -sh "${OUTPUT}" | cut -f1)" +TYPE_DESC="full (first boot ~3-5s, images pre-loaded)" if [ "$BASE_ONLY" = true ]; then - echo " Type: base (first boot ~30-60s, images pulled on demand)" -else - echo " Type: full (first boot ~3-5s, images pre-loaded)" + TYPE_DESC="base (first boot ~30-60s, images pulled on demand)" +fi +if [ "$GPU" = true ]; then + TYPE_DESC="${TYPE_DESC}, GPU" fi +echo " Type: ${TYPE_DESC}" echo "" echo "Next step: mise run vm:build" diff --git a/tasks/scripts/vm/bundle-vm-runtime.sh b/tasks/scripts/vm/bundle-vm-runtime.sh index 6c21e511d..83d53dcac 100755 --- a/tasks/scripts/vm/bundle-vm-runtime.sh +++ b/tasks/scripts/vm/bundle-vm-runtime.sh @@ -46,6 +46,9 @@ TARGETS=( "${ROOT}/target/release" ) +COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed" +GPU_ROOTFS_TARBALL="${COMPRESSED_DIR}/rootfs-gpu.tar.zst" + for target_dir in "${TARGETS[@]}"; do # Only stage if the binary exists (avoid creating orphan runtime dirs) if [ ! -f "${target_dir}/openshell-vm" ] && [ ! -f "${target_dir}/openshell-vm.d" ]; then @@ -61,5 +64,11 @@ for target_dir in "${TARGETS[@]}"; do install -m 0755 "$file" "${runtime_dir}/${name}" done + # Stage the GPU rootfs tarball if it was built + if [ -f "${GPU_ROOTFS_TARBALL}" ]; then + install -m 0644 "${GPU_ROOTFS_TARBALL}" "${runtime_dir}/rootfs-gpu.tar.zst" + echo "staged GPU rootfs tarball in ${runtime_dir}" + fi + echo "staged runtime bundle in ${runtime_dir}" done diff --git a/tasks/scripts/vm/compress-vm-runtime.sh b/tasks/scripts/vm/compress-vm-runtime.sh index efada8a2e..69e1d5658 100755 --- a/tasks/scripts/vm/compress-vm-runtime.sh +++ b/tasks/scripts/vm/compress-vm-runtime.sh @@ -91,11 +91,23 @@ if [ -z "${VM_RUNTIME_TARBALL:-}" ] && _check_compressed_artifacts "$OUTPUT_DIR" for f in "${OUTPUT_DIR}"/*.zst; do [ -f "$f" ] || continue name="$(basename "${f%.zst}")" - # Skip rootfs tarball — bundle-vm-runtime.sh doesn't need it - [[ "$name" == rootfs.tar ]] && continue + # Skip rootfs tarballs — bundle-vm-runtime.sh doesn't need them + [[ "$name" == rootfs.tar || "$name" == rootfs-gpu.tar ]] && continue zstd -d "$f" -o "${WORK_DIR}/${name}" -f -q chmod 0755 "${WORK_DIR}/${name}" done + # GPU passthrough binaries live in libkrun-build but are not part of the + # core compressed set. Copy them into WORK_DIR so bundle-vm-runtime.sh + # stages them alongside the core libraries. + _BUILD_DIR="${ROOT}/target/libkrun-build" + for gpu_bin in vmlinux virtiofsd; do + if [ -f "${_BUILD_DIR}/${gpu_bin}" ]; then + cp "${_BUILD_DIR}/${gpu_bin}" "$WORK_DIR/" + chmod 0755 "${WORK_DIR}/${gpu_bin}" + echo " Included GPU binary: ${gpu_bin}" + fi + done + echo " Decompressed files:" ls -lah "$WORK_DIR" @@ -126,8 +138,9 @@ if [ -n "${VM_RUNTIME_TARBALL:-}" ]; then echo "" compress_dir "$WORK_DIR" "$OUTPUT_DIR" - # Check for rootfs tarball (built separately) + # Check for rootfs tarballs (built separately) ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs.tar.zst" + GPU_ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs-gpu.tar.zst" if [ -f "$ROOTFS_TARBALL" ]; then echo " rootfs.tar.zst: $(du -h "$ROOTFS_TARBALL" | cut -f1) (pre-built)" else @@ -135,6 +148,9 @@ if [ -n "${VM_RUNTIME_TARBALL:-}" ]; then echo "Note: rootfs.tar.zst not found." echo " To build one, run: mise run vm:rootfs -- --base" fi + if [ -f "$GPU_ROOTFS_TARBALL" ]; then + echo " rootfs-gpu.tar.zst: $(du -h "$GPU_ROOTFS_TARBALL" | cut -f1) (pre-built)" + fi echo "" echo "==> Compressed artifacts in ${OUTPUT_DIR}:" @@ -256,6 +272,14 @@ case "$(uname -s)-$(uname -m)" in "https://github.com/containers/gvisor-tap-vsock/releases/download/${GVPROXY_VERSION}/gvproxy-linux-${GVPROXY_ARCH}" chmod +x "$WORK_DIR/gvproxy" fi + + # GPU passthrough binaries (optional — included when present in libkrun-build) + for gpu_bin in vmlinux virtiofsd; do + if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then + cp "${BUILD_DIR}/${gpu_bin}" "$WORK_DIR/" + echo " Included GPU binary: ${gpu_bin}" + fi + done ;; *) @@ -272,16 +296,20 @@ ls -lah "$WORK_DIR" echo "" compress_dir "$WORK_DIR" "$OUTPUT_DIR" -# Check for rootfs tarball (built separately by build-rootfs-tarball.sh) +# Check for rootfs tarballs (built separately by build-rootfs-tarball.sh) ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs.tar.zst" +GPU_ROOTFS_TARBALL="${OUTPUT_DIR}/rootfs-gpu.tar.zst" if [ -f "$ROOTFS_TARBALL" ]; then echo " rootfs.tar.zst: $(du -h "$ROOTFS_TARBALL" | cut -f1) (pre-built)" else echo "" echo "Note: rootfs.tar.zst not found." - echo " To build one, run: mise run vm:rootfs -- --base" - echo " Without it, the binary will still work but require the rootfs" - echo " to be built separately on first run." + echo " To build one, run: mise run vm:rootfs -- --base" + echo " Without it, the binary will still work but require the rootfs" + echo " to be built separately on first run." +fi +if [ -f "$GPU_ROOTFS_TARBALL" ]; then + echo " rootfs-gpu.tar.zst: $(du -h "$GPU_ROOTFS_TARBALL" | cut -f1) (pre-built)" fi echo "" diff --git a/tasks/scripts/vm/download-kernel-runtime.sh b/tasks/scripts/vm/download-kernel-runtime.sh index 8f0427af9..5e60d3c75 100755 --- a/tasks/scripts/vm/download-kernel-runtime.sh +++ b/tasks/scripts/vm/download-kernel-runtime.sh @@ -81,11 +81,11 @@ DOWNLOAD_DIR="${ROOT}/target/vm-runtime-download" mkdir -p "$DOWNLOAD_DIR" "$OUTPUT_DIR" echo "==> Downloading ${TARBALL_NAME} from ${RELEASE_TAG}..." +rm -f "${DOWNLOAD_DIR}/${TARBALL_NAME}" gh release download "${RELEASE_TAG}" \ --repo "${REPO}" \ --pattern "${TARBALL_NAME}" \ - --dir "${DOWNLOAD_DIR}" \ - --clobber + --dir "${DOWNLOAD_DIR}" if [ ! -f "${DOWNLOAD_DIR}/${TARBALL_NAME}" ]; then echo "Error: Download failed — ${TARBALL_NAME} not found." >&2 diff --git a/tasks/scripts/vm/package-vm-runtime.sh b/tasks/scripts/vm/package-vm-runtime.sh index f97eec870..7f5e908c6 100755 --- a/tasks/scripts/vm/package-vm-runtime.sh +++ b/tasks/scripts/vm/package-vm-runtime.sh @@ -84,6 +84,13 @@ case "$PLATFORM" in versioned="$(ls "${PACKAGE_DIR}"/libkrunfw.so.5.* 2>/dev/null | head -n1 || true)" [ -n "$versioned" ] && cp "$versioned" "${PACKAGE_DIR}/libkrunfw.so.5" fi + # GPU passthrough binaries (optional — only included if present) + for gpu_bin in vmlinux virtiofsd; do + if [ -f "${BUILD_DIR}/${gpu_bin}" ]; then + cp "${BUILD_DIR}/${gpu_bin}" "${PACKAGE_DIR}/" + echo " Included GPU passthrough binary: ${gpu_bin}" + fi + done ;; darwin-aarch64) cp "${BUILD_DIR}/libkrun.dylib" "${PACKAGE_DIR}/" diff --git a/tasks/scripts/vm/qemu-check.sh b/tasks/scripts/vm/qemu-check.sh new file mode 100755 index 000000000..8629ff276 --- /dev/null +++ b/tasks/scripts/vm/qemu-check.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Validate QEMU host prerequisites for GPU passthrough. +# +# Checks that qemu-system-x86_64, vhost-vsock support, and required +# runtime artifacts (vmlinux, virtiofsd) are available. +# +# Usage: +# ./qemu-check.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/_lib.sh" +ROOT="$(vm_lib_root)" + +RUNTIME_DIR="${ROOT}/target/libkrun-build" + +pass=0 +fail=0 + +ok() { echo " [OK] $1"; ((pass++)); } +miss() { echo " [MISS] $1"; ((fail++)); } + +echo "==> QEMU host prerequisite check" +echo "" + +# ── qemu-system-x86_64 ────────────────────────────────────────────────── + +echo "--- QEMU binary ---" +if command -v qemu-system-x86_64 &>/dev/null; then + version="$(qemu-system-x86_64 --version | head -n1)" + ok "qemu-system-x86_64 found: ${version}" +else + miss "qemu-system-x86_64 not found (install: sudo apt install qemu-system-x86)" +fi + +# ── vhost-vsock ────────────────────────────────────────────────────────── + +echo "--- vhost-vsock ---" +if [ -e /dev/vhost-vsock ]; then + ok "/dev/vhost-vsock exists" +elif lsmod 2>/dev/null | grep -q vhost_vsock; then + ok "vhost_vsock module loaded (but /dev/vhost-vsock missing — check permissions)" +else + miss "vhost_vsock not loaded (hint: sudo modprobe vhost_vsock)" +fi + +# ── Runtime artifacts ──────────────────────────────────────────────────── + +echo "--- Runtime artifacts (${RUNTIME_DIR}) ---" + +if [ -f "${RUNTIME_DIR}/vmlinux" ]; then + ok "vmlinux found" +else + miss "vmlinux not found (run: FROM_SOURCE=1 mise run vm:setup)" +fi + +if [ -f "${RUNTIME_DIR}/virtiofsd" ]; then + ok "virtiofsd found" +else + miss "virtiofsd not found (run: mise run vm:gpu-deps)" +fi + +# ── Summary ────────────────────────────────────────────────────────────── + +echo "" +echo "==> Summary: ${pass} passed, ${fail} missing" + +if [ "$fail" -gt 0 ]; then + echo "" + echo "Fix the missing prerequisites above before running QEMU GPU passthrough." + exit 1 +fi + +echo "" +echo "All QEMU prerequisites satisfied." +exit 0 diff --git a/tasks/scripts/vm/sync-vm-rootfs.sh b/tasks/scripts/vm/sync-vm-rootfs.sh index 727a9dd18..2c22e360b 100755 --- a/tasks/scripts/vm/sync-vm-rootfs.sh +++ b/tasks/scripts/vm/sync-vm-rootfs.sh @@ -141,6 +141,22 @@ fi patch_vm_helmchart "${MANIFEST_DST}/openshell-helmchart.yaml" patch_vm_helmchart "${ROOTFS_DIR}/var/lib/rancher/k3s/server/manifests/openshell-helmchart.yaml" +# ── GPU manifests ────────────────────────────────────────────────────── +# Only sync if the rootfs was built with --gpu (sentinel file present). +GPU_MANIFEST_SRC="${ROOT}/crates/openshell-vm/scripts/gpu-manifests" +GPU_MANIFEST_DST="${ROOTFS_DIR}/opt/openshell/gpu-manifests" +if [ -f "${ROOTFS_DIR}/opt/openshell/.rootfs-gpu" ] && [ -d "${GPU_MANIFEST_SRC}" ]; then + mkdir -p "${GPU_MANIFEST_DST}" + for manifest in "${GPU_MANIFEST_SRC}"/*.yaml; do + [ -f "$manifest" ] || continue + base=$(basename "$manifest") + if ! cmp -s "$manifest" "${GPU_MANIFEST_DST}/${base}" 2>/dev/null; then + cp "$manifest" "${GPU_MANIFEST_DST}/${base}" + echo " updated: /opt/openshell/gpu-manifests/${base}" + fi + done +fi + # ── Gateway image tarball ────────────────────────────────────────────── # The VM rootfs airgap-imports openshell/gateway:dev from k3s/agent/images/. # Keep that tarball in sync with the local Docker image so `mise run e2e:vm` diff --git a/tasks/scripts/vm/vm-setup.sh b/tasks/scripts/vm/vm-setup.sh index e7ae06d08..8afd3883d 100755 --- a/tasks/scripts/vm/vm-setup.sh +++ b/tasks/scripts/vm/vm-setup.sh @@ -81,6 +81,11 @@ if [ "$FROM_SOURCE" = "1" ]; then linux-*) # Linux: build both libkrunfw and libkrun in one go "${ROOT}/tasks/scripts/vm/build-libkrun.sh" + if [ "${GPU:-0}" = "1" ]; then + echo "" + echo "==> Building GPU passthrough dependencies..." + "${ROOT}/tasks/scripts/vm/build-gpu-deps.sh" + fi ;; esac echo "" diff --git a/tasks/vm.toml b/tasks/vm.toml index ca06b08c1..8c5fd1afc 100644 --- a/tasks/vm.toml +++ b/tasks/vm.toml @@ -5,6 +5,10 @@ # # Workflow: # mise run vm:setup # one-time: download pre-built runtime (~30s) +# # (with FROM_SOURCE=1: builds kernel + libkrun + GPU deps) +# mise run vm:gpu-deps # (standalone) build GPU passthrough binaries separately +# mise run vm:nvidia-modules # (GPU only) build NVIDIA kernel modules +# mise run vm:rootfs -- --base --gpu # build GPU rootfs with NVIDIA drivers # mise run vm # build + run the VM # mise run vm:clean # wipe everything and start over # @@ -26,7 +30,14 @@ run = [ description = "Build the openshell-vm binary with embedded runtime" run = [ "tasks/scripts/vm/compress-vm-runtime.sh", - "OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed cargo build -p openshell-vm", + """ + # The embedded rootfs.tar.zst can exceed 2 GiB, which overflows x86_64's + # default small code model (R_X86_64_PC32 ±2 GiB limit). Use the large + # code model so include_bytes!() blobs of any size link correctly. + RUSTFLAGS="${RUSTFLAGS:-} -C code-model=large" \ + OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed \ + cargo build -p openshell-vm + """, "tasks/scripts/vm/codesign-openshell-vm.sh", "tasks/scripts/vm/bundle-vm-runtime.sh", ] @@ -39,6 +50,18 @@ run = "tasks/scripts/vm/vm-setup.sh" description = "Build the VM rootfs tarball (use -- --base for lightweight)" run = "tasks/scripts/vm/build-rootfs-tarball.sh" +["vm:nvidia-modules"] +description = "Build NVIDIA kernel modules for GPU VM rootfs (requires FROM_SOURCE=1 vm:setup)" +run = "tasks/scripts/vm/build-nvidia-modules.sh" + +["vm:gpu-deps"] +description = "Build GPU passthrough dependencies (virtiofsd) for the QEMU backend" +run = "tasks/scripts/vm/build-gpu-deps.sh" + +["vm:qemu-check"] +description = "Validate QEMU host prerequisites for GPU passthrough" +run = "tasks/scripts/vm/qemu-check.sh" + ["vm:clean"] description = "Remove all VM cached artifacts (runtime, rootfs, builds)" run = "tasks/scripts/vm/vm-clean.sh"